2021-07-27 04:58:07 +00:00
|
|
|
import csv
|
|
|
|
import json
|
|
|
|
import re
|
2021-07-27 04:30:44 +00:00
|
|
|
from os.path import isfile
|
2021-07-27 04:42:37 +00:00
|
|
|
from app.models import *
|
2021-07-27 07:17:39 +00:00
|
|
|
from app.formats import *
|
2021-07-27 04:30:44 +00:00
|
|
|
|
2021-07-27 04:42:37 +00:00
|
|
|
from django.db import transaction
|
|
|
|
|
|
|
|
|
2021-07-27 04:30:44 +00:00
|
|
|
# Correct commas inside of a linked field
|
|
|
|
def fix_bracketed_lists(data):
|
|
|
|
for fix in re.findall(r'\([^\s]*,[ ]*[^\s]*\)', data):
|
|
|
|
data = data.replace(fix, fix.replace(',', ' /'))
|
|
|
|
return data
|
|
|
|
|
2021-07-27 04:58:07 +00:00
|
|
|
|
2021-07-27 04:30:44 +00:00
|
|
|
# Check for valid link
|
|
|
|
def fix_url(link):
|
|
|
|
if len(link) > 3 and not link.startswith('http'):
|
|
|
|
link = 'http://' + link
|
|
|
|
# link = link.split(';')[0]
|
|
|
|
return link
|
|
|
|
|
2021-07-27 04:58:07 +00:00
|
|
|
|
2021-07-27 04:30:44 +00:00
|
|
|
# Create linked objects
|
|
|
|
def add_linked(person, field, obj, data):
|
|
|
|
# TODO: fuzzy matching instead of lower()
|
|
|
|
items = fix_bracketed_lists(data).lower()
|
2021-07-27 04:58:07 +00:00
|
|
|
items = items.replace(';', ',').split(',')
|
2021-07-27 04:30:44 +00:00
|
|
|
for i in items:
|
|
|
|
n = i.strip()
|
2021-07-27 04:58:07 +00:00
|
|
|
if len(n) < 3:
|
|
|
|
continue
|
|
|
|
tgt = obj.objects.filter(name=n).first()
|
2021-07-27 04:30:44 +00:00
|
|
|
if not tgt:
|
|
|
|
tgt = obj()
|
|
|
|
tgt.name = n
|
2021-07-27 04:42:37 +00:00
|
|
|
tgt.save()
|
2021-07-27 04:30:44 +00:00
|
|
|
field.append(tgt)
|
|
|
|
|
2021-07-27 04:58:07 +00:00
|
|
|
|
2021-07-27 04:30:44 +00:00
|
|
|
# Fetch an object by source_id (numeric identifier used in source DB)
|
|
|
|
def get_by_id(rowid, obj, first=True):
|
|
|
|
if type(rowid) is str and rowid.isdigit():
|
|
|
|
rowid = int(rowid)
|
|
|
|
if type(rowid) is int:
|
2021-07-27 04:58:07 +00:00
|
|
|
l = obj.objects.filter(source_id=rowid)
|
|
|
|
if first:
|
|
|
|
return l.first(), rowid
|
|
|
|
else:
|
|
|
|
return l, rowid
|
2021-07-27 04:30:44 +00:00
|
|
|
return None, None
|
|
|
|
|
2021-07-27 04:58:07 +00:00
|
|
|
|
2021-07-27 04:30:44 +00:00
|
|
|
# Quick check of the number of lines
|
|
|
|
def get_total_rows_csv(filename):
|
|
|
|
with open(filename) as f:
|
|
|
|
for i, l in enumerate(f):
|
|
|
|
pass
|
|
|
|
return i + 1
|
|
|
|
|
2021-07-27 04:58:07 +00:00
|
|
|
|
2021-07-27 04:30:44 +00:00
|
|
|
# Search index routine
|
|
|
|
def reindex_data():
|
2021-07-27 04:58:07 +00:00
|
|
|
for i, p in enumerate(Person.objects.all()):
|
2021-07-27 04:30:44 +00:00
|
|
|
p.index()
|
2021-07-27 04:42:37 +00:00
|
|
|
p.save()
|
2021-07-27 04:30:44 +00:00
|
|
|
|
2021-07-27 04:58:07 +00:00
|
|
|
|
2021-07-27 04:30:44 +00:00
|
|
|
# Data update routine
|
|
|
|
def refresh_data(filename, fmt=None):
|
|
|
|
count = 0
|
|
|
|
rowcount = 0
|
|
|
|
if not isfile(filename):
|
|
|
|
msg = "Missing data: %s - refresh aborted." % fmt['filename']
|
2021-07-27 04:42:37 +00:00
|
|
|
print(msg)
|
2021-07-27 04:58:07 +00:00
|
|
|
yield msg, "error"
|
2021-07-27 04:30:44 +00:00
|
|
|
return None
|
|
|
|
if fmt['extension'] == 'csv':
|
|
|
|
totalrows = get_total_rows_csv(filename)
|
|
|
|
with open(filename, 'rt', encoding='utf-8', errors='ignore') as csvfile:
|
|
|
|
datareader = csv.DictReader(csvfile)
|
|
|
|
for row in datareader:
|
|
|
|
rowcount += 1
|
2021-07-27 05:00:20 +00:00
|
|
|
if row is None:
|
|
|
|
continue
|
2021-07-27 04:58:07 +00:00
|
|
|
yield rowcount, rowcount / totalrows
|
2021-07-27 04:30:44 +00:00
|
|
|
|
2021-07-27 04:42:37 +00:00
|
|
|
# # Ensure any new data is flushed from time to time
|
|
|
|
# if count % 25 == 0:
|
|
|
|
# db.session.commit()
|
2021-07-27 04:30:44 +00:00
|
|
|
|
|
|
|
for r in fmt['required']:
|
|
|
|
if not r in row:
|
|
|
|
msg = "Missing attribute in %s (%s)" % (r, fmt['filename'])
|
2021-07-27 04:42:37 +00:00
|
|
|
print(msg)
|
2021-07-27 04:58:07 +00:00
|
|
|
yield msg, "error"
|
2021-07-27 04:30:44 +00:00
|
|
|
return None
|
|
|
|
|
|
|
|
if fmt['dataformat'] is DataFormat.PERSON_DETAIL:
|
|
|
|
person, source_id = get_by_id(row['ID'], Person)
|
|
|
|
if not person:
|
2021-07-27 04:58:07 +00:00
|
|
|
person = Person.objects.filter(first_name=row['First name'], last_name=row['Last name']).first()
|
2021-07-27 04:30:44 +00:00
|
|
|
if not person:
|
|
|
|
person = Person(first_name=row['First name'], last_name=row['Last name'], source_id=row['ID'])
|
|
|
|
|
|
|
|
# Update data fields
|
|
|
|
person.source_id = source_id
|
|
|
|
person.title = row['Title']
|
|
|
|
person.organisation = row['Organisation English']
|
|
|
|
person.country = row['Country']
|
|
|
|
person.position = row['Position']
|
|
|
|
person.biography = row['Biography']
|
|
|
|
person.contact_email = row['e-mail 1']
|
|
|
|
person.personal_url = fix_url(row['URL'])
|
|
|
|
|
2021-07-27 04:42:37 +00:00
|
|
|
with transaction.atomic():
|
2021-07-27 04:30:44 +00:00
|
|
|
add_linked(person, person.research_methods, Method, row['Methods'])
|
2021-07-27 04:58:07 +00:00
|
|
|
add_linked(person, person.research_scales, Scale, row['Scale'])
|
|
|
|
add_linked(person, person.research_taxa, Taxon, row['Taxa'])
|
|
|
|
add_linked(person, person.research_fields, Field, row['Field of expertise'])
|
2021-07-27 04:30:44 +00:00
|
|
|
|
|
|
|
person.index()
|
2021-07-27 04:42:37 +00:00
|
|
|
person.save()
|
2021-07-27 04:30:44 +00:00
|
|
|
count = count + 1
|
|
|
|
|
|
|
|
elif fmt['dataformat'] is DataFormat.RESOURCE_DETAIL:
|
|
|
|
res, source_id = get_by_id(row['ID'], Resource)
|
|
|
|
if not res: res = Resource(source_id=source_id)
|
|
|
|
res.title = row['Title']
|
|
|
|
res.citation = row['Citation']
|
2021-07-27 04:58:07 +00:00
|
|
|
res.url = fix_url(row['URL'].strip('#')) # remove weird #formatting#
|
2021-07-27 04:30:44 +00:00
|
|
|
res.abstract = row['Abstract']
|
2021-07-27 04:42:37 +00:00
|
|
|
res.save()
|
2021-07-27 04:30:44 +00:00
|
|
|
count = count + 1
|
|
|
|
|
|
|
|
elif fmt['dataformat'] is DataFormat.RANGE_DETAIL:
|
|
|
|
rng, source_id = get_by_id(row['Range_ID'], Range)
|
|
|
|
if not rng: rng = Range(source_id=source_id)
|
|
|
|
rng.gmba_id = row['GMBA_ID']
|
|
|
|
rng.name = row['RangeName']
|
|
|
|
rng.countries = row['Countries']
|
2021-07-27 04:42:37 +00:00
|
|
|
rng.save()
|
2021-07-27 04:30:44 +00:00
|
|
|
count = count + 1
|
|
|
|
|
|
|
|
elif fmt['dataformat'] is DataFormat.PERSON_RESOURCE:
|
|
|
|
rzs, source_id = get_by_id(row['Resource'], Resource, first=False)
|
|
|
|
if not rzs or not rzs.first(): continue
|
|
|
|
ppl, source_id = get_by_id(row['Person'], Person, first=False)
|
|
|
|
if not ppl or not ppl.first(): continue
|
|
|
|
for person in ppl:
|
2021-07-27 05:00:20 +00:00
|
|
|
for r in rzs:
|
|
|
|
person.resources.append(r)
|
2021-07-27 04:42:37 +00:00
|
|
|
person.save()
|
2021-07-27 04:30:44 +00:00
|
|
|
count = count + 1
|
|
|
|
|
|
|
|
elif fmt['dataformat'] is DataFormat.PERSON_RANGE:
|
|
|
|
rzs, source_id = get_by_id(row['MountainRange'], Range, first=False)
|
2021-07-27 05:00:20 +00:00
|
|
|
if not rzs or not rzs.first():
|
|
|
|
continue
|
2021-07-27 04:30:44 +00:00
|
|
|
ppl, source_id = get_by_id(row['Person'], Person, first=False)
|
|
|
|
if not ppl or not ppl.first(): continue
|
|
|
|
for person in ppl:
|
2021-07-27 05:00:20 +00:00
|
|
|
for r in rzs:
|
|
|
|
person.ranges.append(r)
|
2021-07-27 04:42:37 +00:00
|
|
|
person.save()
|
2021-07-27 04:30:44 +00:00
|
|
|
count = count + 1
|
|
|
|
|
|
|
|
elif fmt['extension'] == 'geojson':
|
|
|
|
ranges_missing = []
|
|
|
|
with open(filename, 'rt', encoding='utf-8', errors='ignore') as jsonfile:
|
|
|
|
jsondata = json.load(jsonfile)
|
|
|
|
if fmt['dataformat'] is DataFormat.RANGE_SHAPES:
|
|
|
|
totalrows = len(jsondata['features'])
|
|
|
|
for f in jsondata['features']:
|
2021-07-27 04:58:07 +00:00
|
|
|
yield count, count / totalrows
|
2021-07-27 04:30:44 +00:00
|
|
|
count = count + 1
|
|
|
|
|
|
|
|
p = f['properties']
|
2021-07-27 04:58:07 +00:00
|
|
|
rge = Range.objects.filter(gmba_id=p['GMBA_ID']).first()
|
2021-07-27 04:30:44 +00:00
|
|
|
if not rge:
|
|
|
|
ranges_missing.append(p['GMBA_ID'])
|
|
|
|
continue
|
|
|
|
rge.name = p['Name']
|
|
|
|
for c in ['Country_1', 'Country_2_']:
|
|
|
|
if c in p: rge.countries = p[c]
|
2021-07-27 04:42:37 +00:00
|
|
|
rge.save()
|
|
|
|
print("Warning: %d ranges not found" % len(ranges_missing))
|
|
|
|
print("[%s]" % ', '.join(ranges_missing))
|
2021-07-27 04:30:44 +00:00
|
|
|
yield None, None
|
|
|
|
return count
|