Django uses this to get the string representation of an object
192 lines
7.2 KiB
192 lines
7.2 KiB
import csv
import json
import re
from os.path import isfile
from app.models import *
from app.formats import *
from django.db import transaction
# Correct commas inside of a linked field
def fix_bracketed_lists(data):
for fix in re.findall(r'\([^\s]*,[ ]*[^\s]*\)', data):
data = data.replace(fix, fix.replace(',', ' /'))
return data
# Check for valid link
def fix_url(link):
if len(link) > 3 and not link.startswith('http'):
link = 'http://' + link
# link = link.split(';')[0]
return link
# Create linked objects
def add_linked(person, field, obj, data):
# TODO: fuzzy matching instead of lower()
items = fix_bracketed_lists(data).lower()
items = items.replace(';', ',').split(',')
for i in items:
n = i.strip()
if len(n) < 3:
tgt = obj.objects.filter(name=n).first()
if not tgt:
tgt = obj()
tgt.name = n
# Fetch an object by source_id (numeric identifier used in source DB)
def get_by_id(rowid, obj, first=True):
if type(rowid) is str and rowid.isdigit():
rowid = int(rowid)
if type(rowid) is int:
l = obj.objects.filter(source_id=rowid)
if first:
return l.first(), rowid
return l, rowid
return None, None
# Quick check of the number of lines
def get_total_rows_csv(filename):
with open(filename) as f:
for i, l in enumerate(f):
return i + 1
# Search index routine
def reindex_data():
for i, p in enumerate(Person.objects.all()):
# Data update routine
def refresh_data(filename, fmt=None):
count = 0
rowcount = 0
if not isfile(filename):
msg = "Missing data: %s - refresh aborted." % fmt['filename']
yield msg, "error"
return None
if fmt['extension'] == 'csv':
totalrows = get_total_rows_csv(filename)
with open(filename, 'rt', encoding='utf-8', errors='ignore') as csvfile:
datareader = csv.DictReader(csvfile)
for row in datareader:
rowcount += 1
if row is None:
yield rowcount, rowcount / totalrows
# # Ensure any new data is flushed from time to time
# if count % 25 == 0:
# db.session.commit()
for r in fmt['required']:
if not r in row:
msg = "Missing attribute in %s (%s)" % (r, fmt['filename'])
yield msg, "error"
return None
if fmt['dataformat'] is DataFormat.PERSON_DETAIL:
person, source_id = get_by_id(row['ID'], Person)
if not person:
person = Person.objects.filter(first_name=row['First name'], last_name=row['Last name']).first()
if not person:
person = Person(first_name=row['First name'], last_name=row['Last name'], source_id=row['ID'])
# Update data fields
person.source_id = source_id
person.title = row['Title']
person.organisation = row['Organisation English']
person.country = row['Country']
person.position = row['Position']
person.biography = row['Biography']
person.contact_email = row['e-mail 1']
person.personal_url = fix_url(row['URL'])
with transaction.atomic():
add_linked(person, person.research_methods, Method, row['Methods'])
add_linked(person, person.research_scales, Scale, row['Scale'])
add_linked(person, person.research_taxa, Taxon, row['Taxa'])
add_linked(person, person.research_fields, Field, row['Field of expertise'])
count = count + 1
elif fmt['dataformat'] is DataFormat.RESOURCE_DETAIL:
res, source_id = get_by_id(row['ID'], Resource)
if not res: res = Resource(source_id=source_id)
res.title = row['Title']
res.citation = row['Citation']
res.url = fix_url(row['URL'].strip('#')) # remove weird #formatting#
res.abstract = row['Abstract']
count = count + 1
elif fmt['dataformat'] is DataFormat.RANGE_DETAIL:
rng, source_id = get_by_id(row['Range_ID'], Range)
if not rng: rng = Range(source_id=source_id)
rng.gmba_id = row['GMBA_ID']
rng.name = row['RangeName']
rng.countries = row['Countries']
count = count + 1
elif fmt['dataformat'] is DataFormat.PERSON_RESOURCE:
rzs, source_id = get_by_id(row['Resource'], Resource, first=False)
if not rzs or not rzs.first(): continue
ppl, source_id = get_by_id(row['Person'], Person, first=False)
if not ppl or not ppl.first(): continue
for person in ppl:
for r in rzs:
count = count + 1
elif fmt['dataformat'] is DataFormat.PERSON_RANGE:
rzs, source_id = get_by_id(row['MountainRange'], Range, first=False)
if not rzs or not rzs.first():
ppl, source_id = get_by_id(row['Person'], Person, first=False)
if not ppl or not ppl.first(): continue
for person in ppl:
for r in rzs:
count = count + 1
elif fmt['extension'] == 'geojson':
ranges_missing = []
with open(filename, 'rt', encoding='utf-8', errors='ignore') as jsonfile:
jsondata = json.load(jsonfile)
if fmt['dataformat'] is DataFormat.RANGE_SHAPES:
totalrows = len(jsondata['features'])
for f in jsondata['features']:
yield count, count / totalrows
count = count + 1
p = f['properties']
rge = Range.objects.filter(gmba_id=p['GMBA_ID']).first()
if not rge:
rge.name = p['Name']
for c in ['Country_1', 'Country_2_']:
if c in p: rge.countries = p[c]
print("Warning: %d ranges not found" % len(ranges_missing))
print("[%s]" % ', '.join(ranges_missing))
yield None, None
return count