import csv import json import re from os.path import isfile from app.models import * from app.formats import * from django.db import transaction # Correct commas inside of a linked field def fix_bracketed_lists(data): for fix in re.findall(r'\([^\s]*,[ ]*[^\s]*\)', data): data = data.replace(fix, fix.replace(',', ' /')) return data # Check for valid link def fix_url(link): if len(link) > 3 and not link.startswith('http'): link = 'http://' + link # link = link.split(';')[0] return link # Create linked objects def add_linked(person, obj, data): field = [] # TODO: fuzzy matching instead of lower() items = fix_bracketed_lists(data).lower() items = items.replace(';', ',').split(',') for i in items: n = i.strip() if len(n) < 3: continue tgt = obj.objects.filter(name=n).first() if not tgt: tgt = obj() tgt.name = n tgt.save() field.append(tgt) return field # Fetch an object by source_id (numeric identifier used in source DB) def get_by_id(rowid, obj, first=True): if type(rowid) is str and rowid.isdigit(): rowid = int(rowid) if type(rowid) is int: l = obj.objects.filter(source_id=rowid) if first: return l.first(), rowid else: return l, rowid return None, None # Quick check of the number of lines def get_total_rows_csv(filename): with open(filename) as f: for i, l in enumerate(f): pass return i + 1 # Search index routine def reindex_data(): for i, p in enumerate(Person.objects.all()): p.index() p.save() # Data update routine def refresh_data(filename, fmt=None): count = 0 rowcount = 0 if not isfile(filename): msg = "Missing data: %s - refresh aborted." % fmt['filename'] print(msg) yield msg, "error" return None if fmt['extension'] == 'csv': totalrows = get_total_rows_csv(filename) with open(filename, 'rt', encoding='utf-8', errors='ignore') as csvfile: datareader = csv.DictReader(csvfile) for row in datareader: rowcount += 1 if row is None: continue yield rowcount, rowcount / totalrows # # Ensure any new data is flushed from time to time # if count % 25 == 0: # db.session.commit() for r in fmt['required']: if not r in row: msg = "Missing attribute in %s (%s)" % (r, fmt['filename']) print(msg) yield msg, "error" return None if fmt['dataformat'] is DataFormat.PERSON_DETAIL: person, source_id = get_by_id(row['ID'], Person) if not person: person = Person.objects.filter(first_name=row['First name'], last_name=row['Last name']).first() if not person: person = Person(first_name=row['First name'], last_name=row['Last name'], source_id=row['ID']) # Update data fields person.source_id = source_id person.title = row['Title'] person.organisation = row['Organisation English'] person.country = row['Country'] person.position = row['Position'] person.biography = row['Biography'] person.contact_email = row['e-mail 1'] person.personal_url = fix_url(row['URL']) with transaction.atomic(): research_methods = add_linked(person, Method, row['Methods']) person.research_methods = research_methods research_scales = add_linked(person, Scale, row['Scale']) person.research_scales = research_scales research_taxa = add_linked(person, Taxon, row['Taxa']) person.research_taxa = research_taxa research_fields = add_linked(person, Field, row['Field of expertise']) person.research_fields = research_fields person.index() person.save() count = count + 1 elif fmt['dataformat'] is DataFormat.RESOURCE_DETAIL: res, source_id = get_by_id(row['ID'], Resource) if not res: res = Resource(source_id=source_id) res.title = row['Title'] res.citation = row['Citation'] res.url = fix_url(row['URL'].strip('#')) # remove weird #formatting# res.abstract = row['Abstract'] res.save() count = count + 1 elif fmt['dataformat'] is DataFormat.RANGE_DETAIL: rng, source_id = get_by_id(row['Range_ID'], Range) if not rng: rng = Range(source_id=source_id) rng.gmba_id = row['GMBA_ID'] rng.name = row['RangeName'] rng.countries = row['Countries'] rng.save() count = count + 1 elif fmt['dataformat'] is DataFormat.PERSON_RESOURCE: rzs, source_id = get_by_id(row['Resource'], Resource, first=False) if not rzs or not rzs.first(): continue ppl, source_id = get_by_id(row['Person'], Person, first=False) if not ppl or not ppl.first(): continue for person in ppl: person.resources = [] for r in rzs: person.resources.append(r) person.save() count = count + 1 elif fmt['dataformat'] is DataFormat.PERSON_RANGE: rzs, source_id = get_by_id(row['MountainRange'], Range, first=False) if not rzs or not rzs.first(): continue ppl, source_id = get_by_id(row['Person'], Person, first=False) if not ppl or not ppl.first(): continue for person in ppl: person.ranges = [] for r in rzs: person.ranges.append(r) person.save() count = count + 1 elif fmt['extension'] == 'geojson': ranges_missing = [] with open(filename, 'rt', encoding='utf-8', errors='ignore') as jsonfile: jsondata = json.load(jsonfile) if fmt['dataformat'] is DataFormat.RANGE_SHAPES: totalrows = len(jsondata['features']) for f in jsondata['features']: yield count, count / totalrows count = count + 1 p = f['properties'] rge = Range.objects.filter(gmba_id=p['GMBA_ID']).first() if not rge: ranges_missing.append(p['GMBA_ID']) continue rge.name = p['Name'] for c in ['Country_1', 'Country_2_']: if c in p: rge.countries = p[c] rge.save() print("Warning: %d ranges not found" % len(ranges_missing)) print("[%s]" % ', '.join(ranges_missing)) yield None, None return count