import csv import json import re from os.path import isfile from app.models import * from app.formats import * from django.db import transaction # Correct commas inside of a linked field def fix_bracketed_lists(data): for fix in re.findall(r'\([^\s]*,[ ]*[^\s]*\)', data): data = data.replace(fix, fix.replace(',', ' /')) return data # Check for valid link def fix_url(link): if len(link) > 3 and not link.startswith('http'): link = 'http://' + link # link = link.split(';')[0] return link # Create linked objects def add_linked(person, obj, data): field = [] # TODO: fuzzy matching instead of lower() items = fix_bracketed_lists(data).lower() items = items.replace(';', ',').split(',') for i in items: n = i.strip() if len(n) < 3: continue tgt = obj.objects.filter(name=n).first() if not tgt: tgt = obj() tgt.name = n tgt.save() field.append(tgt) return field # Fetch an object by source_id (numeric identifier used in source DB) def get_by_id(rowid, obj, first=True): if type(rowid) is str and rowid.isdigit(): rowid = int(rowid) if type(rowid) is int: l = obj.objects.filter(source_id=rowid) if first: return l.first(), rowid else: return l, rowid return None, None # Quick check of the number of lines def get_total_rows_csv(filename): with open(filename) as f: for i, l in enumerate(f): pass return i + 1 # Search index routine def reindex_data(): for i, p in enumerate(Person.objects.all()): p.index() p.save() # Data update routine def refresh_data(filename, fmt=None, update_existing=False): print("refresh_data") count = 0 rowcount = 0 if not isfile(filename): msg = "Missing data: %s - refresh aborted." % fmt['filename'] print(msg) yield msg, "error" return None if fmt['extension'] == 'csv': totalrows = get_total_rows_csv(filename) with open(filename, 'rt', encoding='utf-8', errors='ignore') as csvfile: datareader = csv.DictReader(csvfile) for row in datareader: rowcount += 1 if row is None: continue yield rowcount, rowcount / totalrows # # Ensure any new data is flushed from time to time # if count % 25 == 0: # db.session.commit() for r in fmt['required']: if not r in row: msg = "Missing attribute in %s (%s)" % (r, fmt['filename']) print(msg) yield msg, "error" return None if fmt['dataformat'] is DataFormat.PERSON_DETAIL: print(row) person, source_id = get_by_id(row['ID'], Person) if not person: person = Person.objects.filter(first_name=row['First name'], last_name=row['Last name']).first() if person: print("Fetched from DB") else: print("Does not exist in DB") if not person: person = Person(first_name=row['First name'], last_name=row['Last name'], source_id=row['ID']) person.source_id = source_id person.title = row['Title'] print(row) person.organisation = row['Organisation English'] print("Country = %s" % row['country'] if 'country' in row else '') if 'country' not in row or row['country'] is None or row['country'].strip() == '': row['country'] = 0 c = Country.objects.get(id=row['country']) person.country = c person.position = row['Position'] person.biography = row['Biography'] person.contact_email = row['e-mail 1'] person.personal_url = fix_url(row['URL']) person.save() print("Created") # Update data fields if update_existing: person.source_id = source_id person.title = row['Title'] print(row) person.organisation = row['Organisation English'] print("Country = %s" % row['country'] if 'country' in row else '') if 'country' not in row or row['country'] is None or row['country'].strip() == '': row['country'] = 0 c = Country.objects.get(id=row['country']) person.country = c person.position = row['Position'] person.biography = row['Biography'] person.contact_email = row['e-mail 1'] person.personal_url = fix_url(row['URL']) person.save() with transaction.atomic(): research_methods = add_linked(person, Method, row['Methods']) methods_people = [MethodsPeople.objects.get_or_create(method_id=m.id, person_id=person.id)[0] for m in research_methods] person.methodspeople_set.set(methods_people) research_scales = add_linked(person, Scale, row['Scale']) scales_people = [ScalesPeople.objects.get_or_create(scale_id=s.id, person_id=person.id)[0] for s in research_scales] person.scalespeople_set.set(scales_people) research_taxa = add_linked(person, Taxon, row['Taxa']) taxa_people = [TaxaPeople.objects.get_or_create(taxon_id=t.id, person_id=person.id)[0] for t in research_taxa] person.taxapeople_set.set(taxa_people) research_fields = add_linked(person, Field, row['Field of expertise']) fields_people = [FieldsPeople.objects.get_or_create(field_id=f.id, person_id=person.id)[0] for f in research_fields] person.fieldspeople_set.set(fields_people) # research_ranges = add_linked(person, MountainRange, row['MountainRange']) # ranges_people = [RangesPeople.objects.get_or_create(range_id=r.id, person_id=person.id)[0] for r in research_ranges] # person.rangespeople_set.set(ranges_people) person.index() person.save() count = count + 1 elif fmt['dataformat'] is DataFormat.RESOURCE_DETAIL: res, source_id = get_by_id(row['ID'], Resource) if not res: res = Resource(source_id=source_id) res.title = row['Title'] res.citation = row['Citation'] res.url = fix_url(row['URL'].strip('#')) # remove weird #formatting# res.abstract = row['Abstract'] res.save() count = count + 1 elif fmt['dataformat'] is DataFormat.RANGE_DETAIL: rng, source_id = get_by_id(row['Range_ID'], MountainRange) if not rng: rng = MountainRange(source_id=source_id) rng.gmba_id = row['GMBA_ID'] rng.name = row['RangeName'] rng.countries = row['Countries'] rng.save() count = count + 1 elif fmt['dataformat'] is DataFormat.PERSON_RESOURCE: rzs, source_id = get_by_id(row['Resource'], Resource, first=False) if not rzs or not rzs.first(): continue ppl, source_id = get_by_id(row['Person'], Person, first=False) if not ppl or not ppl.first(): continue for person in ppl: person.resources = [] for r in rzs: person.resources.append(r) person.save() count = count + 1 elif fmt['dataformat'] is DataFormat.PERSON_RANGE: rzs, source_id = get_by_id(row['MountainRange'], MountainRange, first=False) print(" range=%s, source_id=%s" % (rzs, source_id)) if not rzs or not rzs.first(): print(" --- No rzs, continue") continue ppl, source_id = get_by_id(row['Person'], Person, first=False) print(" +++ ppl=%s, source_id=%s" % (ppl, source_id)) if not ppl or not ppl.first(): print(" --- No ppl, continue") continue with transaction.atomic(): person = ppl.first() research_ranges = add_linked(person, MountainRange, row['MountainRange']) ranges_people = [PeopleRange.objects.get_or_create(range_id=r.id, person_id=person.id)[0] for r in research_ranges] person.peoplerange_set.set(ranges_people) # for person in ppl: # research_ranges = add_linked(person, MountainRange, row['MountainRange']) # ranges_people = [RangesPeople.objects.get_or_create(range_id=r.id, person_id=person.id)[0] for r in research_ranges] # person.rangespeople_set.set(ranges_people) # ranges_people = RangesPeople # for r in rzs: # ranges_people.append(r) # person.rangespeople_set.set(ranges_people) person.save() #print(" *** Saved %s => %s (%s)" % (person, ranges_people, len(ranges_people))) count = count + 1 elif fmt['extension'] == 'geojson': ranges_missing = [] with open(filename, 'rt', encoding='utf-8', errors='ignore') as jsonfile: jsondata = json.load(jsonfile) if fmt['dataformat'] is DataFormat.RANGE_SHAPES: totalrows = len(jsondata['features']) for f in jsondata['features']: yield count, count / totalrows count = count + 1 p = f['properties'] rge = MountainRange.objects.filter(gmba_id=p['GMBA_ID']).first() if not rge: ranges_missing.append(p['GMBA_ID']) continue rge.name = p['Name'] for c in ['Country_1', 'Country_2_']: if c in p: rge.countries = p[c] rge.save() print("Warning: %d ranges not found" % len(ranges_missing)) print("[%s]" % ', '.join(ranges_missing)) yield None, None return count