gmba_django/app/convert.py

252 lines
11 KiB
Python
Raw Permalink Normal View History

import csv
import json
import re
2021-07-27 04:30:44 +00:00
from os.path import isfile
from app.models import *
from app.formats import *
2021-07-27 04:30:44 +00:00
from django.db import transaction
2021-07-27 04:30:44 +00:00
# Correct commas inside of a linked field
def fix_bracketed_lists(data):
for fix in re.findall(r'\([^\s]*,[ ]*[^\s]*\)', data):
data = data.replace(fix, fix.replace(',', ' /'))
return data
2021-07-27 04:30:44 +00:00
# Check for valid link
def fix_url(link):
if len(link) > 3 and not link.startswith('http'):
link = 'http://' + link
# link = link.split(';')[0]
return link
2021-07-27 04:30:44 +00:00
# Create linked objects
def add_linked(person, obj, data):
field = []
2021-07-27 04:30:44 +00:00
# TODO: fuzzy matching instead of lower()
items = fix_bracketed_lists(data).lower()
items = items.replace(';', ',').split(',')
2021-07-27 04:30:44 +00:00
for i in items:
n = i.strip()
if len(n) < 3:
continue
tgt = obj.objects.filter(name=n).first()
2021-07-27 04:30:44 +00:00
if not tgt:
tgt = obj()
tgt.name = n
tgt.save()
2021-07-27 04:30:44 +00:00
field.append(tgt)
2022-03-04 19:13:44 +00:00
return field
2021-07-27 04:30:44 +00:00
2021-07-27 04:30:44 +00:00
# Fetch an object by source_id (numeric identifier used in source DB)
def get_by_id(rowid, obj, first=True):
if type(rowid) is str and rowid.isdigit():
rowid = int(rowid)
if type(rowid) is int:
l = obj.objects.filter(source_id=rowid)
if first:
return l.first(), rowid
else:
return l, rowid
2021-07-27 04:30:44 +00:00
return None, None
2021-07-27 04:30:44 +00:00
# Quick check of the number of lines
def get_total_rows_csv(filename):
with open(filename) as f:
for i, l in enumerate(f):
pass
return i + 1
2021-07-27 04:30:44 +00:00
# Search index routine
def reindex_data():
for i, p in enumerate(Person.objects.all()):
2021-07-27 04:30:44 +00:00
p.index()
p.save()
2021-07-27 04:30:44 +00:00
2021-07-27 04:30:44 +00:00
# Data update routine
2022-03-04 19:13:44 +00:00
def refresh_data(filename, fmt=None, update_existing=False):
print("refresh_data")
2021-07-27 04:30:44 +00:00
count = 0
rowcount = 0
if not isfile(filename):
msg = "Missing data: %s - refresh aborted." % fmt['filename']
print(msg)
yield msg, "error"
2021-07-27 04:30:44 +00:00
return None
if fmt['extension'] == 'csv':
totalrows = get_total_rows_csv(filename)
with open(filename, 'rt', encoding='utf-8', errors='ignore') as csvfile:
datareader = csv.DictReader(csvfile)
for row in datareader:
rowcount += 1
2021-07-27 05:00:20 +00:00
if row is None:
continue
yield rowcount, rowcount / totalrows
2021-07-27 04:30:44 +00:00
# # Ensure any new data is flushed from time to time
# if count % 25 == 0:
# db.session.commit()
2021-07-27 04:30:44 +00:00
for r in fmt['required']:
if not r in row:
msg = "Missing attribute in %s (%s)" % (r, fmt['filename'])
print(msg)
yield msg, "error"
2021-07-27 04:30:44 +00:00
return None
if fmt['dataformat'] is DataFormat.PERSON_DETAIL:
2022-03-04 19:13:44 +00:00
print(row)
2021-07-27 04:30:44 +00:00
person, source_id = get_by_id(row['ID'], Person)
if not person:
person = Person.objects.filter(first_name=row['First name'], last_name=row['Last name']).first()
if person:
print("Fetched from DB")
else:
print("Does not exist in DB")
2021-07-27 04:30:44 +00:00
if not person:
person = Person(first_name=row['First name'], last_name=row['Last name'], source_id=row['ID'])
person.source_id = source_id
person.title = row['Title']
print(row)
person.organisation = row['Organisation English']
print("Country = %s" % row['country'] if 'country' in row else '')
if 'country' not in row or row['country'] is None or row['country'].strip() == '':
row['country'] = 0
c = Country.objects.get(id=row['country'])
person.country = c
person.position = row['Position']
person.biography = row['Biography']
person.contact_email = row['e-mail 1']
person.personal_url = fix_url(row['URL'])
person.save()
2022-03-04 19:31:28 +00:00
print("Created")
2021-07-27 04:30:44 +00:00
# Update data fields
2022-03-04 19:13:44 +00:00
if update_existing:
person.source_id = source_id
person.title = row['Title']
print(row)
person.organisation = row['Organisation English']
print("Country = %s" % row['country'] if 'country' in row else '')
if 'country' not in row or row['country'] is None or row['country'].strip() == '':
row['country'] = 0
c = Country.objects.get(id=row['country'])
person.country = c
person.position = row['Position']
person.biography = row['Biography']
person.contact_email = row['e-mail 1']
person.personal_url = fix_url(row['URL'])
person.save()
with transaction.atomic():
research_methods = add_linked(person, Method, row['Methods'])
2022-03-04 19:13:44 +00:00
methods_people = [MethodsPeople.objects.get_or_create(method_id=m.id, person_id=person.id)[0] for m in research_methods]
person.methodspeople_set.set(methods_people)
research_scales = add_linked(person, Scale, row['Scale'])
2022-03-04 19:13:44 +00:00
scales_people = [ScalesPeople.objects.get_or_create(scale_id=s.id, person_id=person.id)[0] for s in research_scales]
person.scalespeople_set.set(scales_people)
research_taxa = add_linked(person, Taxon, row['Taxa'])
2022-03-04 19:13:44 +00:00
taxa_people = [TaxaPeople.objects.get_or_create(taxon_id=t.id, person_id=person.id)[0] for t in research_taxa]
person.taxapeople_set.set(taxa_people)
research_fields = add_linked(person, Field, row['Field of expertise'])
2022-03-04 19:13:44 +00:00
fields_people = [FieldsPeople.objects.get_or_create(field_id=f.id, person_id=person.id)[0] for f in research_fields]
person.fieldspeople_set.set(fields_people)
# research_ranges = add_linked(person, MountainRange, row['MountainRange'])
# ranges_people = [RangesPeople.objects.get_or_create(range_id=r.id, person_id=person.id)[0] for r in research_ranges]
# person.rangespeople_set.set(ranges_people)
2021-07-27 04:30:44 +00:00
person.index()
person.save()
2021-07-27 04:30:44 +00:00
count = count + 1
elif fmt['dataformat'] is DataFormat.RESOURCE_DETAIL:
res, source_id = get_by_id(row['ID'], Resource)
2022-03-04 19:14:07 +00:00
if not res:
res = Resource(source_id=source_id)
2021-07-27 04:30:44 +00:00
res.title = row['Title']
res.citation = row['Citation']
res.url = fix_url(row['URL'].strip('#')) # remove weird #formatting#
2021-07-27 04:30:44 +00:00
res.abstract = row['Abstract']
res.save()
2021-07-27 04:30:44 +00:00
count = count + 1
elif fmt['dataformat'] is DataFormat.RANGE_DETAIL:
2021-12-15 11:10:09 +00:00
rng, source_id = get_by_id(row['Range_ID'], MountainRange)
if not rng: rng = MountainRange(source_id=source_id)
2021-07-27 04:30:44 +00:00
rng.gmba_id = row['GMBA_ID']
rng.name = row['RangeName']
rng.countries = row['Countries']
rng.save()
2021-07-27 04:30:44 +00:00
count = count + 1
elif fmt['dataformat'] is DataFormat.PERSON_RESOURCE:
rzs, source_id = get_by_id(row['Resource'], Resource, first=False)
2022-03-04 19:14:07 +00:00
if not rzs or not rzs.first():
continue
2021-07-27 04:30:44 +00:00
ppl, source_id = get_by_id(row['Person'], Person, first=False)
2022-03-04 19:14:07 +00:00
if not ppl or not ppl.first():
continue
2021-07-27 04:30:44 +00:00
for person in ppl:
person.resources = []
2021-07-27 05:00:20 +00:00
for r in rzs:
person.resources.append(r)
person.save()
2021-07-27 04:30:44 +00:00
count = count + 1
elif fmt['dataformat'] is DataFormat.PERSON_RANGE:
2021-12-15 11:10:09 +00:00
rzs, source_id = get_by_id(row['MountainRange'], MountainRange, first=False)
2022-03-04 19:13:44 +00:00
print(" range=%s, source_id=%s" % (rzs, source_id))
2021-07-27 05:00:20 +00:00
if not rzs or not rzs.first():
2022-03-04 19:13:44 +00:00
print(" --- No rzs, continue")
2021-07-27 05:00:20 +00:00
continue
2021-07-27 04:30:44 +00:00
ppl, source_id = get_by_id(row['Person'], Person, first=False)
2022-03-04 19:13:44 +00:00
print(" +++ ppl=%s, source_id=%s" % (ppl, source_id))
if not ppl or not ppl.first():
print(" --- No ppl, continue")
continue
with transaction.atomic():
person = ppl.first()
research_ranges = add_linked(person, MountainRange, row['MountainRange'])
ranges_people = [PeopleRange.objects.get_or_create(range_id=r.id, person_id=person.id)[0] for r in research_ranges]
person.peoplerange_set.set(ranges_people)
# for person in ppl:
# research_ranges = add_linked(person, MountainRange, row['MountainRange'])
# ranges_people = [RangesPeople.objects.get_or_create(range_id=r.id, person_id=person.id)[0] for r in research_ranges]
# person.rangespeople_set.set(ranges_people)
# ranges_people = RangesPeople
# for r in rzs:
# ranges_people.append(r)
# person.rangespeople_set.set(ranges_people)
person.save()
2022-03-04 19:13:44 +00:00
#print(" *** Saved %s => %s (%s)" % (person, ranges_people, len(ranges_people)))
2021-07-27 04:30:44 +00:00
count = count + 1
elif fmt['extension'] == 'geojson':
ranges_missing = []
with open(filename, 'rt', encoding='utf-8', errors='ignore') as jsonfile:
jsondata = json.load(jsonfile)
if fmt['dataformat'] is DataFormat.RANGE_SHAPES:
totalrows = len(jsondata['features'])
for f in jsondata['features']:
yield count, count / totalrows
2021-07-27 04:30:44 +00:00
count = count + 1
p = f['properties']
2021-12-15 11:10:09 +00:00
rge = MountainRange.objects.filter(gmba_id=p['GMBA_ID']).first()
2021-07-27 04:30:44 +00:00
if not rge:
ranges_missing.append(p['GMBA_ID'])
continue
rge.name = p['Name']
for c in ['Country_1', 'Country_2_']:
if c in p: rge.countries = p[c]
rge.save()
print("Warning: %d ranges not found" % len(ranges_missing))
print("[%s]" % ', '.join(ranges_missing))
2021-07-27 04:30:44 +00:00
yield None, None
return count