gmba_django/app/convert.py

import csv
import json
import re
from os.path import isfile
from app.models import *
from app.formats import *

from django.db import transaction


# Correct commas inside of a linked field
def fix_bracketed_lists(data):
    for fix in re.findall(r'\([^\s]*,[ ]*[^\s]*\)', data):
        data = data.replace(fix, fix.replace(',', ' /'))
    return data


# Check for valid link
def fix_url(link):
    if len(link) > 3 and not link.startswith('http'):
        link = 'http://' + link
    # link = link.split(';')[0]
    return link


# Create linked objects
def add_linked(person, obj, data):
    field = []
    # TODO: fuzzy matching instead of lower()
    items = fix_bracketed_lists(data).lower()
    items = items.replace(';', ',').split(',')
    for i in items:
        n = i.strip()
        if len(n) < 3:
            continue
        tgt = obj.objects.filter(name=n).first()
        if not tgt:
            tgt = obj()
            tgt.name = n
        tgt.save()
        field.append(tgt)
    return  field


# Fetch an object by source_id (numeric identifier used in source DB)
def get_by_id(rowid, obj, first=True):
    if type(rowid) is str and rowid.isdigit():
        rowid = int(rowid)
    if type(rowid) is int:
        l = obj.objects.filter(source_id=rowid)
        if first:
            return l.first(), rowid
        else:
            return l, rowid
    return None, None


# Quick check of the number of lines
def get_total_rows_csv(filename):
    with open(filename) as f:
        for i, l in enumerate(f):
            pass
    return i + 1


# Search index routine
def reindex_data():
    for i, p in enumerate(Person.objects.all()):
        p.index()
        p.save()


# Data update routine
def refresh_data(filename, fmt=None):
    count = 0
    rowcount = 0
    if not isfile(filename):
        msg = "Missing data: %s  - refresh aborted." % fmt['filename']
        print(msg)
        yield msg, "error"
        return None
    if fmt['extension'] == 'csv':
        totalrows = get_total_rows_csv(filename)
        with open(filename, 'rt', encoding='utf-8', errors='ignore') as csvfile:
            datareader = csv.DictReader(csvfile)
            for row in datareader:
                rowcount += 1
                if row is None:
                    continue
                yield rowcount, rowcount / totalrows

                # # Ensure any new data is flushed from time to time
                # if count % 25 == 0:
                #     db.session.commit()

                for r in fmt['required']:
                    if not r in row:
                        msg = "Missing attribute in %s (%s)" % (r, fmt['filename'])
                        print(msg)
                        yield msg, "error"
                        return None

                if fmt['dataformat'] is DataFormat.PERSON_DETAIL:
                    person, source_id = get_by_id(row['ID'], Person)
                    if not person:
                        person = Person.objects.filter(first_name=row['First name'], last_name=row['Last name']).first()
                    if not person:
                        person = Person(first_name=row['First name'], last_name=row['Last name'], source_id=row['ID'])

                    # Update data fields
                    person.source_id = source_id
                    person.title = row['Title']
                    person.organisation = row['Organisation English']
                    print("Country = %s" % row['country'])
                    if row['country'] is None or row['country'].strip() == '':
                        row['country'] = 0
                    c = Country.objects.get(id=row['country'])
                    person.country = c
                    person.position = row['Position']
                    person.biography = row['Biography']
                    person.contact_email = row['e-mail 1']
                    person.personal_url = fix_url(row['URL'])

                    with transaction.atomic():
                        research_methods = add_linked(person, Method, row['Methods'])
                        methods_people = [MethodsPeople.objects.get_or_create(method_id=m.id, person_id=person.id) for m in research_methods]
                        research_scales = add_linked(person, Scale, row['Scale'])
                        scales_people = [ScalesPeople.objects.get_or_create(scale_id=s.id, person_id=person.id) for s in research_scales]
                        research_taxa = add_linked(person, Taxon, row['Taxa'])
                        taxa_people = [TaxaPeople.objects.get_or_create(taxon_id=t.id, person_id=person.id) for t in research_taxa]
                        research_fields = add_linked(person, Field, row['Field of expertise'])
                        fields_people = [FieldsPeople.objects.get_or_create(field_id=f.id, person_id=person.id) for f in research_fields]
                    person.index()
                    person.save()
                    count = count + 1

                elif fmt['dataformat'] is DataFormat.RESOURCE_DETAIL:
                    res, source_id = get_by_id(row['ID'], Resource)
                    if not res: res = Resource(source_id=source_id)
                    res.title = row['Title']
                    res.citation = row['Citation']
                    res.url = fix_url(row['URL'].strip('#'))  # remove weird #formatting#
                    res.abstract = row['Abstract']
                    res.save()
                    count = count + 1

                elif fmt['dataformat'] is DataFormat.RANGE_DETAIL:
                    rng, source_id = get_by_id(row['Range_ID'], MountainRange)
                    if not rng: rng = MountainRange(source_id=source_id)
                    rng.gmba_id = row['GMBA_ID']
                    rng.name = row['RangeName']
                    rng.countries = row['Countries']
                    rng.save()
                    count = count + 1

                elif fmt['dataformat'] is DataFormat.PERSON_RESOURCE:
                    rzs, source_id = get_by_id(row['Resource'], Resource, first=False)
                    if not rzs or not rzs.first(): continue
                    ppl, source_id = get_by_id(row['Person'], Person, first=False)
                    if not ppl or not ppl.first(): continue
                    for person in ppl:
                        person.resources = []
                        for r in rzs:
                            person.resources.append(r)
                        person.save()
                        count = count + 1

                elif fmt['dataformat'] is DataFormat.PERSON_RANGE:
                    rzs, source_id = get_by_id(row['MountainRange'], MountainRange, first=False)
                    if not rzs or not rzs.first():
                        continue
                    ppl, source_id = get_by_id(row['Person'], Person, first=False)
                    if not ppl or not ppl.first(): continue
                    for person in ppl:
                        person.ranges = []
                        for r in rzs:
                            person.ranges.append(r)
                        person.save()
                        count = count + 1

    elif fmt['extension'] == 'geojson':
        ranges_missing = []
        with open(filename, 'rt', encoding='utf-8', errors='ignore') as jsonfile:
            jsondata = json.load(jsonfile)
            if fmt['dataformat'] is DataFormat.RANGE_SHAPES:
                totalrows = len(jsondata['features'])
                for f in jsondata['features']:
                    yield count, count / totalrows
                    count = count + 1

                    p = f['properties']
                    rge = MountainRange.objects.filter(gmba_id=p['GMBA_ID']).first()
                    if not rge:
                        ranges_missing.append(p['GMBA_ID'])
                        continue
                    rge.name = p['Name']
                    for c in ['Country_1', 'Country_2_']:
                        if c in p: rge.countries = p[c]
                    rge.save()
                print("Warning: %d ranges not found" % len(ranges_missing))
                print("[%s]" % ', '.join(ranges_missing))
    yield None, None
    return count
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`import csv`
			`import json`
			`import re`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`from os.path import isfile`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`from app.models import *`
Create a simple class-based view for the HomePage 2021-07-27 07:17:39 +00:00			`from app.formats import *`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`from django.db import transaction`


Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`# Correct commas inside of a linked field`
			`def fix_bracketed_lists(data):`
			`for fix in re.findall(r'\([^\s],[ ][^\s]*\)', data):`
			`data = data.replace(fix, fix.replace(',', ' /'))`
			`return data`

Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`# Check for valid link`
			`def fix_url(link):`
			`if len(link) > 3 and not link.startswith('http'):`
			`link = 'http://' + link`
			`# link = link.split(';')[0]`
			`return link`

Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`# Create linked objects`
Update add_linked: add relationship fields manually 2021-07-28 11:32:58 +00:00			`def add_linked(person, obj, data):`
			`field = []`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`# TODO: fuzzy matching instead of lower()`
			`items = fix_bracketed_lists(data).lower()`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`items = items.replace(';', ',').split(',')`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`for i in items:`
			`n = i.strip()`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`if len(n) < 3:`
			`continue`
			`tgt = obj.objects.filter(name=n).first()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`if not tgt:`
			`tgt = obj()`
			`tgt.name = n`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`tgt.save()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`field.append(tgt)`
Update add_linked: add relationship fields manually 2021-07-28 11:32:58 +00:00			`return field`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`# Fetch an object by source_id (numeric identifier used in source DB)`
			`def get_by_id(rowid, obj, first=True):`
			`if type(rowid) is str and rowid.isdigit():`
			`rowid = int(rowid)`
			`if type(rowid) is int:`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`l = obj.objects.filter(source_id=rowid)`
			`if first:`
			`return l.first(), rowid`
			`else:`
			`return l, rowid`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`return None, None`

Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`# Quick check of the number of lines`
			`def get_total_rows_csv(filename):`
			`with open(filename) as f:`
			`for i, l in enumerate(f):`
			`pass`
			`return i + 1`

Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`# Search index routine`
			`def reindex_data():`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`for i, p in enumerate(Person.objects.all()):`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`p.index()`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`p.save()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`# Data update routine`
			`def refresh_data(filename, fmt=None):`
			`count = 0`
			`rowcount = 0`
			`if not isfile(filename):`
			`msg = "Missing data: %s - refresh aborted." % fmt['filename']`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`print(msg)`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`yield msg, "error"`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`return None`
			`if fmt['extension'] == 'csv':`
			`totalrows = get_total_rows_csv(filename)`
			`with open(filename, 'rt', encoding='utf-8', errors='ignore') as csvfile:`
			`datareader = csv.DictReader(csvfile)`
			`for row in datareader:`
			`rowcount += 1`
Fix more formatting issues 2021-07-27 05:00:20 +00:00			`if row is None:`
			`continue`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`yield rowcount, rowcount / totalrows`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`# # Ensure any new data is flushed from time to time`
			`# if count % 25 == 0:`
			`# db.session.commit()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00
			`for r in fmt['required']:`
			`if not r in row:`
			`msg = "Missing attribute in %s (%s)" % (r, fmt['filename'])`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`print(msg)`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`yield msg, "error"`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`return None`

			`if fmt['dataformat'] is DataFormat.PERSON_DETAIL:`
			`person, source_id = get_by_id(row['ID'], Person)`
			`if not person:`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`person = Person.objects.filter(first_name=row['First name'], last_name=row['Last name']).first()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`if not person:`
			`person = Person(first_name=row['First name'], last_name=row['Last name'], source_id=row['ID'])`

			`# Update data fields`
			`person.source_id = source_id`
			`person.title = row['Title']`
			`person.organisation = row['Organisation English']`
Fix error refreshing Person 2021-12-15 11:10:09 +00:00			`print("Country = %s" % row['country'])`
			`if row['country'] is None or row['country'].strip() == '':`
			`row['country'] = 0`
			`c = Country.objects.get(id=row['country'])`
			`person.country = c`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`person.position = row['Position']`
			`person.biography = row['Biography']`
			`person.contact_email = row['e-mail 1']`
			`person.personal_url = fix_url(row['URL'])`

Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`with transaction.atomic():`
Update add_linked: add relationship fields manually 2021-07-28 11:32:58 +00:00			`research_methods = add_linked(person, Method, row['Methods'])`
Put research parameters into correct fields 2021-08-31 17:29:41 +00:00			`methods_people = [MethodsPeople.objects.get_or_create(method_id=m.id, person_id=person.id) for m in research_methods]`
Update add_linked: add relationship fields manually 2021-07-28 11:32:58 +00:00			`research_scales = add_linked(person, Scale, row['Scale'])`
Put research parameters into correct fields 2021-08-31 17:29:41 +00:00			`scales_people = [ScalesPeople.objects.get_or_create(scale_id=s.id, person_id=person.id) for s in research_scales]`
Update add_linked: add relationship fields manually 2021-07-28 11:32:58 +00:00			`research_taxa = add_linked(person, Taxon, row['Taxa'])`
Put research parameters into correct fields 2021-08-31 17:29:41 +00:00			`taxa_people = [TaxaPeople.objects.get_or_create(taxon_id=t.id, person_id=person.id) for t in research_taxa]`
Update add_linked: add relationship fields manually 2021-07-28 11:32:58 +00:00			`research_fields = add_linked(person, Field, row['Field of expertise'])`
Put research parameters into correct fields 2021-08-31 17:29:41 +00:00			`fields_people = [FieldsPeople.objects.get_or_create(field_id=f.id, person_id=person.id) for f in research_fields]`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`person.index()`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`person.save()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`count = count + 1`

			`elif fmt['dataformat'] is DataFormat.RESOURCE_DETAIL:`
			`res, source_id = get_by_id(row['ID'], Resource)`
			`if not res: res = Resource(source_id=source_id)`
			`res.title = row['Title']`
			`res.citation = row['Citation']`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`res.url = fix_url(row['URL'].strip('#')) # remove weird #formatting#`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`res.abstract = row['Abstract']`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`res.save()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`count = count + 1`

			`elif fmt['dataformat'] is DataFormat.RANGE_DETAIL:`
Fix error refreshing Person 2021-12-15 11:10:09 +00:00			`rng, source_id = get_by_id(row['Range_ID'], MountainRange)`
			`if not rng: rng = MountainRange(source_id=source_id)`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`rng.gmba_id = row['GMBA_ID']`
			`rng.name = row['RangeName']`
			`rng.countries = row['Countries']`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`rng.save()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`count = count + 1`

			`elif fmt['dataformat'] is DataFormat.PERSON_RESOURCE:`
			`rzs, source_id = get_by_id(row['Resource'], Resource, first=False)`
			`if not rzs or not rzs.first(): continue`
			`ppl, source_id = get_by_id(row['Person'], Person, first=False)`
			`if not ppl or not ppl.first(): continue`
			`for person in ppl:`
Update add_linked: add relationship fields manually 2021-07-28 11:32:58 +00:00			`person.resources = []`
Fix more formatting issues 2021-07-27 05:00:20 +00:00			`for r in rzs:`
			`person.resources.append(r)`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`person.save()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`count = count + 1`

			`elif fmt['dataformat'] is DataFormat.PERSON_RANGE:`
Fix error refreshing Person 2021-12-15 11:10:09 +00:00			`rzs, source_id = get_by_id(row['MountainRange'], MountainRange, first=False)`
Fix more formatting issues 2021-07-27 05:00:20 +00:00			`if not rzs or not rzs.first():`
			`continue`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`ppl, source_id = get_by_id(row['Person'], Person, first=False)`
			`if not ppl or not ppl.first(): continue`
			`for person in ppl:`
Update add_linked: add relationship fields manually 2021-07-28 11:32:58 +00:00			`person.ranges = []`
Fix more formatting issues 2021-07-27 05:00:20 +00:00			`for r in rzs:`
			`person.ranges.append(r)`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`person.save()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`count = count + 1`

			`elif fmt['extension'] == 'geojson':`
			`ranges_missing = []`
			`with open(filename, 'rt', encoding='utf-8', errors='ignore') as jsonfile:`
			`jsondata = json.load(jsonfile)`
			`if fmt['dataformat'] is DataFormat.RANGE_SHAPES:`
			`totalrows = len(jsondata['features'])`
			`for f in jsondata['features']:`
Fix issues in convert.py - Replace all query with objects - Reformat code 2021-07-27 04:58:07 +00:00			`yield count, count / totalrows`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`count = count + 1`

			`p = f['properties']`
Fix error refreshing Person 2021-12-15 11:10:09 +00:00			`rge = MountainRange.objects.filter(gmba_id=p['GMBA_ID']).first()`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`if not rge:`
			`ranges_missing.append(p['GMBA_ID'])`
			`continue`
			`rge.name = p['Name']`
			`for c in ['Country_1', 'Country_2_']:`
			`if c in p: rge.countries = p[c]`
Fix db errors in convert.py - use print for logger for the time being - replace db.session with obj.save methods - use transaction for no_flush case TODO: change query / filter / filter_by 2021-07-27 04:42:37 +00:00			`rge.save()`
			`print("Warning: %d ranges not found" % len(ranges_missing))`
			`print("[%s]" % ', '.join(ranges_missing))`
Add convert.py as is from gmba-connect 2021-07-27 04:30:44 +00:00			`yield None, None`
			`return count`