gmba_django/app/management/commands/import.py
2022-03-17 14:03:35 +05:30

519 lines
22 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import django.db.utils
from django.core.management.base import BaseCommand
from django.apps import apps
from django.db.utils import IntegrityError
from app.models import *
import csv
import json
class Command(BaseCommand):
help = 'Imports csv to DB'
csv_files_models_dict = {
"v2-LU_GMBA_SpeciesGroups.csv": "GMBA_SpeciesGroup",
"v2-LU_Countries.csv": "Country",
"v2-LU_Languages.csv": "Language",
"v2-LU_Sources.csv": "Source",
"v2-LU_RedListCategories.csv": "RedListCategory",
"v2-LU_RangeTypes.csv": "RangeType",
"v2-LU_PeopleStatus.csv": "PeopleStatus",
"v2-LU_TrendsQuantity.csv": "TrendsQuantity",
"v2-LU_TrendsQuality.csv": "TrendsQuality",
"v2-LU_TaxonUnit.csv": "TaxonUnit",
"v2-LU_TaxonStatus.csv": "TaxonStatus",
"v2-Ranges-cleaned.csv": "MountainRange",
"v2-AddElevations.csv": "AddElevation",
"v2-GMBA_Function.csv": "GMBA_function",
"v2-Gmba_V2_centroid.csv": "GMBA_V2_Centroid",
"v2-ImportGeom210915.csv": "ImportGeom210915",
"v2-LanguageLink.csv": "LanguageLink",
"v2-Keywords.csv": "Keyword",
"v2-NamesImport.csv": "NamesImport",
"v2-Organisations-cleaned.csv": "Organisation",
"v2-Peaks.csv": "Peak",
"v2-People.csv": "Person",
"v2-PeopleRanges.csv": "PeopleRange",
"v2-PeopleFunction.csv": "PeopleFunction",
"v2-Resources.csv": "Resource",
"v2-PeopleResources.csv": "PeopleResource",
"v2-RangeCountries.csv": "RangeCountry",
"v2-RangeNameTranslations.csv": "RangeNameTranslation",
"v2-RangeOnlineInfo.csv": "RangeOnlineInfo",
"v2-ResourceRanges.csv": "ResourceRange",
"v2-ResourceKeywords.csv": "ResourceKeyword",
"v2-Repositories.csv": "Repository",
"v2-Species.csv": "Species",
"v2-Searches.csv": "Search",
"v2-TaxonRange.csv": "TaxonRange",
"v2-SpeciesRange.csv": "SpeciesRange"
}
cols_to_django_fields = {
"ID": 'id',
"Source": 'source',
"RangeName": 'range_name_id',
"LanguageTranslation": 'language_translation_id',
"RangeNameTranslation": 'range_name_translation',
"GMBA_ID_v2": 'gmba_v2_id',
"Elev_Min": 'elev_min',
"Elev_Max": 'elev_max',
"Elev_Range": 'elev_range',
"TaxonStatus": 'taxon_status',
"InfoSource": 'info_source',
"URL": 'url',
"GMBA function": 'gmba_function',
"TaxonUnit": 'taxon_unit',
"Range_ID": 'id',
"RangeNameMap": 'range_name_map',
"RangeNameAscii": 'range_name_ascii',
"RangeNameLanguage": 'range_name_language',
"MotherRange": 'mother_range',
"Feature": 'feature',
"MapUnit": 'map_unit',
"Level": 'level',
"LevelText": 'level_text',
"Level_1": 'level_1',
"Level_2": 'level_2',
"Level_3": 'level_3',
"Latitude": 'latitude',
"Longitude": 'longitude',
"Orogeny": 'orogeny',
"Area": 'area',
"GMBA_V1_ID": 'GMBA_v1_id',
"Countries": 'countries',
"Peak_Elevation": 'peak_elevation',
"Peak_Name": 'peak_name',
"Peak_Latitude": 'peak_latitude',
"Peak_Longitude": 'peak_longitude',
"Comments": 'comments',
"Checked": 'checked',
"Range_AlternateID": 'range_alternate_id',
"GeologicRegion": 'geologic_region',
"GMBA_V2_ID": 'gmba_v2_id',
"GMBA_V2_ID_str": 'gmba_v2_id_str',
"WikiDataID": 'wiki_data_id',
"WikiDataURL": 'wiki_data_url',
"Select_300": 'select_300',
"Gmba_Narrow": 'gmba_narrow',
"Name_FR": 'name_fr',
"Name_DE": 'name_de',
"Name_ES": 'name_es',
"Name_PT": 'name_pt',
"Name_CN": 'name_cn',
"Name_RU": 'name_ru',
"Name_TR": 'name_tr',
"Perimeter": 'perimeter',
"ColorAll": 'color_all',
"ColorBasic": 'color_basic',
"Color300": 'color_300',
"Elev_Low": 'elev_low',
"Elev_High": 'elev_high',
"Elev_Avg": 'elev_avg',
"gridcode": 'gridcode',
"Trend": 'trend',
"RepositoryName": 'repository_name',
"RepositoryURL": 'repository_url',
"Resource": 'resource_id',
"Keyword": 'keyword',
"Keyword_ID": 'keyword_id',
"Mother": 'mother',
"CN": 'cn',
"DE": 'de',
"ES": 'es',
"FR": 'fr',
"PT": 'pt',
"RU": 'ru',
"TR": 'tr',
"ResourceTitle": 'resource_title_id',
"LanguageLetterCode": 'language_letter_code',
"LanguageNumberCode": 'language_number_code_id',
"OrgNum1": 'org_num1',
"Organisation Search": 'organisation_search',
"OrgAlphaSearch": 'org_alpha_search',
"Organisation English": 'organisation_english',
"Organisation 2": 'organisation_2',
"Organisation 3": 'organisation_3',
"Organisation Original": 'organisation_original',
"Acronym": 'acronym',
"Street": 'street',
"PO Box": 'po_box',
"Postcode": 'postcode',
"City": 'city',
"Region": 'region',
"SearchURL": 'search_url',
"LatLon": 'lat_long',
"URL Org": 'url',
"Tel Org": 'tel',
"Email Org": 'email',
"Country": 'country_id',
"Tags": 'tags',
"Description": 'description',
"Northing": 'northing',
"Easting": 'easting',
"Category": 'category',
"Subject": 'subject',
"Title": 'title',
"Citation": 'citation',
"Type": 'type',
"Abstract": 'abstract',
"AuthorKeywords": 'author_keywords',
"Lat": 'lat',
"Lon": 'lon',
"Stars": 'stars',
"PEGASuS_Check_map_with_author": 'PEGASuS_Check_map_with_author',
"PEGASuS_polygon_ID": 'PEGASuS_polygon_ID',
"PEGASuS_Polygon_comments": 'PEGASuS_Polygon_comments',
"PEGASuS_Assessment_ID": 'PEGASuS_Assessment_ID',
"GLORIA": 'gloria',
"GNOMO": 'gnomo',
"LTER": 'lter',
"LTSER": 'ltser',
"MIREN": 'miren',
"TEAM": 'team',
"Inventory": 'inventory',
"DOI": 'doi',
"ShortName": 'short_name',
"FormalName": 'formal_name',
"Membership within the UN System": 'membership_within_un_system',
"Membership within the UN System": 'membership_within_un_system',
"Continent": 'continent',
"EU_MS": 'eu_ms',
"EEA_MS": 'eea_ms',
"ISO3": 'iso3',
"ISO2": 'iso2',
"Point_Name": 'point_name',
"Elevation": 'elevation',
"Link": 'link',
"Repository": 'repository_id',
"SearchString": 'search_string',
"SearchDate": 'search_date',
"Result": 'result',
"NumberOfRecords": 'number_of_records',
"Stored": 'stored',
"SpeciesGroup": 'species_group',
"MrMrs": 'mr_mrs',
"First name": 'first_name',
"Last name": 'last_name',
"Full name": 'full_name',
"SearchName": 'search_name',
"e-mail 1": 'contact_email',
"e-mail 2": 'email_2',
"Skype": 'skype',
"Professional phone": 'professional_phone',
"Mobile number": 'mobile_number',
"Field of expertise": 'field_of_expertise',
"Biography": 'biography',
"Position": 'position',
"Status": 'status',
"Entry date": 'entry_date',
"Newsletter": 'news_letter',
"CountryLookup": 'country_lookup',
"Organisation": 'organization_id',
"Birds": 'birds',
"Mammals": 'mammals',
'Reptiles': 'reptiles',
'Amphibians': 'amphibians',
'Fish': 'fish',
'Insects': 'insects',
'Molluscs': 'molluscs',
'Crustaceans': 'crustaceans',
'Arachnids': 'arachnids',
'Angiosperms': 'angiosperms',
'Gymnosperms': 'gymnosperms',
'Fungi': 'fungi',
'Algae': 'algae',
'Microbes': 'microbes',
'Biological field sampling': 'biological_field_sampling',
'Data mining': 'data_mining',
'Remote sensing': 'remote_sensing',
'GIS': 'gis',
'Spatial analysis': 'spatial_analysis',
'Statistical analysis': 'statistical_analysis',
'Modelling': 'modelling',
'Assessment': 'assessment',
'Meta-analysis': 'meta_analysis',
'Synthesis': 'synthesis',
'Qualitative social science methods (interviews, surveys)': 'qualitative_ssm',
'Genetic analyses': 'genetic_analyses',
'Field site': 'field_site',
'Transect': 'transect',
'Mountain top': 'mountain_top',
'Mountain range': 'mountain_range',
'Landscape': 'landscape',
'Regional': 'regional',
'National': 'national',
'Global': '_global',
'Geographic area of expertise': 'geographic_area_of_expertise',
'ProfileOnWeb': 'profile_on_web',
'Updated': 'updated',
'ORCID': 'orcid',
'WebOfScience': 'web_of_science',
'Twitter': 'twitter',
'Instagram': 'instagram',
'ScientificName': 'scientific_name_id',
'Class': '_class',
'EnglishName': 'english_name',
'Language': 'language',
'Person': 'person_id',
'Field': 'field_id',
'Method': 'method_id',
'Scale': 'scale_id',
'Function': 'function_id',
'Range': 'range_id',
'Endemic': 'endemic',
'SourceURL': 'source_url',
'MountainRange': 'mountain_range',
'TaxonRangeID': 'id',
'SubRangeOrRegion': 'subrange_or_region',
'Taxon': 'taxon_id',
'Distribution': 'distribution',
'RedList': 'redlist',
'CountUnit': 'count_unit',
'NumberUnits': 'number_of_units',
'Remarks': 'remarks',
'RangeType': 'range_type',
'Role': 'role',
'RedListCategory': 'red_list_category'
}
def add_arguments(self, parser):
parser.add_argument('--path', type=str, help="file path")
parser.add_argument('--csv_folder_path', type=str, help="Path where the csvs are located")
parser.add_argument('--model_name', type=str, help="model name")
parser.add_argument('--app_name', type=str, help="django app name that the model is connected to", default='app')
parser.add_argument('--mother_range_reload', action='store_true', help="Whether we are trying to reload mother "
"range relationship")
parser.add_argument('--debug', action='store_true', help="Whether we want to debug")
parser.add_argument('--all', action='store_true', help="Imports all csvs")
# ./manage.py import --path /home/pcoder/Downloads/gmbadb/csvs/v2-LU_RedListCategories.csv --model_name RedListCategory --app_name app
def handle(self, *args, **options):
csv.register_dialect(
'mydialect',
delimiter=',',
quotechar='"',
doublequote=True,
skipinitialspace=True,
lineterminator='\n',
quoting=csv.QUOTE_MINIMAL)
csv_folder_path = '/home/pcoder/Downloads/gmbadb/csvs'
if options['csv_folder_path']:
csv_folder_path = options['csv_folder_path']
debug = options.get('debug', None)
if options.get('all'):
print("Doing an import of all csvs")
for csv_file_name, model_name in self.csv_files_models_dict.items():
print("Importing %s -- %s" % (csv_file_name, model_name))
models_to_ignore = ['Range', 'NamesImport', 'ImportGeom210915', 'Organization', 'AddElevation',
'GMBA_V2_Centroid', 'Person', 'PeopleRange', 'PeopleFunction', "PeopleResource",
"RangeCountry", "RangeNameTranslation", "RangeOnlineInfo", "ResourceRange",
"ResourceKeyword", "Repository"]
models_to_ignore = []
if model_name in models_to_ignore:
# we have already imported and do not want to spend more time redoing stuff
continue
if csv_folder_path.endswith('/'):
file_path = '%s%s' % (csv_folder_path, csv_file_name)
else:
file_path = '%s/%s' % (csv_folder_path, csv_file_name)
_model = apps.get_model(options.get('app_name', 'app'), model_name)
with open(file_path, 'r') as csv_file:
reader = csv.reader(csv_file, dialect='mydialect')
first = True
for row in reader:
if first:
# Assume the first row to be the header
header = row
header = [h.strip('"') for h in header]
first = False
continue
_object_dict = {str(self.cols_to_django_fields.get(key)): str(value.lstrip('"').rstrip('"')) for key, value in zip(header, row)}
_object_dict = handle_object_dict(_object_dict, model_name)
m = _model(**_object_dict)
try:
m.save()
except IntegrityError as ie:
print(str(ie))
if "UNIQUE constraint failed: range.gmba_v2_id" in str(ie):
print("======")
print("Could not save %s" % json.dumps(_object_dict))
print("======")
print("Done importing %s" % model_name)
else:
_model = apps.get_model(options.get('app_name', 'app'), options['model_name'])
model_name = options['model_name']
mother_range_reload = options.get('mother_range_reload')
k = ''
csv_file_name = ''
for k, v in self.csv_files_models_dict.items():
if v.strip().lower() == model_name.strip().lower():
csv_file_name = k
if csv_file_name == '':
raise Exception('Could not find a csv file name for model %s' % model_name)
if csv_folder_path.endswith('/'):
file_path = '%s%s' % (csv_folder_path, csv_file_name)
else:
file_path = '%s/%s' % (csv_folder_path, csv_file_name)
csv.register_dialect(
'mydialect',
delimiter=',',
quotechar='"',
doublequote=True,
skipinitialspace=True,
lineterminator='\n',
quoting=csv.QUOTE_MINIMAL)
with open(file_path, 'r', newline='') as csv_file:
reader = csv.reader(csv_file, dialect='mydialect')
first = True
error_row_count = 0
total = 0
for row in reader:
total += 1
if first:
# Assume the first row to be the header
header = row
header = [h.strip('"') for h in header]
first = False
continue
if mother_range_reload:
# We have loaded the range model already and we are attempting to construct the mother_range
# relationship
try:
this_range = MountainRange.objects.get(id=int(row[0]))
r = MountainRange.objects.get(id=int(row[5])) # default Range which exists already
this_range.mother_range = r
this_range.save()
except MountainRange.DoesNotExist as dne:
print("this range = %s, mother range --> %s" % (row[0], row[5]))
print(str(dne))
error_row_count += 1
except ValueError as ve:
print('0 = %s and 5 = %s' % (row[0], row[5]))
print(str(ve))
error_row_count += 1
continue
_object_dict = {self.cols_to_django_fields.get(key): value.lstrip('"').rstrip('"') for key, value in zip(header, row)}
_object_dict = handle_object_dict(_object_dict, model_name, debug=debug)
try:
m = _model(**_object_dict)
m.save()
except django.db.utils.IntegrityError as ex1:
error_row_count += 1
print(str(ex1))
print("**********************")
print(str(_object_dict))
print("**********************")
except Exception as ex:
error_row_count += 1
print('-----')
print(str(ex))
print('----------------')
continue
print("Done importing %s" % str(_model))
print("Total rows = %s, error rows = %s" % (total, error_row_count))
def handle_object_dict(object_dict, model_name, debug=False):
if model_name == 'Resource':
object_dict['url'] = object_dict['url'].strip("#")
for i in ['PEGASuS_Check_map_with_author', 'gloria', 'gnomo', 'lter', 'ltser', 'miren', 'team', 'inventory']:
if i in object_dict:
object_dict[i] = True if object_dict[i].lower().strip() == 'true' else False
if model_name == 'MountainRange':
# Reinstate range_name key
object_dict['range_name'] = object_dict['range_name_id']
object_dict.pop('range_name_id')
connected_fields = [{'range_name_language': Language}, {'feature': RangeType}]
for f in connected_fields:
for k, v in f.items():
if k in object_dict:
if debug:
print('Getting %s of %s' % (k, object_dict[k]))
if object_dict[k] == '':
object_dict[k] = 0
object_dict[k] = v.objects.get(id=int(object_dict[k]))
if 'mother_range' in object_dict:
if debug:
print('Getting mother_range of %s' % object_dict['mother_range'])
if object_dict['mother_range'] == '':
object_dict['mother_range'] = 0
try:
object_dict['mother_range'] = MountainRange.objects.get(id=0) # default Range which exists already
except MountainRange.DoesNotExist as dne:
print(str(dne))
for i in ['checked']:
if i in object_dict:
object_dict[i] = True if object_dict[i].lower().strip() == 'true' else False
for i in ['select_300']:
if i in object_dict:
object_dict[i] = True if object_dict[i].lower().strip() == 'x' else False
for i in ['gmba_narrow']:
if i in object_dict:
object_dict[i] = True if object_dict[i].lower().strip() == 'x' else False
# area field can't be empty
if 'area' in object_dict:
if object_dict['area'] == '':
object_dict['area'] = -1
if model_name == 'Keyword':
if object_dict['mother'] == '':
object_dict['mother'] = 0
try:
object_dict['mother'] = Keyword.objects.get(keyword_id=object_dict['mother']) # default Keyword which exists already
except Keyword.DoesNotExist as dne:
object_dict['mother'] = Keyword.objects.get(keyword_id=0)
print(str(dne))
if model_name == 'Organization' and 'country_id' in object_dict:
object_dict['country'] = object_dict['country_id']
object_dict.pop('country_id')
if model_name == 'PeopleRange' and 'mountain_range' in object_dict:
object_dict['range_id'] = object_dict['mountain_range']
object_dict.pop('mountain_range')
if model_name == 'Species' and 'scientific_name_id' in object_dict:
object_dict['scientific_name'] = object_dict['scientific_name_id']
object_dict.pop('scientific_name_id')
if model_name == 'TaxonRange' and 'taxon_id' in object_dict:
object_dict['taxon'] = object_dict['taxon_id']
object_dict.pop('taxon_id')
if model_name == 'Person' and 'organization_id' in object_dict:
if debug:
print("organization_id=%s" % object_dict['organization_id'])
if object_dict['organization_id'] == '' or object_dict['organization_id'] is None:
object_dict['organization_id'] = '-1'
else:
object_dict['organization_id'] = int(float(object_dict['organization_id']))
if 'status' in object_dict:
if debug:
print('Getting status of %s' % object_dict['status'])
if object_dict['status'] == '':
object_dict['status'] = 0
object_dict['status'] = PeopleStatus.objects.get(id=int(object_dict['status']))
if 'country_lookup' in object_dict:
if object_dict['country_lookup'].strip() == '' or object_dict['country_lookup'] is None:
object_dict['country_lookup'] = 0
if debug:
print('Getting country of %s' % object_dict['country_lookup'])
object_dict['country'] = Country.objects.get(id=int(object_dict['country_lookup']))
object_dict.pop('country_lookup')
for i in ['news_letter', 'birds', 'mammals', 'reptiles', 'amphibians', 'fish', 'insects',
'molluscs', 'crustaceans', 'arachnids', 'angiosperms', 'gymnosperms', 'fungi',
'algae', 'microbes', 'biological_field_sampling', 'data_mining', 'remote_sensing',
'gis', 'spatial_analysis', 'statistical_analysis', 'modelling', 'assessment',
'meta_analysis', 'synthesis', 'qualitative_ssm', 'genetic_analyses', 'field_site',
'transect', 'mountain_top', 'mountain_range', 'landscape', 'regional', 'national',
'_global', 'profile_on_web', 'updated']:
if i in object_dict:
object_dict[i] = True if object_dict[i].lower().strip() == 'true' else False
if object_dict is None:
if debug:
print("Object None for %s" % model_name)
# else:
# print(object_dict) if str(object_dict) is not None else "Str object dict is None"
return object_dict