gmba_django/app/management/commands/import.py

import django.db.utils
from django.core.management.base import BaseCommand
from django.apps import apps

from django.db.utils import IntegrityError

from app.models import *

import csv
import json


class Command(BaseCommand):
    help = 'Imports csv to DB'
    csv_files_models_dict = {
        "v2-LU_GMBA_SpeciesGroups.csv": "GMBA_SpeciesGroup",
        "v2-LU_Countries.csv": "Country",
        "v2-LU_Languages.csv": "Language",
        "v2-LU_Sources.csv": "Source",
        "v2-LU_RedListCategories.csv": "RedListCategory",
        "v2-LU_RangeTypes.csv": "RangeType",
        "v2-LU_PeopleStatus.csv": "PeopleStatus",
        "v2-LU_TrendsQuantity.csv": "TrendsQuantity",
        "v2-LU_TrendsQuality.csv": "TrendsQuality",
        "v2-LU_TaxonUnit.csv": "TaxonUnit",
        "v2-LU_TaxonStatus.csv": "TaxonStatus",

        "v2-Ranges-cleaned.csv": "MountainRange",
        "v2-AddElevations.csv": "AddElevation",

        "v2-GMBA_Function.csv": "GMBA_function",
        "v2-Gmba_V2_centroid.csv": "GMBA_V2_Centroid",
        "v2-ImportGeom210915.csv": "ImportGeom210915",
        "v2-LanguageLink.csv": "LanguageLink",
        "v2-Keywords.csv": "Keyword",
        "v2-NamesImport.csv": "NamesImport",
        "v2-Organisations-cleaned.csv": "Organisation",
        "v2-Peaks.csv": "Peak",

        "v2-People.csv": "Person",
        "v2-PeopleRanges.csv": "PeopleRange",
        "v2-PeopleFunction.csv": "PeopleFunction",
        "v2-Resources.csv": "Resource",

        "v2-PeopleResources.csv": "PeopleResource",
        "v2-RangeCountries.csv": "RangeCountry",
        "v2-RangeNameTranslations.csv": "RangeNameTranslation",
        "v2-RangeOnlineInfo.csv": "RangeOnlineInfo",
        "v2-ResourceRanges.csv": "ResourceRange",
        "v2-ResourceKeywords.csv": "ResourceKeyword",
        "v2-Repositories.csv": "Repository",

        "v2-Species.csv": "Species",
        "v2-Searches.csv": "Search",
        "v2-TaxonRange.csv": "TaxonRange",
        "v2-SpeciesRange.csv": "SpeciesRange"

    }
    cols_to_django_fields = {
        "ID": 'id',
        "Source": 'source',
        "RangeName": 'range_name_id',
        "LanguageTranslation": 'language_translation_id',
        "RangeNameTranslation": 'range_name_translation',
        "GMBA_ID_v2": 'gmba_v2_id',
        "Elev_Min": 'elev_min',
        "Elev_Max": 'elev_max',
        "Elev_Range": 'elev_range',
        "TaxonStatus": 'taxon_status',
        "InfoSource": 'info_source',
        "URL": 'url',
        "GMBA function": 'gmba_function',
        "TaxonUnit": 'taxon_unit',
        "Range_ID": 'id',
        "RangeNameMap": 'range_name_map',
        "RangeNameAscii": 'range_name_ascii',
        "RangeNameLanguage": 'range_name_language',
        "MotherRange": 'mother_range',
        "Feature": 'feature',
        "MapUnit": 'map_unit',
        "Level": 'level',
        "LevelText": 'level_text',
        "Level_1": 'level_1',
        "Level_2": 'level_2',
        "Level_3": 'level_3',
        "Latitude": 'latitude',
        "Longitude": 'longitude',
        "Orogeny": 'orogeny',
        "Area": 'area',
        "GMBA_V1_ID": 'GMBA_v1_id',
        "Countries": 'countries',
        "Peak_Elevation": 'peak_elevation',
        "Peak_Name": 'peak_name',
        "Peak_Latitude": 'peak_latitude',
        "Peak_Longitude": 'peak_longitude',
        "Comments": 'comments',
        "Checked": 'checked',
        "Range_AlternateID": 'range_alternate_id',
        "GeologicRegion": 'geologic_region',
        "GMBA_V2_ID": 'gmba_v2_id',
        "GMBA_V2_ID_str": 'gmba_v2_id_str',
        "WikiDataID": 'wiki_data_id',
        "WikiDataURL": 'wiki_data_url',
        "Select_300": 'select_300',
        "Gmba_Narrow": 'gmba_narrow',
        "Name_FR": 'name_fr',
        "Name_DE": 'name_de',
        "Name_ES": 'name_es',
        "Name_PT": 'name_pt',
        "Name_CN": 'name_cn',
        "Name_RU": 'name_ru',
        "Name_TR": 'name_tr',
        "Perimeter": 'perimeter',
        "ColorAll": 'color_all',
        "ColorBasic": 'color_basic',
        "Color300": 'color_300',
        "Elev_Low": 'elev_low',
        "Elev_High": 'elev_high',
        "Elev_Avg": 'elev_avg',
        "gridcode": 'gridcode',
        "Trend": 'trend',
        "RepositoryName": 'repository_name',
        "RepositoryURL": 'repository_url',
        "Resource": 'resource_id',
        "Keyword": 'keyword',
        "Keyword_ID": 'keyword_id',
        "Mother": 'mother',
        "CN": 'cn',
        "DE": 'de',
        "ES": 'es',
        "FR": 'fr',
        "PT": 'pt',
        "RU": 'ru',
        "TR": 'tr',
        "ResourceTitle": 'resource_title_id',
        "LanguageLetterCode": 'language_letter_code',
        "LanguageNumberCode": 'language_number_code_id',
        "OrgNum1": 'org_num1',
        "Organisation Search": 'organisation_search',
        "OrgAlphaSearch": 'org_alpha_search',
        "Organisation English": 'organisation_english',
        "Organisation 2": 'organisation_2',
        "Organisation 3": 'organisation_3',
        "Organisation Original": 'organisation_original',
        "Acronym": 'acronym',
        "Street": 'street',
        "PO Box": 'po_box',
        "Postcode": 'postcode',
        "City": 'city',
        "Region": 'region',
        "SearchURL": 'search_url',
        "LatLon": 'lat_long',
        "URL Org": 'url',
        "Tel Org": 'tel',
        "Email Org": 'email',
        "Country": 'country_id',
        "Tags": 'tags',
        "Description": 'description',
        "Northing": 'northing',
        "Easting": 'easting',
        "Category": 'category',
        "Subject": 'subject',
        "Title": 'title',
        "Citation": 'citation',
        "Type": 'type',
        "Abstract": 'abstract',
        "AuthorKeywords": 'author_keywords',
        "Lat": 'lat',
        "Lon": 'lon',
        "Stars": 'stars',
        "PEGASuS_Check_map_with_author": 'PEGASuS_Check_map_with_author',
        "PEGASuS_polygon_ID": 'PEGASuS_polygon_ID',
        "PEGASuS_Polygon_comments": 'PEGASuS_Polygon_comments',
        "PEGASuS_Assessment_ID": 'PEGASuS_Assessment_ID',
        "GLORIA": 'gloria',
        "GNOMO": 'gnomo',
        "LTER": 'lter',
        "LTSER": 'ltser',
        "MIREN": 'miren',
        "TEAM": 'team',
        "Inventory": 'inventory',
        "DOI": 'doi',
        "ShortName": 'short_name',
        "FormalName": 'formal_name',
        "Membership within the UN System": 'membership_within_un_system',
        "Membership within the UN System": 'membership_within_un_system',
        "Continent": 'continent',
        "EU_MS": 'eu_ms',
        "EEA_MS": 'eea_ms',
        "ISO3": 'iso3',
        "ISO2": 'iso2',
        "Point_Name": 'point_name',
        "Elevation": 'elevation',
        "Link": 'link',
        "Repository": 'repository_id',
        "SearchString": 'search_string',
        "SearchDate": 'search_date',
        "Result": 'result',
        "NumberOfRecords": 'number_of_records',
        "Stored": 'stored',
        "SpeciesGroup": 'species_group',
        "MrMrs": 'mr_mrs',
        "First name": 'first_name',
        "Last name": 'last_name',
        "Full name": 'full_name',
        "SearchName": 'search_name',
        "e-mail 1": 'contact_email',
        "e-mail 2": 'email_2',
        "Skype": 'skype',
        "Professional phone": 'professional_phone',
        "Mobile number": 'mobile_number',
        "Field of expertise": 'field_of_expertise',
        "Biography": 'biography',
        "Position": 'position',
        "Status": 'status',
        "Entry date": 'entry_date',
        "Newsletter": 'news_letter',
        "CountryLookup": 'country_lookup',
        "Organisation": 'organization_id',
        "Birds": 'birds',
        "Mammals": 'mammals',
        'Reptiles': 'reptiles',
        'Amphibians': 'amphibians',
        'Fish': 'fish',
        'Insects': 'insects',
        'Molluscs': 'molluscs',
        'Crustaceans': 'crustaceans',
        'Arachnids': 'arachnids',
        'Angiosperms': 'angiosperms',
        'Gymnosperms': 'gymnosperms',
        'Fungi': 'fungi',
        'Algae': 'algae',
        'Microbes': 'microbes',
        'Biological field sampling': 'biological_field_sampling',
        'Data mining': 'data_mining',
        'Remote sensing': 'remote_sensing',
        'GIS': 'gis',
        'Spatial analysis': 'spatial_analysis',
        'Statistical analysis': 'statistical_analysis',
        'Modelling': 'modelling',
        'Assessment': 'assessment',
        'Meta-analysis': 'meta_analysis',
        'Synthesis': 'synthesis',
        'Qualitative social science methods (interviews, surveys)': 'qualitative_ssm',
        'Genetic analyses': 'genetic_analyses',
        'Field site': 'field_site',
        'Transect': 'transect',
        'Mountain top': 'mountain_top',
        'Mountain range': 'mountain_range',
        'Landscape': 'landscape',
        'Regional': 'regional',
        'National': 'national',
        'Global': '_global',
        'Geographic area of expertise': 'geographic_area_of_expertise',
        'ProfileOnWeb': 'profile_on_web',
        'Updated': 'updated',
        'ORCID': 'orcid',
        'WebOfScience': 'web_of_science',
        'Twitter': 'twitter',
        'Instagram': 'instagram',
        'ScientificName': 'scientific_name_id',
        'Class': '_class',
        'EnglishName': 'english_name',
        'Language': 'language',
        'Person': 'person_id',
        'Field': 'field_id',
        'Method': 'method_id',
        'Scale': 'scale_id',
        'Function': 'function_id',
        'Range': 'range_id',
        'Endemic': 'endemic',
        'SourceURL': 'source_url',
        'MountainRange': 'mountain_range',
        'TaxonRangeID': 'id',
        'SubRangeOrRegion': 'subrange_or_region',
        'Taxon': 'taxon_id',
        'Distribution': 'distribution',
        'RedList': 'redlist',
        'CountUnit': 'count_unit',
        'NumberUnits': 'number_of_units',
        'Remarks': 'remarks',
        'RangeType': 'range_type',
        'Role': 'role',
        'RedListCategory': 'red_list_category'
    }

    def add_arguments(self, parser):
        parser.add_argument('--path', type=str, help="file path")
        parser.add_argument('--csv_folder_path', type=str, help="Path where the csvs are located")
        parser.add_argument('--model_name', type=str, help="model name")
        parser.add_argument('--app_name', type=str, help="django app name that the model is connected to", default='app')
        parser.add_argument('--mother_range_reload', action='store_true', help="Whether we are trying to reload mother "
                                                                               "range relationship")
        parser.add_argument('--debug', action='store_true', help="Whether we want to debug")
        parser.add_argument('--all', action='store_true', help="Imports all csvs")

    #  ./manage.py import --path /home/pcoder/Downloads/gmbadb/csvs/v2-LU_RedListCategories.csv --model_name RedListCategory --app_name app
    def handle(self, *args, **options):
        csv.register_dialect(
            'mydialect',
            delimiter=',',
            quotechar='"',
            doublequote=True,
            skipinitialspace=True,
            lineterminator='\n',
            quoting=csv.QUOTE_MINIMAL)
        csv_folder_path = '/home/pcoder/Downloads/gmbadb/csvs'
        if options['csv_folder_path']:
            csv_folder_path = options['csv_folder_path']
        debug = options.get('debug', None)
        if options.get('all'):
            print("Doing an import of all csvs")
            for csv_file_name, model_name in self.csv_files_models_dict.items():
                print("Importing %s -- %s" % (csv_file_name, model_name))
                models_to_ignore = ['Range', 'NamesImport', 'ImportGeom210915', 'Organization', 'AddElevation',
                                    'GMBA_V2_Centroid', 'Person', 'PeopleRange', 'PeopleFunction',  "PeopleResource",
                                    "RangeCountry", "RangeNameTranslation", "RangeOnlineInfo", "ResourceRange",
                                    "ResourceKeyword", "Repository"]
                models_to_ignore = []
                if model_name in models_to_ignore:
                    # we have already imported and do not want to spend more time redoing stuff
                    continue
                if csv_folder_path.endswith('/'):
                    file_path = '%s%s' % (csv_folder_path, csv_file_name)
                else:
                    file_path = '%s/%s' % (csv_folder_path, csv_file_name)
                _model = apps.get_model(options.get('app_name', 'app'), model_name)
                with open(file_path, 'r') as csv_file:
                    reader = csv.reader(csv_file, dialect='mydialect')
                    first = True
                    for row in reader:
                        if first:
                            # Assume the first row to be the header
                            header = row
                            header = [h.strip('"') for h in header]
                            first = False
                            continue
                        _object_dict = {str(self.cols_to_django_fields.get(key)): str(value.lstrip('"').rstrip('"')) for key, value in zip(header, row)}
                        _object_dict = handle_object_dict(_object_dict, model_name)
                        m = _model(**_object_dict)
                        try:
                            m.save()
                        except IntegrityError as ie:
                            print(str(ie))
                            if "UNIQUE constraint failed: range.gmba_v2_id" in str(ie):
                                print("======")
                                print("Could not save %s" % json.dumps(_object_dict))
                                print("======")
                    print("Done importing %s" % model_name)
        else:
            _model = apps.get_model(options.get('app_name', 'app'), options['model_name'])
            model_name = options['model_name']
            mother_range_reload = options.get('mother_range_reload')
            k = ''
            csv_file_name = ''
            for k, v in self.csv_files_models_dict.items():
                if v.strip().lower() == model_name.strip().lower():
                    csv_file_name = k
            if csv_file_name == '':
                raise Exception('Could not find a csv file name for model %s' % model_name)
            if csv_folder_path.endswith('/'):
                file_path = '%s%s' % (csv_folder_path, csv_file_name)
            else:
                file_path = '%s/%s' % (csv_folder_path, csv_file_name)
            csv.register_dialect(
                'mydialect',
                delimiter=',',
                quotechar='"',
                doublequote=True,
                skipinitialspace=True,
                lineterminator='\n',
                quoting=csv.QUOTE_MINIMAL)
            with open(file_path, 'r', newline='') as csv_file:
                reader = csv.reader(csv_file, dialect='mydialect')
                first = True
                error_row_count = 0
                total = 0
                for row in reader:
                    total += 1
                    if first:
                        # Assume the first row to be the header
                        header = row
                        header = [h.strip('"') for h in header]
                        first = False
                        continue
                    if mother_range_reload:
                        # We have loaded the range model already and we are attempting to construct the mother_range
                        # relationship
                        try:
                            this_range = MountainRange.objects.get(id=int(row[0]))
                            r = MountainRange.objects.get(id=int(row[5]))  # default Range which exists already
                            this_range.mother_range = r
                            this_range.save()
                        except MountainRange.DoesNotExist as dne:
                            print("this range = %s, mother range --> %s" % (row[0], row[5]))
                            print(str(dne))
                            error_row_count += 1
                        except ValueError as ve:
                            print('0 = %s and 5 = %s' % (row[0], row[5]))
                            print(str(ve))
                            error_row_count += 1
                        continue
                    _object_dict = {self.cols_to_django_fields.get(key): value.lstrip('"').rstrip('"') for key, value in zip(header, row)}
                    _object_dict = handle_object_dict(_object_dict, model_name, debug=debug)
                    try:
                        m = _model(**_object_dict)
                        m.save()
                    except django.db.utils.IntegrityError as ex1:
                        error_row_count += 1
                        print(str(ex1))
                        print("**********************")
                        print(str(_object_dict))
                        print("**********************")
                    except Exception as ex:
                        error_row_count += 1
                        print('-----')
                        print(str(ex))
                        print('----------------')
                        continue
                print("Done importing %s" % str(_model))
                print("Total rows = %s, error rows = %s" % (total, error_row_count))


def handle_object_dict(object_dict, model_name, debug=False):
    if model_name == 'Resource':
        object_dict['url'] = object_dict['url'].strip("#")
        for i in ['PEGASuS_Check_map_with_author', 'gloria', 'gnomo', 'lter', 'ltser', 'miren', 'team', 'inventory']:
            if i in object_dict:
                object_dict[i] = True if object_dict[i].lower().strip() == 'true' else False
    if model_name == 'MountainRange':
        # Reinstate range_name key
        object_dict['range_name'] = object_dict['range_name_id']
        object_dict.pop('range_name_id')
        connected_fields = [{'range_name_language': Language}, {'feature': RangeType}]
        for f in connected_fields:
            for k, v in f.items():
                if k in object_dict:
                    if debug:
                        print('Getting %s of %s' % (k, object_dict[k]))
                    if object_dict[k] == '':
                        object_dict[k] = 0
                    object_dict[k] = v.objects.get(id=int(object_dict[k]))
        if 'mother_range' in object_dict:
            if debug:
                print('Getting mother_range of %s' % object_dict['mother_range'])
            if object_dict['mother_range'] == '':
                object_dict['mother_range'] = 0
            try:
                object_dict['mother_range'] = MountainRange.objects.get(id=0)  # default Range which exists already
            except MountainRange.DoesNotExist as dne:
                print(str(dne))
        for i in ['checked']:
            if i in object_dict:
                object_dict[i] = True if object_dict[i].lower().strip() == 'true' else False
        for i in ['select_300']:
            if i in object_dict:
                object_dict[i] = True if object_dict[i].lower().strip() == 'x' else False
        for i in ['gmba_narrow']:
            if i in object_dict:
                object_dict[i] = True if object_dict[i].lower().strip() == 'x' else False
        # area field can't be empty
        if 'area' in object_dict:
            if object_dict['area'] == '':
                object_dict['area'] = -1
    if model_name == 'Keyword':
        if object_dict['mother'] == '':
            object_dict['mother'] = 0
        try:
            object_dict['mother'] = Keyword.objects.get(keyword_id=object_dict['mother'])  # default Keyword which exists already
        except Keyword.DoesNotExist as dne:
            object_dict['mother'] = Keyword.objects.get(keyword_id=0)
            print(str(dne))
    if model_name == 'Organization' and 'country_id' in object_dict:
        object_dict['country'] = object_dict['country_id']
        object_dict.pop('country_id')
    if model_name == 'PeopleRange' and 'mountain_range' in object_dict:
        object_dict['range_id'] = object_dict['mountain_range']
        object_dict.pop('mountain_range')
    if model_name == 'Species' and 'scientific_name_id' in object_dict:
        object_dict['scientific_name'] = object_dict['scientific_name_id']
        object_dict.pop('scientific_name_id')
    if model_name == 'TaxonRange' and 'taxon_id' in object_dict:
        object_dict['taxon'] = object_dict['taxon_id']
        object_dict.pop('taxon_id')
    if model_name == 'Person' and 'organization_id' in object_dict:
        if debug:
            print("organization_id=%s" % object_dict['organization_id'])
        if object_dict['organization_id'] == '' or object_dict['organization_id'] is None:
            object_dict['organization_id'] = '-1'
        else:
            object_dict['organization_id'] = int(float(object_dict['organization_id']))
        if 'status' in object_dict:
            if debug:
                print('Getting status of %s' % object_dict['status'])
            if object_dict['status'] == '':
                object_dict['status'] = 0
            object_dict['status'] = PeopleStatus.objects.get(id=int(object_dict['status']))
        if 'country_lookup' in object_dict:
            if object_dict['country_lookup'].strip() == '' or object_dict['country_lookup'] is None:
                object_dict['country_lookup'] = 0
            if debug:
                print('Getting country of %s' % object_dict['country_lookup'])
            object_dict['country'] = Country.objects.get(id=int(object_dict['country_lookup']))
            object_dict.pop('country_lookup')
        for i in ['news_letter', 'birds', 'mammals', 'reptiles', 'amphibians', 'fish', 'insects',
                  'molluscs', 'crustaceans', 'arachnids', 'angiosperms', 'gymnosperms', 'fungi',
                  'algae', 'microbes', 'biological_field_sampling', 'data_mining', 'remote_sensing',
                  'gis', 'spatial_analysis', 'statistical_analysis', 'modelling', 'assessment',
                  'meta_analysis', 'synthesis', 'qualitative_ssm', 'genetic_analyses', 'field_site',
                  'transect', 'mountain_top', 'mountain_range', 'landscape', 'regional', 'national',
                  '_global', 'profile_on_web', 'updated']:
            if i in object_dict:
                object_dict[i] = True if object_dict[i].lower().strip() == 'true' else False
    if object_dict is None:
        if debug:
            print("Object None for %s" % model_name)
    # else:
    #     print(object_dict) if str(object_dict) is not None else "Str object dict is None"
    return object_dict