From 73f63988ee72121deeed7ceec4d90b7e84e141fb Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Wed, 1 May 2024 16:14:46 -0500 Subject: [PATCH 1/3] bulk update handle case where name field value has too many * chars --- rorapi/common/csv_create.py | 29 +++++++++++++++++------------ rorapi/common/csv_update.py | 23 +++++++++++++---------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/rorapi/common/csv_create.py b/rorapi/common/csv_create.py index cc8799f..10ff03b 100644 --- a/rorapi/common/csv_create.py +++ b/rorapi/common/csv_create.py @@ -54,21 +54,26 @@ def new_record_from_csv(csv_data, version): if csv_data['names.types.' + v]: for n in csv_data['names.types.' + v].strip(';').split(';'): if LANG_DELIMITER in n: - name_val, lang = n.split("*") - if lang: - lang_errors, lang_code = get_lang_code(lang.strip()) - if lang_errors: - errors.append("Could not convert language value to ISO code: {}".format(lang)) + if n.count(LANG_DELIMITER) == 1: + name_val, lang = n.split("*") + if lang: + lang_errors, lang_code = get_lang_code(lang.strip()) + if lang_errors: + errors.append("Could not convert language value to ISO code: {}".format(lang)) + else: + name_val = None + lang_code = None + errors.append("Could not parse name value {} in names.types.{} because it contains multiple {} lang delimiter chars.".format(n, v, LANG_DELIMITER)) else: name_val = n lang_code = None - - name_obj = { - "types": [v], - "value": name_val.strip(), - "lang": lang_code - } - temp_names.append(name_obj) + if name_val: + name_obj = { + "types": [v], + "value": name_val.strip(), + "lang": lang_code + } + temp_names.append(name_obj) print("temp names 1:") print(temp_names) name_vals = [n['value'] for n in temp_names] diff --git a/rorapi/common/csv_update.py b/rorapi/common/csv_update.py index 51e8d5f..3e41716 100644 --- a/rorapi/common/csv_update.py +++ b/rorapi/common/csv_update.py @@ -247,18 +247,21 @@ def update_record_from_csv(csv_data, version): "lang": None } if LANG_DELIMITER in val: - print("has lang delim") - name_val, lang = val.split("*") - vals_obj["value"] = name_val.strip() - if lang: - lang_errors, lang_code = get_lang_code(lang.strip()) - if lang_errors: - errors.append("Could not convert language value to ISO code: {}".format(lang)) - else: - vals_obj["lang"] = lang_code + if val.count(LANG_DELIMITER) == 1: + name_val, lang = val.split("*") + vals_obj["value"] = name_val.strip() + if lang: + lang_errors, lang_code = get_lang_code(lang.strip()) + if lang_errors: + errors.append("Could not convert language value to ISO code: {}".format(lang)) + else: + vals_obj["lang"] = lang_code + else: + errors.append("Could not parse name value {} in names.types.{} because it contains multiple {} lang delimiter chars.".format(val, t, LANG_DELIMITER)) else: vals_obj["value"] = val.strip() - vals_obj_list.append(vals_obj) + if vals_obj["value"]: + vals_obj_list.append(vals_obj) actions_values[k] = vals_obj_list print("updated actions values") print(actions_values) From a0fe3a60bce1c906521910e3f71fbed58683cccd Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 25 Jul 2024 16:40:06 -0500 Subject: [PATCH 2/3] add error and sys exit if dump zip contains a directory --- rorapi/management/commands/getrordump.py | 12 +++++-- rorapi/management/commands/indexrordump.py | 37 ++++++++++++---------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/rorapi/management/commands/getrordump.py b/rorapi/management/commands/getrordump.py index 4bee42d..cb75d74 100644 --- a/rorapi/management/commands/getrordump.py +++ b/rorapi/management/commands/getrordump.py @@ -30,7 +30,7 @@ def get_ror_dump_sha(filename, use_test_data, github_headers): except: return None -def get_ror_dump_zip(filename, use_test_data, github_headers): +def get_ror_dump_zip(self, filename, use_test_data, github_headers): sha = get_ror_dump_sha(filename, use_test_data, github_headers) if sha: if use_test_data: @@ -46,9 +46,14 @@ def get_ror_dump_zip(filename, use_test_data, github_headers): file_decoded = base64.b64decode(response_json['content']) with open(filename + '.zip', 'wb') as zip_file: zip_file.write(file_decoded) + with zipfile.ZipFile(zip_file.name, 'r') as ror_zip: + filenames = ror_zip.namelist() + dir_names = [f for f in filenames if ('json' not in f and 'csv' not in f)] + if dir_names: + raise SystemExit(f"Dump zip has extra directory and cannot be indexed") return zip_file.name except: - return None + raise SystemExit(f"Something went wrong saving zip file") class Command(BaseCommand): help = 'Downloads a specified ROR data dump from Github' @@ -61,4 +66,5 @@ def handle(self, *args, **options): github_headers = AUTH_HEADERS else: github_headers = HEADERS - ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers) + ror_dump_zip = get_ror_dump_zip(self, filename, use_test_data, github_headers) + diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py index f1ce85a..c116c76 100644 --- a/rorapi/management/commands/indexrordump.py +++ b/rorapi/management/commands/indexrordump.py @@ -114,23 +114,26 @@ def handle(self, *args, **options): for file in unzipped_files: if file.endswith(".json"): json_files.append(file) - for json_file in json_files: - index = None - json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file - if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): - self.stdout.write('Loading JSON') - with open(json_path, 'r') as it: - dataset = json.load(it) - self.stdout.write('Indexing ROR dataset ' + json_file) - index = ES_VARS['INDEX_V2'] - index_dump(self, json_file, index, dataset) - if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): - self.stdout.write('Loading JSON') - with open(json_path, 'r') as it: - dataset = json.load(it) - self.stdout.write('Indexing ROR dataset ' + json_file) - index = ES_VARS['INDEX_V1'] - index_dump(self, json_file, index, dataset) + if json_files: + for json_file in json_files: + index = None + json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file + if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) + self.stdout.write('Indexing ROR dataset ' + json_file) + index = ES_VARS['INDEX_V2'] + index_dump(self, json_file, index, dataset) + if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) + self.stdout.write('Indexing ROR dataset ' + json_file) + index = ES_VARS['INDEX_V1'] + index_dump(self, json_file, index, dataset) + else: + self.stdout.write("ROR data dump does not contain any JSON files") else: self.stdout.write("ROR data dump zip file does not exist") From 56877591f2a399cf7457283e08dac7c629dec658 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 25 Jul 2024 18:15:51 -0500 Subject: [PATCH 3/3] pass msg back to view in case of error --- rorapi/management/commands/setup.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/rorapi/management/commands/setup.py b/rorapi/management/commands/setup.py index 3505e72..a87b7f4 100644 --- a/rorapi/management/commands/setup.py +++ b/rorapi/management/commands/setup.py @@ -52,11 +52,14 @@ def handle(self, *args, **options): sha = get_ror_dump_sha(filename, use_test_data) if sha: - GetRorDumpCommand().handle(*args, **options) - DeleteIndexCommand().handle(*args, **options) - CreateIndexCommand().handle(*args, **options) - IndexRorDumpCommand().handle(*args, **options) - msg = 'SUCCESS: ROR dataset {} indexed in version {}. Using test repo: {}'.format(filename, str(options['schema']), str(use_test_data)) + try: + GetRorDumpCommand().handle(*args, **options) + DeleteIndexCommand().handle(*args, **options) + CreateIndexCommand().handle(*args, **options) + IndexRorDumpCommand().handle(*args, **options) + msg = 'SUCCESS: ROR dataset {} indexed in version {}. Using test repo: {}'.format(filename, str(options['schema']), str(use_test_data)) + except: + msg = 'ERROR: Could not index ROR data dump. Check API logs for details.' else: msg = 'ERROR: ROR dataset for file {} not found. '.format(filename) \ +'Please generate the data dump first.'