diff --git a/rorapi/management/commands/getrordump.py b/rorapi/management/commands/getrordump.py index 4bee42d..cb75d74 100644 --- a/rorapi/management/commands/getrordump.py +++ b/rorapi/management/commands/getrordump.py @@ -30,7 +30,7 @@ def get_ror_dump_sha(filename, use_test_data, github_headers): except: return None -def get_ror_dump_zip(filename, use_test_data, github_headers): +def get_ror_dump_zip(self, filename, use_test_data, github_headers): sha = get_ror_dump_sha(filename, use_test_data, github_headers) if sha: if use_test_data: @@ -46,9 +46,14 @@ def get_ror_dump_zip(filename, use_test_data, github_headers): file_decoded = base64.b64decode(response_json['content']) with open(filename + '.zip', 'wb') as zip_file: zip_file.write(file_decoded) + with zipfile.ZipFile(zip_file.name, 'r') as ror_zip: + filenames = ror_zip.namelist() + dir_names = [f for f in filenames if ('json' not in f and 'csv' not in f)] + if dir_names: + raise SystemExit(f"Dump zip has extra directory and cannot be indexed") return zip_file.name except: - return None + raise SystemExit(f"Something went wrong saving zip file") class Command(BaseCommand): help = 'Downloads a specified ROR data dump from Github' @@ -61,4 +66,5 @@ def handle(self, *args, **options): github_headers = AUTH_HEADERS else: github_headers = HEADERS - ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers) + ror_dump_zip = get_ror_dump_zip(self, filename, use_test_data, github_headers) + diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py index f1ce85a..c116c76 100644 --- a/rorapi/management/commands/indexrordump.py +++ b/rorapi/management/commands/indexrordump.py @@ -114,23 +114,26 @@ def handle(self, *args, **options): for file in unzipped_files: if file.endswith(".json"): json_files.append(file) - for json_file in json_files: - index = None - json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file - if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): - self.stdout.write('Loading JSON') - with open(json_path, 'r') as it: - dataset = json.load(it) - self.stdout.write('Indexing ROR dataset ' + json_file) - index = ES_VARS['INDEX_V2'] - index_dump(self, json_file, index, dataset) - if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): - self.stdout.write('Loading JSON') - with open(json_path, 'r') as it: - dataset = json.load(it) - self.stdout.write('Indexing ROR dataset ' + json_file) - index = ES_VARS['INDEX_V1'] - index_dump(self, json_file, index, dataset) + if json_files: + for json_file in json_files: + index = None + json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file + if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) + self.stdout.write('Indexing ROR dataset ' + json_file) + index = ES_VARS['INDEX_V2'] + index_dump(self, json_file, index, dataset) + if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) + self.stdout.write('Indexing ROR dataset ' + json_file) + index = ES_VARS['INDEX_V1'] + index_dump(self, json_file, index, dataset) + else: + self.stdout.write("ROR data dump does not contain any JSON files") else: self.stdout.write("ROR data dump zip file does not exist")