Merge pull request #398 from ror-community/dump-index-error-if-extra-dir

add error and sys exit if dump zip contains a directory
ror-community · Jul 25, 2024 · 3713bcc · 3713bcc
2 parents ea1f523 + a0fe3a6
commit 3713bcc
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 20 deletions.
diff --git a/rorapi/management/commands/getrordump.py b/rorapi/management/commands/getrordump.py
@@ -30,7 +30,7 @@ def get_ror_dump_sha(filename, use_test_data, github_headers):
     except:
         return None
 
-def get_ror_dump_zip(filename, use_test_data, github_headers):
+def get_ror_dump_zip(self, filename, use_test_data, github_headers):
     sha = get_ror_dump_sha(filename, use_test_data, github_headers)
     if sha:
         if use_test_data:
@@ -46,9 +46,14 @@ def get_ror_dump_zip(filename, use_test_data, github_headers):
             file_decoded = base64.b64decode(response_json['content'])
             with open(filename + '.zip', 'wb') as zip_file:
                 zip_file.write(file_decoded)
+            with zipfile.ZipFile(zip_file.name, 'r') as ror_zip:
+                filenames = ror_zip.namelist()
+                dir_names = [f for f in filenames if ('json' not in f and 'csv' not in f)]
+                if dir_names:
+                    raise SystemExit(f"Dump zip has extra directory and cannot be indexed")
             return zip_file.name
         except:
-            return None
+            raise SystemExit(f"Something went wrong saving zip file")
 
 class Command(BaseCommand):
     help = 'Downloads a specified ROR data dump from Github'
@@ -61,4 +66,5 @@ def handle(self, *args, **options):
             github_headers = AUTH_HEADERS
         else:
             github_headers = HEADERS
-        ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers)
+        ror_dump_zip = get_ror_dump_zip(self, filename, use_test_data, github_headers)
+
diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py
@@ -114,23 +114,26 @@ def handle(self, *args, **options):
             for file in unzipped_files:
                 if file.endswith(".json"):
                     json_files.append(file)
-            for json_file in json_files:
-                index = None
-                json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
-                if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
-                    self.stdout.write('Loading JSON')
-                    with open(json_path, 'r') as it:
-                        dataset = json.load(it)
-                    self.stdout.write('Indexing ROR dataset ' + json_file)
-                    index = ES_VARS['INDEX_V2']
-                    index_dump(self, json_file, index, dataset)
-                if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
-                    self.stdout.write('Loading JSON')
-                    with open(json_path, 'r') as it:
-                        dataset = json.load(it)
-                    self.stdout.write('Indexing ROR dataset ' + json_file)
-                    index = ES_VARS['INDEX_V1']
-                    index_dump(self, json_file, index, dataset)
+            if json_files:
+                for json_file in json_files:
+                    index = None
+                    json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
+                    if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
+                        self.stdout.write('Loading JSON')
+                        with open(json_path, 'r') as it:
+                            dataset = json.load(it)
+                        self.stdout.write('Indexing ROR dataset ' + json_file)
+                        index = ES_VARS['INDEX_V2']
+                        index_dump(self, json_file, index, dataset)
+                    if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
+                        self.stdout.write('Loading JSON')
+                        with open(json_path, 'r') as it:
+                            dataset = json.load(it)
+                        self.stdout.write('Indexing ROR dataset ' + json_file)
+                        index = ES_VARS['INDEX_V1']
+                        index_dump(self, json_file, index, dataset)
+            else:
+                self.stdout.write("ROR data dump does not contain any JSON files")
 
         else:
             self.stdout.write("ROR data dump zip file does not exist")