Add django command for extracting annotations

GateNLP · Oct 2, 2023 · baa6882 · baa6882
1 parent a9c28d8
commit baa6882
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 27 deletions.
diff --git a/backend/management/commands/download_annotations.py b/backend/management/commands/download_annotations.py
@@ -0,0 +1,58 @@
+import json
+from django.core.management.base import BaseCommand, CommandError
+from django.template.loader import render_to_string
+from backend.rpcserver import JSONRPCEndpoint
+from backend.views import DownloadAnnotationsView
+import argparse
+
+class Command(BaseCommand):
+
+    help = "Download annotation data"
+
+
+
+    def add_arguments(self, parser):
+        parser.add_argument("output_path", type=str, help="Path of file output")
+        parser.add_argument("project_id", type=str, help="ID of the project")
+        parser.add_argument("doc_type", type=str, help="Document type all, training, test, or annotation")
+        parser.add_argument("export_type", type=str, help="Type of export json, jsonl or csv")
+        parser.add_argument("anonymize", type=self.str2bool, help="Data should be anonymized or not ")
+        parser.add_argument("-j", "--json_format", type=str, help="Type of json format: raw (default) or gate ")
+        parser.add_argument("-n", "--num_entries_per_file", type=int, help="Number of entries to generate per file, default 500")
+
+
+    def handle(self, *args, **options):
+
+        annotations_downloader = DownloadAnnotationsView()
+
+        output_path = options["output_path"]
+        project_id = options["project_id"]
+        doc_type = options["doc_type"]
+        export_type = options["export_type"]
+        anonymize = options["anonymize"]
+        json_format = options["json_format"] if options["json_format"] else "raw"
+        num_entries_per_file = options["num_entries_per_file"] if options["num_entries_per_file"] else 500
+
+        print(f"Writing annotations to {output_path} \n Project: {project_id}\n Document type: {doc_type}\n Export type: {export_type} \n Anonymized: {anonymize} \n Json format:  {json_format} \n Num entries per file: {num_entries_per_file}\n")
+
+        with open(output_path, "wb") as z:
+            annotations_downloader.write_zip_to_file(file_stream=z,
+                                                     project_id=project_id,
+                                                     doc_type=doc_type,
+                                                     export_type=export_type,
+                                                     json_format=json_format,
+                                                     anonymize=anonymize,
+                                                     documents_per_file=num_entries_per_file)
+
+
+    def str2bool(self, v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ('yes', 'true', 't', 'y', '1'):
+            return True
+        elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+            return False
+        else:
+            raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
diff --git a/backend/views.py b/backend/views.py
@@ -58,36 +58,10 @@ def get(self, request, project_id, doc_type, export_type, json_format, entries_p
 
     def generate_download(self, project_id, doc_type="all", export_type="json", json_format="raw", anonymize=True, chunk_size=512, documents_per_file=500):
 
-        project = Project.objects.get(pk=project_id)
-
         with tempfile.TemporaryFile() as z:
-            with ZipFile(z, "w") as zip:
-                    all_docs = project.documents.all()
-                    if doc_type == "training":
-                        all_docs = project.documents.filter(doc_type=DocumentType.TRAINING)
-                    elif doc_type == "test":
-                        all_docs = project.documents.filter(doc_type=DocumentType.TEST)
-                    elif doc_type == "annotation":
-                        all_docs = project.documents.filter(doc_type=DocumentType.ANNOTATION)
-
-
-                    num_docs = all_docs.count()
-                    num_slices = math.ceil(num_docs/documents_per_file)
-
-                    for slice_index in range(num_slices):
-                        start_index = slice_index*documents_per_file
-                        end_index = ((slice_index+1)*documents_per_file)
-                        if end_index >= num_docs:
-                            end_index = num_docs
-
-                        slice_docs = all_docs[start_index:end_index]
-
-                        with tempfile.NamedTemporaryFile("w+") as f:
-                            self.write_docs_to_file(f, slice_docs, export_type, json_format, anonymize)
-                            zip.write(f.name, f"project-{project_id}-{doc_type}-{slice_index:04d}.{export_type}")
+            self.write_zip_to_file(z, project_id, doc_type, export_type, json_format, anonymize, documents_per_file)
 
             # Stream file output
-
             z.seek(0)
             while True:
                 c = z.read(chunk_size)
@@ -96,6 +70,34 @@ def generate_download(self, project_id, doc_type="all", export_type="json", json
                 else:
                     break
 
+
+    def write_zip_to_file(self, file_stream, project_id, doc_type="all", export_type="json", json_format="raw", anonymize=True, documents_per_file=500):
+
+        project = Project.objects.get(pk=project_id)
+        with ZipFile(file_stream, "w") as zip:
+            all_docs = project.documents.all()
+            if doc_type == "training":
+                all_docs = project.documents.filter(doc_type=DocumentType.TRAINING)
+            elif doc_type == "test":
+                all_docs = project.documents.filter(doc_type=DocumentType.TEST)
+            elif doc_type == "annotation":
+                all_docs = project.documents.filter(doc_type=DocumentType.ANNOTATION)
+
+            num_docs = all_docs.count()
+            num_slices = math.ceil(num_docs / documents_per_file)
+
+            for slice_index in range(num_slices):
+                start_index = slice_index * documents_per_file
+                end_index = ((slice_index + 1) * documents_per_file)
+                if end_index >= num_docs:
+                    end_index = num_docs
+
+                slice_docs = all_docs[start_index:end_index]
+
+                with tempfile.NamedTemporaryFile("w+") as f:
+                    self.write_docs_to_file(f, slice_docs, export_type, json_format, anonymize)
+                    zip.write(f.name, f"project-{project_id}-{doc_type}-{slice_index:04d}.{export_type}")
+
     def write_docs_to_file(self, file, documents, export_type, json_format, anonymize):
         if export_type == "json":
             self.write_docs_as_json(file, documents, json_format, anonymize)