Skip to content

Commit

Permalink
Add django command for extracting annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
twinkarma committed Oct 2, 2023
1 parent a9c28d8 commit baa6882
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 27 deletions.
58 changes: 58 additions & 0 deletions backend/management/commands/download_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json
from django.core.management.base import BaseCommand, CommandError
from django.template.loader import render_to_string
from backend.rpcserver import JSONRPCEndpoint
from backend.views import DownloadAnnotationsView
import argparse

class Command(BaseCommand):

help = "Download annotation data"



def add_arguments(self, parser):
parser.add_argument("output_path", type=str, help="Path of file output")
parser.add_argument("project_id", type=str, help="ID of the project")
parser.add_argument("doc_type", type=str, help="Document type all, training, test, or annotation")
parser.add_argument("export_type", type=str, help="Type of export json, jsonl or csv")
parser.add_argument("anonymize", type=self.str2bool, help="Data should be anonymized or not ")
parser.add_argument("-j", "--json_format", type=str, help="Type of json format: raw (default) or gate ")
parser.add_argument("-n", "--num_entries_per_file", type=int, help="Number of entries to generate per file, default 500")


def handle(self, *args, **options):

annotations_downloader = DownloadAnnotationsView()

output_path = options["output_path"]
project_id = options["project_id"]
doc_type = options["doc_type"]
export_type = options["export_type"]
anonymize = options["anonymize"]
json_format = options["json_format"] if options["json_format"] else "raw"
num_entries_per_file = options["num_entries_per_file"] if options["num_entries_per_file"] else 500

print(f"Writing annotations to {output_path} \n Project: {project_id}\n Document type: {doc_type}\n Export type: {export_type} \n Anonymized: {anonymize} \n Json format: {json_format} \n Num entries per file: {num_entries_per_file}\n")

with open(output_path, "wb") as z:
annotations_downloader.write_zip_to_file(file_stream=z,
project_id=project_id,
doc_type=doc_type,
export_type=export_type,
json_format=json_format,
anonymize=anonymize,
documents_per_file=num_entries_per_file)


def str2bool(self, v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')


56 changes: 29 additions & 27 deletions backend/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,36 +58,10 @@ def get(self, request, project_id, doc_type, export_type, json_format, entries_p

def generate_download(self, project_id, doc_type="all", export_type="json", json_format="raw", anonymize=True, chunk_size=512, documents_per_file=500):

project = Project.objects.get(pk=project_id)

with tempfile.TemporaryFile() as z:
with ZipFile(z, "w") as zip:
all_docs = project.documents.all()
if doc_type == "training":
all_docs = project.documents.filter(doc_type=DocumentType.TRAINING)
elif doc_type == "test":
all_docs = project.documents.filter(doc_type=DocumentType.TEST)
elif doc_type == "annotation":
all_docs = project.documents.filter(doc_type=DocumentType.ANNOTATION)


num_docs = all_docs.count()
num_slices = math.ceil(num_docs/documents_per_file)

for slice_index in range(num_slices):
start_index = slice_index*documents_per_file
end_index = ((slice_index+1)*documents_per_file)
if end_index >= num_docs:
end_index = num_docs

slice_docs = all_docs[start_index:end_index]

with tempfile.NamedTemporaryFile("w+") as f:
self.write_docs_to_file(f, slice_docs, export_type, json_format, anonymize)
zip.write(f.name, f"project-{project_id}-{doc_type}-{slice_index:04d}.{export_type}")
self.write_zip_to_file(z, project_id, doc_type, export_type, json_format, anonymize, documents_per_file)

# Stream file output

z.seek(0)
while True:
c = z.read(chunk_size)
Expand All @@ -96,6 +70,34 @@ def generate_download(self, project_id, doc_type="all", export_type="json", json
else:
break


def write_zip_to_file(self, file_stream, project_id, doc_type="all", export_type="json", json_format="raw", anonymize=True, documents_per_file=500):

project = Project.objects.get(pk=project_id)
with ZipFile(file_stream, "w") as zip:
all_docs = project.documents.all()
if doc_type == "training":
all_docs = project.documents.filter(doc_type=DocumentType.TRAINING)
elif doc_type == "test":
all_docs = project.documents.filter(doc_type=DocumentType.TEST)
elif doc_type == "annotation":
all_docs = project.documents.filter(doc_type=DocumentType.ANNOTATION)

num_docs = all_docs.count()
num_slices = math.ceil(num_docs / documents_per_file)

for slice_index in range(num_slices):
start_index = slice_index * documents_per_file
end_index = ((slice_index + 1) * documents_per_file)
if end_index >= num_docs:
end_index = num_docs

slice_docs = all_docs[start_index:end_index]

with tempfile.NamedTemporaryFile("w+") as f:
self.write_docs_to_file(f, slice_docs, export_type, json_format, anonymize)
zip.write(f.name, f"project-{project_id}-{doc_type}-{slice_index:04d}.{export_type}")

def write_docs_to_file(self, file, documents, export_type, json_format, anonymize):
if export_type == "json":
self.write_docs_as_json(file, documents, json_format, anonymize)
Expand Down

0 comments on commit baa6882

Please sign in to comment.