Merge pull request #384 from GateNLP/spread-docs-evenly

Attempt to spread documents more evenly across annotators
GateNLP · Aug 4, 2023 · a9c28d8 · a9c28d8
2 parents 44098c8 + 04118ad
commit a9c28d8
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 7 deletions.
diff --git a/backend/models.py b/backend/models.py
@@ -737,9 +737,18 @@ def assign_annotator_task(self, user, doc_type=DocumentType.ANNOTATION):
         Annotation task performs an extra check for remaining annotation task (num_annotation_tasks_remaining),
         testing and training does not do this check as the annotator must annotate all documents.
         """
-        if (DocumentType.ANNOTATION and self.num_annotation_tasks_remaining > 0) or \
-                DocumentType.TEST or DocumentType.TRAINING:
-            for doc in self.documents.filter(doc_type=doc_type).order_by('?'):
+        if (doc_type == DocumentType.ANNOTATION and self.num_annotation_tasks_remaining > 0) or \
+                doc_type == DocumentType.TEST or doc_type == DocumentType.TRAINING:
+            if doc_type == DocumentType.TEST or doc_type == DocumentType.TRAINING:
+                queryset = self.documents.filter(doc_type=doc_type).order_by('?')
+            else:
+                # Prefer documents which have fewer complete or pending annotations, in order to
+                # spread the annotators as evenly as possible across the available documents
+                queryset = self.documents.filter(doc_type=doc_type).alias(
+                    occupied_annotations=Count("annotations", filter=Q(annotations__status=Annotation.COMPLETED)
+                                                                     | Q(annotations__status=Annotation.PENDING))
+                ).order_by('occupied_annotations', '?')
+            for doc in queryset:
                 # Check that annotator hasn't annotated and that
                 # doc hasn't been fully annotated
                 if doc.user_can_annotate_document(user):

diff --git a/backend/tests/test_rpc_endpoints.py b/backend/tests/test_rpc_endpoints.py
@@ -8,6 +8,7 @@
 
 from django.utils import timezone
 import json
+import logging
 
 from backend.models import Annotation, Document, DocumentType, Project, AnnotatorProject, UserDocumentFormatPreference
 from backend.rpc import create_project, update_project, add_project_document, add_document_annotation, \
@@ -28,7 +29,7 @@
 from backend.tests.test_rpc_server import TestEndpoint
 
 
-
+LOGGER = logging.getLogger(__name__)
 
 class TestUserAuth(TestCase):
 
@@ -1379,7 +1380,7 @@ def setUp(self):
         self.num_training_docs = 5
         self.training_docs = []
         for i in range(self.num_training_docs):
-            self.docs.append(Document.objects.create(project=self.proj,
+            self.training_docs.append(Document.objects.create(project=self.proj,
                                                      doc_type=DocumentType.TRAINING,
                                                      data={
                                                         "text": f"Document {i}",
@@ -1396,7 +1397,7 @@ def setUp(self):
         self.num_test_docs = 10
         self.test_docs = []
         for i in range(self.num_test_docs):
-            self.docs.append(Document.objects.create(project=self.proj,
+            self.test_docs.append(Document.objects.create(project=self.proj,
                                                      doc_type=DocumentType.TEST,
                                                      data={
                                                          "text": f"Document {i}",
@@ -1609,10 +1610,11 @@ def test_annotations_per_doc_not_enforced_for_training_or_test(self):
         self.proj.save()
 
         docs_annotated_per_user = []
-        for (i, (ann_user, _)) in enumerate(self.annotators):
+        for (ann_user, _) in self.annotators:
             # Add to project
             self.assertTrue(add_project_annotator(self.manager_request, self.proj.id, ann_user.username))
 
+        for (i, (ann_user, _)) in enumerate(self.annotators):
             # Every annotator should be able to complete every training document, even though
             # max annotations per document is less than the total number of annotators
             self.assertEqual(self.num_training_docs,
@@ -1623,6 +1625,7 @@ def test_annotations_per_doc_not_enforced_for_training_or_test(self):
             self.assertEqual(self.num_training_docs,
                              self.proj.get_annotator_document_score(ann_user, DocumentType.TRAINING))
 
+        for (i, (ann_user, _)) in enumerate(self.annotators):
             # Every annotator should be able to complete every test document, even though
             # max annotations per document is less than the total number of annotators
             self.assertEqual(self.num_test_docs,
@@ -1633,6 +1636,7 @@ def test_annotations_per_doc_not_enforced_for_training_or_test(self):
             self.assertEqual(self.num_training_docs,
                              self.proj.get_annotator_document_score(ann_user, DocumentType.TRAINING))
 
+        for (i, (ann_user, _)) in enumerate(self.annotators):
             # Now attempt to complete task normally
             num_annotated = self.complete_annotations(self.num_docs, "Annotation", annotator=i)
             docs_annotated_per_user.append(num_annotated)
@@ -1662,15 +1666,30 @@ def complete_annotations(self, num_annotations_to_complete, expected_doc_type_st
 
         # Expect to get self.num_training_docs tasks
         num_completed_tasks = 0
+        if expected_doc_type_str == 'Annotation':
+            all_docs = self.docs
+        elif expected_doc_type_str == 'Training':
+            all_docs = self.training_docs
+        else:
+            all_docs = self.test_docs
+
+        annotated_docs = {doc.pk: ' ' for doc in all_docs}
         for i in range(num_annotations_to_complete):
             task_context = get_annotation_task(ann_req)
             if task_context:
                 self.assertEqual(expected_doc_type_str, task_context.get("document_type"),
                                  f"Document type does not match in task {task_context!r}, " +
                                  "annotator {ann.username}, document {i}")
+                annotated_docs[task_context['document_id']] = "\u2714"
                 complete_annotation_task(ann_req, task_context["annotation_id"], {"sentiment": answer})
                 num_completed_tasks += 1
 
+        # Draw a nice markdown table of exactly which documents each annotator was given
+        if annotator == 0:
+            LOGGER.debug("Annotator | " + (" | ".join(str(i) for i in annotated_docs.keys())))
+            LOGGER.debug(" | ".join(["--"] * (len(annotated_docs)+1)))
+        LOGGER.debug(ann.username + " | " + (" | ".join(str(v) for v in annotated_docs.values())))
+
         return num_completed_tasks
 
 class TestAnnotationChange(TestEndpoint):