diff --git a/backend/models.py b/backend/models.py index d8e44ed1..24ffc0b6 100644 --- a/backend/models.py +++ b/backend/models.py @@ -980,22 +980,26 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): if json_format == "raw" or json_format == "csv": doc_dict = self.data.copy() elif json_format == "gate": + # GATE json format are expected to have an existing "features" field + features_dict = dict(self.data["features"]) if "features" in self.data and isinstance(self.data["features"], dict) else {} - ignore_keys = {"text", self.project.document_id_field} - features_dict = {key: value for key, value in self.data.items() if key not in ignore_keys} + # Add any non-compliant top-level fields into the "features" field instead + ignore_keys = {"text", "features", "offset_type", "annotation_sets", self.project.document_id_field} + features_dict.update({key: value for key, value in self.data.items() if key not in ignore_keys}) doc_dict = { "text": self.data["text"], "features": features_dict, - "offset_type": "p", + "offset_type": self.data["offset_type"] if "offset_type" in self.data else "p", # Use original offset type "name": get_value_from_key_path(self.data, self.project.document_id_field) } # Insert annotation sets into the doc dict annotations = self.annotations.filter(status=Annotation.COMPLETED) if json_format == "csv": + # Gets pre-existing annotations + annotation_sets = dict(self.data["annotations"]) if "annotations" in self.data else {} # Format annotations for CSV export - annotation_sets = {} for annotation in annotations: a_data = annotation.data annotation_dict = {} @@ -1008,19 +1012,21 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): annotation_dict["duration_seconds"] = annotation.time_to_complete if anonymize: - annotation_sets[str(annotation.user.id)] = annotation_dict + annotation_sets[f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"] = annotation_dict else: annotation_sets[annotation.user.username] = annotation_dict doc_dict["annotations"] = annotation_sets else: + # Gets pre-existing annotations + annotation_sets = dict(self.data["annotation_sets"]) if "annotation_sets" in self.data else {} # Format for JSON in line with GATE formatting - annotation_sets = {} for annotation in annotations: a_data = annotation.data + anonymized_name = f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}" annotation_set = { - "name": annotation.user.id if anonymize else annotation.user.username, + "name": anonymized_name if anonymize else annotation.user.username, "annotations": [ { "type": "Document", @@ -1028,14 +1034,13 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): "end": 0, "id": 0, "duration_seconds": annotation.time_to_complete, - "features": { - "label": a_data - } + "features": a_data } ], "next_annid": 1, } - annotation_sets[annotation.user.username] = annotation_set + annotation_sets[anonymized_name if anonymize else annotation.user.username] = annotation_set + doc_dict["annotation_sets"] = annotation_sets # Add to the export the lists (possibly empty) of users who rejected, @@ -1047,7 +1052,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): ("aborted", Annotation.ABORTED), ]: teamware_status[key] = [ - annotation.user.id if anonymize else annotation.user.username + f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}" if anonymize else annotation.user.username for annotation in self.annotations.filter(status=status) ] if json_format == "csv": diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py index 2002937e..2758df2c 100644 --- a/backend/tests/test_models.py +++ b/backend/tests/test_models.py @@ -1098,10 +1098,11 @@ def test_get_annotations_for_user_in_project(self): class TestDocumentAnnotationModelExport(TestCase): def setUp(self): + self.unanonymized_prefix = "namedperson" self.test_user = get_user_model().objects.create(username="project_creator") - self.annotator_names = [f"anno{i}" for i in range(3)] + self.annotator_names = [f"{self.unanonymized_prefix}{i}" for i in range(3)] self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names] - self.annotator_ids = [a.id for a in self.annotators] + self.anon_annotator_names = [f"{settings.ANONYMIZATION_PREFIX}{a.id}" for a in self.annotators] self.project = Project.objects.create(owner=self.test_user) for i in range(10): document = Document.objects.create( @@ -1112,6 +1113,55 @@ def setUp(self): "feature1": "Testvalue 1", "feature2": "Testvalue 1", "feature3": "Testvalue 1", + "features": { + "gate_format_feature1": "Gate feature test value", + "gate_format_feature2": "Gate feature test value", + "gate_format_feature3": "Gate feature test value", + }, + "offset_type": "x", + "annotations": { + "existing_annotator1": { + "sentiment": "positive" + }, + f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}": { + "sentiment": "positive" + } + + }, + "annotation_sets": { + "existing_annotator1": { + "name": "existing_annotator1", + "annotations": [ + { + "type": "Document", + "start": 0, + "end": 10, + "id": 0, + "features": { + "sentiment": "positive" + } + } + ], + "next_annid": 1 + }, + f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}": { + "name": f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}", + "annotations": [ + { + "type": "Document", + "start": 0, + "end": 10, + "id": 0, + "features": { + "sentiment": "positive" + } + } + ], + "next_annid": 1 + } + + } + } ) @@ -1147,6 +1197,8 @@ def setUp(self): def test_export_raw(self): for document in self.project.documents.all(): + # Fields should remain exactly the same as what's been uploaded + # aside from annotation_sets doc_dict = document.get_doc_annotation_dict("raw") print(doc_dict) self.assertTrue("id" in doc_dict) @@ -1154,46 +1206,79 @@ def test_export_raw(self): self.assertTrue("feature1" in doc_dict) self.assertTrue("feature2" in doc_dict) self.assertTrue("feature3" in doc_dict) + self.assertTrue("features" in doc_dict) + self.assertTrue("offset_type" in doc_dict) + self.assertTrue("annotations" in doc_dict) + doc_features = doc_dict["features"] + self.assertTrue("gate_format_feature1" in doc_features) + self.assertTrue("gate_format_feature2" in doc_features) + self.assertTrue("gate_format_feature3" in doc_features) + self.check_raw_gate_annotation_formatting(doc_dict) - self.check_teamware_status(doc_dict, self.annotator_ids) + self.check_teamware_status(doc_dict, self.anon_annotator_names) def test_export_gate(self): for document in self.project.documents.all(): + # All top-level fields apart from name, text, features and annotation_sets should be + # nested inside the features field doc_dict = document.get_doc_annotation_dict("gate") print(doc_dict) self.assertTrue("text" in doc_dict) self.assertTrue("features" in doc_dict) + self.assertFalse("annotations" in doc_dict) + self.assertEqual(doc_dict["offset_type"], "x") doc_features = doc_dict["features"] self.assertTrue("id" in doc_features) self.assertTrue("feature1" in doc_features) self.assertTrue("feature2" in doc_features) self.assertTrue("feature3" in doc_features) + self.assertTrue("annotations" in doc_features) + self.assertFalse("features" in doc_features, "Double nesting of features field") + self.assertFalse("offset_type" in doc_features, "Double nesting of offset_type field") + self.assertTrue("gate_format_feature1" in doc_features) + self.assertTrue("gate_format_feature2" in doc_features) + self.assertTrue("gate_format_feature3" in doc_features) self.check_raw_gate_annotation_formatting(doc_dict) - self.check_teamware_status(doc_features, self.annotator_ids) + self.check_teamware_status(doc_features, self.anon_annotator_names) + + def test_export_gate_with_no_offset_type(self): - def check_raw_gate_annotation_formatting(self, doc_dict): + for document in self.project.documents.all(): + document.data.pop("offset_type") + + doc_dict = document.get_doc_annotation_dict("gate") + self.assertEqual(doc_dict["offset_type"], "p", "offset_type should default to p") + + + def check_raw_gate_annotation_formatting(self, doc_dict: dict): self.assertTrue("annotation_sets" in doc_dict) - self.assertTrue(len(doc_dict["annotation_sets"]) == 3) + self.assertEqual(len(doc_dict["annotation_sets"]), 4, doc_dict) # Test annotation formatting for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue("name" in aset_data) - self.assertTrue("annotations" in aset_data) - self.assertEqual(len(aset_data["annotations"]), 1) - anno_dict = aset_data["annotations"][0] - self.assertTrue("type" in anno_dict) - self.assertTrue("start" in anno_dict) - self.assertTrue("end" in anno_dict) - self.assertTrue("id" in anno_dict) - self.assertTrue("features" in anno_dict) - self.assertTrue("label" in anno_dict["features"]) - label_dict = anno_dict["features"]["label"] - self.assertTrue("text1" in label_dict) - self.assertTrue("checkbox1" in label_dict) + if aset_key != "existing_annotator1": + self.assertTrue("name" in aset_data) + self.assertTrue("annotations" in aset_data) + self.assertEqual(len(aset_data["annotations"]), 1) + anno_dict = aset_data["annotations"][0] + self.assertTrue("type" in anno_dict) + self.assertTrue("start" in anno_dict) + self.assertTrue("end" in anno_dict) + self.assertTrue("id" in anno_dict) + self.assertTrue("features" in anno_dict) + features_dict = anno_dict["features"] + self.assertTrue("text1" in features_dict) + self.assertTrue("checkbox1" in features_dict) + else: + # Check that existing annotation from document upload is carried over + self.assertEqual(aset_data["annotations"][0]["features"]["sentiment"], "positive") + + + def check_teamware_status(self, containing_dict, expected_value): self.assertTrue("teamware_status" in containing_dict) @@ -1219,31 +1304,44 @@ def test_export_csv(self): self.assertTrue("feature2" in doc_dict) self.assertTrue("feature3" in doc_dict) self.assertTrue("annotations" in doc_dict) - self.assertTrue(len(doc_dict["annotations"]) == 3) + self.assertEqual(len(doc_dict["annotations"]), 4, doc_dict) anno_set_dict = doc_dict["annotations"] for set_key in anno_set_dict: - self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str)) - self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str)) + if set_key != "existing_annotator1": + self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str)) + self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str)) + else: + self.assertEqual(anno_set_dict[set_key]["sentiment"], "positive") - self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.annotator_ids)) + self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.anon_annotator_names)) def test_export_raw_anonymized(self): for document in self.project.documents.all(): + # Mask any existing annotations that came with the document upload + document.data.pop("annotation_sets") + document.save() + doc_dict = document.get_doc_annotation_dict("raw", anonymize=True) for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue(isinstance(aset_data.get("name", None), int)) + self.assertFalse(aset_key.startswith(self.unanonymized_prefix)) + self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix)) - self.check_teamware_status(doc_dict, self.annotator_ids) + self.check_teamware_status(doc_dict, self.anon_annotator_names) def test_export_raw_deanonymized(self): for document in self.project.documents.all(): + # Mask any existing annotations that came with the document upload + document.data.pop("annotation_sets") + document.save() + doc_dict = document.get_doc_annotation_dict("raw", anonymize=False) for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue(isinstance(aset_data.get("name", None), str)) + self.assertTrue(aset_key.startswith(self.unanonymized_prefix)) + self.assertTrue(aset_data.get("name", None).startswith(self.unanonymized_prefix)) # for non-anonymized export the rejected/aborted/timed_out status # uses names rather than ID numbers @@ -1252,20 +1350,30 @@ def test_export_raw_deanonymized(self): def test_export_gate_anonymized(self): for document in self.project.documents.all(): + # Mask any existing annotations that came with the document upload + document.data.pop("annotation_sets") + document.save() + doc_dict = document.get_doc_annotation_dict("gate", anonymize=True) for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue(isinstance(aset_data.get("name", None), int)) + self.assertFalse(aset_key.startswith(self.unanonymized_prefix)) + self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix)) - self.check_teamware_status(doc_dict["features"], self.annotator_ids) + self.check_teamware_status(doc_dict["features"], self.anon_annotator_names) def test_export_gate_deanonymized(self): for document in self.project.documents.all(): + # Mask any existing annotations that came with the document upload + document.data.pop("annotation_sets") + document.save() + doc_dict = document.get_doc_annotation_dict("gate", anonymize=False) for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue(isinstance(aset_data.get("name", None), str)) + self.assertTrue(aset_key.startswith(self.unanonymized_prefix)) + self.assertTrue(aset_data.get("name", None).startswith(self.unanonymized_prefix)) # for non-anonymized export the rejected/aborted/timed_out status # uses names rather than ID numbers diff --git a/docs/docs/manageradminguide/documents_annotations_management.md b/docs/docs/manageradminguide/documents_annotations_management.md index c73fbb5a..7b852340 100644 --- a/docs/docs/manageradminguide/documents_annotations_management.md +++ b/docs/docs/manageradminguide/documents_annotations_management.md @@ -187,12 +187,11 @@ possible to determine which documents were annotated by _the same person_, just You can choose how documents are exported: * `.json` & `.jsonl` - JSON or JSON Lines files can be generated in the format of: - * `raw` - Exports unmodified JSON. If you've originally uploaded in GATE format then choose this option. - - An additional field named `annotation_sets` is added for storing annotations. The annotations are laid out in the - same way as GATE JSON format. For example if a document has been annotated by `user1` with labels and values - `text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`, the non-anonymous export might look - like this: + * `raw` - Exports the original `JSON` combined with an additional field named `annotation_sets` for storing + annotations. The annotations are laid out in the same way as GATE + [bdocjs](https://gatenlp.github.io/gateplugin-Format_Bdoc/bdoc_document.html) format. For example if a document + has been annotated by `user1` with labels and values `text`:`Annotation text`, `radio`:`val3`, and + `checkbox`:`["val2", "val4"]`, the non-anonymous export might look like this: ```json { @@ -210,14 +209,12 @@ You can choose how documents are exported: "end":10, "id":0, "features":{ - "label":{ - "text":"Annotation text", - "radio":"val3", - "checkbox":[ - "val2", - "val4" - ] - } + "text":"Annotation text", + "radio":"val3", + "checkbox":[ + "val2", + "val4" + ] } } ], @@ -232,16 +229,18 @@ You can choose how documents are exported: } ``` - In anonymous mode the name `user1` would instead be the user's opaque numeric identifier (e.g. `105`). + In anonymous mode the name `user1` would instead be derived from the user's opaque numeric identifier (e.g. + `annotator105`). - The field `teamware_status` gives the ids or usernames (depending on the "anonymize" setting) of those annotators + The field `teamware_status` gives the usernames or anonymous IDs (depending on the "anonymize" setting) of those annotators who rejected the document, "timed out" because they did not complete their annotation in the time allowed by the project, or "aborted" for some other reason (e.g. they were removed from the project). - * `gate` - Convert documents to GATE JSON format and export. A `name` field is added that takes the ID value from the - ID field specified in the project configuration. Fields apart from `text` and the ID field specified in the project - config are placed in the `features` field, as is the `teamware_status` information. An `annotation_sets` field is - added for storing annotations. + * `gate` - Convert documents to GATE [bdocjs](https://gatenlp.github.io/gateplugin-Format_Bdoc/bdoc_document.html) + format and export. A `name` field is added that takes the `ID` value from the `ID field` specified in the + **project configuration**. Any top-level fields apart from `text`, `features`, `offset_type`, `annotation_sets`, + and the ID field specified in the project config are placed in the `features` field, as is the `teamware_status` + information. An `annotation_sets` field is added for storing annotations if it doesn't already exist. For example in the case of this uploaded JSON document: ```json @@ -271,6 +270,9 @@ You can choose how documents are exported: columns with the header of `annotations.username.label` and the status information is in columns named `teamware_status.rejected_by`, `teamware_status.timed_out` and `teamware_status.aborted`. +**Note: Documents that contains existing annotations (i.e. the `annotation_sets` field for `JSON` or `annotations` for `CSV`) are merged with the new sets of annotations. Be aware that if the document has a new annotation from an annotator with the same +username, the previous annotation will be overwritten. Existing annotations are also not anonymized when exporting the document.** + ## Deleting documents and annotations It is possible to click on the top left of corner of documents and annotations to select it, then click on the diff --git a/teamware/settings/base.py b/teamware/settings/base.py index fccf1e4f..66b0fe46 100644 --- a/teamware/settings/base.py +++ b/teamware/settings/base.py @@ -265,6 +265,11 @@ DELETED_USER_LASTNAME = "Deleted" DELETED_USER_EMAIL_DOMAIN = "teamware-deleted.com" +""" +Anonymization settings +""" +ANONYMIZATION_PREFIX = "annotator" + """ Frontend dev server configuration """