Merge pull request #204 from IndicoDataSolutions/mawelborn/results-cl…

…eanup Result File Dataclasses Maintenance
IndicoDataSolutions · Nov 22, 2024 · eb26c5f · eb26c5f
2 parents a755eac + 8204509
commit eb26c5f
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 61 deletions.
diff --git a/indico_toolkit/results/document.py b/indico_toolkit/results/document.py
@@ -8,7 +8,6 @@ class Document:
     id: int
     name: str
     etl_output_url: str
-    full_text_url: str
 
     # Auto review changes must reproduce all model sections that were present in the
     # original result file. This may not be possible from the predictions alone--if a
@@ -26,14 +25,12 @@ def from_v1_dict(result: object) -> "Document":
         document_results = get(result, dict, "results", "document", "results")
         model_names = frozenset(document_results.keys())
         etl_output_url = get(result, str, "etl_output")
-        full_text_url = etl_output_url.replace("etl_output.json", "full_text.txt")
 
         return Document(
             # v1 result files don't include document IDs or filenames.
             id=None,  # type: ignore[arg-type]
             name=None,  # type: ignore[arg-type]
             etl_output_url=etl_output_url,
-            full_text_url=full_text_url,
             _model_sections=model_names,
         )
 
@@ -45,12 +42,10 @@ def from_v3_dict(document: object) -> "Document":
         model_results = get(document, dict, "model_results", "ORIGINAL")
         model_ids = frozenset(model_results.keys())
         etl_output_url = get(document, str, "etl_output")
-        full_text_url = etl_output_url.replace("etl_output.json", "full_text.txt")
 
         return Document(
             id=get(document, int, "submissionfile_id"),
             name=get(document, str, "input_filename"),
             etl_output_url=etl_output_url,
-            full_text_url=full_text_url,
             _model_sections=model_ids,
         )
diff --git a/indico_toolkit/results/normalization.py b/indico_toolkit/results/normalization.py
@@ -37,6 +37,7 @@ def normalize_v1_result(result: "Any") -> None:
             for prediction in review
             if prediction is not None
         )
+
         for prediction in predictions:
             # Predictions added in review lack a `confidence` section.
             if "confidence" not in prediction:
@@ -90,52 +91,53 @@ def normalize_v3_result(result: "Any") -> None:
     """
     Fix inconsistencies observed in v3 result files.
     """
-    for document in get(result, list, "submission_results"):
-        for review_results in get(document, dict, "model_results").values():
-            predictions: "Any" = (
-                prediction
-                for model_results in review_results.values()
-                for prediction in model_results
-            )
-            for prediction in predictions:
-                # Predictions added in review lack a `confidence` section.
-                if "confidence" not in prediction:
-                    prediction["confidence"] = {prediction["label"]: 0}
-
-                # Document Extractions added in review may lack spans.
-                if (
-                    "text" in prediction
-                    and "type" not in prediction
-                    and "spans" not in prediction
-                ):
-                    prediction["spans"] = [
-                        {
-                            "page_num": prediction["page_num"],
-                            "start": 0,
-                            "end": 0,
-                        }
-                    ]
-
-                # Form Extractions added in review may lack bounding boxes.
-                if "type" in prediction and "top" not in prediction:
-                    prediction["top"] = 0
-                    prediction["left"] = 0
-                    prediction["right"] = 0
-                    prediction["bottom"] = 0
-
-                # Prior to 6.11, some Extractions lack a `normalized` section after
-                # review.
-                if "text" in prediction and "normalized" not in prediction:
-                    prediction["normalized"] = {"formatted": prediction["text"]}
-
-                # Document Extractions that didn't go through a linked labels
-                # transformer lack a `groupings` section.
-                if (
-                    "text" in prediction
-                    and "type" not in prediction
-                    and "groupings" not in prediction
-                ):
-                    prediction["groupings"] = []
+    predictions: "Any" = (
+        prediction
+        for submission_result in get(result, list, "submission_results")
+        for model_result in get(submission_result, dict, "model_results").values()
+        for review_result in model_result.values()
+        for prediction in review_result
+    )
+
+    for prediction in predictions:
+        # Predictions added in review lack a `confidence` section.
+        if "confidence" not in prediction:
+            prediction["confidence"] = {prediction["label"]: 0}
+
+        # Document Extractions added in review may lack spans.
+        if (
+            "text" in prediction
+            and "type" not in prediction
+            and "spans" not in prediction
+        ):
+            prediction["spans"] = [
+                {
+                    "page_num": prediction["page_num"],
+                    "start": 0,
+                    "end": 0,
+                }
+            ]
+
+        # Form Extractions added in review may lack bounding boxes.
+        if "type" in prediction and "top" not in prediction:
+            prediction["top"] = 0
+            prediction["left"] = 0
+            prediction["right"] = 0
+            prediction["bottom"] = 0
+
+        # Prior to 6.11, some Extractions lack a `normalized` section after
+        # review.
+        if "text" in prediction and "normalized" not in prediction:
+            prediction["normalized"] = {"formatted": prediction["text"]}
+
+        # Document Extractions that didn't go through a linked labels
+        # transformer lack a `groupings` section.
+        if (
+            "text" in prediction
+            and "type" not in prediction
+            and "groupings" not in prediction
+        ):
+            prediction["groupings"] = []
 
     # Prior to 6.8, v3 result files don't include a `reviews` section.
     if not has(result, dict, "reviews"):

diff --git a/indico_toolkit/results/result.py b/indico_toolkit/results/result.py
@@ -19,10 +19,10 @@
 class Result:
     version: int
     submission_id: int
-    documents: "list[Document]"
-    models: "list[ModelGroup]"
+    documents: "tuple[Document, ...]"
+    models: "tuple[ModelGroup, ...]"
     predictions: "PredictionList[Prediction]"
-    reviews: "list[Review]"
+    reviews: "tuple[Review, ...]"
 
     @property
     def rejected(self) -> bool:
@@ -88,10 +88,10 @@ def from_v1_dict(result: object) -> "Result":
         return Result(
             version=version,
             submission_id=submission_id,
-            documents=[document],
-            models=models,
+            documents=(document,),
+            models=tuple(models),
             predictions=predictions,
-            reviews=sorted(reviews),
+            reviews=tuple(sorted(reviews)),
         )
 
     @staticmethod
@@ -141,8 +141,8 @@ def from_v3_dict(result: object) -> "Result":
         return Result(
             version=version,
             submission_id=submission_id,
-            documents=documents,
-            models=models,
+            documents=tuple(documents),
+            models=tuple(models),
             predictions=predictions,
-            reviews=reviews,
+            reviews=tuple(reviews),
         )
diff --git a/tests/results/test_predictionlist.py b/tests/results/test_predictionlist.py
@@ -21,7 +21,6 @@ def document() -> Document:
         id=2922,
         name="1040_filled.tiff",
         etl_output_url="indico-file:///storage/submission/2922/etl_output.json",
-        full_text_url="indico-file:///storage/submission/2922/full_text.txt",
         _model_sections=frozenset({"124", "123", "122", "121"}),
     )