Skip to content

Commit

Permalink
Merge pull request #204 from IndicoDataSolutions/mawelborn/results-cl…
Browse files Browse the repository at this point in the history
…eanup

Result File Dataclasses Maintenance
  • Loading branch information
mawelborn authored Nov 22, 2024
2 parents a755eac + 8204509 commit eb26c5f
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 61 deletions.
5 changes: 0 additions & 5 deletions indico_toolkit/results/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class Document:
id: int
name: str
etl_output_url: str
full_text_url: str

# Auto review changes must reproduce all model sections that were present in the
# original result file. This may not be possible from the predictions alone--if a
Expand All @@ -26,14 +25,12 @@ def from_v1_dict(result: object) -> "Document":
document_results = get(result, dict, "results", "document", "results")
model_names = frozenset(document_results.keys())
etl_output_url = get(result, str, "etl_output")
full_text_url = etl_output_url.replace("etl_output.json", "full_text.txt")

return Document(
# v1 result files don't include document IDs or filenames.
id=None, # type: ignore[arg-type]
name=None, # type: ignore[arg-type]
etl_output_url=etl_output_url,
full_text_url=full_text_url,
_model_sections=model_names,
)

Expand All @@ -45,12 +42,10 @@ def from_v3_dict(document: object) -> "Document":
model_results = get(document, dict, "model_results", "ORIGINAL")
model_ids = frozenset(model_results.keys())
etl_output_url = get(document, str, "etl_output")
full_text_url = etl_output_url.replace("etl_output.json", "full_text.txt")

return Document(
id=get(document, int, "submissionfile_id"),
name=get(document, str, "input_filename"),
etl_output_url=etl_output_url,
full_text_url=full_text_url,
_model_sections=model_ids,
)
94 changes: 48 additions & 46 deletions indico_toolkit/results/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def normalize_v1_result(result: "Any") -> None:
for prediction in review
if prediction is not None
)

for prediction in predictions:
# Predictions added in review lack a `confidence` section.
if "confidence" not in prediction:
Expand Down Expand Up @@ -90,52 +91,53 @@ def normalize_v3_result(result: "Any") -> None:
"""
Fix inconsistencies observed in v3 result files.
"""
for document in get(result, list, "submission_results"):
for review_results in get(document, dict, "model_results").values():
predictions: "Any" = (
prediction
for model_results in review_results.values()
for prediction in model_results
)
for prediction in predictions:
# Predictions added in review lack a `confidence` section.
if "confidence" not in prediction:
prediction["confidence"] = {prediction["label"]: 0}

# Document Extractions added in review may lack spans.
if (
"text" in prediction
and "type" not in prediction
and "spans" not in prediction
):
prediction["spans"] = [
{
"page_num": prediction["page_num"],
"start": 0,
"end": 0,
}
]

# Form Extractions added in review may lack bounding boxes.
if "type" in prediction and "top" not in prediction:
prediction["top"] = 0
prediction["left"] = 0
prediction["right"] = 0
prediction["bottom"] = 0

# Prior to 6.11, some Extractions lack a `normalized` section after
# review.
if "text" in prediction and "normalized" not in prediction:
prediction["normalized"] = {"formatted": prediction["text"]}

# Document Extractions that didn't go through a linked labels
# transformer lack a `groupings` section.
if (
"text" in prediction
and "type" not in prediction
and "groupings" not in prediction
):
prediction["groupings"] = []
predictions: "Any" = (
prediction
for submission_result in get(result, list, "submission_results")
for model_result in get(submission_result, dict, "model_results").values()
for review_result in model_result.values()
for prediction in review_result
)

for prediction in predictions:
# Predictions added in review lack a `confidence` section.
if "confidence" not in prediction:
prediction["confidence"] = {prediction["label"]: 0}

# Document Extractions added in review may lack spans.
if (
"text" in prediction
and "type" not in prediction
and "spans" not in prediction
):
prediction["spans"] = [
{
"page_num": prediction["page_num"],
"start": 0,
"end": 0,
}
]

# Form Extractions added in review may lack bounding boxes.
if "type" in prediction and "top" not in prediction:
prediction["top"] = 0
prediction["left"] = 0
prediction["right"] = 0
prediction["bottom"] = 0

# Prior to 6.11, some Extractions lack a `normalized` section after
# review.
if "text" in prediction and "normalized" not in prediction:
prediction["normalized"] = {"formatted": prediction["text"]}

# Document Extractions that didn't go through a linked labels
# transformer lack a `groupings` section.
if (
"text" in prediction
and "type" not in prediction
and "groupings" not in prediction
):
prediction["groupings"] = []

# Prior to 6.8, v3 result files don't include a `reviews` section.
if not has(result, dict, "reviews"):
Expand Down
18 changes: 9 additions & 9 deletions indico_toolkit/results/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
class Result:
version: int
submission_id: int
documents: "list[Document]"
models: "list[ModelGroup]"
documents: "tuple[Document, ...]"
models: "tuple[ModelGroup, ...]"
predictions: "PredictionList[Prediction]"
reviews: "list[Review]"
reviews: "tuple[Review, ...]"

@property
def rejected(self) -> bool:
Expand Down Expand Up @@ -88,10 +88,10 @@ def from_v1_dict(result: object) -> "Result":
return Result(
version=version,
submission_id=submission_id,
documents=[document],
models=models,
documents=(document,),
models=tuple(models),
predictions=predictions,
reviews=sorted(reviews),
reviews=tuple(sorted(reviews)),
)

@staticmethod
Expand Down Expand Up @@ -141,8 +141,8 @@ def from_v3_dict(result: object) -> "Result":
return Result(
version=version,
submission_id=submission_id,
documents=documents,
models=models,
documents=tuple(documents),
models=tuple(models),
predictions=predictions,
reviews=reviews,
reviews=tuple(reviews),
)
1 change: 0 additions & 1 deletion tests/results/test_predictionlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def document() -> Document:
id=2922,
name="1040_filled.tiff",
etl_output_url="indico-file:///storage/submission/2922/etl_output.json",
full_text_url="indico-file:///storage/submission/2922/full_text.txt",
_model_sections=frozenset({"124", "123", "122", "121"}),
)

Expand Down

0 comments on commit eb26c5f

Please sign in to comment.