Merge pull request #166 from IndicoDataSolutions/result-file-dataclasses

Result File Dataclasses
IndicoDataSolutions · Sep 27, 2024 · 4dd0202 · 4dd0202
2 parents 9f367ec + 5b3f6cc
commit 4dd0202
Show file tree

Hide file tree

Showing 35 changed files with 6,494 additions and 65 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -0,0 +1,68 @@
+name: Python
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  Python:
+    runs-on: ubuntu-22.04
+
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    env:
+      API_TOKEN: ${{ secrets.TEST_API_TOKEN }}
+      DATASET_ID: ${{ secrets.DATASET_ID }}
+      PDF_DATASET_ID: ${{ secrets.PDF_DATASET_ID }}
+      MODEL_NAME: ${{ secrets.MODEL_NAME }}
+      WORKFLOW_ID: ${{ secrets.WORKFLOW_ID }}
+      MODEL_ID: ${{ secrets.MODEL_ID }} 
+      MODEL_GROUP_ID: ${{ secrets.MODEL_GROUP_ID }}
+      TEACH_TASK_ID: ${{ secrets.TEACH_TASK_ID }}
+      HOST_URL: try.indico.io
+
+    steps:
+    - name: Checkout Commit
+      uses: actions/checkout@v3
+
+    - name: Install Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install Dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install -r requirements.txt
+        python -m pip install -e .[full]
+        python -m pip install flake8 pytest pytest-cov
+
+    - name: Run Tests And Build Coverage File
+      run: |
+        pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=indico_toolkit tests/ | tee pytest-coverage.txt
+
+    - name: Pytest Coverage Comment
+      id: coverageComment
+      uses: MishaKav/pytest-coverage-comment@main
+      with:
+        pytest-coverage-path: ./pytest-coverage.txt
+        junitxml-path: ./pytest.xml
+        title: Indico Toolkit Coverage Report
+        badge-title: Test Coverage
+        default-branch: main
+
+    - name: Check The Output Coverage
+      run: |
+        echo "Coverage Percentage - ${{ steps.coverageComment.outputs.coverage }}"
+        echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}"
+        echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}"
+        echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}"
+        echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}"
+        echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}"
+        echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}"
+        echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}"
diff --git a/examples/results_autoreview.py b/examples/results_autoreview.py
@@ -0,0 +1,73 @@
+"""
+Minimal auto review example for single-document submissions.
+"""
+from operator import attrgetter
+from typing import Any
+
+from indico import IndicoClient
+from indico.filters import SubmissionFilter
+from indico.queries import ListSubmissions, RetrieveStorageObject, SubmitReview
+
+from indico_toolkit import results
+
+
+def autoreview(result: results.Result) -> Any:
+    """
+    Apply simple auto review rules to a submission.
+    Assumes single-document submissions.
+    """
+    pre_review = result.pre_review
+    extractions = pre_review.extractions
+
+    # Downselect all labels from all models based on highest confidence.
+    for model, extractions in extractions.groupby(attrgetter("model")).items():
+        for label, extractions in extractions.groupby(attrgetter("label")).items():
+            # Order extractions by confidence descending.
+            ordered = extractions.orderby(attrgetter("confidence"), reverse=True)
+            ordered.reject()  # Reject all extractions.
+            ordered[0].unreject()  # Unreject the highest confidence extraction.
+
+    confidence_thresholds = {
+        "From": 0.99,
+        "To": 0.97,
+        "Subject": 0.90,
+        "Date": 0.99999,
+    }
+
+    # Auto accept predictions based on label's confidence threshold.
+    for label, threshold in confidence_thresholds.items():
+        extractions.where(label=label, min_confidence=threshold).accept()
+
+    # Reject all predictions with confidence below 75%.
+    extractions.where(max_confidence=0.75).reject()
+
+    # Apply name normalization to all predictions with the "Name" label.
+    extractions.where(label="Name").apply(normalize_name)
+
+    return pre_review.to_changes(result)
+
+
+def normalize_name(extraction: results.Extraction) -> None:
+    """
+    Normalize 'Last, First' to 'First Last'.
+    """
+    names = extraction.text.split(",")
+
+    if len(names) == 2:
+        last, first = names
+        extraction.text = first.strip() + " " + last.strip()
+
+
+if __name__ == "__main__":
+    client = IndicoClient()
+
+    for submission in client.call(
+        ListSubmissions(
+            workflow_ids=[123],
+            filters=SubmissionFilter(status="PENDING_AUTO_REVIEW"),
+        )
+    ):
+        result_dict = client.call(RetrieveStorageObject(submission.result_file))
+        result = results.load(result_dict)
+        changes = autoreview(result)
+        client.call(SubmitReview(submission.id, changes))
diff --git a/examples/results_dataclasses.py b/examples/results_dataclasses.py
@@ -0,0 +1,129 @@
+"""
+Overview of dataclasses and functionality available in the results module.
+"""
+from operator import attrgetter
+from pathlib import Path
+
+from indico import IndicoClient
+from indico.queries import GetSubmission, RetrieveStorageObject
+
+from indico_toolkit import results
+
+"""
+Loading Result Files
+"""
+
+# Result files can be loaded as Python-native dataclasses from result dictionaries
+# returned by the Indico client, from JSON strings, and from JSON files on disk.
+client = IndicoClient()
+submission = client.call(GetSubmission(123))
+result_dict = client.call(RetrieveStorageObject(submission.result_file))
+result = results.load(result_dict)
+
+result = results.load("""{"file_version": 1, ... }""")
+
+for result_file in Path("results_folder").glob("*.json"):
+    result = results.load(result_file)
+
+
+"""
+Example Results Traversal
+"""
+
+# Get the classification of a single-document submission that went through a
+# single-classification workflow.
+result.pre_review.classifications[0].label
+
+# Get the highest-confidence prediction for the Invoice Number field.
+invoice_numbers = result.pre_review.extractions.where(label="Invoice Number")
+invoice_number = invoice_numbers.orderby(attrgetter("confidence"), reverse=True)[0]
+invoice_number.text
+
+# Get all auto review predictions grouped by model.
+predictions_by_model = result.auto_review.groupby(attrgetter("model"))
+
+# Get all final extractions on page 5.
+result.final.extractions.where(predicate=lambda pred: pred.page == 5)
+
+
+"""
+Dataclass Reference
+"""
+
+# Result Dataclass
+result.id  # Submission ID
+result.version  # Result file version
+result.documents  # List of documents in this submission
+result.models  # List of documents in this submission
+result.reviews  # List of reviews for this submission
+result.rejected  # Whether this submission was rejected in review
+
+result.predictions  # List of all model predictions
+result.pre_review  # List of raw model predictions
+result.auto_review  # List of predictions for auto review
+result.manual_review  # List of predictions for manual review
+result.admin_review  # List of predictions for admin review
+result.final  # List of final predictions
+
+
+# Review Dataclass
+if result.reviews:
+    review = result.reviews[0]
+    review.id
+    review.reviewer_id
+    review.notes
+    review.rejected
+    review.type
+
+
+# Document Dataclass
+document = result.documents[0]
+document.id
+document.name
+document.etl_output_url
+document.full_text_url
+
+
+# Prediction list Dataclass
+predictions = result.final
+predictions.classifications  # List of all classification predictions
+predictions.extractions  # List of all document extraction predictions
+predictions.form_extractions  # List of all form extraction predictions
+predictions.unbundlings  # List of all unbundling predictions
+
+predictions.apply()  # Apply a function to all predictions
+predictions.groupby()  # Group predictions into a dictionary by some attribute (e.g. label)
+predictions.orderby()  # Sort predictions by some attribute (e.g. confidence)
+predictions.where()  # Filter predictions by some predicate (e.g. model, label, confidence)
+predictions.to_changes(result)  # Get this list of predictions as changes for `SubmitReview`
+
+predictions.extractions.accept()  # Accept all extractions in this list (e.g. after filtering)
+predictions.extractions.reject()  # Reject all extractions in this list (e.g. after filtering)
+predictions.extractions.unaccept()  # Unaccept all extractions in this list (e.g. after filtering)
+predictions.extractions.unreject()  # Unreject all extractions in this list (e.g. after filtering)
+
+
+# Prediction Dataclass
+prediction = predictions[0]
+prediction.document
+prediction.model
+prediction.label
+prediction.confidence  # Confidence of the predicted label
+prediction.confidences  # Confidences of all labels
+prediction.extras  # Other attributes from the result file prediction dict that are not explicitly parsed
+
+
+# Extraction Dataclass (Subclass of Prediction)
+extraction = predictions.extractions[0]
+extraction.text
+extraction.start
+extraction.end
+extraction.page
+extraction.groups  # Any linked label groups this prediction is a part of
+extraction.accepted
+extraction.rejected
+
+extraction.accept()  # Mark this extraction as accepted for auto review
+extraction.reject()  # Mark this extraction as rejected for auto review
+extraction.unaccept()  # Mark this extraction as not accepted for auto review
+extraction.unreject()  # Mark this extraction as not rejected for auto review