Skip to content

Commit

Permalink
Merge pull request #166 from IndicoDataSolutions/result-file-dataclasses
Browse files Browse the repository at this point in the history
Result File Dataclasses
  • Loading branch information
mawelborn authored Sep 27, 2024
2 parents 9f367ec + 5b3f6cc commit 4dd0202
Show file tree
Hide file tree
Showing 35 changed files with 6,494 additions and 65 deletions.
64 changes: 0 additions & 64 deletions .github/workflows/build.yml

This file was deleted.

68 changes: 68 additions & 0 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
name: Python

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
Python:
runs-on: ubuntu-22.04

strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]

env:
API_TOKEN: ${{ secrets.TEST_API_TOKEN }}
DATASET_ID: ${{ secrets.DATASET_ID }}
PDF_DATASET_ID: ${{ secrets.PDF_DATASET_ID }}
MODEL_NAME: ${{ secrets.MODEL_NAME }}
WORKFLOW_ID: ${{ secrets.WORKFLOW_ID }}
MODEL_ID: ${{ secrets.MODEL_ID }}
MODEL_GROUP_ID: ${{ secrets.MODEL_GROUP_ID }}
TEACH_TASK_ID: ${{ secrets.TEACH_TASK_ID }}
HOST_URL: try.indico.io

steps:
- name: Checkout Commit
uses: actions/checkout@v3

- name: Install Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}

- name: Install Dependencies
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install -e .[full]
python -m pip install flake8 pytest pytest-cov
- name: Run Tests And Build Coverage File
run: |
pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=indico_toolkit tests/ | tee pytest-coverage.txt
- name: Pytest Coverage Comment
id: coverageComment
uses: MishaKav/pytest-coverage-comment@main
with:
pytest-coverage-path: ./pytest-coverage.txt
junitxml-path: ./pytest.xml
title: Indico Toolkit Coverage Report
badge-title: Test Coverage
default-branch: main

- name: Check The Output Coverage
run: |
echo "Coverage Percentage - ${{ steps.coverageComment.outputs.coverage }}"
echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}"
echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}"
echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}"
echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}"
echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}"
echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}"
echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}"
73 changes: 73 additions & 0 deletions examples/results_autoreview.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""
Minimal auto review example for single-document submissions.
"""
from operator import attrgetter
from typing import Any

from indico import IndicoClient
from indico.filters import SubmissionFilter
from indico.queries import ListSubmissions, RetrieveStorageObject, SubmitReview

from indico_toolkit import results


def autoreview(result: results.Result) -> Any:
"""
Apply simple auto review rules to a submission.
Assumes single-document submissions.
"""
pre_review = result.pre_review
extractions = pre_review.extractions

# Downselect all labels from all models based on highest confidence.
for model, extractions in extractions.groupby(attrgetter("model")).items():
for label, extractions in extractions.groupby(attrgetter("label")).items():
# Order extractions by confidence descending.
ordered = extractions.orderby(attrgetter("confidence"), reverse=True)
ordered.reject() # Reject all extractions.
ordered[0].unreject() # Unreject the highest confidence extraction.

confidence_thresholds = {
"From": 0.99,
"To": 0.97,
"Subject": 0.90,
"Date": 0.99999,
}

# Auto accept predictions based on label's confidence threshold.
for label, threshold in confidence_thresholds.items():
extractions.where(label=label, min_confidence=threshold).accept()

# Reject all predictions with confidence below 75%.
extractions.where(max_confidence=0.75).reject()

# Apply name normalization to all predictions with the "Name" label.
extractions.where(label="Name").apply(normalize_name)

return pre_review.to_changes(result)


def normalize_name(extraction: results.Extraction) -> None:
"""
Normalize 'Last, First' to 'First Last'.
"""
names = extraction.text.split(",")

if len(names) == 2:
last, first = names
extraction.text = first.strip() + " " + last.strip()


if __name__ == "__main__":
client = IndicoClient()

for submission in client.call(
ListSubmissions(
workflow_ids=[123],
filters=SubmissionFilter(status="PENDING_AUTO_REVIEW"),
)
):
result_dict = client.call(RetrieveStorageObject(submission.result_file))
result = results.load(result_dict)
changes = autoreview(result)
client.call(SubmitReview(submission.id, changes))
129 changes: 129 additions & 0 deletions examples/results_dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""
Overview of dataclasses and functionality available in the results module.
"""
from operator import attrgetter
from pathlib import Path

from indico import IndicoClient
from indico.queries import GetSubmission, RetrieveStorageObject

from indico_toolkit import results

"""
Loading Result Files
"""

# Result files can be loaded as Python-native dataclasses from result dictionaries
# returned by the Indico client, from JSON strings, and from JSON files on disk.
client = IndicoClient()
submission = client.call(GetSubmission(123))
result_dict = client.call(RetrieveStorageObject(submission.result_file))
result = results.load(result_dict)

result = results.load("""{"file_version": 1, ... }""")

for result_file in Path("results_folder").glob("*.json"):
result = results.load(result_file)


"""
Example Results Traversal
"""

# Get the classification of a single-document submission that went through a
# single-classification workflow.
result.pre_review.classifications[0].label

# Get the highest-confidence prediction for the Invoice Number field.
invoice_numbers = result.pre_review.extractions.where(label="Invoice Number")
invoice_number = invoice_numbers.orderby(attrgetter("confidence"), reverse=True)[0]
invoice_number.text

# Get all auto review predictions grouped by model.
predictions_by_model = result.auto_review.groupby(attrgetter("model"))

# Get all final extractions on page 5.
result.final.extractions.where(predicate=lambda pred: pred.page == 5)


"""
Dataclass Reference
"""

# Result Dataclass
result.id # Submission ID
result.version # Result file version
result.documents # List of documents in this submission
result.models # List of documents in this submission
result.reviews # List of reviews for this submission
result.rejected # Whether this submission was rejected in review

result.predictions # List of all model predictions
result.pre_review # List of raw model predictions
result.auto_review # List of predictions for auto review
result.manual_review # List of predictions for manual review
result.admin_review # List of predictions for admin review
result.final # List of final predictions


# Review Dataclass
if result.reviews:
review = result.reviews[0]
review.id
review.reviewer_id
review.notes
review.rejected
review.type


# Document Dataclass
document = result.documents[0]
document.id
document.name
document.etl_output_url
document.full_text_url


# Prediction list Dataclass
predictions = result.final
predictions.classifications # List of all classification predictions
predictions.extractions # List of all document extraction predictions
predictions.form_extractions # List of all form extraction predictions
predictions.unbundlings # List of all unbundling predictions

predictions.apply() # Apply a function to all predictions
predictions.groupby() # Group predictions into a dictionary by some attribute (e.g. label)
predictions.orderby() # Sort predictions by some attribute (e.g. confidence)
predictions.where() # Filter predictions by some predicate (e.g. model, label, confidence)
predictions.to_changes(result) # Get this list of predictions as changes for `SubmitReview`

predictions.extractions.accept() # Accept all extractions in this list (e.g. after filtering)
predictions.extractions.reject() # Reject all extractions in this list (e.g. after filtering)
predictions.extractions.unaccept() # Unaccept all extractions in this list (e.g. after filtering)
predictions.extractions.unreject() # Unreject all extractions in this list (e.g. after filtering)


# Prediction Dataclass
prediction = predictions[0]
prediction.document
prediction.model
prediction.label
prediction.confidence # Confidence of the predicted label
prediction.confidences # Confidences of all labels
prediction.extras # Other attributes from the result file prediction dict that are not explicitly parsed


# Extraction Dataclass (Subclass of Prediction)
extraction = predictions.extractions[0]
extraction.text
extraction.start
extraction.end
extraction.page
extraction.groups # Any linked label groups this prediction is a part of
extraction.accepted
extraction.rejected

extraction.accept() # Mark this extraction as accepted for auto review
extraction.reject() # Mark this extraction as rejected for auto review
extraction.unaccept() # Mark this extraction as not accepted for auto review
extraction.unreject() # Mark this extraction as not rejected for auto review
Loading

0 comments on commit 4dd0202

Please sign in to comment.