-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #166 from IndicoDataSolutions/result-file-dataclasses
Result File Dataclasses
- Loading branch information
Showing
35 changed files
with
6,494 additions
and
65 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
name: Python | ||
|
||
on: | ||
push: | ||
branches: [ "main" ] | ||
pull_request: | ||
branches: [ "main" ] | ||
|
||
jobs: | ||
Python: | ||
runs-on: ubuntu-22.04 | ||
|
||
strategy: | ||
fail-fast: false | ||
matrix: | ||
python-version: ["3.9", "3.10", "3.11"] | ||
|
||
env: | ||
API_TOKEN: ${{ secrets.TEST_API_TOKEN }} | ||
DATASET_ID: ${{ secrets.DATASET_ID }} | ||
PDF_DATASET_ID: ${{ secrets.PDF_DATASET_ID }} | ||
MODEL_NAME: ${{ secrets.MODEL_NAME }} | ||
WORKFLOW_ID: ${{ secrets.WORKFLOW_ID }} | ||
MODEL_ID: ${{ secrets.MODEL_ID }} | ||
MODEL_GROUP_ID: ${{ secrets.MODEL_GROUP_ID }} | ||
TEACH_TASK_ID: ${{ secrets.TEACH_TASK_ID }} | ||
HOST_URL: try.indico.io | ||
|
||
steps: | ||
- name: Checkout Commit | ||
uses: actions/checkout@v3 | ||
|
||
- name: Install Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v3 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
- name: Install Dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
python -m pip install -r requirements.txt | ||
python -m pip install -e .[full] | ||
python -m pip install flake8 pytest pytest-cov | ||
- name: Run Tests And Build Coverage File | ||
run: | | ||
pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=indico_toolkit tests/ | tee pytest-coverage.txt | ||
- name: Pytest Coverage Comment | ||
id: coverageComment | ||
uses: MishaKav/pytest-coverage-comment@main | ||
with: | ||
pytest-coverage-path: ./pytest-coverage.txt | ||
junitxml-path: ./pytest.xml | ||
title: Indico Toolkit Coverage Report | ||
badge-title: Test Coverage | ||
default-branch: main | ||
|
||
- name: Check The Output Coverage | ||
run: | | ||
echo "Coverage Percentage - ${{ steps.coverageComment.outputs.coverage }}" | ||
echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}" | ||
echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}" | ||
echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}" | ||
echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}" | ||
echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}" | ||
echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}" | ||
echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
""" | ||
Minimal auto review example for single-document submissions. | ||
""" | ||
from operator import attrgetter | ||
from typing import Any | ||
|
||
from indico import IndicoClient | ||
from indico.filters import SubmissionFilter | ||
from indico.queries import ListSubmissions, RetrieveStorageObject, SubmitReview | ||
|
||
from indico_toolkit import results | ||
|
||
|
||
def autoreview(result: results.Result) -> Any: | ||
""" | ||
Apply simple auto review rules to a submission. | ||
Assumes single-document submissions. | ||
""" | ||
pre_review = result.pre_review | ||
extractions = pre_review.extractions | ||
|
||
# Downselect all labels from all models based on highest confidence. | ||
for model, extractions in extractions.groupby(attrgetter("model")).items(): | ||
for label, extractions in extractions.groupby(attrgetter("label")).items(): | ||
# Order extractions by confidence descending. | ||
ordered = extractions.orderby(attrgetter("confidence"), reverse=True) | ||
ordered.reject() # Reject all extractions. | ||
ordered[0].unreject() # Unreject the highest confidence extraction. | ||
|
||
confidence_thresholds = { | ||
"From": 0.99, | ||
"To": 0.97, | ||
"Subject": 0.90, | ||
"Date": 0.99999, | ||
} | ||
|
||
# Auto accept predictions based on label's confidence threshold. | ||
for label, threshold in confidence_thresholds.items(): | ||
extractions.where(label=label, min_confidence=threshold).accept() | ||
|
||
# Reject all predictions with confidence below 75%. | ||
extractions.where(max_confidence=0.75).reject() | ||
|
||
# Apply name normalization to all predictions with the "Name" label. | ||
extractions.where(label="Name").apply(normalize_name) | ||
|
||
return pre_review.to_changes(result) | ||
|
||
|
||
def normalize_name(extraction: results.Extraction) -> None: | ||
""" | ||
Normalize 'Last, First' to 'First Last'. | ||
""" | ||
names = extraction.text.split(",") | ||
|
||
if len(names) == 2: | ||
last, first = names | ||
extraction.text = first.strip() + " " + last.strip() | ||
|
||
|
||
if __name__ == "__main__": | ||
client = IndicoClient() | ||
|
||
for submission in client.call( | ||
ListSubmissions( | ||
workflow_ids=[123], | ||
filters=SubmissionFilter(status="PENDING_AUTO_REVIEW"), | ||
) | ||
): | ||
result_dict = client.call(RetrieveStorageObject(submission.result_file)) | ||
result = results.load(result_dict) | ||
changes = autoreview(result) | ||
client.call(SubmitReview(submission.id, changes)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
""" | ||
Overview of dataclasses and functionality available in the results module. | ||
""" | ||
from operator import attrgetter | ||
from pathlib import Path | ||
|
||
from indico import IndicoClient | ||
from indico.queries import GetSubmission, RetrieveStorageObject | ||
|
||
from indico_toolkit import results | ||
|
||
""" | ||
Loading Result Files | ||
""" | ||
|
||
# Result files can be loaded as Python-native dataclasses from result dictionaries | ||
# returned by the Indico client, from JSON strings, and from JSON files on disk. | ||
client = IndicoClient() | ||
submission = client.call(GetSubmission(123)) | ||
result_dict = client.call(RetrieveStorageObject(submission.result_file)) | ||
result = results.load(result_dict) | ||
|
||
result = results.load("""{"file_version": 1, ... }""") | ||
|
||
for result_file in Path("results_folder").glob("*.json"): | ||
result = results.load(result_file) | ||
|
||
|
||
""" | ||
Example Results Traversal | ||
""" | ||
|
||
# Get the classification of a single-document submission that went through a | ||
# single-classification workflow. | ||
result.pre_review.classifications[0].label | ||
|
||
# Get the highest-confidence prediction for the Invoice Number field. | ||
invoice_numbers = result.pre_review.extractions.where(label="Invoice Number") | ||
invoice_number = invoice_numbers.orderby(attrgetter("confidence"), reverse=True)[0] | ||
invoice_number.text | ||
|
||
# Get all auto review predictions grouped by model. | ||
predictions_by_model = result.auto_review.groupby(attrgetter("model")) | ||
|
||
# Get all final extractions on page 5. | ||
result.final.extractions.where(predicate=lambda pred: pred.page == 5) | ||
|
||
|
||
""" | ||
Dataclass Reference | ||
""" | ||
|
||
# Result Dataclass | ||
result.id # Submission ID | ||
result.version # Result file version | ||
result.documents # List of documents in this submission | ||
result.models # List of documents in this submission | ||
result.reviews # List of reviews for this submission | ||
result.rejected # Whether this submission was rejected in review | ||
|
||
result.predictions # List of all model predictions | ||
result.pre_review # List of raw model predictions | ||
result.auto_review # List of predictions for auto review | ||
result.manual_review # List of predictions for manual review | ||
result.admin_review # List of predictions for admin review | ||
result.final # List of final predictions | ||
|
||
|
||
# Review Dataclass | ||
if result.reviews: | ||
review = result.reviews[0] | ||
review.id | ||
review.reviewer_id | ||
review.notes | ||
review.rejected | ||
review.type | ||
|
||
|
||
# Document Dataclass | ||
document = result.documents[0] | ||
document.id | ||
document.name | ||
document.etl_output_url | ||
document.full_text_url | ||
|
||
|
||
# Prediction list Dataclass | ||
predictions = result.final | ||
predictions.classifications # List of all classification predictions | ||
predictions.extractions # List of all document extraction predictions | ||
predictions.form_extractions # List of all form extraction predictions | ||
predictions.unbundlings # List of all unbundling predictions | ||
|
||
predictions.apply() # Apply a function to all predictions | ||
predictions.groupby() # Group predictions into a dictionary by some attribute (e.g. label) | ||
predictions.orderby() # Sort predictions by some attribute (e.g. confidence) | ||
predictions.where() # Filter predictions by some predicate (e.g. model, label, confidence) | ||
predictions.to_changes(result) # Get this list of predictions as changes for `SubmitReview` | ||
|
||
predictions.extractions.accept() # Accept all extractions in this list (e.g. after filtering) | ||
predictions.extractions.reject() # Reject all extractions in this list (e.g. after filtering) | ||
predictions.extractions.unaccept() # Unaccept all extractions in this list (e.g. after filtering) | ||
predictions.extractions.unreject() # Unreject all extractions in this list (e.g. after filtering) | ||
|
||
|
||
# Prediction Dataclass | ||
prediction = predictions[0] | ||
prediction.document | ||
prediction.model | ||
prediction.label | ||
prediction.confidence # Confidence of the predicted label | ||
prediction.confidences # Confidences of all labels | ||
prediction.extras # Other attributes from the result file prediction dict that are not explicitly parsed | ||
|
||
|
||
# Extraction Dataclass (Subclass of Prediction) | ||
extraction = predictions.extractions[0] | ||
extraction.text | ||
extraction.start | ||
extraction.end | ||
extraction.page | ||
extraction.groups # Any linked label groups this prediction is a part of | ||
extraction.accepted | ||
extraction.rejected | ||
|
||
extraction.accept() # Mark this extraction as accepted for auto review | ||
extraction.reject() # Mark this extraction as rejected for auto review | ||
extraction.unaccept() # Mark this extraction as not accepted for auto review | ||
extraction.unreject() # Mark this extraction as not rejected for auto review |
Oops, something went wrong.