Skip to content

Commit

Permalink
Merge pull request #117 from ArneBinder/separate_dataset_tests
Browse files Browse the repository at this point in the history
separate test forkflow for datasets
  • Loading branch information
ArneBinder authored Apr 15, 2024
2 parents 16410c9 + 195642e commit b1f4d92
Show file tree
Hide file tree
Showing 5 changed files with 305 additions and 2 deletions.
12 changes: 10 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,16 @@ name: Tests
on:
push:
branches: [main]
paths-ignore:
- "dataset_builders/**"
- "tests/dataset_builders/**"
- "tests/fixtures/dataset_builders/**"
pull_request:
branches: [main, "release/*"]
paths-ignore:
- "dataset_builders/**"
- "tests/dataset_builders/**"
- "tests/fixtures/dataset_builders/**"

jobs:
tests:
Expand All @@ -17,7 +25,7 @@ jobs:
os: ["ubuntu-latest"]
python-version: ["3.9"]

timeout-minutes: 30
timeout-minutes: 10

steps:
#----------------------------------------------
Expand Down Expand Up @@ -70,7 +78,7 @@ jobs:
- name: Run tests with coverage
run: |
source .venv/bin/activate
pytest -k "not slow" --cov --cov-report term-missing --cov-report xml:coverage.xml
pytest --ignore=tests/dataset_builders -k "not slow" --cov=src --cov-report term-missing --cov-report xml:coverage.xml
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v3
with:
Expand Down
91 changes: 91 additions & 0 deletions .github/workflows/test_datasets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@

name: Test Datasets

on:
push:
branches: [main]
paths:
- "src/dataset_builders/**"
- "data/datasets/**"
- "tests/dataset_builders/**"
- "tests/fixtures/dataset_builders/**"
- ".github/workflows/test_datasets.yaml"
pull_request:
branches: [main, "release/*"]
paths:
- "src/dataset_builders/**"
- "data/datasets/**"
- "tests/dataset_builders/**"
- "tests/fixtures/dataset_builders/**"
- ".github/workflows/test_datasets.yaml"

jobs:
tests:
runs-on: ${{ matrix.os }}

strategy:
fail-fast: false
matrix:
os: ["ubuntu-latest"]
python-version: ["3.9"]

timeout-minutes: 30

steps:
#----------------------------------------------
# check-out repo and set-up python
#----------------------------------------------
- name: Check out repository
uses: actions/checkout@v3
- name: Set up python
id: setup-python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

#----------------------------------------------
# ----- install & configure poetry -----
#----------------------------------------------
- name: Install Poetry
uses: snok/install-poetry@v1
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true

#----------------------------------------------
# load cached venv if cache exists
#----------------------------------------------
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}

#----------------------------------------------
# install dependencies if cache does not exist
#----------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction --no-root

#----------------------------------------------
# install your root project, if required
#----------------------------------------------
- name: Install project
run: poetry install --no-interaction

#----------------------------------------------
# run test suite and upload coverage data
#----------------------------------------------
- name: Run tests with coverage
run: |
source .venv/bin/activate
pytest tests/dataset_builders -k "not slow" --cov=dataset_builders --cov-report term-missing --cov-report xml:coverage_datasets.xml
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v3
with:
files: ./coverage_datasets.xml
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
6 changes: 6 additions & 0 deletions src/pie_datasets/builders/brat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import datasets
from pie_modules.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
from pytorch_ie import Document
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
from pytorch_ie.documents import TextBasedDocument

Expand Down Expand Up @@ -305,3 +306,8 @@ def _generate_document(self, example, **kwargs):
return example_to_document(
example, merge_fragmented_spans=self.config.merge_fragmented_spans
)

def _generate_example(self, document: Document, **kwargs) -> Dict[str, Any]:
if not isinstance(document, (BratDocument, BratDocumentWithMergedSpans)):
raise TypeError(f"document type {type(document)} is not supported")
return document_to_example(document)
Empty file added tests/unit/builder/__init__.py
Empty file.
198 changes: 198 additions & 0 deletions tests/unit/builder/test_brat_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
from typing import Any

import pytest
from pie_modules.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
from pytorch_ie import Annotation
from pytorch_ie.documents import TextBasedDocument

from pie_datasets.builders.brat import BratAttribute, BratBuilder

HF_EXAMPLES = [
{
"context": "Jane lives in Berlin.\n",
"file_name": "1",
"spans": {
"id": ["T1", "T2"],
"type": ["person", "city"],
"locations": [{"start": [0], "end": [4]}, {"start": [14], "end": [20]}],
"text": ["Jane", "Berlin"],
},
"relations": {"id": [], "type": [], "arguments": []},
"equivalence_relations": {"type": [], "targets": []},
"events": {"id": [], "type": [], "trigger": [], "arguments": []},
"attributions": {"id": [], "type": [], "target": [], "value": []},
"normalizations": {
"id": [],
"type": [],
"target": [],
"resource_id": [],
"entity_id": [],
},
"notes": {"id": [], "type": [], "target": [], "note": []},
},
{
"context": "Seattle is a rainy city. Jenny Durkan is the city's mayor.\n",
"file_name": "2",
"spans": {
"id": ["T1", "T2"],
"type": ["city", "person"],
"locations": [{"start": [0], "end": [7]}, {"start": [25], "end": [37]}],
"text": ["Seattle", "Jenny Durkan"],
},
"relations": {
"id": ["R1"],
"type": ["mayor_of"],
"arguments": [{"type": ["Arg1", "Arg2"], "target": ["T2", "T1"]}],
},
"equivalence_relations": {"type": [], "targets": []},
"events": {"id": [], "type": [], "trigger": [], "arguments": []},
"attributions": {
"id": ["A1", "A2"],
"type": ["factuality", "statement"],
"target": ["T1", "R1"],
"value": ["actual", "true"],
},
"normalizations": {
"id": [],
"type": [],
"target": [],
"resource_id": [],
"entity_id": [],
},
"notes": {"id": [], "type": [], "target": [], "note": []},
},
]


def resolve_annotation(annotation: Annotation) -> Any:
if annotation.target is None:
return None
if isinstance(annotation, LabeledMultiSpan):
return (
[annotation.target[start:end] for start, end in annotation.slices],
annotation.label,
)
elif isinstance(annotation, LabeledSpan):
return (annotation.target[annotation.start : annotation.end], annotation.label)
elif isinstance(annotation, BinaryRelation):
return (
resolve_annotation(annotation.head),
annotation.label,
resolve_annotation(annotation.tail),
)
elif isinstance(annotation, BratAttribute):
result = (resolve_annotation(annotation.annotation), annotation.label)
if annotation.value is not None:
return result + (annotation.value,)
else:
return result
else:
raise TypeError(f"Unknown annotation type: {type(annotation)}")


@pytest.fixture(scope="module", params=BratBuilder.BUILDER_CONFIGS)
def config_name(request) -> str:
return request.param.name


def test_config_names(config_name):
assert config_name in ["default", "merge_fragmented_spans"]


@pytest.fixture(scope="module")
def builder(config_name: str) -> BratBuilder:
return BratBuilder(name=config_name)


def test_builder(builder):
assert builder is not None


@pytest.fixture(scope="module", params=HF_EXAMPLES)
def hf_example(request) -> dict:
return request.param


def test_generate_document(builder, hf_example):
kwargs = dict()
generated_document = builder._generate_document(example=hf_example, **kwargs)
resolved_spans = [resolve_annotation(annotation=span) for span in generated_document.spans]
resolved_relations = [
resolve_annotation(relation) for relation in generated_document.relations
]
if hf_example == HF_EXAMPLES[0]:
assert len(generated_document.spans) == 2
assert len(generated_document.relations) == 0
assert len(generated_document.span_attributes) == 0
assert len(generated_document.relation_attributes) == 0

if builder.config.name == "default":
assert resolved_spans[0] == (["Jane"], "person")
assert resolved_spans[1] == (["Berlin"], "city")
elif builder.config.name == "merge_fragmented_spans":
assert resolved_spans[0] == ("Jane", "person")
assert resolved_spans[1] == ("Berlin", "city")
else:
raise ValueError(f"Unknown builder variant: {builder.name}")

elif hf_example == HF_EXAMPLES[1]:
assert len(generated_document.spans) == 2
assert len(generated_document.relations) == 1
assert len(generated_document.span_attributes) == 1
assert len(generated_document.relation_attributes) == 1

resolved_span_attributes = [
resolve_annotation(attribute) for attribute in generated_document.span_attributes
]
resolved_relation_attributes = [
resolve_annotation(attribute) for attribute in generated_document.relation_attributes
]

if builder.config.name == "default":
assert resolved_spans[0] == (["Seattle"], "city")
assert resolved_spans[1] == (["Jenny Durkan"], "person")
assert resolved_relations[0] == (
(["Jenny Durkan"], "person"),
"mayor_of",
(["Seattle"], "city"),
)
assert resolved_span_attributes[0] == ((["Seattle"], "city"), "factuality", "actual")
assert resolved_relation_attributes[0] == (
((["Jenny Durkan"], "person"), "mayor_of", (["Seattle"], "city")),
"statement",
"true",
)
elif builder.config.name == "merge_fragmented_spans":
assert resolved_spans[0] == ("Seattle", "city")
assert resolved_spans[1] == ("Jenny Durkan", "person")
assert resolved_relations[0] == (
("Jenny Durkan", "person"),
"mayor_of",
("Seattle", "city"),
)
assert resolved_span_attributes[0] == (("Seattle", "city"), "factuality", "actual")
assert resolved_relation_attributes[0] == (
(("Jenny Durkan", "person"), "mayor_of", ("Seattle", "city")),
"statement",
"true",
)
else:
raise ValueError(f"Unknown builder variant: {config_name}")
else:
raise ValueError(f"Unknown sample: {hf_example}")


def test_example_to_document_and_back_all(builder):
for hf_example in HF_EXAMPLES:
doc = builder._generate_document(hf_example)
assert isinstance(doc, builder.document_type)
hf_example_back = builder._generate_example(doc)
assert hf_example == hf_example_back


def test_document_to_example_wrong_type(builder):
doc = TextBasedDocument(text="Hello, world!")

with pytest.raises(TypeError) as exc_info:
builder._generate_example(doc)
assert str(exc_info.value) == f"document type {type(doc)} is not supported"

0 comments on commit b1f4d92

Please sign in to comment.