From 05988cd1f291e2eddd087ca1bc72265e0f59c56c Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 17:31:49 +0200 Subject: [PATCH 1/9] outsource tests to test_datasets.yaml GitHub CI workflow --- .github/workflows/test.yaml | 10 +++- .github/workflows/test_datasets.yaml | 87 ++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/test_datasets.yaml diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3ccbd453..3f435284 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -4,8 +4,14 @@ name: Tests on: push: branches: [main] + paths: + - "!dataset_builders/**" + - "!tests/dataset_builders/**" pull_request: branches: [main, "release/*"] + paths: + - "!dataset_builders/**" + - "!tests/dataset_builders/**" jobs: tests: @@ -17,7 +23,7 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.9"] - timeout-minutes: 30 + timeout-minutes: 10 steps: #---------------------------------------------- @@ -70,7 +76,7 @@ jobs: - name: Run tests with coverage run: | source .venv/bin/activate - pytest -k "not slow" --cov --cov-report term-missing --cov-report xml:coverage.xml + pytest --ignore=tests/dataset_builders -k "not slow" --cov=src --cov-report term-missing --cov-report xml:coverage.xml - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v3 with: diff --git a/.github/workflows/test_datasets.yaml b/.github/workflows/test_datasets.yaml new file mode 100644 index 00000000..9443d4f5 --- /dev/null +++ b/.github/workflows/test_datasets.yaml @@ -0,0 +1,87 @@ + +name: Test Datasets + +on: + push: + branches: [main] + paths: + - "!src/**" + - "!tests/**" + - "tests/dataset_builders/**" + pull_request: + branches: [main, "release/*"] + paths: + - "!src/**" + - "!tests/**" + - "tests/dataset_builders/**" + +jobs: + tests: + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + python-version: ["3.9"] + + timeout-minutes: 30 + + steps: + #---------------------------------------------- + # check-out repo and set-up python + #---------------------------------------------- + - name: Check out repository + uses: actions/checkout@v3 + - name: Set up python + id: setup-python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + #---------------------------------------------- + # ----- install & configure poetry ----- + #---------------------------------------------- + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + #---------------------------------------------- + # load cached venv if cache exists + #---------------------------------------------- + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + #---------------------------------------------- + # install dependencies if cache does not exist + #---------------------------------------------- + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + #---------------------------------------------- + # install your root project, if required + #---------------------------------------------- + - name: Install project + run: poetry install --no-interaction + + #---------------------------------------------- + # run test suite and upload coverage data + #---------------------------------------------- + - name: Run tests with coverage + run: | + source .venv/bin/activate + pytest tests/dataset_builders -k "not slow" --cov=dataset_builders --cov-report term-missing --cov-report xml:coverage_datasets.xml + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + with: + files: ./coverage_datasets.xml + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} From 59f851df2f6eb33575f5e6d2bd3bfd9e51965480 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 17:44:22 +0200 Subject: [PATCH 2/9] use paths-ignore instead of paths --- .github/workflows/test.yaml | 12 ++++++------ .github/workflows/test_datasets.yaml | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3f435284..0653b349 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -4,14 +4,14 @@ name: Tests on: push: branches: [main] - paths: - - "!dataset_builders/**" - - "!tests/dataset_builders/**" + paths-ignore: + - "dataset_builders/**" + - "tests/dataset_builders/**" pull_request: branches: [main, "release/*"] - paths: - - "!dataset_builders/**" - - "!tests/dataset_builders/**" + paths-ignore: + - "dataset_builders/**" + - "tests/dataset_builders/**" jobs: tests: diff --git a/.github/workflows/test_datasets.yaml b/.github/workflows/test_datasets.yaml index 9443d4f5..15365921 100644 --- a/.github/workflows/test_datasets.yaml +++ b/.github/workflows/test_datasets.yaml @@ -4,16 +4,16 @@ name: Test Datasets on: push: branches: [main] - paths: - - "!src/**" - - "!tests/**" - - "tests/dataset_builders/**" + paths-ignore: + - "src/**" + - "tests/**" + - "!tests/dataset_builders/**" pull_request: branches: [main, "release/*"] - paths: - - "!src/**" - - "!tests/**" - - "tests/dataset_builders/**" + paths-ignore: + - "src/**" + - "tests/**" + - "!tests/dataset_builders/**" jobs: tests: From 63ec3485fe204271f7569d28dd0c537e80073c8e Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 17:52:55 +0200 Subject: [PATCH 3/9] use paths (instead of paths-ignore) for test_datasets.yaml --- .github/workflows/test_datasets.yaml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test_datasets.yaml b/.github/workflows/test_datasets.yaml index 15365921..855e36b0 100644 --- a/.github/workflows/test_datasets.yaml +++ b/.github/workflows/test_datasets.yaml @@ -4,16 +4,18 @@ name: Test Datasets on: push: branches: [main] - paths-ignore: - - "src/**" - - "tests/**" - - "!tests/dataset_builders/**" + paths: + - "src/dataset_builders/**" + - "tests/dataset_builders/**" + - "data/datasets/**" + - ".github/workflows/test_datasets.yaml" pull_request: branches: [main, "release/*"] - paths-ignore: - - "src/**" - - "tests/**" - - "!tests/dataset_builders/**" + paths: + - "src/dataset_builders/**" + - "tests/dataset_builders/**" + - "data/datasets/**" + - ".github/workflows/test_datasets.yaml" jobs: tests: From c83d24cec4ac9cf14a68ada95434f6ab7315248b Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 17:56:00 +0200 Subject: [PATCH 4/9] remove dataset_builders from source entry in .coveragerc --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index d48bfce0..7bbf28e2 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,6 +1,6 @@ [run] -source = src,dataset_builders +source = src [report] exclude_lines = From c1c1777cfc41daa73f16c8f0213f7ff7efbdaba0 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 18:08:04 +0200 Subject: [PATCH 5/9] add fixture data locations to paths(-ignore) --- .github/workflows/test.yaml | 2 ++ .github/workflows/test_datasets.yaml | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 0653b349..b3572991 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -7,11 +7,13 @@ on: paths-ignore: - "dataset_builders/**" - "tests/dataset_builders/**" + - "tests/fixtures/dataset_builders/**" pull_request: branches: [main, "release/*"] paths-ignore: - "dataset_builders/**" - "tests/dataset_builders/**" + - "tests/fixtures/dataset_builders/**" jobs: tests: diff --git a/.github/workflows/test_datasets.yaml b/.github/workflows/test_datasets.yaml index 855e36b0..855877a7 100644 --- a/.github/workflows/test_datasets.yaml +++ b/.github/workflows/test_datasets.yaml @@ -6,15 +6,17 @@ on: branches: [main] paths: - "src/dataset_builders/**" - - "tests/dataset_builders/**" - "data/datasets/**" + - "tests/dataset_builders/**" + - "tests/fixtures/dataset_builders/**" - ".github/workflows/test_datasets.yaml" pull_request: branches: [main, "release/*"] paths: - "src/dataset_builders/**" - - "tests/dataset_builders/**" - "data/datasets/**" + - "tests/dataset_builders/**" + - "tests/fixtures/dataset_builders/**" - ".github/workflows/test_datasets.yaml" jobs: From cbce8532fd16724a5783d65791bec3f23641dbc0 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 18:41:52 +0200 Subject: [PATCH 6/9] implement BratBuilder._generate_example() --- src/pie_datasets/builders/brat.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/pie_datasets/builders/brat.py b/src/pie_datasets/builders/brat.py index 9a09f296..69a16339 100644 --- a/src/pie_datasets/builders/brat.py +++ b/src/pie_datasets/builders/brat.py @@ -5,6 +5,7 @@ import datasets from pie_modules.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan +from pytorch_ie import Document from pytorch_ie.core import Annotation, AnnotationList, annotation_field from pytorch_ie.documents import TextBasedDocument @@ -305,3 +306,8 @@ def _generate_document(self, example, **kwargs): return example_to_document( example, merge_fragmented_spans=self.config.merge_fragmented_spans ) + + def _generate_example(self, document: Document, **kwargs) -> Dict[str, Any]: + if not isinstance(document, (BratDocument, BratDocumentWithMergedSpans)): + raise TypeError(f"document type {type(document)} is not supported") + return document_to_example(document) From eae9157e7c2670dcfe76fe5fa460a707a062a7bf Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 18:42:32 +0200 Subject: [PATCH 7/9] add test_brat_builder.py --- tests/unit/builder/__init__.py | 0 tests/unit/builder/test_brat_builder.py | 190 ++++++++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 tests/unit/builder/__init__.py create mode 100644 tests/unit/builder/test_brat_builder.py diff --git a/tests/unit/builder/__init__.py b/tests/unit/builder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/builder/test_brat_builder.py b/tests/unit/builder/test_brat_builder.py new file mode 100644 index 00000000..fd1cd3a1 --- /dev/null +++ b/tests/unit/builder/test_brat_builder.py @@ -0,0 +1,190 @@ +from typing import Any + +import pytest +from pie_modules.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan +from pytorch_ie import Annotation + +from pie_datasets.builders.brat import BratAttribute, BratBuilder + +HF_EXAMPLES = [ + { + "context": "Jane lives in Berlin.\n", + "file_name": "1", + "spans": { + "id": ["T1", "T2"], + "type": ["person", "city"], + "locations": [{"start": [0], "end": [4]}, {"start": [14], "end": [20]}], + "text": ["Jane", "Berlin"], + }, + "relations": {"id": [], "type": [], "arguments": []}, + "equivalence_relations": {"type": [], "targets": []}, + "events": {"id": [], "type": [], "trigger": [], "arguments": []}, + "attributions": {"id": [], "type": [], "target": [], "value": []}, + "normalizations": { + "id": [], + "type": [], + "target": [], + "resource_id": [], + "entity_id": [], + }, + "notes": {"id": [], "type": [], "target": [], "note": []}, + }, + { + "context": "Seattle is a rainy city. Jenny Durkan is the city's mayor.\n", + "file_name": "2", + "spans": { + "id": ["T1", "T2"], + "type": ["city", "person"], + "locations": [{"start": [0], "end": [7]}, {"start": [25], "end": [37]}], + "text": ["Seattle", "Jenny Durkan"], + }, + "relations": { + "id": ["R1"], + "type": ["mayor_of"], + "arguments": [{"type": ["Arg1", "Arg2"], "target": ["T2", "T1"]}], + }, + "equivalence_relations": {"type": [], "targets": []}, + "events": {"id": [], "type": [], "trigger": [], "arguments": []}, + "attributions": { + "id": ["A1", "A2"], + "type": ["factuality", "statement"], + "target": ["T1", "R1"], + "value": ["actual", "true"], + }, + "normalizations": { + "id": [], + "type": [], + "target": [], + "resource_id": [], + "entity_id": [], + }, + "notes": {"id": [], "type": [], "target": [], "note": []}, + }, +] + + +def resolve_annotation(annotation: Annotation) -> Any: + if annotation.target is None: + return None + if isinstance(annotation, LabeledMultiSpan): + return ( + [annotation.target[start:end] for start, end in annotation.slices], + annotation.label, + ) + elif isinstance(annotation, LabeledSpan): + return (annotation.target[annotation.start : annotation.end], annotation.label) + elif isinstance(annotation, BinaryRelation): + return ( + resolve_annotation(annotation.head), + annotation.label, + resolve_annotation(annotation.tail), + ) + elif isinstance(annotation, BratAttribute): + result = (resolve_annotation(annotation.annotation), annotation.label) + if annotation.value is not None: + return result + (annotation.value,) + else: + return result + else: + raise TypeError(f"Unknown annotation type: {type(annotation)}") + + +@pytest.fixture(scope="module", params=BratBuilder.BUILDER_CONFIGS) +def config_name(request) -> str: + return request.param.name + + +def test_config_names(config_name): + assert config_name in ["default", "merge_fragmented_spans"] + + +@pytest.fixture(scope="module") +def builder(config_name: str) -> BratBuilder: + return BratBuilder(name=config_name) + + +def test_builder(builder): + assert builder is not None + + +@pytest.fixture(scope="module", params=range(len(HF_EXAMPLES))) +def sample_idx(request) -> int: + return request.param + + +def test_generate_document(builder, sample_idx, config_name): + hf_example = HF_EXAMPLES[sample_idx] + kwargs = dict() + generated_document = builder._generate_document(example=hf_example, **kwargs) + resolved_spans = [resolve_annotation(annotation=span) for span in generated_document.spans] + resolved_relations = [ + resolve_annotation(relation) for relation in generated_document.relations + ] + if sample_idx == 0: + assert len(generated_document.spans) == 2 + assert len(generated_document.relations) == 0 + assert len(generated_document.span_attributes) == 0 + assert len(generated_document.relation_attributes) == 0 + + if config_name == "default": + assert resolved_spans[0] == (["Jane"], "person") + assert resolved_spans[1] == (["Berlin"], "city") + elif config_name == "merge_fragmented_spans": + assert resolved_spans[0] == ("Jane", "person") + assert resolved_spans[1] == ("Berlin", "city") + else: + raise ValueError(f"Unknown dataset variant: {config_name}") + + elif sample_idx == 1: + assert len(generated_document.spans) == 2 + assert len(generated_document.relations) == 1 + assert len(generated_document.span_attributes) == 1 + assert len(generated_document.relation_attributes) == 1 + + resolved_span_attributes = [ + resolve_annotation(attribute) for attribute in generated_document.span_attributes + ] + resolved_relation_attributes = [ + resolve_annotation(attribute) for attribute in generated_document.relation_attributes + ] + + if config_name == "default": + assert resolved_spans[0] == (["Seattle"], "city") + assert resolved_spans[1] == (["Jenny Durkan"], "person") + assert resolved_relations[0] == ( + (["Jenny Durkan"], "person"), + "mayor_of", + (["Seattle"], "city"), + ) + assert resolved_span_attributes[0] == ((["Seattle"], "city"), "factuality", "actual") + assert resolved_relation_attributes[0] == ( + ((["Jenny Durkan"], "person"), "mayor_of", (["Seattle"], "city")), + "statement", + "true", + ) + elif config_name == "merge_fragmented_spans": + assert resolved_spans[0] == ("Seattle", "city") + assert resolved_spans[1] == ("Jenny Durkan", "person") + assert resolved_relations[0] == ( + ("Jenny Durkan", "person"), + "mayor_of", + ("Seattle", "city"), + ) + assert resolved_span_attributes[0] == (("Seattle", "city"), "factuality", "actual") + assert resolved_relation_attributes[0] == ( + (("Jenny Durkan", "person"), "mayor_of", ("Seattle", "city")), + "statement", + "true", + ) + else: + raise ValueError(f"Unknown dataset variant: {config_name}") + else: + raise ValueError(f"Unknown sample index: {sample_idx}") + + +def test_example_to_document_and_back_all(builder): + for hf_example in HF_EXAMPLES: + doc = builder._generate_document(hf_example) + assert isinstance(doc, builder.document_type) + hf_example_back = builder._generate_example(doc) + assert hf_example == hf_example_back From 3aa71e9fadcbbafd5ff24884c4fcfa6063c67d9f Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 19:13:02 +0200 Subject: [PATCH 8/9] add test_document_to_example_wrong_type() and simplify --- tests/unit/builder/test_brat_builder.py | 34 +++++++++++++++---------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/tests/unit/builder/test_brat_builder.py b/tests/unit/builder/test_brat_builder.py index fd1cd3a1..836acbe1 100644 --- a/tests/unit/builder/test_brat_builder.py +++ b/tests/unit/builder/test_brat_builder.py @@ -3,6 +3,7 @@ import pytest from pie_modules.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan from pytorch_ie import Annotation +from pytorch_ie.documents import TextBasedDocument from pie_datasets.builders.brat import BratAttribute, BratBuilder @@ -107,35 +108,34 @@ def test_builder(builder): assert builder is not None -@pytest.fixture(scope="module", params=range(len(HF_EXAMPLES))) -def sample_idx(request) -> int: +@pytest.fixture(scope="module", params=HF_EXAMPLES) +def hf_example(request) -> dict: return request.param -def test_generate_document(builder, sample_idx, config_name): - hf_example = HF_EXAMPLES[sample_idx] +def test_generate_document(builder, hf_example): kwargs = dict() generated_document = builder._generate_document(example=hf_example, **kwargs) resolved_spans = [resolve_annotation(annotation=span) for span in generated_document.spans] resolved_relations = [ resolve_annotation(relation) for relation in generated_document.relations ] - if sample_idx == 0: + if hf_example == HF_EXAMPLES[0]: assert len(generated_document.spans) == 2 assert len(generated_document.relations) == 0 assert len(generated_document.span_attributes) == 0 assert len(generated_document.relation_attributes) == 0 - if config_name == "default": + if builder.config.name == "default": assert resolved_spans[0] == (["Jane"], "person") assert resolved_spans[1] == (["Berlin"], "city") - elif config_name == "merge_fragmented_spans": + elif builder.config.name == "merge_fragmented_spans": assert resolved_spans[0] == ("Jane", "person") assert resolved_spans[1] == ("Berlin", "city") else: - raise ValueError(f"Unknown dataset variant: {config_name}") + raise ValueError(f"Unknown builder variant: {builder.name}") - elif sample_idx == 1: + elif hf_example == HF_EXAMPLES[1]: assert len(generated_document.spans) == 2 assert len(generated_document.relations) == 1 assert len(generated_document.span_attributes) == 1 @@ -148,7 +148,7 @@ def test_generate_document(builder, sample_idx, config_name): resolve_annotation(attribute) for attribute in generated_document.relation_attributes ] - if config_name == "default": + if builder.config.name == "default": assert resolved_spans[0] == (["Seattle"], "city") assert resolved_spans[1] == (["Jenny Durkan"], "person") assert resolved_relations[0] == ( @@ -162,7 +162,7 @@ def test_generate_document(builder, sample_idx, config_name): "statement", "true", ) - elif config_name == "merge_fragmented_spans": + elif builder.config.name == "merge_fragmented_spans": assert resolved_spans[0] == ("Seattle", "city") assert resolved_spans[1] == ("Jenny Durkan", "person") assert resolved_relations[0] == ( @@ -177,9 +177,9 @@ def test_generate_document(builder, sample_idx, config_name): "true", ) else: - raise ValueError(f"Unknown dataset variant: {config_name}") + raise ValueError(f"Unknown builder variant: {config_name}") else: - raise ValueError(f"Unknown sample index: {sample_idx}") + raise ValueError(f"Unknown sample: {hf_example}") def test_example_to_document_and_back_all(builder): @@ -188,3 +188,11 @@ def test_example_to_document_and_back_all(builder): assert isinstance(doc, builder.document_type) hf_example_back = builder._generate_example(doc) assert hf_example == hf_example_back + + +def test_document_to_example_wrong_type(builder): + doc = TextBasedDocument(text="Hello, world!") + + with pytest.raises(TypeError) as exc_info: + builder._generate_example(doc) + assert str(exc_info.value) == f"document type {type(doc)} is not supported" From 195642eb2c3e5d2ac3991ce5f986f1d5a05cb8c3 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 15 Apr 2024 19:25:42 +0200 Subject: [PATCH 9/9] re-add dataset_builders to source in .coveragerc --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index 7bbf28e2..d48bfce0 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,6 +1,6 @@ [run] -source = src +source = src,dataset_builders [report] exclude_lines =