Skip to content

Commit

Permalink
create core package
Browse files Browse the repository at this point in the history
  • Loading branch information
ArneBinder committed Nov 8, 2023
1 parent f2925d1 commit d813fbb
Show file tree
Hide file tree
Showing 13 changed files with 47 additions and 36 deletions.
14 changes: 2 additions & 12 deletions src/pie_datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,3 @@
from .builder import ArrowBasedBuilder, GeneratorBasedBuilder
from .dataset import Dataset, IterableDataset
from .dataset_dict import DatasetDict
from .document_formatter import DocumentFormatter
# flake8: noqa

__all__ = [
"GeneratorBasedBuilder",
"ArrowBasedBuilder",
"Dataset",
"IterableDataset",
"DatasetDict",
"DocumentFormatter",
]
from .core import *
11 changes: 11 additions & 0 deletions src/pie_datasets/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from .builder import ArrowBasedBuilder, GeneratorBasedBuilder
from .dataset import Dataset, IterableDataset
from .dataset_dict import DatasetDict

__all__ = [
"GeneratorBasedBuilder",
"ArrowBasedBuilder",
"Dataset",
"IterableDataset",
"DatasetDict",
]
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pytorch_ie.core.document import Document
from pytorch_ie.utils.hydra import resolve_target, serialize_document_type

from pie_datasets.dataset import Dataset, IterableDataset, get_pie_dataset_type
from .dataset import Dataset, IterableDataset, get_pie_dataset_type

logger = logging.getLogger(__name__)

Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion src/pie_datasets/document/processing/regex_partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pytorch_ie.documents import TextBasedDocument

from pie_datasets import Dataset, IterableDataset
from pie_datasets.dataset_dict import EnterDatasetMixin, ExitDatasetMixin
from pie_datasets.core.dataset_dict import EnterDatasetMixin, ExitDatasetMixin

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextDocument

from pie_datasets.builder import ArrowBasedBuilder
from pie_datasets import ArrowBasedBuilder
from tests import FIXTURES_ROOT


Expand Down
1 change: 1 addition & 0 deletions tests/unit/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
TEST_PACKAGE = "tests.unit.core"
12 changes: 8 additions & 4 deletions tests/unit/test_builder.py → tests/unit/core/test_builder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Type

import pytest
Expand All @@ -10,11 +11,14 @@
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextBasedDocument, TextDocumentWithSpans

from pie_datasets.builder import PieDatasetBuilder
from pie_datasets.core.builder import PieDatasetBuilder
from tests import FIXTURES_ROOT
from tests.unit.core import TEST_PACKAGE

DATASETS_ROOT = FIXTURES_ROOT / "builder" / "datasets"

TEST_MODULE = f"{TEST_PACKAGE}.{Path(__file__).stem}"


def test_builder_class():
dataset_module = dataset_module_factory(str(DATASETS_ROOT / "single_config"))
Expand Down Expand Up @@ -193,7 +197,7 @@ class RenamedExampleDocument(TextBasedDocument):
builder = builder_cls(
cache_dir=tmp_cache_dir,
document_converters={
"tests.unit.test_builder.ExampleDocumentWithSimpleSpans": "tests.unit.test_builder.convert_example_document_to_example_document_with_simple_spans",
f"{TEST_MODULE}.ExampleDocumentWithSimpleSpans": f"{TEST_MODULE}.convert_example_document_to_example_document_with_simple_spans",
},
)
assert isinstance(builder, PieDatasetBuilder)
Expand All @@ -213,12 +217,12 @@ def test_builder_with_document_converters_resolve_wrong_document_type():
with pytest.raises(
TypeError,
match=re.escape(
"The key 'tests.unit.test_builder.NoDocumentType' for one of the converters can not be resolved to a document type."
f"The key '{TEST_MODULE}.NoDocumentType' for one of the converters can not be resolved to a document type."
),
):
builder = builder_cls(
cache_dir=tmp_cache_dir,
document_converters={
"tests.unit.test_builder.NoDocumentType": convert_example_document_to_example_document_with_simple_spans,
f"{TEST_MODULE}.NoDocumentType": convert_example_document_to_example_document_with_simple_spans,
},
)
14 changes: 10 additions & 4 deletions tests/unit/test_dataset.py → tests/unit/core/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections.abc import Iterator, Sequence
from dataclasses import dataclass
from pathlib import Path
from typing import Union

import numpy
Expand All @@ -16,8 +17,11 @@
from pytorch_ie.taskmodules import TransformerSpanClassificationTaskModule

from pie_datasets import Dataset, IterableDataset
from pie_datasets.dataset import get_pie_dataset_type
from pie_datasets.core.dataset import get_pie_dataset_type
from tests.conftest import TestDocument
from tests.unit.core import TEST_PACKAGE

TEST_MODULE = f"{TEST_PACKAGE}.{Path(__file__).stem}"


def test_dataset(maybe_iterable_dataset):
Expand Down Expand Up @@ -281,12 +285,14 @@ class TestDocumentWithSpans(TestDocument):
assert (
str(excinfo.value)
== "No valid key (either subclass or superclass) was found for the document type "
"'<class 'tests.unit.test_dataset.test_to_document_type_not_found.<locals>.TestDocumentWithSpans'>' "
f"'<class '{TEST_MODULE}.test_to_document_type_not_found.<locals>.TestDocumentWithSpans'>' "
"in the document_converters of the dataset. Available keys: "
"{<class 'tests.unit.test_dataset.TestDocumentWithLabel'>}. Consider adding a respective converter "
"{<class '"
+ TEST_MODULE
+ ".TestDocumentWithLabel'>}. Consider adding a respective converter "
"to the dataset with dataset.register_document_converter(my_converter_method) where "
"my_converter_method should accept <class 'tests.conftest.TestDocument'> as input and return "
"'<class 'tests.unit.test_dataset.test_to_document_type_not_found.<locals>.TestDocumentWithSpans'>'."
f"'<class '{TEST_MODULE}.test_to_document_type_not_found.<locals>.TestDocumentWithSpans'>'."
)


Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,17 @@
from pytorch_ie.documents import TextBasedDocument, TextDocument

from pie_datasets import Dataset, DatasetDict, IterableDataset
from pie_datasets.dataset_dict import (
from pie_datasets.core.dataset_dict import (
EnterDatasetDictMixin,
EnterDatasetMixin,
ExitDatasetDictMixin,
ExitDatasetMixin,
)
from tests import DATASET_BUILDERS_ROOT, FIXTURES_ROOT
from tests.conftest import CREATE_FIXTURE_DATA, TestDocument
from tests.unit.core import TEST_PACKAGE

TEST_MODULE = f"{TEST_PACKAGE}.{Path(__file__).stem}"

logger = logging.getLogger(__name__)

Expand All @@ -27,8 +30,6 @@
PIE_DATASET_PATH = DATASET_BUILDERS_ROOT / "pie" / DATASET_NAME
FIXTURE_DATA_PATH = FIXTURES_ROOT / "dataset_dict" / f"{DATASET_NAME}_extract"

TEST_CLASS_PREFIX = "tests.unit.test_dataset_dict"


@pytest.mark.skipif(condition=not CREATE_FIXTURE_DATA, reason="don't create fixture data again")
def test_create_fixture_data():
Expand Down Expand Up @@ -167,7 +168,7 @@ def map_fn(doc):

@pytest.mark.parametrize(
"function",
[map_fn, f"{TEST_CLASS_PREFIX}.map_fn"],
[map_fn, f"{TEST_MODULE}.map_fn"],
)
def test_map(dataset_dict, function):
dataset_dict_mapped = dataset_dict.map(function)
Expand Down Expand Up @@ -479,8 +480,8 @@ def test_register_document_converter(dataset_dict):

def test_register_document_converter_resolve(dataset_dict):
dataset_dict.register_document_converter(
f"{TEST_CLASS_PREFIX}.convert_to_document_with_label",
document_type=f"{TEST_CLASS_PREFIX}.TestDocumentWithLabel",
f"{TEST_MODULE}.convert_to_document_with_label",
document_type=f"{TEST_MODULE}.TestDocumentWithLabel",
)

for name, split in dataset_dict.items():
Expand All @@ -494,11 +495,11 @@ class NoDocument:
def test_register_document_converter_resolve_wrong_document_type(dataset_dict):
with pytest.raises(TypeError) as excinfo:
dataset_dict.register_document_converter(
convert_to_document_with_label, document_type=f"{TEST_CLASS_PREFIX}.NoDocument"
convert_to_document_with_label, document_type=f"{TEST_MODULE}.NoDocument"
)
assert (
str(excinfo.value)
== f"document_type must be or resolve to a subclass of Document, but is '{TEST_CLASS_PREFIX}.NoDocument'"
== f"document_type must be or resolve to a subclass of Document, but is '{TEST_MODULE}.NoDocument'"
)


Expand All @@ -518,9 +519,7 @@ def test_to_document_type(dataset_dict):

def test_to_document_resolve(dataset_dict):
dataset_dict.register_document_converter(convert_to_document_with_label)
dataset_dict_converted = dataset_dict.to_document_type(
f"{TEST_CLASS_PREFIX}.TestDocumentWithLabel"
)
dataset_dict_converted = dataset_dict.to_document_type(f"{TEST_MODULE}.TestDocumentWithLabel")
assert dataset_dict_converted.document_type == TestDocumentWithLabel
for split in dataset_dict_converted.values():
assert all(isinstance(doc, TestDocumentWithLabel) for doc in split)
Expand All @@ -529,11 +528,11 @@ def test_to_document_resolve(dataset_dict):
def test_to_document_type_resolve_wrong_document_type(dataset_dict):
dataset_dict.register_document_converter(convert_to_document_with_label)
with pytest.raises(TypeError) as excinfo:
dataset_dict.to_document_type(f"{TEST_CLASS_PREFIX}.NoDocument")
dataset_dict.to_document_type(f"{TEST_MODULE}.NoDocument")
assert (
str(excinfo.value)
== f"document_type must be a document type or a string that can be resolved to such a type, but got "
f"{TEST_CLASS_PREFIX}.NoDocument."
f"{TEST_MODULE}.NoDocument."
)


Expand Down

0 comments on commit d813fbb

Please sign in to comment.