Skip to content

Commit

Permalink
refactor: IL-414 create new Huggingface dataset for each test (#691)
Browse files Browse the repository at this point in the history
This will hopefully fix the flaky Huggingface tests.

---------

Co-authored-by: FelixFehse <[email protected]>
  • Loading branch information
FelixFehse and FelixFehseTNG authored Apr 4, 2024
1 parent 8a0e2ef commit 70c7c22
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 23 deletions.
7 changes: 4 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
### New Features
- feature: Error information is printed to the console on failed runs and evaluations.
- feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
- feature: Added `Runner.failed_runs` and `Evaluator.failed_evaluations` to retrieve all failed run / evaluation lineages
- feature: Added `.successful_example_outputs` and `.failed_example_outputs` to `RunRepository` to match the evaluation repository
- feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
- feature: Added `Runner.failed_runs(..)` and `Evaluator.failed_evaluations(..)` to retrieve all failed run / evaluation lineages
- feature: Added `.successful_example_outputs(..)` and `.failed_example_outputs(..)` to `RunRepository` to match the evaluation repository
- feature: Added optional argument to set an id when creating a `Dataset` via `DatasetRepository.create_dataset(..)`

### Fixes

Expand Down
2 changes: 1 addition & 1 deletion scripts/test.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env -S bash -eu -o pipefail

poetry run python3 -c "import nltk; nltk.download('punkt')"
poetry run pytest -n 10
poetry run pytest -n auto
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@ class DatasetRepository(ABC):

@abstractmethod
def create_dataset(
self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
self,
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
) -> Dataset:
"""Creates a dataset from given :class:`Example`s and returns the ID of that dataset.
Args:
examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset.
dataset_name: A name for the dataset.
id: The dataset ID. If `None`, an ID will be generated.
Returns:
The created :class:`Dataset`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,15 @@ def __init__(self, filesystem: AbstractFileSystem, root_directory: Path) -> None
super().__init__(file_system=filesystem, root_directory=root_directory)

def create_dataset(
self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
self,
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
) -> Dataset:
dataset = Dataset(name=dataset_name)
if id is not None:
dataset.id = id

self.mkdir(self._dataset_directory(dataset.id))

dataset_path = self._dataset_path(dataset.id)
Expand Down Expand Up @@ -139,13 +145,10 @@ def _write_data(
file_path: Path,
data_to_write: Iterable[PydanticSerializable],
) -> None:
with self._file_system.open(
self.path_to_str(file_path), "w", encoding="utf-8"
) as file:
for data_chunk in data_to_write:
serialized_result = JsonSerializer(root=data_chunk)
json_string = serialized_result.model_dump_json() + "\n"
file.write(json_string)
data = "\n".join(
JsonSerializer(root=chunk).model_dump_json() for chunk in data_to_write
)
self.write_utf8(file_path, data, create_parents=True)


class FileDatasetRepository(FileSystemDatasetRepository):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,14 @@ def __init__(self) -> None:
] = {}

def create_dataset(
self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
self,
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
) -> Dataset:
dataset = Dataset(name=dataset_name)
if id is not None:
dataset.id = id
if dataset.id in self._datasets_and_examples:
raise ValueError(
f"Created random dataset ID already exists for dataset {dataset}. This should not happen."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ def __init__(
self._huggingface_dataset = huggingface_dataset

def create_dataset(
self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
self,
examples: Iterable[Example[Input, ExpectedOutput]],
dataset_name: str,
id: str | None = None,
) -> Dataset:
raise NotImplementedError

Expand Down
20 changes: 20 additions & 0 deletions tests/evaluation/test_dataset_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,26 @@ def file_dataset_repository(tmp_path: Path) -> FileDatasetRepository:
]


@mark.parametrize(
"repository_fixture",
test_repository_fixtures,
)
def test_dataset_repository_with_custom_id(
repository_fixture: str,
request: FixtureRequest,
dummy_string_example: Example[DummyStringInput, DummyStringOutput],
) -> None:
dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)

dataset = dataset_repository.create_dataset(
examples=[dummy_string_example],
dataset_name="test-dataset",
id="my-custom-dataset-id",
)

assert dataset.id == "my-custom-dataset-id"


@mark.parametrize(
"repository_fixture",
test_repository_fixtures,
Expand Down
27 changes: 19 additions & 8 deletions tests/evaluation/test_hugging_face_dataset_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,26 @@ class DummyAggregatedEvaluation(BaseModel):

@fixture(scope="session")
def hugging_face_dataset_repository_id() -> str:
return "Aleph-Alpha/test-datasets"
return f"Aleph-Alpha/test-datasets-{str(uuid4())}"


@fixture(scope="session")
# @fixture(scope="session")
@fixture
def hugging_face_dataset_repository(
hugging_face_dataset_repository_id: str, hugging_face_token: str
) -> HuggingFaceDatasetRepository:
) -> Iterable[HuggingFaceDatasetRepository]:
# this repository should already exist and does not have to be deleted after the tests
return HuggingFaceDatasetRepository(
repo = HuggingFaceDatasetRepository(
repository_id=hugging_face_dataset_repository_id,
token=hugging_face_token,
private=True,
)

try:
yield repo
finally:
repo.delete_repository()


@fixture
def example_1() -> Example[str, str]:
Expand All @@ -50,14 +56,19 @@ def hugging_face_repository_with_dataset_and_examples(
Tuple[HuggingFaceDatasetRepository, Dataset, Sequence[Example[str, str]]]
]:
examples = [example_1, example_2]
dataset = hugging_face_dataset_repository.create_dataset(
examples=examples, dataset_name="test-hg-dataset"
)
id = str(uuid4())
try:
dataset = hugging_face_dataset_repository.create_dataset(
examples=examples, dataset_name="test-hg-dataset", id=id
)
except Exception as e:
hugging_face_dataset_repository.delete_dataset(id)
raise e

try:
yield hugging_face_dataset_repository, dataset, examples
finally:
hugging_face_dataset_repository.delete_dataset(dataset.id)
hugging_face_dataset_repository.delete_dataset(id)


def test_hugging_face_repository_can_create_and_delete_a_repository(
Expand Down

0 comments on commit 70c7c22

Please sign in to comment.