From 70c7c22fc677697276063f1549ac21fee981756b Mon Sep 17 00:00:00 2001 From: FelixFehse <155464791+FelixFehse@users.noreply.github.com> Date: Thu, 4 Apr 2024 11:00:56 +0200 Subject: [PATCH] refactor: IL-414 create new Huggingface dataset for each test (#691) This will hopefully fix the flaky Huggingface tests. --------- Co-authored-by: FelixFehse --- CHANGELOG.md | 7 ++--- scripts/test.sh | 2 +- .../evaluation/dataset/dataset_repository.py | 6 ++++- .../dataset/file_dataset_repository.py | 19 +++++++------ .../dataset/in_memory_dataset_repository.py | 7 ++++- .../single_huggingface_dataset_repository.py | 5 +++- tests/evaluation/test_dataset_repository.py | 20 ++++++++++++++ .../test_hugging_face_dataset_repository.py | 27 +++++++++++++------ 8 files changed, 70 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6bb4f600..31e55d21d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,10 @@ ### New Features - feature: Error information is printed to the console on failed runs and evaluations. - feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object -- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs. -- feature: Added `Runner.failed_runs` and `Evaluator.failed_evaluations` to retrieve all failed run / evaluation lineages -- feature: Added `.successful_example_outputs` and `.failed_example_outputs` to `RunRepository` to match the evaluation repository +- feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs. +- feature: Added `Runner.failed_runs(..)` and `Evaluator.failed_evaluations(..)` to retrieve all failed run / evaluation lineages +- feature: Added `.successful_example_outputs(..)` and `.failed_example_outputs(..)` to `RunRepository` to match the evaluation repository +- feature: Added optional argument to set an id when creating a `Dataset` via `DatasetRepository.create_dataset(..)` ### Fixes diff --git a/scripts/test.sh b/scripts/test.sh index 82abaac1e..98660e4b7 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -1,4 +1,4 @@ #!/usr/bin/env -S bash -eu -o pipefail poetry run python3 -c "import nltk; nltk.download('punkt')" -poetry run pytest -n 10 +poetry run pytest -n auto diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py index a97301e3f..aecf89b24 100644 --- a/src/intelligence_layer/evaluation/dataset/dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/dataset_repository.py @@ -17,13 +17,17 @@ class DatasetRepository(ABC): @abstractmethod def create_dataset( - self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str + self, + examples: Iterable[Example[Input, ExpectedOutput]], + dataset_name: str, + id: str | None = None, ) -> Dataset: """Creates a dataset from given :class:`Example`s and returns the ID of that dataset. Args: examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset. dataset_name: A name for the dataset. + id: The dataset ID. If `None`, an ID will be generated. Returns: The created :class:`Dataset`. diff --git a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py index 833abce87..cbe2c4a80 100644 --- a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py @@ -23,9 +23,15 @@ def __init__(self, filesystem: AbstractFileSystem, root_directory: Path) -> None super().__init__(file_system=filesystem, root_directory=root_directory) def create_dataset( - self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str + self, + examples: Iterable[Example[Input, ExpectedOutput]], + dataset_name: str, + id: str | None = None, ) -> Dataset: dataset = Dataset(name=dataset_name) + if id is not None: + dataset.id = id + self.mkdir(self._dataset_directory(dataset.id)) dataset_path = self._dataset_path(dataset.id) @@ -139,13 +145,10 @@ def _write_data( file_path: Path, data_to_write: Iterable[PydanticSerializable], ) -> None: - with self._file_system.open( - self.path_to_str(file_path), "w", encoding="utf-8" - ) as file: - for data_chunk in data_to_write: - serialized_result = JsonSerializer(root=data_chunk) - json_string = serialized_result.model_dump_json() + "\n" - file.write(json_string) + data = "\n".join( + JsonSerializer(root=chunk).model_dump_json() for chunk in data_to_write + ) + self.write_utf8(file_path, data, create_parents=True) class FileDatasetRepository(FileSystemDatasetRepository): diff --git a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py index 36bc35931..4504b98a5 100644 --- a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py @@ -19,9 +19,14 @@ def __init__(self) -> None: ] = {} def create_dataset( - self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str + self, + examples: Iterable[Example[Input, ExpectedOutput]], + dataset_name: str, + id: str | None = None, ) -> Dataset: dataset = Dataset(name=dataset_name) + if id is not None: + dataset.id = id if dataset.id in self._datasets_and_examples: raise ValueError( f"Created random dataset ID already exists for dataset {dataset}. This should not happen." diff --git a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py index 184878f9a..6c25dccbe 100644 --- a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py @@ -28,7 +28,10 @@ def __init__( self._huggingface_dataset = huggingface_dataset def create_dataset( - self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str + self, + examples: Iterable[Example[Input, ExpectedOutput]], + dataset_name: str, + id: str | None = None, ) -> Dataset: raise NotImplementedError diff --git a/tests/evaluation/test_dataset_repository.py b/tests/evaluation/test_dataset_repository.py index cf6ea7c70..c1f202a57 100644 --- a/tests/evaluation/test_dataset_repository.py +++ b/tests/evaluation/test_dataset_repository.py @@ -24,6 +24,26 @@ def file_dataset_repository(tmp_path: Path) -> FileDatasetRepository: ] +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_dataset_repository_with_custom_id( + repository_fixture: str, + request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], +) -> None: + dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) + + dataset = dataset_repository.create_dataset( + examples=[dummy_string_example], + dataset_name="test-dataset", + id="my-custom-dataset-id", + ) + + assert dataset.id == "my-custom-dataset-id" + + @mark.parametrize( "repository_fixture", test_repository_fixtures, diff --git a/tests/evaluation/test_hugging_face_dataset_repository.py b/tests/evaluation/test_hugging_face_dataset_repository.py index 3473063a6..70a606509 100644 --- a/tests/evaluation/test_hugging_face_dataset_repository.py +++ b/tests/evaluation/test_hugging_face_dataset_repository.py @@ -16,20 +16,26 @@ class DummyAggregatedEvaluation(BaseModel): @fixture(scope="session") def hugging_face_dataset_repository_id() -> str: - return "Aleph-Alpha/test-datasets" + return f"Aleph-Alpha/test-datasets-{str(uuid4())}" -@fixture(scope="session") +# @fixture(scope="session") +@fixture def hugging_face_dataset_repository( hugging_face_dataset_repository_id: str, hugging_face_token: str -) -> HuggingFaceDatasetRepository: +) -> Iterable[HuggingFaceDatasetRepository]: # this repository should already exist and does not have to be deleted after the tests - return HuggingFaceDatasetRepository( + repo = HuggingFaceDatasetRepository( repository_id=hugging_face_dataset_repository_id, token=hugging_face_token, private=True, ) + try: + yield repo + finally: + repo.delete_repository() + @fixture def example_1() -> Example[str, str]: @@ -50,14 +56,19 @@ def hugging_face_repository_with_dataset_and_examples( Tuple[HuggingFaceDatasetRepository, Dataset, Sequence[Example[str, str]]] ]: examples = [example_1, example_2] - dataset = hugging_face_dataset_repository.create_dataset( - examples=examples, dataset_name="test-hg-dataset" - ) + id = str(uuid4()) + try: + dataset = hugging_face_dataset_repository.create_dataset( + examples=examples, dataset_name="test-hg-dataset", id=id + ) + except Exception as e: + hugging_face_dataset_repository.delete_dataset(id) + raise e try: yield hugging_face_dataset_repository, dataset, examples finally: - hugging_face_dataset_repository.delete_dataset(dataset.id) + hugging_face_dataset_repository.delete_dataset(id) def test_hugging_face_repository_can_create_and_delete_a_repository(