refactor: IL-414 create new Huggingface dataset for each test (#691)

This will hopefully fix the flaky Huggingface tests. --------- Co-authored-by: FelixFehse <[email protected]>
Aleph-Alpha · Apr 4, 2024 · 70c7c22 · 70c7c22
1 parent 8a0e2ef
commit 70c7c22
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,9 +7,10 @@
 ### New Features
 - feature: Error information is printed to the console on failed runs and evaluations.
 - feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
-- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
-- feature: Added `Runner.failed_runs` and `Evaluator.failed_evaluations` to retrieve all failed run / evaluation lineages
-- feature: Added `.successful_example_outputs` and `.failed_example_outputs` to `RunRepository` to match the evaluation repository
+- feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
+- feature: Added `Runner.failed_runs(..)` and `Evaluator.failed_evaluations(..)` to retrieve all failed run / evaluation lineages
+- feature: Added `.successful_example_outputs(..)` and `.failed_example_outputs(..)` to `RunRepository` to match the evaluation repository
+- feature: Added optional argument to set an id when creating a `Dataset` via `DatasetRepository.create_dataset(..)`
 
 ### Fixes
 

diff --git a/scripts/test.sh b/scripts/test.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env -S bash -eu -o pipefail
 
 poetry run python3 -c "import nltk; nltk.download('punkt')"
-poetry run pytest -n 10
+poetry run pytest -n auto
diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py
@@ -17,13 +17,17 @@ class DatasetRepository(ABC):
 
     @abstractmethod
     def create_dataset(
-        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+        dataset_name: str,
+        id: str | None = None,
     ) -> Dataset:
         """Creates a dataset from given :class:`Example`s and returns the ID of that dataset.
 
         Args:
             examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset.
             dataset_name: A name for the dataset.
+            id: The dataset ID. If `None`, an ID will be generated.
 
         Returns:
             The created :class:`Dataset`.

diff --git a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py
@@ -23,9 +23,15 @@ def __init__(self, filesystem: AbstractFileSystem, root_directory: Path) -> None
         super().__init__(file_system=filesystem, root_directory=root_directory)
 
     def create_dataset(
-        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+        dataset_name: str,
+        id: str | None = None,
     ) -> Dataset:
         dataset = Dataset(name=dataset_name)
+        if id is not None:
+            dataset.id = id
+
         self.mkdir(self._dataset_directory(dataset.id))
 
         dataset_path = self._dataset_path(dataset.id)
@@ -139,13 +145,10 @@ def _write_data(
         file_path: Path,
         data_to_write: Iterable[PydanticSerializable],
     ) -> None:
-        with self._file_system.open(
-            self.path_to_str(file_path), "w", encoding="utf-8"
-        ) as file:
-            for data_chunk in data_to_write:
-                serialized_result = JsonSerializer(root=data_chunk)
-                json_string = serialized_result.model_dump_json() + "\n"
-                file.write(json_string)
+        data = "\n".join(
+            JsonSerializer(root=chunk).model_dump_json() for chunk in data_to_write
+        )
+        self.write_utf8(file_path, data, create_parents=True)
 
 
 class FileDatasetRepository(FileSystemDatasetRepository):

diff --git a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py
@@ -19,9 +19,14 @@ def __init__(self) -> None:
         ] = {}
 
     def create_dataset(
-        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+        dataset_name: str,
+        id: str | None = None,
     ) -> Dataset:
         dataset = Dataset(name=dataset_name)
+        if id is not None:
+            dataset.id = id
         if dataset.id in self._datasets_and_examples:
             raise ValueError(
                 f"Created random dataset ID already exists for dataset {dataset}. This should not happen."

diff --git a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py
@@ -28,7 +28,10 @@ def __init__(
         self._huggingface_dataset = huggingface_dataset
 
     def create_dataset(
-        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+        dataset_name: str,
+        id: str | None = None,
     ) -> Dataset:
         raise NotImplementedError
 

diff --git a/tests/evaluation/test_dataset_repository.py b/tests/evaluation/test_dataset_repository.py
@@ -24,6 +24,26 @@ def file_dataset_repository(tmp_path: Path) -> FileDatasetRepository:
 ]
 
 
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_dataset_repository_with_custom_id(
+    repository_fixture: str,
+    request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
+) -> None:
+    dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
+
+    dataset = dataset_repository.create_dataset(
+        examples=[dummy_string_example],
+        dataset_name="test-dataset",
+        id="my-custom-dataset-id",
+    )
+
+    assert dataset.id == "my-custom-dataset-id"
+
+
 @mark.parametrize(
     "repository_fixture",
     test_repository_fixtures,

diff --git a/tests/evaluation/test_hugging_face_dataset_repository.py b/tests/evaluation/test_hugging_face_dataset_repository.py
@@ -16,20 +16,26 @@ class DummyAggregatedEvaluation(BaseModel):
 
 @fixture(scope="session")
 def hugging_face_dataset_repository_id() -> str:
-    return "Aleph-Alpha/test-datasets"
+    return f"Aleph-Alpha/test-datasets-{str(uuid4())}"
 
 
-@fixture(scope="session")
+# @fixture(scope="session")
+@fixture
 def hugging_face_dataset_repository(
     hugging_face_dataset_repository_id: str, hugging_face_token: str
-) -> HuggingFaceDatasetRepository:
+) -> Iterable[HuggingFaceDatasetRepository]:
     # this repository should already exist and does not have to be deleted after the tests
-    return HuggingFaceDatasetRepository(
+    repo = HuggingFaceDatasetRepository(
         repository_id=hugging_face_dataset_repository_id,
         token=hugging_face_token,
         private=True,
     )
 
+    try:
+        yield repo
+    finally:
+        repo.delete_repository()
+
 
 @fixture
 def example_1() -> Example[str, str]:
@@ -50,14 +56,19 @@ def hugging_face_repository_with_dataset_and_examples(
     Tuple[HuggingFaceDatasetRepository, Dataset, Sequence[Example[str, str]]]
 ]:
     examples = [example_1, example_2]
-    dataset = hugging_face_dataset_repository.create_dataset(
-        examples=examples, dataset_name="test-hg-dataset"
-    )
+    id = str(uuid4())
+    try:
+        dataset = hugging_face_dataset_repository.create_dataset(
+            examples=examples, dataset_name="test-hg-dataset", id=id
+        )
+    except Exception as e:
+        hugging_face_dataset_repository.delete_dataset(id)
+        raise e
 
     try:
         yield hugging_face_dataset_repository, dataset, examples
     finally:
-        hugging_face_dataset_repository.delete_dataset(dataset.id)
+        hugging_face_dataset_repository.delete_dataset(id)
 
 
 def test_hugging_face_repository_can_create_and_delete_a_repository(