From 70c7c22fc677697276063f1549ac21fee981756b Mon Sep 17 00:00:00 2001
From: FelixFehse <155464791+FelixFehse@users.noreply.github.com>
Date: Thu, 4 Apr 2024 11:00:56 +0200
Subject: [PATCH] refactor: IL-414 create new Huggingface dataset for each test
 (#691)

This will hopefully fix the flaky Huggingface tests.

---------

Co-authored-by: FelixFehse <felix.fehse@tngtech.com>
---
 CHANGELOG.md                                  |  7 ++---
 scripts/test.sh                               |  2 +-
 .../evaluation/dataset/dataset_repository.py  |  6 ++++-
 .../dataset/file_dataset_repository.py        | 19 +++++++------
 .../dataset/in_memory_dataset_repository.py   |  7 ++++-
 .../single_huggingface_dataset_repository.py  |  5 +++-
 tests/evaluation/test_dataset_repository.py   | 20 ++++++++++++++
 .../test_hugging_face_dataset_repository.py   | 27 +++++++++++++------
 8 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c6bb4f600..31e55d21d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,10 @@
 ### New Features
 - feature: Error information is printed to the console on failed runs and evaluations.
 - feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
-- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
-- feature: Added `Runner.failed_runs` and `Evaluator.failed_evaluations` to retrieve all failed run / evaluation lineages
-- feature: Added `.successful_example_outputs` and `.failed_example_outputs` to `RunRepository` to match the evaluation repository
+- feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
+- feature: Added `Runner.failed_runs(..)` and `Evaluator.failed_evaluations(..)` to retrieve all failed run / evaluation lineages
+- feature: Added `.successful_example_outputs(..)` and `.failed_example_outputs(..)` to `RunRepository` to match the evaluation repository
+- feature: Added optional argument to set an id when creating a `Dataset` via `DatasetRepository.create_dataset(..)`
 
 ### Fixes
 
diff --git a/scripts/test.sh b/scripts/test.sh
index 82abaac1e..98660e4b7 100755
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env -S bash -eu -o pipefail
 
 poetry run python3 -c "import nltk; nltk.download('punkt')"
-poetry run pytest -n 10
+poetry run pytest -n auto
diff --git a/src/intelligence_layer/evaluation/dataset/dataset_repository.py b/src/intelligence_layer/evaluation/dataset/dataset_repository.py
index a97301e3f..aecf89b24 100644
--- a/src/intelligence_layer/evaluation/dataset/dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/dataset_repository.py
@@ -17,13 +17,17 @@ class DatasetRepository(ABC):
 
     @abstractmethod
     def create_dataset(
-        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+        dataset_name: str,
+        id: str | None = None,
     ) -> Dataset:
         """Creates a dataset from given :class:`Example`s and returns the ID of that dataset.
 
         Args:
             examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset.
             dataset_name: A name for the dataset.
+            id: The dataset ID. If `None`, an ID will be generated.
 
         Returns:
             The created :class:`Dataset`.
diff --git a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py
index 833abce87..cbe2c4a80 100644
--- a/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/file_dataset_repository.py
@@ -23,9 +23,15 @@ def __init__(self, filesystem: AbstractFileSystem, root_directory: Path) -> None
         super().__init__(file_system=filesystem, root_directory=root_directory)
 
     def create_dataset(
-        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+        dataset_name: str,
+        id: str | None = None,
     ) -> Dataset:
         dataset = Dataset(name=dataset_name)
+        if id is not None:
+            dataset.id = id
+
         self.mkdir(self._dataset_directory(dataset.id))
 
         dataset_path = self._dataset_path(dataset.id)
@@ -139,13 +145,10 @@ def _write_data(
         file_path: Path,
         data_to_write: Iterable[PydanticSerializable],
     ) -> None:
-        with self._file_system.open(
-            self.path_to_str(file_path), "w", encoding="utf-8"
-        ) as file:
-            for data_chunk in data_to_write:
-                serialized_result = JsonSerializer(root=data_chunk)
-                json_string = serialized_result.model_dump_json() + "\n"
-                file.write(json_string)
+        data = "\n".join(
+            JsonSerializer(root=chunk).model_dump_json() for chunk in data_to_write
+        )
+        self.write_utf8(file_path, data, create_parents=True)
 
 
 class FileDatasetRepository(FileSystemDatasetRepository):
diff --git a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py
index 36bc35931..4504b98a5 100644
--- a/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/in_memory_dataset_repository.py
@@ -19,9 +19,14 @@ def __init__(self) -> None:
         ] = {}
 
     def create_dataset(
-        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+        dataset_name: str,
+        id: str | None = None,
     ) -> Dataset:
         dataset = Dataset(name=dataset_name)
+        if id is not None:
+            dataset.id = id
         if dataset.id in self._datasets_and_examples:
             raise ValueError(
                 f"Created random dataset ID already exists for dataset {dataset}. This should not happen."
diff --git a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py
index 184878f9a..6c25dccbe 100644
--- a/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/single_huggingface_dataset_repository.py
@@ -28,7 +28,10 @@ def __init__(
         self._huggingface_dataset = huggingface_dataset
 
     def create_dataset(
-        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+        dataset_name: str,
+        id: str | None = None,
     ) -> Dataset:
         raise NotImplementedError
 
diff --git a/tests/evaluation/test_dataset_repository.py b/tests/evaluation/test_dataset_repository.py
index cf6ea7c70..c1f202a57 100644
--- a/tests/evaluation/test_dataset_repository.py
+++ b/tests/evaluation/test_dataset_repository.py
@@ -24,6 +24,26 @@ def file_dataset_repository(tmp_path: Path) -> FileDatasetRepository:
 ]
 
 
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_dataset_repository_with_custom_id(
+    repository_fixture: str,
+    request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
+) -> None:
+    dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
+
+    dataset = dataset_repository.create_dataset(
+        examples=[dummy_string_example],
+        dataset_name="test-dataset",
+        id="my-custom-dataset-id",
+    )
+
+    assert dataset.id == "my-custom-dataset-id"
+
+
 @mark.parametrize(
     "repository_fixture",
     test_repository_fixtures,
diff --git a/tests/evaluation/test_hugging_face_dataset_repository.py b/tests/evaluation/test_hugging_face_dataset_repository.py
index 3473063a6..70a606509 100644
--- a/tests/evaluation/test_hugging_face_dataset_repository.py
+++ b/tests/evaluation/test_hugging_face_dataset_repository.py
@@ -16,20 +16,26 @@ class DummyAggregatedEvaluation(BaseModel):
 
 @fixture(scope="session")
 def hugging_face_dataset_repository_id() -> str:
-    return "Aleph-Alpha/test-datasets"
+    return f"Aleph-Alpha/test-datasets-{str(uuid4())}"
 
 
-@fixture(scope="session")
+# @fixture(scope="session")
+@fixture
 def hugging_face_dataset_repository(
     hugging_face_dataset_repository_id: str, hugging_face_token: str
-) -> HuggingFaceDatasetRepository:
+) -> Iterable[HuggingFaceDatasetRepository]:
     # this repository should already exist and does not have to be deleted after the tests
-    return HuggingFaceDatasetRepository(
+    repo = HuggingFaceDatasetRepository(
         repository_id=hugging_face_dataset_repository_id,
         token=hugging_face_token,
         private=True,
     )
 
+    try:
+        yield repo
+    finally:
+        repo.delete_repository()
+
 
 @fixture
 def example_1() -> Example[str, str]:
@@ -50,14 +56,19 @@ def hugging_face_repository_with_dataset_and_examples(
     Tuple[HuggingFaceDatasetRepository, Dataset, Sequence[Example[str, str]]]
 ]:
     examples = [example_1, example_2]
-    dataset = hugging_face_dataset_repository.create_dataset(
-        examples=examples, dataset_name="test-hg-dataset"
-    )
+    id = str(uuid4())
+    try:
+        dataset = hugging_face_dataset_repository.create_dataset(
+            examples=examples, dataset_name="test-hg-dataset", id=id
+        )
+    except Exception as e:
+        hugging_face_dataset_repository.delete_dataset(id)
+        raise e
 
     try:
         yield hugging_face_dataset_repository, dataset, examples
     finally:
-        hugging_face_dataset_repository.delete_dataset(dataset.id)
+        hugging_face_dataset_repository.delete_dataset(id)
 
 
 def test_hugging_face_repository_can_create_and_delete_a_repository(