feat(repository): add data selection methods to run, eval and aggrega…

…tion repositories Task: IL-304, IL-301
Aleph-Alpha · Feb 29, 2024 · 43c0a71 · 43c0a71
1 parent 7d3cb60
commit 43c0a71
Show file tree

Hide file tree

Showing 11 changed files with 311 additions and 224 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ The key features of the Intelligence Layer are:
   To ensure this, we provide full comprehensibility, by seamlessly logging each step of every workflow.
   This enhances your debugging capabilities and offers greater control post-deployment when examining model responses.
 
-### Table of contents
+## Table of contents
 
 1. [Getting Started](#getting-started)
 2. [Getting started with the Jupyter Notebooks](#getting-started-with-the-jupyter-notebooks)
@@ -24,8 +24,9 @@ The key features of the Intelligence Layer are:
 6. [How to make your own use case](#how-to-make-your-own-use-case)
 7. [Running the Trace Viewer](#running-the-trace-viewer)
 8. [Using the Intelligence Layer in Docker](#using-the-intelligence-layer-in-docker)
-9. [References](#references)
-10. [License](#license)
+9. [For developers](#for-developers)
+10. [References](#references)
+11. [License](#license)
 
 ## Getting started
 
@@ -251,6 +252,18 @@ secrets:
 
 You can read more about this in the [official documentation](https://docs.docker.com/engine/swarm/secrets/).
 
+## For Developers
+
+### Python: Naming Conventions
+
+We follow the [PEP 8 – Style Guide for Python Code](https://peps.python.org/pep-0008/).
+In addition, there are the following naming conventions:
+* Class method names:
+  * Use only substantives for a method name having no side effects and returning some objects
+    * E.g., `evaluation_overview` which returns an evaluation overview object
+  * Use a verb for a method name if it has side effects and return nothing
+    * E.g., `store_evaluation_overview` which saves a given evaluation overview (and returns nothing)
+
 ## References
 
 - Full documentation: https://aleph-alpha-intelligence-layer.readthedocs-hosted.com/en/latest/

diff --git a/src/intelligence_layer/evaluation/data_storage/aggregation_repository.py b/src/intelligence_layer/evaluation/data_storage/aggregation_repository.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Sequence
+from typing import Any, Iterable, Optional, Sequence
 
 from intelligence_layer.evaluation.data_storage.utils import FileBasedRepository
 from intelligence_layer.evaluation.domain import (
@@ -10,21 +10,37 @@
 
 
 class AggregationRepository(ABC):
+    """Base aggregation repository interface.
+
+    Provides methods to store and load aggregated evaluation results.
+    """
+
+    @abstractmethod
+    def aggregation_ids(self) -> Sequence[str]:
+        """Returns the ids of all stored aggregation runs.
+
+        Having the id of an aggregation run, its overview can be retrieved with
+        :meth:`AggregationRepository.aggregation_overview`.
+
+        Returns:
+            The ids of all stored aggregation runs.
+        """
+        pass
+
     @abstractmethod
     def aggregation_overview(
-        self, id: str, stat_type: type[AggregatedEvaluation]
-    ) -> AggregationOverview[AggregatedEvaluation] | None:
-        """Returns all failed :class:`ExampleResult` instances of a given run
+        self, aggregation_id: str, aggregation_type: type[AggregatedEvaluation]
+    ) -> Optional[AggregationOverview[AggregatedEvaluation]]:
+        """Returns a specific instance of :class:`AggregationOverview` of a given run
 
         Args:
-            id: Identifier of the TODO
-            stat_type:
-
+            aggregation_id: Identifier of the aggregation overview
+            aggregation_type: Type of the aggregation
 
         Returns:
             :class:`EvaluationOverview` if one was found, `None` otherwise.
         """
-        ...
+        pass
 
     @abstractmethod
     def store_aggregation_overview(
@@ -35,7 +51,17 @@ def store_aggregation_overview(
         Args:
             overview: The overview to be persisted.
         """
-        ...
+        pass
+
+    def aggregation_overviews(
+        self, aggregation_type: type[AggregatedEvaluation]
+    ) -> Iterable[AggregationOverview[AggregatedEvaluation]]:
+        for aggregation_id in self.aggregation_ids():
+            aggregation_overview = self.aggregation_overview(
+                aggregation_id, aggregation_type
+            )
+            if aggregation_overview is not None:
+                yield aggregation_overview
 
 
 class FileAggregationRepository(AggregationRepository, FileBasedRepository):
@@ -49,17 +75,18 @@ def _aggregation_directory(self, eval_id: str) -> Path:
         path.mkdir(exist_ok=True)
         return path
 
-    def _aggregation_overview_path(self, id: str) -> Path:
-        return self._aggregation_directory(id).with_suffix(".json")
+    def _aggregation_overview_path(self, aggregation_id: str) -> Path:
+        return self._aggregation_directory(aggregation_id).with_suffix(".json")
 
     def aggregation_overview(
-        self, id: str, stat_type: type[AggregatedEvaluation]
+        self, aggregation_id: str, aggregation_type: type[AggregatedEvaluation]
     ) -> AggregationOverview[AggregatedEvaluation] | None:
-        file_path = self._aggregation_overview_path(id)
+        file_path = self._aggregation_overview_path(aggregation_id)
         if not file_path.exists():
             return None
+
         content = self.read_utf8(file_path)
-        return AggregationOverview[stat_type].model_validate_json(  # type:ignore
+        return AggregationOverview[aggregation_type].model_validate_json(  # type:ignore
             content
         )
 
@@ -80,10 +107,13 @@ def __init__(self) -> None:
         super().__init__()
         self._aggregation_overviews: dict[str, AggregationOverview[Any]] = dict()
 
+    def aggregation_ids(self) -> Sequence[str]:
+        return list(self._aggregation_overviews.keys())
+
     def aggregation_overview(
-        self, id: str, stat_type: type[AggregatedEvaluation]
+        self, aggregation_id: str, aggregation_type: type[AggregatedEvaluation]
     ) -> AggregationOverview[AggregatedEvaluation] | None:
-        return self._aggregation_overviews[id]
+        return self._aggregation_overviews[aggregation_id]
 
     def store_aggregation_overview(
         self, overview: AggregationOverview[AggregatedEvaluation]

diff --git a/src/intelligence_layer/evaluation/data_storage/dataset_repository.py b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py
@@ -11,38 +11,82 @@
 
 
 class DatasetRepository(ABC):
+    """Base dataset repository interface.
+
+    Provides methods to store and load datasets and their examples.
+    """
+
     @abstractmethod
     def create_dataset(
         self,
         examples: Iterable[Example[Input, ExpectedOutput]],
     ) -> str:
+        """Creates a dataset from given examples and returns the ID of that dataset.
+
+        Args:
+            examples: An iterable of examples to be saved together under the same dataset ID.
+
+        Returns:
+            Returns the ID of the created dataset.
+        """
         pass
 
     @abstractmethod
-    def examples_by_id(
+    def example(
         self,
         dataset_id: str,
+        example_id: str,
         input_type: type[Input],
         expected_output_type: type[ExpectedOutput],
-    ) -> Optional[Iterable[Example[Input, ExpectedOutput]]]:
+    ) -> Optional[Example[Input, ExpectedOutput]]:
+        """Returns an :class:`Example` identified by the given dataset ID and example ID.
+
+        Args:
+            dataset_id: Dataset ID.
+            example_id: Example ID.
+            input_type: Input type of the example.
+            expected_output_type: Expected output type of the example.
+
+        Returns:
+            :class:`Example` if one was found, `None` otherwise.
+        """
         pass
 
     @abstractmethod
-    def example(
+    def examples(
         self,
         dataset_id: str,
-        example_id: str,
         input_type: type[Input],
         expected_output_type: type[ExpectedOutput],
-    ) -> Optional[Example[Input, ExpectedOutput]]:
+    ) -> Iterable[Example[Input, ExpectedOutput]]:
+        """Returns all :class:`Example`s identified by the given dataset ID.
+
+        Args:
+            dataset_id: Dataset ID.
+            input_type: Input type of the example.
+            expected_output_type: Expected output type of the example.
+
+        Returns:
+            :class:`Iterable` of :class`Example`s.
+        """
         pass
 
     @abstractmethod
-    def delete_dataset(self, dataset_id: str) -> None:
+    def dataset_ids(self) -> Iterable[str]:
+        """Returns all dataset IDs.
+
+        Returns:
+            :class:`Iterable` of strings.
+        """
         pass
 
     @abstractmethod
-    def list_datasets(self) -> Iterable[str]:
+    def delete_dataset(self, dataset_id: str) -> None:
+        """Deletes a dataset identified by the given dataset ID.
+
+        Args:
+            dataset_id: Dataset ID of the dataset to delete.
+        """
         pass
 
 
@@ -55,8 +99,18 @@ def __init__(self, fs: AbstractFileSystem, root_directory: str) -> None:
         self._fs = fs
         self._root_directory = root_directory
 
-    def _dataset_path(self, dataset_id: str) -> str:
-        return self._root_directory + f"/{dataset_id}.jsonl"
+    def create_dataset(self, examples: Iterable[Example[Input, ExpectedOutput]]) -> str:
+        dataset_id = str(uuid4())
+        dataset_path = self._dataset_path(dataset_id)
+        if self._fs.exists(dataset_path):
+            raise ValueError(f"Dataset name {dataset_id} already taken")
+
+        with self._fs.open(dataset_path, "w", encoding="utf-8") as examples_file:
+            for example in examples:
+                serialized_result = JsonSerializer(root=example)
+                text = serialized_result.model_dump_json() + "\n"
+                examples_file.write(text)
+        return dataset_id
 
     def example(
         self,
@@ -70,35 +124,22 @@ def example(
             return None
 
         with self._fs.open(example_path, "r", encoding="utf-8") as examples_file:
-            # Mypy does not accept dynamic types
             for example in examples_file:
+                # mypy does not accept dynamic types
                 validated_example = Example[input_type, expected_output_type].model_validate_json(json_data=example)  # type: ignore
                 if validated_example.id == example_id:
                     return validated_example
         return None
 
-    def create_dataset(self, examples: Iterable[Example[Input, ExpectedOutput]]) -> str:
-        dataset_id = str(uuid4())
-        dataset_path = self._dataset_path(dataset_id)
-        if self._fs.exists(dataset_path):
-            raise ValueError(f"Dataset name {dataset_id} already taken")
-
-        with self._fs.open(dataset_path, "w", encoding="utf-8") as examples_file:
-            for example in examples:
-                serialized_result = JsonSerializer(root=example)
-                text = serialized_result.model_dump_json() + "\n"
-                examples_file.write(text)
-        return dataset_id
-
-    def examples_by_id(
+    def examples(
         self,
         dataset_id: str,
         input_type: type[Input],
         expected_output_type: type[ExpectedOutput],
-    ) -> Optional[Iterable[Example[Input, ExpectedOutput]]]:
+    ) -> Iterable[Example[Input, ExpectedOutput]]:
         example_path = self._dataset_path(dataset_id)
         if not self._fs.exists(example_path):
-            return None
+            return []
 
         with self._fs.open(example_path, "r", encoding="utf-8") as examples_file:
             # Mypy does not accept dynamic types
@@ -113,19 +154,22 @@ def examples_by_id(
             if example
         )
 
+    def dataset_ids(self) -> Iterable[str]:
+        return [
+            Path(f["name"]).stem
+            for f in self._fs.ls(self._root_directory, detail=True)
+            if isinstance(f, Dict) and Path(f["name"]).suffix == ".jsonl"
+        ]
+
     def delete_dataset(self, dataset_id: str) -> None:
         dataset_path = self._dataset_path(dataset_id)
         try:
             self._fs.rm(dataset_path, recursive=True)
         except FileNotFoundError:
             pass
 
-    def list_datasets(self) -> Iterable[str]:
-        return [
-            Path(f["name"]).stem
-            for f in self._fs.ls(self._root_directory, detail=True)
-            if isinstance(f, Dict) and Path(f["name"]).suffix == ".jsonl"
-        ]
+    def _dataset_path(self, dataset_id: str) -> str:
+        return self._root_directory + f"/{dataset_id}.jsonl"
 
 
 class InMemoryDatasetRepository(DatasetRepository):
@@ -153,14 +197,14 @@ def create_dataset(
         self._datasets[name] = in_memory_examples
         return name
 
-    def examples_by_id(
+    def examples(
         self,
         dataset_id: str,
         input_type: type[Input],
         expected_output_type: type[ExpectedOutput],
-    ) -> Optional[Iterable[Example[Input, ExpectedOutput]]]:
+    ) -> Iterable[Example[Input, ExpectedOutput]]:
         return cast(
-            Optional[Iterable[Example[Input, ExpectedOutput]]],
+            Iterable[Example[Input, ExpectedOutput]],
             self._datasets.get(dataset_id),
         )
 
@@ -171,18 +215,16 @@ def example(
         input_type: type[Input],
         expected_output_type: type[ExpectedOutput],
     ) -> Example[Input, ExpectedOutput] | None:
-        examples = self.examples_by_id(dataset_id, input_type, expected_output_type)
-        if examples is None:
-            return None
+        examples = self.examples(dataset_id, input_type, expected_output_type)
         filtered = (e for e in examples if e.id == example_id)
         return next(filtered, None)
 
+    def dataset_ids(self) -> Iterable[str]:
+        return list(self._datasets.keys())
+
     def delete_dataset(self, dataset_id: str) -> None:
         self._datasets.pop(dataset_id, None)
 
-    def list_datasets(self) -> Iterable[str]:
-        return list(self._datasets.keys())
-
 
 class FileDatasetRepository(FileSystemDatasetRepository):
     def __init__(self, root_directory: Path) -> None: