Skip to content

Commit

Permalink
feat(repository): add data selection methods to run, eval and aggrega…
Browse files Browse the repository at this point in the history
…tion repositories

Task: IL-304, IL-301
  • Loading branch information
MerlinKallenbornTNG authored and Valentina Galata committed Feb 29, 2024
1 parent 7d3cb60 commit 43c0a71
Show file tree
Hide file tree
Showing 11 changed files with 311 additions and 224 deletions.
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ The key features of the Intelligence Layer are:
To ensure this, we provide full comprehensibility, by seamlessly logging each step of every workflow.
This enhances your debugging capabilities and offers greater control post-deployment when examining model responses.

### Table of contents
## Table of contents

1. [Getting Started](#getting-started)
2. [Getting started with the Jupyter Notebooks](#getting-started-with-the-jupyter-notebooks)
Expand All @@ -24,8 +24,9 @@ The key features of the Intelligence Layer are:
6. [How to make your own use case](#how-to-make-your-own-use-case)
7. [Running the Trace Viewer](#running-the-trace-viewer)
8. [Using the Intelligence Layer in Docker](#using-the-intelligence-layer-in-docker)
9. [References](#references)
10. [License](#license)
9. [For developers](#for-developers)
10. [References](#references)
11. [License](#license)

## Getting started

Expand Down Expand Up @@ -251,6 +252,18 @@ secrets:
You can read more about this in the [official documentation](https://docs.docker.com/engine/swarm/secrets/).
## For Developers
### Python: Naming Conventions
We follow the [PEP 8 – Style Guide for Python Code](https://peps.python.org/pep-0008/).
In addition, there are the following naming conventions:
* Class method names:
* Use only substantives for a method name having no side effects and returning some objects
* E.g., `evaluation_overview` which returns an evaluation overview object
* Use a verb for a method name if it has side effects and return nothing
* E.g., `store_evaluation_overview` which saves a given evaluation overview (and returns nothing)

## References

- Full documentation: https://aleph-alpha-intelligence-layer.readthedocs-hosted.com/en/latest/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Sequence
from typing import Any, Iterable, Optional, Sequence

from intelligence_layer.evaluation.data_storage.utils import FileBasedRepository
from intelligence_layer.evaluation.domain import (
Expand All @@ -10,21 +10,37 @@


class AggregationRepository(ABC):
"""Base aggregation repository interface.
Provides methods to store and load aggregated evaluation results.
"""

@abstractmethod
def aggregation_ids(self) -> Sequence[str]:
"""Returns the ids of all stored aggregation runs.
Having the id of an aggregation run, its overview can be retrieved with
:meth:`AggregationRepository.aggregation_overview`.
Returns:
The ids of all stored aggregation runs.
"""
pass

@abstractmethod
def aggregation_overview(
self, id: str, stat_type: type[AggregatedEvaluation]
) -> AggregationOverview[AggregatedEvaluation] | None:
"""Returns all failed :class:`ExampleResult` instances of a given run
self, aggregation_id: str, aggregation_type: type[AggregatedEvaluation]
) -> Optional[AggregationOverview[AggregatedEvaluation]]:
"""Returns a specific instance of :class:`AggregationOverview` of a given run
Args:
id: Identifier of the TODO
stat_type:
aggregation_id: Identifier of the aggregation overview
aggregation_type: Type of the aggregation
Returns:
:class:`EvaluationOverview` if one was found, `None` otherwise.
"""
...
pass

@abstractmethod
def store_aggregation_overview(
Expand All @@ -35,7 +51,17 @@ def store_aggregation_overview(
Args:
overview: The overview to be persisted.
"""
...
pass

def aggregation_overviews(
self, aggregation_type: type[AggregatedEvaluation]
) -> Iterable[AggregationOverview[AggregatedEvaluation]]:
for aggregation_id in self.aggregation_ids():
aggregation_overview = self.aggregation_overview(
aggregation_id, aggregation_type
)
if aggregation_overview is not None:
yield aggregation_overview


class FileAggregationRepository(AggregationRepository, FileBasedRepository):
Expand All @@ -49,17 +75,18 @@ def _aggregation_directory(self, eval_id: str) -> Path:
path.mkdir(exist_ok=True)
return path

def _aggregation_overview_path(self, id: str) -> Path:
return self._aggregation_directory(id).with_suffix(".json")
def _aggregation_overview_path(self, aggregation_id: str) -> Path:
return self._aggregation_directory(aggregation_id).with_suffix(".json")

def aggregation_overview(
self, id: str, stat_type: type[AggregatedEvaluation]
self, aggregation_id: str, aggregation_type: type[AggregatedEvaluation]
) -> AggregationOverview[AggregatedEvaluation] | None:
file_path = self._aggregation_overview_path(id)
file_path = self._aggregation_overview_path(aggregation_id)
if not file_path.exists():
return None

content = self.read_utf8(file_path)
return AggregationOverview[stat_type].model_validate_json( # type:ignore
return AggregationOverview[aggregation_type].model_validate_json( # type:ignore
content
)

Expand All @@ -80,10 +107,13 @@ def __init__(self) -> None:
super().__init__()
self._aggregation_overviews: dict[str, AggregationOverview[Any]] = dict()

def aggregation_ids(self) -> Sequence[str]:
return list(self._aggregation_overviews.keys())

def aggregation_overview(
self, id: str, stat_type: type[AggregatedEvaluation]
self, aggregation_id: str, aggregation_type: type[AggregatedEvaluation]
) -> AggregationOverview[AggregatedEvaluation] | None:
return self._aggregation_overviews[id]
return self._aggregation_overviews[aggregation_id]

def store_aggregation_overview(
self, overview: AggregationOverview[AggregatedEvaluation]
Expand Down
124 changes: 83 additions & 41 deletions src/intelligence_layer/evaluation/data_storage/dataset_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,82 @@


class DatasetRepository(ABC):
"""Base dataset repository interface.
Provides methods to store and load datasets and their examples.
"""

@abstractmethod
def create_dataset(
self,
examples: Iterable[Example[Input, ExpectedOutput]],
) -> str:
"""Creates a dataset from given examples and returns the ID of that dataset.
Args:
examples: An iterable of examples to be saved together under the same dataset ID.
Returns:
Returns the ID of the created dataset.
"""
pass

@abstractmethod
def examples_by_id(
def example(
self,
dataset_id: str,
example_id: str,
input_type: type[Input],
expected_output_type: type[ExpectedOutput],
) -> Optional[Iterable[Example[Input, ExpectedOutput]]]:
) -> Optional[Example[Input, ExpectedOutput]]:
"""Returns an :class:`Example` identified by the given dataset ID and example ID.
Args:
dataset_id: Dataset ID.
example_id: Example ID.
input_type: Input type of the example.
expected_output_type: Expected output type of the example.
Returns:
:class:`Example` if one was found, `None` otherwise.
"""
pass

@abstractmethod
def example(
def examples(
self,
dataset_id: str,
example_id: str,
input_type: type[Input],
expected_output_type: type[ExpectedOutput],
) -> Optional[Example[Input, ExpectedOutput]]:
) -> Iterable[Example[Input, ExpectedOutput]]:
"""Returns all :class:`Example`s identified by the given dataset ID.
Args:
dataset_id: Dataset ID.
input_type: Input type of the example.
expected_output_type: Expected output type of the example.
Returns:
:class:`Iterable` of :class`Example`s.
"""
pass

@abstractmethod
def delete_dataset(self, dataset_id: str) -> None:
def dataset_ids(self) -> Iterable[str]:
"""Returns all dataset IDs.
Returns:
:class:`Iterable` of strings.
"""
pass

@abstractmethod
def list_datasets(self) -> Iterable[str]:
def delete_dataset(self, dataset_id: str) -> None:
"""Deletes a dataset identified by the given dataset ID.
Args:
dataset_id: Dataset ID of the dataset to delete.
"""
pass


Expand All @@ -55,8 +99,18 @@ def __init__(self, fs: AbstractFileSystem, root_directory: str) -> None:
self._fs = fs
self._root_directory = root_directory

def _dataset_path(self, dataset_id: str) -> str:
return self._root_directory + f"/{dataset_id}.jsonl"
def create_dataset(self, examples: Iterable[Example[Input, ExpectedOutput]]) -> str:
dataset_id = str(uuid4())
dataset_path = self._dataset_path(dataset_id)
if self._fs.exists(dataset_path):
raise ValueError(f"Dataset name {dataset_id} already taken")

with self._fs.open(dataset_path, "w", encoding="utf-8") as examples_file:
for example in examples:
serialized_result = JsonSerializer(root=example)
text = serialized_result.model_dump_json() + "\n"
examples_file.write(text)
return dataset_id

def example(
self,
Expand All @@ -70,35 +124,22 @@ def example(
return None

with self._fs.open(example_path, "r", encoding="utf-8") as examples_file:
# Mypy does not accept dynamic types
for example in examples_file:
# mypy does not accept dynamic types
validated_example = Example[input_type, expected_output_type].model_validate_json(json_data=example) # type: ignore
if validated_example.id == example_id:
return validated_example
return None

def create_dataset(self, examples: Iterable[Example[Input, ExpectedOutput]]) -> str:
dataset_id = str(uuid4())
dataset_path = self._dataset_path(dataset_id)
if self._fs.exists(dataset_path):
raise ValueError(f"Dataset name {dataset_id} already taken")

with self._fs.open(dataset_path, "w", encoding="utf-8") as examples_file:
for example in examples:
serialized_result = JsonSerializer(root=example)
text = serialized_result.model_dump_json() + "\n"
examples_file.write(text)
return dataset_id

def examples_by_id(
def examples(
self,
dataset_id: str,
input_type: type[Input],
expected_output_type: type[ExpectedOutput],
) -> Optional[Iterable[Example[Input, ExpectedOutput]]]:
) -> Iterable[Example[Input, ExpectedOutput]]:
example_path = self._dataset_path(dataset_id)
if not self._fs.exists(example_path):
return None
return []

with self._fs.open(example_path, "r", encoding="utf-8") as examples_file:
# Mypy does not accept dynamic types
Expand All @@ -113,19 +154,22 @@ def examples_by_id(
if example
)

def dataset_ids(self) -> Iterable[str]:
return [
Path(f["name"]).stem
for f in self._fs.ls(self._root_directory, detail=True)
if isinstance(f, Dict) and Path(f["name"]).suffix == ".jsonl"
]

def delete_dataset(self, dataset_id: str) -> None:
dataset_path = self._dataset_path(dataset_id)
try:
self._fs.rm(dataset_path, recursive=True)
except FileNotFoundError:
pass

def list_datasets(self) -> Iterable[str]:
return [
Path(f["name"]).stem
for f in self._fs.ls(self._root_directory, detail=True)
if isinstance(f, Dict) and Path(f["name"]).suffix == ".jsonl"
]
def _dataset_path(self, dataset_id: str) -> str:
return self._root_directory + f"/{dataset_id}.jsonl"


class InMemoryDatasetRepository(DatasetRepository):
Expand Down Expand Up @@ -153,14 +197,14 @@ def create_dataset(
self._datasets[name] = in_memory_examples
return name

def examples_by_id(
def examples(
self,
dataset_id: str,
input_type: type[Input],
expected_output_type: type[ExpectedOutput],
) -> Optional[Iterable[Example[Input, ExpectedOutput]]]:
) -> Iterable[Example[Input, ExpectedOutput]]:
return cast(
Optional[Iterable[Example[Input, ExpectedOutput]]],
Iterable[Example[Input, ExpectedOutput]],
self._datasets.get(dataset_id),
)

Expand All @@ -171,18 +215,16 @@ def example(
input_type: type[Input],
expected_output_type: type[ExpectedOutput],
) -> Example[Input, ExpectedOutput] | None:
examples = self.examples_by_id(dataset_id, input_type, expected_output_type)
if examples is None:
return None
examples = self.examples(dataset_id, input_type, expected_output_type)
filtered = (e for e in examples if e.id == example_id)
return next(filtered, None)

def dataset_ids(self) -> Iterable[str]:
return list(self._datasets.keys())

def delete_dataset(self, dataset_id: str) -> None:
self._datasets.pop(dataset_id, None)

def list_datasets(self) -> Iterable[str]:
return list(self._datasets.keys())


class FileDatasetRepository(FileSystemDatasetRepository):
def __init__(self, root_directory: Path) -> None:
Expand Down
Loading

0 comments on commit 43c0a71

Please sign in to comment.