Skip to content

Commit

Permalink
feature: add StudioRunnerRepository, StudioEvaluationRepository and S…
Browse files Browse the repository at this point in the history
…tudioAggregationRepository
  • Loading branch information
mveleci committed Aug 21, 2024
1 parent 90474e8 commit 430a277
Show file tree
Hide file tree
Showing 16 changed files with 811 additions and 39 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# Changelog

## Unreleased
- Add `DataClient` and `StudioDataRepository` as connectors to Studio for submitting data.
- Add `DataClient` and `StudioDatasetRepository` as connectors to Studio for submitting data.
- Add `StudioRunnerRepository` as a connector to Studio for submitting runs.
- Add `StudioEvaluationRepository` as a connector to Studio for submitting evaluations.
- Add `StudioAggregationRepository` as a connector to Studio for submitting aggregations.
### Breaking Changes
...

Expand Down
127 changes: 127 additions & 0 deletions src/documentation/how_tos/how_to_create_a_dataset_using_studio.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from collections.abc import Sequence\n",
"\n",
"from pydantic import BaseModel\n",
"\n",
"from intelligence_layer.evaluation import Example\n",
"from intelligence_layer.evaluation.dataset.studio_dataset_repository import StudioDatasetRepository\n",
"from intelligence_layer.connectors.data import DataClient"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How to create a dataset\n",
"\n",
"0. Collect data for examples.\n",
"1. Convert data to `Example`s.\n",
"1. Create a `DatasetRepository`.\n",
"2. Store `Example`s to `DatasetRepository`.\n",
"3. Remember the dataset id."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"65421249-cdea-4a98-a5c8-0ed9280869d5\n",
"{'label2', 'label1'}\n",
"{'key_a': ['a', 'b'], 'key_b': 'value'}\n"
]
}
],
"source": [
"class StoryTaskInput(BaseModel): # Should already be implemented in your task\n",
" topic: str\n",
" targeted_word_count: int\n",
"\n",
"\n",
"class StoryTaskExpectedOutput(BaseModel): # Should already be implemented in your task\n",
" keywords: Sequence[str]\n",
"\n",
"\n",
"# Step 1\n",
"examples = [\n",
" Example(\n",
" input=StoryTaskInput(topic=\"rain\", targeted_word_count=42),\n",
" expected_output=StoryTaskExpectedOutput(keywords=[\"wet\"]),\n",
" metadata={\n",
" \"author\": \"Shakespeare\"\n",
" }, # the metadata is optional and can contain custom information\n",
" ),\n",
" # ...\n",
"]*10\n",
"\n",
"# Step 2 - Use FileDatasetRepository or HuggingFaceDatasetRepository for persistence\n",
"dataset_repository = StudioDatasetRepository(\n",
" repository_id=\"<repository_id>\",\n",
" data_client=DataClient(\n",
" token=\"your_token\",\n",
" base_data_platform_url=\"http://localhost:8080\",\n",
" ),\n",
")\n",
"\n",
"# Step 3\n",
"dataset = dataset_repository.create_dataset(\n",
" examples=examples,\n",
" dataset_name=\"StoryDataset\",\n",
" labels=set([\"label1\", \"label2\"]),\n",
" metadata=dict({\"key_a\": [\"a\", \"b\"], \"key_b\": \"value\"}),\n",
")\n",
"\n",
"# Step 4\n",
"print(dataset.id)\n",
"print(dataset.labels)\n",
"print(dataset.metadata)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "intelligence-layer-dgcJwC7l-py3.11",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from fsspec.implementations.local import LocalFileSystem\n",
"\n",
"from example_data import DummyEvaluationLogic, example_data, DummyEvaluation\n",
"\n",
"from intelligence_layer.evaluation import Evaluator, StudioEvaluationRepository\n",
"from intelligence_layer.evaluation.dataset.studio_dataset_repository import StudioDatasetRepository\n",
"from intelligence_layer.connectors.data.data import DataClient"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How to evaluate runs\n",
"0. Run your tasks on the datasets where you want to evaluate them on (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n",
" - When evaluating multiple runs, all of them need the same data types \n",
"2. Initialize all necessary repositories for the `Evaluator`, and an `EvaluationLogic`.\n",
"3. Run the evaluator to evaluate all examples and create a single `EvaluationOverview`\n",
"4. (Optional) Save the evaluation id for later use"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating: 2it [00:00, 31300.78it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"Evaluating: 2it [00:00, 28532.68it/s]\n"
]
}
],
"source": [
"# Step 0\n",
"\n",
"my_example_data = example_data()\n",
"run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n",
"\n",
"\n",
"# Step 1\n",
"studio_dataset_repository = StudioDatasetRepository(\n",
" repository_id=\"<your_repository_id>\",\n",
" data_client=DataClient(token=\"<your_token>\", base_data_platform_url=\"http://localhost:8080\"),\n",
")\n",
"dataset_repository = my_example_data.dataset_repository\n",
"run_repository = my_example_data.run_repository\n",
"evaluation_repository = StudioEvaluationRepository(\n",
" file_system=LocalFileSystem(True),\n",
" root_directory=Path(\"evals\"),\n",
" studio_dataset_repository=studio_dataset_repository,\n",
" evaluation_type=DummyEvaluation,\n",
")\n",
"evaluation_logic = DummyEvaluationLogic()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Step 3\n",
"evaluator = Evaluator(\n",
" dataset_repository,\n",
" run_repository,\n",
" evaluation_repository,\n",
" \"My dummy evaluation\",\n",
" evaluation_logic,\n",
")\n",
"\n",
"evaluation_overview = evaluator.evaluate_runs(\n",
" *run_ids, labels=set({\"label\"}), metadata=dict({\"key\": \"value\"})\n",
")\n",
"\n",
"# Step 4\n",
"print(evaluation_overview.id)\n",
"print(evaluation_overview.metadata)\n",
"print(evaluation_overview.labels)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "intelligence-layer-d3iSWYpm-py3.10",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from example_data import DummyTask, example_data\n",
"\n",
"from intelligence_layer.evaluation.run.studio_runner_repository import (\n",
" StudioRunnerRepository, \n",
")\n",
"from intelligence_layer.evaluation.run.runner import Runner\n",
"from intelligence_layer.evaluation.dataset.studio_dataset_repository import StudioDatasetRepository\n",
"from intelligence_layer.connectors.data.data import DataClient\n",
"\n",
"from fsspec.implementations.local import LocalFileSystem"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"studio_dataset_repository = StudioDatasetRepository(\n",
" repository_id=\"<your_repository_id>\",\n",
" data_client=DataClient(token=\"<your_token>\", base_data_platform_url=\"http://localhost:8080\"),\n",
")\n",
"run_repository = StudioRunnerRepository(\n",
" file_system=LocalFileSystem(True),\n",
" root_directory=Path(\"runs\"),\n",
" output_type=str,\n",
" studio_dataset_repository=studio_dataset_repository,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How to run a task on a dataset\n",
"0. Create a suitable dataset (see [here](./how_to_create_a_dataset.ipynb)) and a task (see [here](./how_to_implement_a_task.ipynb)).\n",
"1. Initialize the task and a `RunRepository`, and open the correct `DatasetRepository`\n",
" - The `DatasetRepository` needs to contain the dataset.\n",
" - The `RunRepository` stores results.\n",
"2. Use the `Runner` to run the task on the given dataset via `run_dataset`\n",
"3. Save the id of the resulting `RunOverview`\n",
"\n",
"### Example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Step 0\n",
"my_example_data = example_data()\n",
"print()\n",
"\n",
"# Step 1\n",
"dataset_repository = my_example_data.dataset_repository\n",
"\n",
"task = DummyTask()\n",
"\n",
"# Step 2\n",
"runner = Runner(task, dataset_repository, run_repository, \"MyRunDescription\")\n",
"run_overview = runner.run_dataset(my_example_data.dataset.id)\n",
"\n",
"# Step 3\n",
"print(run_overview.id)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "intelligence-layer-d3iSWYpm-py3.10",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
15 changes: 8 additions & 7 deletions src/intelligence_layer/connectors/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,16 +151,17 @@ def create_dataset(self, repository_id: str, dataset: DatasetCreate) -> DataData
url = urljoin(
self.base_data_platform_url, f"api/v1/repositories/{repository_id}/datasets"
)
body = {
"sourceData": dataset.source_data,
"labels": ",".join(dataset.labels),
"name": dataset.name,
"totalDatapoints": dataset.total_datapoints,
"metadata": json.dumps(dataset.metadata) if dataset.metadata else None,
}
response = self._do_request(
"POST",
url,
files={
"source_data": dataset.source_data,
"labels": ",".join(dataset.labels),
"name": dataset.name,
"total_datapoints": dataset.total_datapoints,
"metadata": json.dumps(dataset.metadata) if dataset.metadata else None,
},
files={k: v for k, v in body.items() if v not in [None, ""]},
)
return DataDataset(**response.json())

Expand Down
Loading

0 comments on commit 430a277

Please sign in to comment.