From bde54417edbddce3664b32c9998a5a93f8fcafed Mon Sep 17 00:00:00 2001
From: Florian Schepers <florian.schepers@ext.aleph-alpha.com>
Date: Mon, 13 May 2024 18:07:59 +0200
Subject: [PATCH] fix: Change `confusion_matrix` in
 `SingleLabelClassifyAggregationLogic` such that it can be persisted on disc

TASK: IL-475
---
 CHANGELOG.md                                  |  1 +
 .../examples/classify/classify.py             | 12 +--
 tests/conftest.py                             | 43 ++++++++++-
 tests/evaluation/conftest.py                  | 40 +---------
 tests/use_cases/classify/test_classify.py     | 76 ++++++++++++++++++-
 5 files changed, 124 insertions(+), 48 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60e88b254..e66d35ce1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,6 +32,7 @@ Check the how-to for detailed information [here](./src/documentation/how_tos/how
 
 ### Fixes
  - Improve description of using artifactory tokens for installation of IL
+ - Change `confusion_matrix` in `SingleLabelClassifyAggregationLogic` such that it can be persisted in a file repository
 
 ### Deprecations
 ...
diff --git a/src/intelligence_layer/examples/classify/classify.py b/src/intelligence_layer/examples/classify/classify.py
index d3dcb89e6..611e032f4 100644
--- a/src/intelligence_layer/examples/classify/classify.py
+++ b/src/intelligence_layer/examples/classify/classify.py
@@ -90,9 +90,9 @@ class AggregatedSingleLabelClassifyEvaluation(BaseModel):
     """
 
     percentage_correct: float
-    confusion_matrix: Mapping[tuple[str, str], int]
-    by_label: Mapping[str, AggregatedLabelInfo]
-    missing_labels: Mapping[str, int]
+    confusion_matrix: dict[str, dict[str, int]]
+    by_label: dict[str, AggregatedLabelInfo]
+    missing_labels: dict[str, int]
 
 
 class SingleLabelClassifyAggregationLogic(
@@ -105,14 +105,16 @@ def aggregate(
     ) -> AggregatedSingleLabelClassifyEvaluation:
         acc = MeanAccumulator()
         missing_labels: dict[str, int] = defaultdict(int)
-        confusion_matrix: dict[tuple[str, str], int] = defaultdict(int)
+        confusion_matrix: dict[str, dict[str, int]] = defaultdict(
+            lambda: defaultdict(int)
+        )
         by_label: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
         for evaluation in evaluations:
             acc.add(1.0 if evaluation.correct else 0.0)
             if evaluation.expected_label_missing:
                 missing_labels[evaluation.expected] += 1
             else:
-                confusion_matrix[(evaluation.predicted, evaluation.expected)] += 1
+                confusion_matrix[evaluation.predicted][evaluation.expected] += 1
                 by_label[evaluation.predicted]["predicted"] += 1
                 by_label[evaluation.expected]["expected"] += 1
 
diff --git a/tests/conftest.py b/tests/conftest.py
index e4eb66ae7..a80c8f7a1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,13 +17,21 @@
     QdrantInMemoryRetriever,
     RetrieverType,
 )
-from intelligence_layer.core import LuminousControlModel, NoOpTracer, Task, TaskSpan
+from intelligence_layer.core import (
+    LuminousControlModel,
+    NoOpTracer,
+    Task,
+    TaskSpan,
+    utc_now,
+)
 from intelligence_layer.evaluation import (
     AsyncInMemoryEvaluationRepository,
+    EvaluationOverview,
     InMemoryAggregationRepository,
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
     InMemoryRunRepository,
+    RunOverview,
 )
 
 
@@ -154,3 +162,36 @@ def in_memory_aggregation_repository() -> InMemoryAggregationRepository:
 @fixture()
 def async_in_memory_evaluation_repository() -> AsyncInMemoryEvaluationRepository:
     return AsyncInMemoryEvaluationRepository()
+
+
+@fixture
+def run_overview() -> RunOverview:
+    return RunOverview(
+        dataset_id="dataset-id",
+        id="run-id-1",
+        start=utc_now(),
+        end=utc_now(),
+        failed_example_count=0,
+        successful_example_count=3,
+        description="test run overview 1",
+    )
+    
+
+@fixture
+def evaluation_id() -> str:
+    return "evaluation-id-1"
+
+
+@fixture
+def evaluation_overview(
+    evaluation_id: str, run_overview: RunOverview
+) -> EvaluationOverview:
+    return EvaluationOverview(
+        id=evaluation_id,
+        start_date=utc_now(),
+        end_date=utc_now(),
+        successful_evaluation_count=1,
+        failed_evaluation_count=1,
+        run_overviews=frozenset([run_overview]),
+        description="test evaluation overview 1",
+    )
diff --git a/tests/evaluation/conftest.py b/tests/evaluation/conftest.py
index 052351de5..c15362589 100644
--- a/tests/evaluation/conftest.py
+++ b/tests/evaluation/conftest.py
@@ -1,4 +1,3 @@
-from datetime import datetime
 from os import getenv
 from pathlib import Path
 from typing import Iterable, Sequence
@@ -29,7 +28,6 @@
     InMemoryDatasetRepository,
     InMemoryRunRepository,
     Runner,
-    RunOverview,
 )
 from tests.conftest import DummyStringInput, DummyStringOutput
 
@@ -69,11 +67,6 @@ def sequence_examples() -> Iterable[Example[str, None]]:
     ]
 
 
-@fixture
-def evaluation_id() -> str:
-    return "evaluation-id-1"
-
-
 @fixture
 def successful_example_evaluation(
     evaluation_id: str,
@@ -115,45 +108,16 @@ def dummy_aggregated_evaluation() -> DummyAggregatedEvaluation:
     return DummyAggregatedEvaluation(score=0.5)
 
 
-@fixture
-def run_overview() -> RunOverview:
-    return RunOverview(
-        dataset_id="dataset-id",
-        id="run-id-1",
-        start=utc_now(),
-        end=utc_now(),
-        failed_example_count=0,
-        successful_example_count=3,
-        description="test run overview 1",
-    )
-
-
-@fixture
-def evaluation_overview(
-    evaluation_id: str, run_overview: RunOverview
-) -> EvaluationOverview:
-    return EvaluationOverview(
-        id=evaluation_id,
-        start_date=utc_now(),
-        end_date=utc_now(),
-        successful_evaluation_count=1,
-        failed_evaluation_count=1,
-        run_overviews=frozenset([run_overview]),
-        description="test evaluation overview 1",
-    )
-
-
 @fixture
 def aggregation_overview(
     evaluation_overview: EvaluationOverview,
     dummy_aggregated_evaluation: DummyAggregatedEvaluation,
 ) -> AggregationOverview[DummyAggregatedEvaluation]:
-    now = datetime.now()
     return AggregationOverview(
         evaluation_overviews=frozenset([evaluation_overview]),
         id="aggregation-id",
-        start=now,
-        end=now,
+        start=utc_now(),
+        end=utc_now(),
         successful_evaluation_count=5,
         crashed_during_evaluation_count=3,
         description="dummy-evaluator",
diff --git a/tests/use_cases/classify/test_classify.py b/tests/use_cases/classify/test_classify.py
index b54bd7334..8de01a67e 100644
--- a/tests/use_cases/classify/test_classify.py
+++ b/tests/use_cases/classify/test_classify.py
@@ -1,22 +1,28 @@
+from pathlib import Path
 from typing import Iterable, List, Sequence
+from uuid import uuid4
 
 from pytest import fixture
 
 from intelligence_layer.connectors import AlephAlphaClientProtocol
-from intelligence_layer.core import Task, TextChunk
+from intelligence_layer.core import Task, TextChunk, utc_now
 from intelligence_layer.evaluation import (
+    AggregationOverview,
     Aggregator,
     DatasetRepository,
+    EvaluationOverview,
     Evaluator,
     Example,
+    FileAggregationRepository,
     InMemoryAggregationRepository,
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
+    InMemoryRunRepository,
     Runner,
-    RunRepository,
 )
 from intelligence_layer.examples import (
     AggregatedMultiLabelClassifyEvaluation,
+    AggregatedSingleLabelClassifyEvaluation,
     ClassifyInput,
     EmbeddingBasedClassify,
     LabelWithExamples,
@@ -25,6 +31,7 @@
     MultiLabelClassifyEvaluationLogic,
     MultiLabelClassifyOutput,
 )
+from intelligence_layer.examples.classify.classify import AggregatedLabelInfo
 
 
 @fixture
@@ -143,7 +150,7 @@ def multi_label_classify_aggregation_logic() -> MultiLabelClassifyAggregationLog
 @fixture
 def classify_evaluator(
     in_memory_dataset_repository: DatasetRepository,
-    in_memory_run_repository: RunRepository,
+    in_memory_run_repository: InMemoryRunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
     multi_label_classify_evaluation_logic: MultiLabelClassifyEvaluationLogic,
 ) -> Evaluator[
@@ -178,11 +185,28 @@ def classify_aggregator(
     )
 
 
+@fixture
+def classify_aggregator_file_repo(
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    file_aggregation_repository: FileAggregationRepository,
+    multi_label_classify_aggregation_logic: MultiLabelClassifyAggregationLogic,
+) -> Aggregator[
+    MultiLabelClassifyEvaluation,
+    AggregatedMultiLabelClassifyEvaluation,
+]:
+    return Aggregator(
+        in_memory_evaluation_repository,
+        file_aggregation_repository,
+        "multi-label-classify",
+        multi_label_classify_aggregation_logic,
+    )
+
+
 @fixture
 def classify_runner(
     embedding_based_classify: Task[ClassifyInput, MultiLabelClassifyOutput],
     in_memory_dataset_repository: DatasetRepository,
-    in_memory_run_repository: RunRepository,
+    in_memory_run_repository: InMemoryRunRepository,
 ) -> Runner[ClassifyInput, MultiLabelClassifyOutput]:
     return Runner(
         embedding_based_classify,
@@ -240,3 +264,47 @@ def test_multi_label_classify_evaluator_full_dataset(
     assert {"positive", "negative", "finance", "school"} == set(
         aggregation_overview.statistics.class_metrics.keys()
     )
+
+
+def test_confusion_matrix_in_single_label_classify_aggregation_is_compatible_with_file_repository(
+    evaluation_overview: EvaluationOverview,
+    tmp_path: Path,
+) -> None:
+    aggregated_single_label_classify_evaluation = (
+        AggregatedSingleLabelClassifyEvaluation(
+            percentage_correct=0.123,
+            confusion_matrix={
+                "happy": {"happy": 1},
+                "sad": {"sad": 2},
+                "angry": {"sad": 1},
+            },
+            by_label={
+                "happy": AggregatedLabelInfo(expected_count=1, predicted_count=1),
+                "sad": AggregatedLabelInfo(expected_count=3, predicted_count=2),
+                "angry": AggregatedLabelInfo(expected_count=0, predicted_count=1),
+            },
+            missing_labels={"tired": 1},
+        )
+    )
+
+    aggregation_overview = AggregationOverview(
+        id=str(uuid4()),
+        evaluation_overviews=frozenset([evaluation_overview]),
+        start=utc_now(),
+        end=utc_now(),
+        successful_evaluation_count=5,
+        crashed_during_evaluation_count=3,
+        statistics=aggregated_single_label_classify_evaluation,
+        description="dummy-aggregator",
+    )
+
+    aggregation_file_repository = FileAggregationRepository(tmp_path)
+    aggregation_file_repository.store_aggregation_overview(aggregation_overview)
+
+    aggregation_overview_from_file_repository = (
+        aggregation_file_repository.aggregation_overview(
+            aggregation_overview.id, AggregatedSingleLabelClassifyEvaluation
+        )
+    )
+
+    assert aggregation_overview_from_file_repository == aggregation_overview