Fixes for rebase onto main

Aleph-Alpha · Apr 3, 2024 · 24ec873 · 24ec873
1 parent 169969a
commit 24ec873
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 20 deletions.
diff --git a/src/examples/data/classify_examples.json b/src/examples/data/classify_examples.json
@@ -85,7 +85,7 @@
   },
   {
     "label": "Human Resources",
-    "message": "I want to take a week off immediatly"
+    "message": "I want to take a week off immediately"
   },
   {
     "label": "Human Resources",

diff --git a/src/examples/evaluation.ipynb b/src/examples/evaluation.ipynb
@@ -205,7 +205,7 @@
     "    examples=[\n",
     "        Example(\n",
     "            input=ClassifyInput(chunk=TextChunk(item[\"text\"]), labels=all_labels),\n",
-    "            expected_output=item[\"label_name\"],\n",
+    "            expected_output=item[\"label_name\"][0],\n",
     "        )\n",
     "        for item in data\n",
     "    ],\n",

diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb
@@ -62,7 +62,9 @@
    "source": [
     "from intelligence_layer.core import TextChunk, InMemoryTracer\n",
     "from intelligence_layer.use_cases import PromptBasedClassify, ClassifyInput\n",
+    "from dotenv import load_dotenv\n",
     "\n",
+    "load_dotenv()\n",
     "\n",
     "# instantiating the default task\n",
     "prompt_based_classify = PromptBasedClassify()\n",
@@ -146,8 +148,9 @@
     "            expected_output=example[\"label\"],\n",
     "        )\n",
     "        for example in labeled_examples\n",
-    "    ]\n",
-    ")"
+    "    ],\n",
+    "    dataset_name=\"MyDataset\",\n",
+    ").id"
    ]
   },
   {
@@ -179,8 +182,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from dotenv import load_dotenv\n",
-    "\n",
     "from intelligence_layer.evaluation import (\n",
     "    Evaluator,\n",
     "    InMemoryEvaluationRepository,\n",
@@ -194,7 +195,6 @@
     "    SingleLabelClassifyAggregationLogic,\n",
     ")\n",
     "\n",
-    "load_dotenv()\n",
     "\n",
     "# we need a few repositories to store runs, evals and aggregated evaluations\n",
     "run_repository = InMemoryRunRepository()\n",
@@ -367,8 +367,9 @@
     "            expected_output=example[\"label\"],\n",
     "        )\n",
     "        for example in labeled_examples\n",
-    "    ]\n",
-    ")"
+    "    ],\n",
+    "    dataset_name=\"CleanedDataset\",\n",
+    ").id"
    ]
   },
   {

diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
@@ -84,24 +84,21 @@ class PerformanceScores(BaseModel):
 
 
 class AggregatedLabelInfo(BaseModel):
-    scores: PerformanceScores
-    expected_share: float
-    actual_share: float
+    expected_count: int
+    predicted_count: int
 
 
 class AggregatedSingleLabelClassifyEvaluation(BaseModel):
     """The aggregated evaluation of a single label classify implementation against a dataset.
 
     Attributes:
         percentage_correct: Percentage of answers that were considered to be correct.
-        js_divergence: Divergence between expected and predicted distributions ().
         confusion_matrix: How often each label was confused with each other.
         by_label: Each label along with a couple aggregated statistics.
         missing_labels: Each label missing from the results accompanied by the missing count.
     """
 
     percentage_correct: float
-    js_divergence: float
     confusion_matrix: Mapping[tuple[str, str], int]
     by_label: Mapping[str, AggregatedLabelInfo]
     missing_labels: Mapping[str, int]
@@ -118,7 +115,7 @@ def aggregate(
         acc = MeanAccumulator()
         missing_labels: dict[str, int] = defaultdict(int)
         confusion_matrix: dict[tuple[str, str], int] = defaultdict(int)
-        by_label: dict[str, Mapping[str, int]] = defaultdict(lambda: defaultdict(int))
+        by_label: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
         for evaluation in evaluations:
             acc.add(1.0 if evaluation.correct else 0.0)
             if evaluation.expected_label_missing:

diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py
@@ -54,7 +54,7 @@ def classify_evaluator(
 ) -> Evaluator[
     ClassifyInput,
     SingleLabelClassifyOutput,
-    Sequence[str],
+    str,
     SingleLabelClassifyEvaluation,
 ]:
     return Evaluator(
@@ -197,7 +197,7 @@ def test_can_evaluate_classify(
             chunk=TextChunk("This is good"),
             labels=frozenset({"positive", "negative"}),
         ),
-        expected_output=["positive"],
+        expected_output="positive",
     )
 
     dataset_id = in_memory_dataset_repository.create_dataset(
@@ -230,20 +230,20 @@ def test_can_aggregate_evaluations(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     classify_runner: Runner[ClassifyInput, SingleLabelClassifyOutput],
 ) -> None:
-    positive_lst: Sequence[str] = ["positive"]
+    positive: str = "positive"
     correct_example = Example(
         input=ClassifyInput(
             chunk=TextChunk("This is good"),
             labels=frozenset({"positive", "negative"}),
         ),
-        expected_output=positive_lst,
+        expected_output=positive,
     )
     incorrect_example = Example(
         input=ClassifyInput(
             chunk=TextChunk("This is extremely bad"),
             labels=frozenset({"positive", "negative"}),
         ),
-        expected_output=positive_lst,
+        expected_output=positive,
     )
     dataset_id = in_memory_dataset_repository.create_dataset(
         examples=[correct_example, incorrect_example], dataset_name="test-dataset"