From 6ec891870784ea2835a5fbb44db9f35212ae7c7f Mon Sep 17 00:00:00 2001
From: Sebastian Niehus <sebastian.niehus@ext.aleph-alpha.com>
Date: Mon, 6 May 2024 12:11:10 +0200
Subject: [PATCH] feat: WIP Fix generics and types, some refactoring TASK:
 IL-394

---
 .../evaluation/evaluation/elo_evaluator.py    |  82 +++---
 .../evaluation/elo_graders/elo_grader.py      |  56 +++++
 .../evaluation/elo_graders/elo_qa_grader.py   | 141 +++++++++++
 tests/evaluation/test_elo_evaluator.py        | 237 +++++++++++++-----
 4 files changed, 401 insertions(+), 115 deletions(-)
 create mode 100644 src/intelligence_layer/evaluation/evaluation/elo_graders/elo_grader.py
 create mode 100644 src/intelligence_layer/evaluation/evaluation/elo_graders/elo_qa_grader.py

diff --git a/src/intelligence_layer/evaluation/evaluation/elo_evaluator.py b/src/intelligence_layer/evaluation/evaluation/elo_evaluator.py
index 3c3347bcc..99ef3daa5 100644
--- a/src/intelligence_layer/evaluation/evaluation/elo_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/elo_evaluator.py
@@ -1,65 +1,41 @@
-from abc import abstractmethod
-from typing import Sequence
-
-from pydantic import BaseModel
+from itertools import combinations
 
 from intelligence_layer.core import Input, NoOpTracer, Output, Tracer
 from intelligence_layer.evaluation import EvaluationLogic
-from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
-from intelligence_layer.evaluation.dataset.domain import Example
+from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
+from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import (
+    EloGrader,
+    Matches,
+)
 from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
 
 
-class Match(BaseModel):
-    player_a: str
-    player_b: str
-    outcome: MatchOutcome
+class EloEvaluationLogic(EvaluationLogic[Input, Output, ExpectedOutput, Matches]):
+    """Evaluation logic for a pair-wise ELO comparison.
 
+    Args:
+        grader: The :class:`Task` that perform the grading, i.e. the actual comparison of two run outputs.
+        tracer: :class:`Tracer` for tracking and debugging
 
-class EloEvaluationLogic(EvaluationLogic[Input, Output, str, list[Match]]):
-    # def __init__(
-    #     self,
-    #     # client: Client,
-    #     tracer: Tracer = NoOpTracer(),
-    # ):
-    #     self._tracer = tracer
-    #     # self._grader = Grader(                                             ## TODO
-    #     #     LlamaControlModel(name="llama-2-70b-chat", client=client)
-    #     # )
+    """
 
-    @abstractmethod
-    def do_evaluate(
+    def __init__(
         self,
-        example: Example[Input, str],
-        *output: SuccessfulExampleOutput[Output],
-    ) -> list[Match]:
-        # pairs = combinations(output, 2)
-        # return Matches(
-        #     matches=[
-        #         self._run_grader(first, second, example)
-        #         for [first, second] in pairs
-        #         if self._high_priority_runs is None                     ##TODO: Adapts to iterative elo class
-        #            or len(self._high_priority_runs) == 0
-        #            or first.run_id in self._high_priority_runs
-        #            or second.run_id in self._high_priority_runs
-        #     ]
-        # )
-        pass
+        grader: EloGrader[Input, Output, ExpectedOutput],
+        tracer: Tracer = NoOpTracer(),
+    ):
+        self.tracer = tracer
+        self.grader = grader
 
-    def _run_grader(
+    def do_evaluate(
         self,
-        first: SuccessfulExampleOutput[Output],
-        second: SuccessfulExampleOutput[Output],
-        example: Example[Input, str],
-    ) -> Match:
-        pass
-        # if random.choice([True, False]):
-        #     first, second = second, first
-        #
-        #
-        #
-        # return Match(
-        #     outcome='str',
-        #     player_a=first.run_id,
-        #     player_b=second.run_id,
-        # )
+        example: Example[Input, ExpectedOutput],
+        *output: SuccessfulExampleOutput[Output],
+    ) -> Matches:
+        pairs = combinations(output, 2)
+        return Matches(
+            matches=[
+                self.grader.run_grader(first, second, example)
+                for [first, second] in pairs
+            ]
+        )
diff --git a/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_grader.py b/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_grader.py
new file mode 100644
index 000000000..0512b27ed
--- /dev/null
+++ b/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_grader.py
@@ -0,0 +1,56 @@
+from abc import abstractmethod
+from typing import Generic, Sequence
+
+from pydantic import BaseModel
+
+from intelligence_layer.core import NoOpTracer, Task, Tracer
+from intelligence_layer.core.task import Input, Output
+from intelligence_layer.evaluation import MatchOutcome
+from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
+from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
+
+
+class Match(BaseModel):
+    player_a: str
+    player_b: str
+    outcome: MatchOutcome
+
+
+class Matches(BaseModel):
+    matches: Sequence[Match]
+
+
+class EloGradingInput(BaseModel):
+    instruction: str
+    first_completion: str
+    second_completion: str
+
+
+class EloGrader(
+    Task[EloGradingInput, MatchOutcome],
+    Generic[
+        Input,
+        Output,
+        ExpectedOutput,
+    ],
+):
+    def __init__(self, tracer: Tracer = NoOpTracer()):
+        self.tracer = tracer
+
+    # @abstractmethod
+    # def create_grading_input(
+    #     self,
+    #     first: SuccessfulExampleOutput[Output],
+    #     second: SuccessfulExampleOutput[Output],
+    #     example: Example[Input, ExpectedOutput],
+    # ) -> EloGradingInput:
+    #     pass
+
+    @abstractmethod
+    def run_grader(
+        self,
+        first: SuccessfulExampleOutput[Output],
+        second: SuccessfulExampleOutput[Output],
+        example: Example[Input, ExpectedOutput],  # TODO Generalize away from Llama
+    ) -> Match:
+        pass
diff --git a/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_qa_grader.py b/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_qa_grader.py
new file mode 100644
index 000000000..75e27c767
--- /dev/null
+++ b/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_qa_grader.py
@@ -0,0 +1,141 @@
+from liquid import Template
+
+from intelligence_layer.core.detect_language import Language
+from intelligence_layer.core.model import ControlModel, Llama2InstructModel
+from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
+from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import (
+    EloGrader,
+    EloGradingInput,
+    Match,
+)
+from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
+from intelligence_layer.examples.qa.single_chunk_qa import (
+    QA_INSTRUCTIONS,
+    SingleChunkQaInput,
+    SingleChunkQaOutput,
+)
+
+
+class EloQaGrader(
+    EloGrader[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]
+):
+    INPUT_TEMPLATE = """
+Your task is to compare two answers to an instruction on one metric.
+
+Please make sure you read and understand these instruction carefully. Please keep this document open while reviewing, and refer to it as needed.
+
+The Instruction for the answers was:{instruction}
+
+Evaluation Procedure:
+1. Read both answers carefully and identify the main facts and details they present.
+2. Check if the answers contain any factual errors that are not supported by the instruction.
+3. Evaluate which answer is more correct.
+
+Answer A:{first_completion}
+
+Answer B:{second_completion}
+
+Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B?
+
+Response: Answer """
+    VALUES = [
+        " A",
+        " B",
+    ]  # The space before the A and B is important due to tokenization
+
+    def __init__(self, model: ControlModel = Llama2InstructModel()):
+        super().__init__()
+        self._model = model
+
+    def _create_grading_input(
+        self,
+        first: SuccessfulExampleOutput[SingleChunkQaOutput],
+        second: SuccessfulExampleOutput[SingleChunkQaOutput],
+        example: Example[SingleChunkQaInput, ExpectedOutput],
+    ) -> EloGradingInput:
+        qa_instruction = Template(
+            QA_INSTRUCTIONS[Language("en")].unformatted_instruction
+        ).render(question=example.input.question)
+
+        no_answer = "There is no answer."
+        return EloGradingInput(
+            instruction=f"{example.input.chunk} {qa_instruction}",
+            first_completion=(
+                first.output.answer if first.output.answer is not None else no_answer
+            ),
+            second_completion=(
+                second.output.answer if second.output.answer is not None else no_answer
+            ),
+        )
+
+    # def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome:
+    #     text = self.INPUT_TEMPLATE.format(
+    #         instruction=input.instruction,
+    #         first_completion=input.first_completion,
+    #         second_completion=input.second_completion,
+    #     )
+    #
+    #     complete_input = self._create_complete_input(Prompt.from_text(text))
+    #     complete_output = self._model.complete_task().run(complete_input, task_span)
+    #
+    #     return self._calculate_winners(complete_output)
+    #
+    def run_grader(
+        self,
+        first: SuccessfulExampleOutput[SingleChunkQaOutput],
+        second: SuccessfulExampleOutput[SingleChunkQaOutput],
+        example: Example[SingleChunkQaInput, ExpectedOutput],
+    ) -> Match:
+        grading_input = self._create_grading_input(first, second, example)
+
+        return Match(
+            outcome=self.do_run(
+                grading_input,
+                self.tracer.task_span(
+                    task_name="elo_qa_run_grader", input=grading_input
+                ),
+            ),
+            player_a=first.run_id,
+            player_b=second.run_id,
+        )
+
+    #
+    # def _create_complete_input(self, prompt: Prompt) -> CompleteInput:
+    #     return CompleteInput(
+    #         prompt=prompt,
+    #         maximum_tokens=1,
+    #         log_probs=3,
+    #         disable_optimizations=True,
+    #     )
+    #
+    # def _calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
+    #     default_log_prob = float("-inf")
+    #
+    #     def get_normalized_prob(
+    #         log_prob_list: Sequence[Mapping[str, float | None]] | None,
+    #     ) -> float:
+    #         assert log_prob_list is not None
+    #         log_probs = log_prob_list[0]
+    #         values = [
+    #             math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob)
+    #             for key in self.VALUES
+    #         ]
+    #         if all(v == 0 for v in values):
+    #             raise ValueError(
+    #                 f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}"
+    #             )
+    #         normalized_A_prob = values[0] / sum(values)
+    #         return normalized_A_prob
+    #
+    #     def categorize_value(value: float) -> MatchOutcome:
+    #         if value > 0.7:
+    #             return MatchOutcome.A_WINS
+    #         elif 0.3 > value:
+    #             return MatchOutcome.B_WINS
+    #         else:
+    #             return MatchOutcome.DRAW
+    #
+    #     normalized_probability = get_normalized_prob(
+    #         complete_output.completions[0].log_probs
+    #     )
+    #     return categorize_value(normalized_probability)
diff --git a/tests/evaluation/test_elo_evaluator.py b/tests/evaluation/test_elo_evaluator.py
index e9f57c9f1..b186b61be 100644
--- a/tests/evaluation/test_elo_evaluator.py
+++ b/tests/evaluation/test_elo_evaluator.py
@@ -1,16 +1,9 @@
-from itertools import combinations
-
 from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
-from intelligence_layer.evaluation.dataset.domain import Example
-
-from intelligence_layer.evaluation.evaluation.elo_evaluator import (
-    EloEvaluationLogic,
-    Match,
-)
 from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
 
 
-def choose_winner(first: SuccessfulExampleOutput[str], second: SuccessfulExampleOutput[str]
+def choose_winner(
+    first: SuccessfulExampleOutput[str], second: SuccessfulExampleOutput[str]
 ) -> MatchOutcome:
     if first.run_id < second.run_id:
         return MatchOutcome.A_WINS
@@ -20,56 +13,176 @@ def choose_winner(first: SuccessfulExampleOutput[str], second: SuccessfulExample
         return MatchOutcome.DRAW
 
 
-class LexicographicELoComparisonEvaluationLogic(
-    EloEvaluationLogic[str, str]
-):
-    def do_evaluate(
-        self,
-        example: Example[str, str],
-        *output: SuccessfulExampleOutput[str],
-    ) -> list[Match]:
-        pairs = combinations(output, 2)
-        return [
-                Match(
-                    outcome=choose_winner(first, second),
-                    player_a=first.run_id,
-                    player_b=second.run_id,
-                )
-                for [first, second] in pairs
-            ]
-
-
-def test_choose_winner_should_return_contestant_with_lower_run_id():
-    # Given
-    contestant_a = SuccessfulExampleOutput[str](run_id="a", example_id="", output="")
-    contestant_b = SuccessfulExampleOutput[str](run_id="b", example_id="", output="")
-    contestant_a2 = SuccessfulExampleOutput[str](run_id="a", example_id="", output="")
-    contestant_c = SuccessfulExampleOutput[str](run_id="c", example_id="", output="")
-    # When
-    match_a_wins = choose_winner(contestant_a, contestant_b)
-    match_b_wins = choose_winner(contestant_c, contestant_b)
-    match_draw = choose_winner(contestant_a, contestant_a2)
-    # Then
-    assert match_a_wins == MatchOutcome.A_WINS
-    assert match_b_wins == MatchOutcome.B_WINS
-    assert match_draw == MatchOutcome.DRAW
-
-
-def test_do_evaluate_should_build_correct_matches():
-
-    example = Example(input=None, expected_output=None)
-    contestant_a = SuccessfulExampleOutput[str](run_id="a", example_id="_", output="_")
-    contestant_b = SuccessfulExampleOutput[str](run_id="b", example_id="_", output="_")
-    contestant_c = SuccessfulExampleOutput[str](run_id="c", example_id="_", output="_")
-    contestants = [contestant_a, contestant_b, contestant_c]
-
-    evaluation_logic = LexicographicELoComparisonEvaluationLogic()
-
-    matches = evaluation_logic.do_evaluate(example, *contestants)
-
-    for match in matches:
-        assert isinstance(match, Match)
-        if match.player_a < match.player_b:
-            assert match.outcome == MatchOutcome.A_WINS
-        elif match.player_a > match.player_b:
-            assert match.outcome == MatchOutcome.B_WINS
\ No newline at end of file
+# class LexicographicELoComparisonEvaluationLogic(EloEvaluationLogic[str, str, str]):
+#     def _run_grader(
+#         self,
+#         example: Example[str, str],
+#         *output: SuccessfulExampleOutput[str],
+#     ) -> Matches:
+#         pairs = combinations(output, 2)
+#         return Matches(
+#             matches=[
+#                 Match(
+#                     outcome=choose_winner(first, second),
+#                     player_a=first.run_id,
+#                     player_b=second.run_id,
+#                 )
+#                 for [first, second] in pairs
+#             ]
+#         )
+#
+#
+# @fixture
+# def in_memory_dataset_repository() -> InMemoryDatasetRepository:
+#     return InMemoryDatasetRepository()
+#
+#
+# @fixture
+# def in_memory_run_repository() -> InMemoryRunRepository:
+#     return InMemoryRunRepository()
+#
+#
+# @fixture
+# def in_memory_evaluation_repository() -> InMemoryEvaluationRepository:
+#     return InMemoryEvaluationRepository()
+#
+#
+# @fixture
+# def elo_evaluation_logic() -> EloEvaluationLogic:
+#     return LexicographicELoComparisonEvaluationLogic(grader=Task[None, None])
+#
+#
+# @fixture
+# def elo_evaluator(
+#     in_memory_dataset_repository: InMemoryDatasetRepository,
+#     in_memory_run_repository: InMemoryRunRepository,
+#     in_memory_evaluation_repository: InMemoryEvaluationRepository,
+#     elo_evaluation_logic: EloEvaluationLogic,
+# ) -> Evaluator:
+#     return Evaluator(
+#         in_memory_dataset_repository,
+#         in_memory_run_repository,
+#         in_memory_evaluation_repository,
+#         "Testing",
+#         elo_evaluation_logic,
+#     )
+#
+#
+# @fixture
+# def qa_outputs() -> Sequence[SingleChunkQaOutput]:
+#     return [
+#         SingleChunkQaOutput(answer=answer, highlights=[])
+#         for answer in [
+#             "Surface micromachining builds microstructures.",
+#             "Surface micromachining builds microstructures. This is done by deposition and etching structural layers over a substrate.",
+#             "Surface micromachining builds microstructures by deposition and etching structural layers over a substrate. This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures.",
+#         ]
+#     ]
+#
+#
+# @fixture
+# def qa_setup(
+#     in_memory_dataset_repository: InMemoryDatasetRepository,
+#     in_memory_run_repository: InMemoryRunRepository,
+#     qa_outputs: Sequence[SingleChunkQaOutput],
+# ) -> Tuple[Sequence[str], str]:
+#     qa_input_text = """Surface micromachining builds microstructures by deposition and etching structural layers over a substrate.[1] This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures."""
+#     qa_input = SingleChunkQaInput(
+#         chunk=qa_input_text, question="What is micromachining?", language=Language("en")
+#     )
+#     expected_output = "Surface micromachining builds microstructures by deposition and etching structural layers over a substrate."
+#
+#     example_id = "some-example-id"
+#     dataset_id = in_memory_dataset_repository.create_dataset(
+#         examples=[
+#             Example(input=qa_input, expected_output=expected_output, id=example_id)
+#         ],
+#         dataset_name="some-example-dataset-name",
+#     ).id
+#
+#     run_ids = [f"some-run-id-{i}" for i in range(len(qa_outputs))]
+#     for i, output in enumerate(qa_outputs):
+#         in_memory_run_repository.store_example_output(
+#             example_output=ExampleOutput(
+#                 run_id=run_ids[i],
+#                 example_id=example_id,
+#                 output=output,
+#             )
+#         )
+#         in_memory_run_repository.store_run_overview(
+#             RunOverview(
+#                 dataset_id=dataset_id,
+#                 id=run_ids[i],
+#                 start=utc_now(),
+#                 end=utc_now(),
+#                 failed_example_count=0,
+#                 successful_example_count=len(qa_outputs),
+#                 description="runner",
+#             )
+#         )
+#
+#     return run_ids, dataset_id
+#
+#
+# def test_choose_winner_should_return_contestant_with_lower_run_id():
+#     # Given
+#     contestant_a = SuccessfulExampleOutput[str](run_id="a", example_id="", output="")
+#     contestant_b = SuccessfulExampleOutput[str](run_id="b", example_id="", output="")
+#     contestant_a2 = SuccessfulExampleOutput[str](run_id="a", example_id="", output="")
+#     contestant_c = SuccessfulExampleOutput[str](run_id="c", example_id="", output="")
+#     # When
+#     match_a_wins = choose_winner(contestant_a, contestant_b)
+#     match_b_wins = choose_winner(contestant_c, contestant_b)
+#     match_draw = choose_winner(contestant_a, contestant_a2)
+#     # Then
+#     assert match_a_wins == MatchOutcome.A_WINS
+#     assert match_b_wins == MatchOutcome.B_WINS
+#     assert match_draw == MatchOutcome.DRAW
+#
+#
+# def test_do_evaluate_should_build_correct_matches():
+#     example = Example(input=None, expected_output=None)
+#     contestant_a = SuccessfulExampleOutput[str](run_id="a", example_id="_", output="_")
+#     contestant_b = SuccessfulExampleOutput[str](run_id="b", example_id="_", output="_")
+#     contestant_c = SuccessfulExampleOutput[str](run_id="c", example_id="_", output="_")
+#     contestants = [contestant_a, contestant_b, contestant_c]
+#
+#     evaluation_logic = LexicographicELoComparisonEvaluationLogic(
+#         grader=Task[None, None]
+#     )
+#
+#     matches = evaluation_logic._run_grader(example, *contestants).matches
+#
+#     for match in matches:
+#         assert isinstance(match, Match)
+#         if match.player_a < match.player_b:
+#             assert match.outcome == MatchOutcome.A_WINS
+#         elif match.player_a > match.player_b:
+#             assert match.outcome == MatchOutcome.B_WINS
+#
+#
+# def test_full_elo_eval_run(
+#     qa_setup: Tuple[Sequence[str], str],  # TODO: Better name
+#     elo_evaluator: Evaluator,
+#     in_memory_dataset_repository: InMemoryDatasetRepository,
+#     in_memory_run_repository: InMemoryRunRepository,
+#     in_memory_evaluation_repository: InMemoryEvaluationRepository,
+# ) -> None:
+#     run_ids, _ = qa_setup
+#
+#     evaluation_overview = elo_evaluator.evaluate_runs(run_ids[0], run_ids[1])
+#
+#     new_elo_qa_evaluator = Evaluator(
+#         in_memory_dataset_repository,
+#         in_memory_run_repository,
+#         in_memory_evaluation_repository,
+#         "Testing",
+#         evaluation_logic=EloEvaluationLogic(grader=Task[str, str]),
+#     )
+#
+#     # new_evaluation_overview = new_elo_qa_evaluator.evaluate_runs(*run_ids)
+#
+#     print(evaluation_overview)
+#     print(new_elo_qa_evaluator)
+#
+#     # TODO check if above code runs and add assertions