[AIC-py][eval] add structured model-based evals

- add `make_structured_llm_metric()` which takes some configs and creates a metric that queries an LLM with the text to evaluate and some evaluation criteria, and returns a structured eval result - wrap with `make_openai_structured_llm_metric()` - make example `gpt3_5_text_ratings()` - add tests that mock the openai call
lastmile-ai · Dec 18, 2023 · 85febe4 · 85febe4
1 parent 38ca486
commit 85febe4
Show file tree

Hide file tree

Showing 7 changed files with 599 additions and 184 deletions.
diff --git a/python/requirements.txt b/python/requirements.txt
@@ -14,6 +14,6 @@ nest_asyncio
 prompt_toolkit
 mock
 pytest-asyncio
-lastmile-utils==0.0.10
+lastmile-utils==0.0.11
 hypothesis==6.91.0
 nltk
diff --git a/python/src/aiconfig/eval/api/__init__.py b/python/src/aiconfig/eval/api/__init__.py
@@ -9,14 +9,15 @@
     TestSuiteWithInputsSettings,
 )
 """
-from .. import metrics
+from .. import common, metrics
 
 # pyright: reportWildcardImportFromLibrary=false
 from ..lib import TestSuiteWithInputsSettings, run_test_suite_outputs_only, run_test_suite_with_inputs
 from ..metrics import Metric, brevity, substring_match
 
 __all__ = [
     "Metric",
+    "common",
     "metrics",
     "brevity",
     "substring_match",

diff --git a/python/src/aiconfig/eval/common.py b/python/src/aiconfig/eval/common.py
@@ -1,14 +1,21 @@
 import json
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, Generic, Protocol, TypeVar
+from typing import Any, Generic, NewType, Protocol, Type, TypeVar
 
 import lastmile_utils.lib.core.api as cu
-from pydantic import root_validator
+import result
+from aiconfig.eval import common
+from pydantic import BaseModel, root_validator
+from result import Result
 
 T_InputDatum = TypeVar("T_InputDatum", contravariant=True)
 T_OutputDatum = TypeVar("T_OutputDatum", contravariant=True)
 
+T_BaseModel = TypeVar("T_BaseModel", bound=BaseModel)
+
+SerializedJSON = NewType("SerializedJSON", str)
+
 
 @dataclass
 class CustomMetricValue(ABC):
@@ -21,9 +28,20 @@ class CustomMetricValue(ABC):
     """
 
 
+class CompletionTextToSerializedJSON(Protocol):
+    @abstractmethod
+    def __call__(self, output_datum: str) -> Result[common.SerializedJSON, str]:
+        pass
+
+
 MetricValue = int | float | str | bool | CustomMetricValue
 
 
+@dataclass
+class CustomMetricPydanticObject(CustomMetricValue, Generic[T_BaseModel]):
+    data: T_BaseModel
+
+
 class EvaluationFunction(Protocol, Generic[T_OutputDatum]):
     @abstractmethod
     async def __call__(self, output_datum: T_OutputDatum) -> MetricValue:
@@ -127,3 +145,21 @@ def check_value_range(cls, values: dict[str, Any]) -> dict[str, Any]:
             )
         else:
             return values
+
+
+class TextRatingsData(cu.Record):
+    conciseness_rating: int
+    conciseness_confidence: float
+    conciseness_reasoning: str
+
+
+def get_llm_structured_response(
+    input_text: str,
+    chat_completion_create: CompletionTextToSerializedJSON,
+    basemodel_type: Type[common.T_BaseModel],
+) -> Result[common.T_BaseModel, str]:
+    return result.do(
+        cu.safe_model_validate_json(response_ok, basemodel_type)
+        # get the serialized JSON response
+        for response_ok in chat_completion_create(input_text)
+    )