Skip to content

Commit

Permalink
[AIC-py][eval] add structured model-based evals
Browse files Browse the repository at this point in the history
- add `make_structured_llm_metric()` which takes some configs and creates a metric
that queries an LLM with the text to evaluate and some evaluation criteria,
and returns a structured eval result
- wrap with `make_openai_structured_llm_metric()`
- make example `gpt3_5_text_ratings()`
- add tests that mock the openai call
  • Loading branch information
jonathanlastmileai committed Dec 18, 2023
1 parent 38ca486 commit 85febe4
Show file tree
Hide file tree
Showing 7 changed files with 599 additions and 184 deletions.
2 changes: 1 addition & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ nest_asyncio
prompt_toolkit
mock
pytest-asyncio
lastmile-utils==0.0.10
lastmile-utils==0.0.11
hypothesis==6.91.0
nltk
3 changes: 2 additions & 1 deletion python/src/aiconfig/eval/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
TestSuiteWithInputsSettings,
)
"""
from .. import metrics
from .. import common, metrics

# pyright: reportWildcardImportFromLibrary=false
from ..lib import TestSuiteWithInputsSettings, run_test_suite_outputs_only, run_test_suite_with_inputs
from ..metrics import Metric, brevity, substring_match

__all__ = [
"Metric",
"common",
"metrics",
"brevity",
"substring_match",
Expand Down
40 changes: 38 additions & 2 deletions python/src/aiconfig/eval/common.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import json
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Generic, Protocol, TypeVar
from typing import Any, Generic, NewType, Protocol, Type, TypeVar

import lastmile_utils.lib.core.api as cu
from pydantic import root_validator
import result
from aiconfig.eval import common
from pydantic import BaseModel, root_validator
from result import Result

T_InputDatum = TypeVar("T_InputDatum", contravariant=True)
T_OutputDatum = TypeVar("T_OutputDatum", contravariant=True)

T_BaseModel = TypeVar("T_BaseModel", bound=BaseModel)

SerializedJSON = NewType("SerializedJSON", str)


@dataclass
class CustomMetricValue(ABC):
Expand All @@ -21,9 +28,20 @@ class CustomMetricValue(ABC):
"""


class CompletionTextToSerializedJSON(Protocol):
@abstractmethod
def __call__(self, output_datum: str) -> Result[common.SerializedJSON, str]:
pass


MetricValue = int | float | str | bool | CustomMetricValue


@dataclass
class CustomMetricPydanticObject(CustomMetricValue, Generic[T_BaseModel]):
data: T_BaseModel


class EvaluationFunction(Protocol, Generic[T_OutputDatum]):
@abstractmethod
async def __call__(self, output_datum: T_OutputDatum) -> MetricValue:
Expand Down Expand Up @@ -127,3 +145,21 @@ def check_value_range(cls, values: dict[str, Any]) -> dict[str, Any]:
)
else:
return values


class TextRatingsData(cu.Record):
conciseness_rating: int
conciseness_confidence: float
conciseness_reasoning: str


def get_llm_structured_response(
input_text: str,
chat_completion_create: CompletionTextToSerializedJSON,
basemodel_type: Type[common.T_BaseModel],
) -> Result[common.T_BaseModel, str]:
return result.do(
cu.safe_model_validate_json(response_ok, basemodel_type)
# get the serialized JSON response
for response_ok in chat_completion_create(input_text)
)
Loading

0 comments on commit 85febe4

Please sign in to comment.