diff --git a/python/src/aiconfig/eval/metrics.py b/python/src/aiconfig/eval/metrics.py index d51821d8a..da68341e1 100644 --- a/python/src/aiconfig/eval/metrics.py +++ b/python/src/aiconfig/eval/metrics.py @@ -1,6 +1,7 @@ import json -from functools import total_ordering -from typing import Any, Generic, Type +from abc import abstractmethod +from functools import partial, total_ordering +from typing import Any, Callable, Generic, Protocol, Type import lastmile_utils.lib.core.api as cu import nltk @@ -9,7 +10,7 @@ from aiconfig.eval.common import CustomMetricValue, EvaluationFunction, EvaluationMetricMetadata, T_BaseModel, T_OutputDatum, TextRatingsData from aiconfig.eval.openai import OpenAIChatCompletionCreate, default_openai_chat_completion_create, make_fn_completion_text_to_serialized_json from attr import dataclass -from nltk.sentiment.vader import SentimentIntensityAnalyzer +from nltk.sentiment.vader import SentimentIntensityAnalyzer as NLTKSentimentIntensityAnalyzer from result import Err, Ok, Result @@ -35,43 +36,12 @@ def _check_substring(output_datum: str, substring: str, case_sensitive: bool) -> return substring.lower() in output_datum.lower() -def substring_match(substring: str, case_sensitive: bool = True) -> Metric[str]: - async def _fn(output_datum: str) -> bool: - return _check_substring( - output_datum=output_datum, - substring=substring, - case_sensitive=case_sensitive, - ) - - return Metric( - evaluation_fn=_fn, - metric_metadata=EvaluationMetricMetadata( - name="substring_match", - description="True (pass) if contains given substring", - best_value=True, - worst_value=False, - extra_metadata=dict(substring=substring, case_sensitive=case_sensitive), - ), - ) - - async def _calculate_brevity(output_datum: str) -> int: if len(output_datum) == 0: raise ValueError("Brevity is meaningless for empty string.") return len(output_datum) -brevity: Metric[str] = Metric( - evaluation_fn=_calculate_brevity, - metric_metadata=EvaluationMetricMetadata( - name="brevity", - description="Absolute text length", - best_value=1.0, - worst_value=float("inf"), - ), -) - - @dataclass class TextSentimentScores(CustomMetricValue): mapping: dict[str, float] @@ -103,55 +73,58 @@ def __lt__(self, other: Any) -> bool: return self.pos - self.neg < other.pos - other.neg -async def _get_sentiment_scores(output_datum: str) -> TextSentimentScores: - nltk.download("vader_lexicon", quiet=True) # type: ignore - sid = SentimentIntensityAnalyzer() - mapping: dict[str, float] = sid.polarity_scores(output_datum) # type: ignore +class GetPolarityScores(Protocol): + @abstractmethod + def __call__(self, text: str) -> dict[str, float]: + pass + + +def _get_nltk_polarity_scores(text: str, model: str) -> dict[str, float]: + nltk.download(model, quiet=True) # type: ignore + return NLTKSentimentIntensityAnalyzer().polarity_scores(text) # type: ignore + + +def _get_sentiment_scores(output_datum: str, get_polarity_scores: GetPolarityScores) -> TextSentimentScores: + mapping: dict[str, float] = get_polarity_scores(output_datum) highest: str = pd.Series(mapping).idxmax() # type: ignore return TextSentimentScores(mapping=mapping, **mapping, highest=highest) -async def _get_sentiment(output_datum: str) -> str: - scores = await _get_sentiment_scores(output_datum) - return scores.highest +def make_get_sentiment_scores(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]: + async def _f(output_datum: str) -> TextSentimentScores: + return _get_sentiment_scores(output_datum, get_polarity_scores) + return _f -async def _get_overall_positive_sentiment(output_datum: str) -> TextOverallPositiveSentiment: - scores = await _get_sentiment_scores(output_datum) - return TextOverallPositiveSentiment(pos=scores.pos, neg=scores.neg) +def make_get_sentiment_class(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]: + async def _f(output_datum: str) -> str: + scores = _get_sentiment_scores(output_datum, get_polarity_scores) + return scores.highest -sentiment_scores: Metric[str] = Metric( - evaluation_fn=_get_sentiment_scores, - metric_metadata=EvaluationMetricMetadata( - name="sentiment_scores", - description="Sentiment scores container object", - best_value=None, - worst_value=None, - ), -) + return _f -sentiment_class: Metric[str] = Metric( - evaluation_fn=_get_sentiment, - metric_metadata=EvaluationMetricMetadata( - name="sentiment_class", - description="top sentiment class", - best_value=None, - worst_value=None, - ), -) +def make_get_overall_positive_sentiment(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]: + async def _f(output_datum: str) -> TextOverallPositiveSentiment: + scores = _get_sentiment_scores(output_datum, get_polarity_scores) + return TextOverallPositiveSentiment(pos=scores.pos, neg=scores.neg) + return _f -sentiment_score_overall_positive: Metric[str] = Metric( - evaluation_fn=_get_overall_positive_sentiment, - metric_metadata=EvaluationMetricMetadata( - name="sentiment_score_overall_positive", - description="Positive minus negative", - best_value=TextOverallPositiveSentiment(pos=1.0, neg=0.0), - worst_value=TextOverallPositiveSentiment(pos=0.0, neg=1.0), - ), -) + +def make_sentiment_scores_metric( + get_polarity_scores: GetPolarityScores, + make_evaluation_fn: Callable[[GetPolarityScores], EvaluationFunction[str]], + name: str, + description: str, + best_value: common.MetricValue | None = None, + worst_value: common.MetricValue | None = None, +) -> Metric[str]: + return Metric( + evaluation_fn=make_evaluation_fn(get_polarity_scores), + metric_metadata=EvaluationMetricMetadata(name=name, description=description, best_value=best_value, worst_value=worst_value), + ) def make_structured_llm_metric( @@ -241,6 +214,12 @@ def _with_description(key: str, value: dict[str, str]) -> dict[str, str]: ) +## User interface + + +# 1. functions that return metrics intended to be called directly + + def make_openai_structured_llm_metric( eval_llm_name: str, pydantic_basemodel_type: Type[T_BaseModel], @@ -266,6 +245,39 @@ def make_openai_structured_llm_metric( raise ValueError(f"Error making metric: {e}") +def substring_match(substring: str, case_sensitive: bool = True) -> Metric[str]: + async def _fn(output_datum: str) -> bool: + return _check_substring( + output_datum=output_datum, + substring=substring, + case_sensitive=case_sensitive, + ) + + return Metric( + evaluation_fn=_fn, + metric_metadata=EvaluationMetricMetadata( + name="substring_match", + description="True (pass) if contains given substring", + best_value=True, + worst_value=False, + extra_metadata=dict(substring=substring, case_sensitive=case_sensitive), + ), + ) + + +# 2. literal metrics + +brevity: Metric[str] = Metric( + evaluation_fn=_calculate_brevity, + metric_metadata=EvaluationMetricMetadata( + name="brevity", + description="Absolute text length", + best_value=1.0, + worst_value=float("inf"), + ), +) + + gpt3_5_text_ratings = make_openai_structured_llm_metric( eval_llm_name="gpt-3.5-turbo-0613", pydantic_basemodel_type=TextRatingsData, @@ -277,3 +289,26 @@ def make_openai_structured_llm_metric( conciseness_reasoning="reasoning behind the conciseness rating", ), ) + +nltk_sentiment_scores_vader = make_sentiment_scores_metric( + get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"), + make_evaluation_fn=make_get_sentiment_scores, + name="nltk_sentiment_scores_vader", + description="NLTK sentiment scores using Vader", +) + +nltk_sentiment_class_vader = make_sentiment_scores_metric( + get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"), + make_evaluation_fn=make_get_sentiment_class, + name="nltk_sentiment_class_vader", + description="Highest-probability NLTK sentiment class using Vader", +) + +nltk_sentiment_score_overall_positive = make_sentiment_scores_metric( + get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"), + make_evaluation_fn=make_get_overall_positive_sentiment, + name="nltk_sentiment_score_overall_positive", + description="Positive minus negative", + best_value=TextOverallPositiveSentiment(pos=1.0, neg=0.0), + worst_value=TextOverallPositiveSentiment(pos=0.0, neg=1.0), +) diff --git a/python/tests/test_eval.py b/python/tests/test_eval.py index 98bf1126d..0f3357132 100644 --- a/python/tests/test_eval.py +++ b/python/tests/test_eval.py @@ -22,6 +22,28 @@ brevity = metrics.brevity substring_match = metrics.substring_match +MOCK_NLTK_SENTIMENT_SCORE_MAPPING = { + "nltk is amazing": {"pos": 0.9, "neu": 0.1, "neg": 0.0, "compound": 0.9}, + "whats for dinner?": {"pos": 0.0, "neu": 0.9, "neg": 0.1, "compound": -0.9}, + "oh, bother": {"pos": 0.0, "neu": 0.1, "neg": 0.9, "compound": -0.9}, +} + + +def _compute_mock_sentiment_class_mapping(score_mapping: dict[str, dict[str, float]]) -> dict[str, str]: + out: dict[str, str] = {} + for k, scores in score_mapping.items(): + max_class, max_score = "", float("-inf") + for class_, score in scores.items(): + if score > max_score: + max_score = score + max_class = class_ + out[k] = max_class + + return out + + +MOCK_NLTK_SENTIMENT_CLASS_MAPPING = _compute_mock_sentiment_class_mapping(MOCK_NLTK_SENTIMENT_SCORE_MAPPING) + def set_pd(): pd.set_option("display.max_rows", 50) @@ -195,34 +217,57 @@ async def test_run_test_suite_with_inputs(data: st.DataObject): assert False, f"expected Ok, got Err({e})" +def _make_mock_nltk_metrics() -> list[metrics.Metric[str]]: + def _mock_get_nltk_polarity_scores(text: str) -> dict[str, float]: + return MOCK_NLTK_SENTIMENT_SCORE_MAPPING[text] + + mock_nltk_sentiment_scores_vader = metrics.make_sentiment_scores_metric( + get_polarity_scores=_mock_get_nltk_polarity_scores, + make_evaluation_fn=metrics.make_get_sentiment_scores, + name="nltk_sentiment_scores_vader", + description="NLTK sentiment scores using Vader", + ) + + mock_nltk_sentiment_class_vader = metrics.make_sentiment_scores_metric( + get_polarity_scores=_mock_get_nltk_polarity_scores, + make_evaluation_fn=metrics.make_get_sentiment_class, + name="nltk_sentiment_class_vader", + description="Highest-probability NLTK sentiment class using Vader", + ) + + mock_nltk_sentiment_score_overall_positive = metrics.make_sentiment_scores_metric( + get_polarity_scores=_mock_get_nltk_polarity_scores, + make_evaluation_fn=metrics.make_get_overall_positive_sentiment, + name="nltk_sentiment_score_overall_positive", + description="Positive minus negative", + best_value=metrics.TextOverallPositiveSentiment(pos=1.0, neg=0.0), + worst_value=metrics.TextOverallPositiveSentiment(pos=0.0, neg=1.0), + ) + + return [mock_nltk_sentiment_scores_vader, mock_nltk_sentiment_class_vader, mock_nltk_sentiment_score_overall_positive] + + @pytest.mark.asyncio async def test_custom_metric_type(): + mock_nltk_metrics = _make_mock_nltk_metrics() user_test_suite_outputs_only = list( itertools.product( ["nltk is amazing", "whats for dinner?", "oh, bother"], - [ - metrics.sentiment_scores, - metrics.sentiment_class, - metrics.sentiment_score_overall_positive, - ], + mock_nltk_metrics, ) ) df = await run_test_suite_outputs_only(user_test_suite_outputs_only) result = df.set_index(["metric_name", "aiconfig_output"]).value.unstack(0).to_dict() # type: ignore - assert result["sentiment_class"] == { - "nltk is amazing": "pos", - "whats for dinner?": "neu", - "oh, bother": "neg", - } + assert result["nltk_sentiment_class_vader"] == MOCK_NLTK_SENTIMENT_CLASS_MAPPING - assert all(isinstance(v, metrics.TextSentimentScores) for v in result["sentiment_scores"].values()) # type: ignore + assert all(isinstance(v, metrics.TextSentimentScores) for v in result["nltk_sentiment_scores_vader"].values()) # type: ignore - assert all(isinstance(v, metrics.TextOverallPositiveSentiment) for v in result["sentiment_score_overall_positive"].values()) # type: ignore + assert all(isinstance(v, metrics.TextOverallPositiveSentiment) for v in result["nltk_sentiment_score_overall_positive"].values()) # type: ignore neutral = metrics.TextOverallPositiveSentiment(pos=0.0, neg=0.0) - assert result["sentiment_score_overall_positive"]["nltk is amazing"] > neutral - assert result["sentiment_score_overall_positive"]["oh, bother"] < neutral + assert result["nltk_sentiment_score_overall_positive"]["nltk is amazing"] > neutral + assert result["nltk_sentiment_score_overall_positive"]["oh, bother"] < neutral @pytest.mark.asyncio