Skip to content

Commit

Permalink
[AIC-py][eval] refactor nltk score metrics for mocking
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathanlastmileai committed Dec 18, 2023
1 parent 85febe4 commit c358470
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 85 deletions.
177 changes: 106 additions & 71 deletions python/src/aiconfig/eval/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from functools import total_ordering
from typing import Any, Generic, Type
from abc import abstractmethod
from functools import partial, total_ordering
from typing import Any, Callable, Generic, Protocol, Type

import lastmile_utils.lib.core.api as cu
import nltk
Expand All @@ -9,7 +10,7 @@
from aiconfig.eval.common import CustomMetricValue, EvaluationFunction, EvaluationMetricMetadata, T_BaseModel, T_OutputDatum, TextRatingsData
from aiconfig.eval.openai import OpenAIChatCompletionCreate, default_openai_chat_completion_create, make_fn_completion_text_to_serialized_json
from attr import dataclass
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as NLTKSentimentIntensityAnalyzer
from result import Err, Ok, Result


Expand All @@ -35,43 +36,12 @@ def _check_substring(output_datum: str, substring: str, case_sensitive: bool) ->
return substring.lower() in output_datum.lower()


def substring_match(substring: str, case_sensitive: bool = True) -> Metric[str]:
async def _fn(output_datum: str) -> bool:
return _check_substring(
output_datum=output_datum,
substring=substring,
case_sensitive=case_sensitive,
)

return Metric(
evaluation_fn=_fn,
metric_metadata=EvaluationMetricMetadata(
name="substring_match",
description="True (pass) if contains given substring",
best_value=True,
worst_value=False,
extra_metadata=dict(substring=substring, case_sensitive=case_sensitive),
),
)


async def _calculate_brevity(output_datum: str) -> int:
if len(output_datum) == 0:
raise ValueError("Brevity is meaningless for empty string.")
return len(output_datum)


brevity: Metric[str] = Metric(
evaluation_fn=_calculate_brevity,
metric_metadata=EvaluationMetricMetadata(
name="brevity",
description="Absolute text length",
best_value=1.0,
worst_value=float("inf"),
),
)


@dataclass
class TextSentimentScores(CustomMetricValue):
mapping: dict[str, float]
Expand Down Expand Up @@ -103,55 +73,58 @@ def __lt__(self, other: Any) -> bool:
return self.pos - self.neg < other.pos - other.neg


async def _get_sentiment_scores(output_datum: str) -> TextSentimentScores:
nltk.download("vader_lexicon", quiet=True) # type: ignore
sid = SentimentIntensityAnalyzer()
mapping: dict[str, float] = sid.polarity_scores(output_datum) # type: ignore
class GetPolarityScores(Protocol):
@abstractmethod
def __call__(self, text: str) -> dict[str, float]:
pass


def _get_nltk_polarity_scores(text: str, model: str) -> dict[str, float]:
nltk.download(model, quiet=True) # type: ignore
return NLTKSentimentIntensityAnalyzer().polarity_scores(text) # type: ignore


def _get_sentiment_scores(output_datum: str, get_polarity_scores: GetPolarityScores) -> TextSentimentScores:
mapping: dict[str, float] = get_polarity_scores(output_datum)
highest: str = pd.Series(mapping).idxmax() # type: ignore
return TextSentimentScores(mapping=mapping, **mapping, highest=highest)


async def _get_sentiment(output_datum: str) -> str:
scores = await _get_sentiment_scores(output_datum)
return scores.highest
def make_get_sentiment_scores(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]:
async def _f(output_datum: str) -> TextSentimentScores:
return _get_sentiment_scores(output_datum, get_polarity_scores)

return _f

async def _get_overall_positive_sentiment(output_datum: str) -> TextOverallPositiveSentiment:
scores = await _get_sentiment_scores(output_datum)
return TextOverallPositiveSentiment(pos=scores.pos, neg=scores.neg)

def make_get_sentiment_class(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]:
async def _f(output_datum: str) -> str:
scores = _get_sentiment_scores(output_datum, get_polarity_scores)
return scores.highest

sentiment_scores: Metric[str] = Metric(
evaluation_fn=_get_sentiment_scores,
metric_metadata=EvaluationMetricMetadata(
name="sentiment_scores",
description="Sentiment scores container object",
best_value=None,
worst_value=None,
),
)
return _f


sentiment_class: Metric[str] = Metric(
evaluation_fn=_get_sentiment,
metric_metadata=EvaluationMetricMetadata(
name="sentiment_class",
description="top sentiment class",
best_value=None,
worst_value=None,
),
)
def make_get_overall_positive_sentiment(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]:
async def _f(output_datum: str) -> TextOverallPositiveSentiment:
scores = _get_sentiment_scores(output_datum, get_polarity_scores)
return TextOverallPositiveSentiment(pos=scores.pos, neg=scores.neg)

return _f

sentiment_score_overall_positive: Metric[str] = Metric(
evaluation_fn=_get_overall_positive_sentiment,
metric_metadata=EvaluationMetricMetadata(
name="sentiment_score_overall_positive",
description="Positive minus negative",
best_value=TextOverallPositiveSentiment(pos=1.0, neg=0.0),
worst_value=TextOverallPositiveSentiment(pos=0.0, neg=1.0),
),
)

def make_sentiment_scores_metric(
get_polarity_scores: GetPolarityScores,
make_evaluation_fn: Callable[[GetPolarityScores], EvaluationFunction[str]],
name: str,
description: str,
best_value: common.MetricValue | None = None,
worst_value: common.MetricValue | None = None,
) -> Metric[str]:
return Metric(
evaluation_fn=make_evaluation_fn(get_polarity_scores),
metric_metadata=EvaluationMetricMetadata(name=name, description=description, best_value=best_value, worst_value=worst_value),
)


def make_structured_llm_metric(
Expand Down Expand Up @@ -241,6 +214,12 @@ def _with_description(key: str, value: dict[str, str]) -> dict[str, str]:
)


## User interface


# 1. functions that return metrics intended to be called directly


def make_openai_structured_llm_metric(
eval_llm_name: str,
pydantic_basemodel_type: Type[T_BaseModel],
Expand All @@ -266,6 +245,39 @@ def make_openai_structured_llm_metric(
raise ValueError(f"Error making metric: {e}")


def substring_match(substring: str, case_sensitive: bool = True) -> Metric[str]:
async def _fn(output_datum: str) -> bool:
return _check_substring(
output_datum=output_datum,
substring=substring,
case_sensitive=case_sensitive,
)

return Metric(
evaluation_fn=_fn,
metric_metadata=EvaluationMetricMetadata(
name="substring_match",
description="True (pass) if contains given substring",
best_value=True,
worst_value=False,
extra_metadata=dict(substring=substring, case_sensitive=case_sensitive),
),
)


# 2. literal metrics

brevity: Metric[str] = Metric(
evaluation_fn=_calculate_brevity,
metric_metadata=EvaluationMetricMetadata(
name="brevity",
description="Absolute text length",
best_value=1.0,
worst_value=float("inf"),
),
)


gpt3_5_text_ratings = make_openai_structured_llm_metric(
eval_llm_name="gpt-3.5-turbo-0613",
pydantic_basemodel_type=TextRatingsData,
Expand All @@ -277,3 +289,26 @@ def make_openai_structured_llm_metric(
conciseness_reasoning="reasoning behind the conciseness rating",
),
)

nltk_sentiment_scores_vader = make_sentiment_scores_metric(
get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"),
make_evaluation_fn=make_get_sentiment_scores,
name="nltk_sentiment_scores_vader",
description="NLTK sentiment scores using Vader",
)

nltk_sentiment_class_vader = make_sentiment_scores_metric(
get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"),
make_evaluation_fn=make_get_sentiment_class,
name="nltk_sentiment_class_vader",
description="Highest-probability NLTK sentiment class using Vader",
)

nltk_sentiment_score_overall_positive = make_sentiment_scores_metric(
get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"),
make_evaluation_fn=make_get_overall_positive_sentiment,
name="nltk_sentiment_score_overall_positive",
description="Positive minus negative",
best_value=TextOverallPositiveSentiment(pos=1.0, neg=0.0),
worst_value=TextOverallPositiveSentiment(pos=0.0, neg=1.0),
)
73 changes: 59 additions & 14 deletions python/tests/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,28 @@
brevity = metrics.brevity
substring_match = metrics.substring_match

MOCK_NLTK_SENTIMENT_SCORE_MAPPING = {
"nltk is amazing": {"pos": 0.9, "neu": 0.1, "neg": 0.0, "compound": 0.9},
"whats for dinner?": {"pos": 0.0, "neu": 0.9, "neg": 0.1, "compound": -0.9},
"oh, bother": {"pos": 0.0, "neu": 0.1, "neg": 0.9, "compound": -0.9},
}


def _compute_mock_sentiment_class_mapping(score_mapping: dict[str, dict[str, float]]) -> dict[str, str]:
out: dict[str, str] = {}
for k, scores in score_mapping.items():
max_class, max_score = "", float("-inf")
for class_, score in scores.items():
if score > max_score:
max_score = score
max_class = class_
out[k] = max_class

return out


MOCK_NLTK_SENTIMENT_CLASS_MAPPING = _compute_mock_sentiment_class_mapping(MOCK_NLTK_SENTIMENT_SCORE_MAPPING)


def set_pd():
pd.set_option("display.max_rows", 50)
Expand Down Expand Up @@ -195,34 +217,57 @@ async def test_run_test_suite_with_inputs(data: st.DataObject):
assert False, f"expected Ok, got Err({e})"


def _make_mock_nltk_metrics() -> list[metrics.Metric[str]]:
def _mock_get_nltk_polarity_scores(text: str) -> dict[str, float]:
return MOCK_NLTK_SENTIMENT_SCORE_MAPPING[text]

mock_nltk_sentiment_scores_vader = metrics.make_sentiment_scores_metric(
get_polarity_scores=_mock_get_nltk_polarity_scores,
make_evaluation_fn=metrics.make_get_sentiment_scores,
name="nltk_sentiment_scores_vader",
description="NLTK sentiment scores using Vader",
)

mock_nltk_sentiment_class_vader = metrics.make_sentiment_scores_metric(
get_polarity_scores=_mock_get_nltk_polarity_scores,
make_evaluation_fn=metrics.make_get_sentiment_class,
name="nltk_sentiment_class_vader",
description="Highest-probability NLTK sentiment class using Vader",
)

mock_nltk_sentiment_score_overall_positive = metrics.make_sentiment_scores_metric(
get_polarity_scores=_mock_get_nltk_polarity_scores,
make_evaluation_fn=metrics.make_get_overall_positive_sentiment,
name="nltk_sentiment_score_overall_positive",
description="Positive minus negative",
best_value=metrics.TextOverallPositiveSentiment(pos=1.0, neg=0.0),
worst_value=metrics.TextOverallPositiveSentiment(pos=0.0, neg=1.0),
)

return [mock_nltk_sentiment_scores_vader, mock_nltk_sentiment_class_vader, mock_nltk_sentiment_score_overall_positive]


@pytest.mark.asyncio
async def test_custom_metric_type():
mock_nltk_metrics = _make_mock_nltk_metrics()
user_test_suite_outputs_only = list(
itertools.product(
["nltk is amazing", "whats for dinner?", "oh, bother"],
[
metrics.sentiment_scores,
metrics.sentiment_class,
metrics.sentiment_score_overall_positive,
],
mock_nltk_metrics,
)
)
df = await run_test_suite_outputs_only(user_test_suite_outputs_only)
result = df.set_index(["metric_name", "aiconfig_output"]).value.unstack(0).to_dict() # type: ignore
assert result["sentiment_class"] == {
"nltk is amazing": "pos",
"whats for dinner?": "neu",
"oh, bother": "neg",
}
assert result["nltk_sentiment_class_vader"] == MOCK_NLTK_SENTIMENT_CLASS_MAPPING

assert all(isinstance(v, metrics.TextSentimentScores) for v in result["sentiment_scores"].values()) # type: ignore
assert all(isinstance(v, metrics.TextSentimentScores) for v in result["nltk_sentiment_scores_vader"].values()) # type: ignore

assert all(isinstance(v, metrics.TextOverallPositiveSentiment) for v in result["sentiment_score_overall_positive"].values()) # type: ignore
assert all(isinstance(v, metrics.TextOverallPositiveSentiment) for v in result["nltk_sentiment_score_overall_positive"].values()) # type: ignore

neutral = metrics.TextOverallPositiveSentiment(pos=0.0, neg=0.0)

assert result["sentiment_score_overall_positive"]["nltk is amazing"] > neutral
assert result["sentiment_score_overall_positive"]["oh, bother"] < neutral
assert result["nltk_sentiment_score_overall_positive"]["nltk is amazing"] > neutral
assert result["nltk_sentiment_score_overall_positive"]["oh, bother"] < neutral


@pytest.mark.asyncio
Expand Down

0 comments on commit c358470

Please sign in to comment.