diff --git a/python/src/aiconfig/eval/metrics.py b/python/src/aiconfig/eval/metrics.py
index d51821d8a..da68341e1 100644
--- a/python/src/aiconfig/eval/metrics.py
+++ b/python/src/aiconfig/eval/metrics.py
@@ -1,6 +1,7 @@
 import json
-from functools import total_ordering
-from typing import Any, Generic, Type
+from abc import abstractmethod
+from functools import partial, total_ordering
+from typing import Any, Callable, Generic, Protocol, Type
 
 import lastmile_utils.lib.core.api as cu
 import nltk
@@ -9,7 +10,7 @@
 from aiconfig.eval.common import CustomMetricValue, EvaluationFunction, EvaluationMetricMetadata, T_BaseModel, T_OutputDatum, TextRatingsData
 from aiconfig.eval.openai import OpenAIChatCompletionCreate, default_openai_chat_completion_create, make_fn_completion_text_to_serialized_json
 from attr import dataclass
-from nltk.sentiment.vader import SentimentIntensityAnalyzer
+from nltk.sentiment.vader import SentimentIntensityAnalyzer as NLTKSentimentIntensityAnalyzer
 from result import Err, Ok, Result
 
 
@@ -35,43 +36,12 @@ def _check_substring(output_datum: str, substring: str, case_sensitive: bool) ->
         return substring.lower() in output_datum.lower()
 
 
-def substring_match(substring: str, case_sensitive: bool = True) -> Metric[str]:
-    async def _fn(output_datum: str) -> bool:
-        return _check_substring(
-            output_datum=output_datum,
-            substring=substring,
-            case_sensitive=case_sensitive,
-        )
-
-    return Metric(
-        evaluation_fn=_fn,
-        metric_metadata=EvaluationMetricMetadata(
-            name="substring_match",
-            description="True (pass) if contains given substring",
-            best_value=True,
-            worst_value=False,
-            extra_metadata=dict(substring=substring, case_sensitive=case_sensitive),
-        ),
-    )
-
-
 async def _calculate_brevity(output_datum: str) -> int:
     if len(output_datum) == 0:
         raise ValueError("Brevity is meaningless for empty string.")
     return len(output_datum)
 
 
-brevity: Metric[str] = Metric(
-    evaluation_fn=_calculate_brevity,
-    metric_metadata=EvaluationMetricMetadata(
-        name="brevity",
-        description="Absolute text length",
-        best_value=1.0,
-        worst_value=float("inf"),
-    ),
-)
-
-
 @dataclass
 class TextSentimentScores(CustomMetricValue):
     mapping: dict[str, float]
@@ -103,55 +73,58 @@ def __lt__(self, other: Any) -> bool:
         return self.pos - self.neg < other.pos - other.neg
 
 
-async def _get_sentiment_scores(output_datum: str) -> TextSentimentScores:
-    nltk.download("vader_lexicon", quiet=True)  # type: ignore
-    sid = SentimentIntensityAnalyzer()
-    mapping: dict[str, float] = sid.polarity_scores(output_datum)  # type: ignore
+class GetPolarityScores(Protocol):
+    @abstractmethod
+    def __call__(self, text: str) -> dict[str, float]:
+        pass
+
+
+def _get_nltk_polarity_scores(text: str, model: str) -> dict[str, float]:
+    nltk.download(model, quiet=True)  # type: ignore
+    return NLTKSentimentIntensityAnalyzer().polarity_scores(text)  # type: ignore
+
+
+def _get_sentiment_scores(output_datum: str, get_polarity_scores: GetPolarityScores) -> TextSentimentScores:
+    mapping: dict[str, float] = get_polarity_scores(output_datum)
     highest: str = pd.Series(mapping).idxmax()  # type: ignore
     return TextSentimentScores(mapping=mapping, **mapping, highest=highest)
 
 
-async def _get_sentiment(output_datum: str) -> str:
-    scores = await _get_sentiment_scores(output_datum)
-    return scores.highest
+def make_get_sentiment_scores(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]:
+    async def _f(output_datum: str) -> TextSentimentScores:
+        return _get_sentiment_scores(output_datum, get_polarity_scores)
 
+    return _f
 
-async def _get_overall_positive_sentiment(output_datum: str) -> TextOverallPositiveSentiment:
-    scores = await _get_sentiment_scores(output_datum)
-    return TextOverallPositiveSentiment(pos=scores.pos, neg=scores.neg)
 
+def make_get_sentiment_class(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]:
+    async def _f(output_datum: str) -> str:
+        scores = _get_sentiment_scores(output_datum, get_polarity_scores)
+        return scores.highest
 
-sentiment_scores: Metric[str] = Metric(
-    evaluation_fn=_get_sentiment_scores,
-    metric_metadata=EvaluationMetricMetadata(
-        name="sentiment_scores",
-        description="Sentiment scores container object",
-        best_value=None,
-        worst_value=None,
-    ),
-)
+    return _f
 
 
-sentiment_class: Metric[str] = Metric(
-    evaluation_fn=_get_sentiment,
-    metric_metadata=EvaluationMetricMetadata(
-        name="sentiment_class",
-        description="top sentiment class",
-        best_value=None,
-        worst_value=None,
-    ),
-)
+def make_get_overall_positive_sentiment(get_polarity_scores: GetPolarityScores) -> EvaluationFunction[str]:
+    async def _f(output_datum: str) -> TextOverallPositiveSentiment:
+        scores = _get_sentiment_scores(output_datum, get_polarity_scores)
+        return TextOverallPositiveSentiment(pos=scores.pos, neg=scores.neg)
 
+    return _f
 
-sentiment_score_overall_positive: Metric[str] = Metric(
-    evaluation_fn=_get_overall_positive_sentiment,
-    metric_metadata=EvaluationMetricMetadata(
-        name="sentiment_score_overall_positive",
-        description="Positive minus negative",
-        best_value=TextOverallPositiveSentiment(pos=1.0, neg=0.0),
-        worst_value=TextOverallPositiveSentiment(pos=0.0, neg=1.0),
-    ),
-)
+
+def make_sentiment_scores_metric(
+    get_polarity_scores: GetPolarityScores,
+    make_evaluation_fn: Callable[[GetPolarityScores], EvaluationFunction[str]],
+    name: str,
+    description: str,
+    best_value: common.MetricValue | None = None,
+    worst_value: common.MetricValue | None = None,
+) -> Metric[str]:
+    return Metric(
+        evaluation_fn=make_evaluation_fn(get_polarity_scores),
+        metric_metadata=EvaluationMetricMetadata(name=name, description=description, best_value=best_value, worst_value=worst_value),
+    )
 
 
 def make_structured_llm_metric(
@@ -241,6 +214,12 @@ def _with_description(key: str, value: dict[str, str]) -> dict[str, str]:
     )
 
 
+## User interface
+
+
+# 1. functions that return metrics intended to be called directly
+
+
 def make_openai_structured_llm_metric(
     eval_llm_name: str,
     pydantic_basemodel_type: Type[T_BaseModel],
@@ -266,6 +245,39 @@ def make_openai_structured_llm_metric(
             raise ValueError(f"Error making metric: {e}")
 
 
+def substring_match(substring: str, case_sensitive: bool = True) -> Metric[str]:
+    async def _fn(output_datum: str) -> bool:
+        return _check_substring(
+            output_datum=output_datum,
+            substring=substring,
+            case_sensitive=case_sensitive,
+        )
+
+    return Metric(
+        evaluation_fn=_fn,
+        metric_metadata=EvaluationMetricMetadata(
+            name="substring_match",
+            description="True (pass) if contains given substring",
+            best_value=True,
+            worst_value=False,
+            extra_metadata=dict(substring=substring, case_sensitive=case_sensitive),
+        ),
+    )
+
+
+# 2. literal metrics
+
+brevity: Metric[str] = Metric(
+    evaluation_fn=_calculate_brevity,
+    metric_metadata=EvaluationMetricMetadata(
+        name="brevity",
+        description="Absolute text length",
+        best_value=1.0,
+        worst_value=float("inf"),
+    ),
+)
+
+
 gpt3_5_text_ratings = make_openai_structured_llm_metric(
     eval_llm_name="gpt-3.5-turbo-0613",
     pydantic_basemodel_type=TextRatingsData,
@@ -277,3 +289,26 @@ def make_openai_structured_llm_metric(
         conciseness_reasoning="reasoning behind the conciseness rating",
     ),
 )
+
+nltk_sentiment_scores_vader = make_sentiment_scores_metric(
+    get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"),
+    make_evaluation_fn=make_get_sentiment_scores,
+    name="nltk_sentiment_scores_vader",
+    description="NLTK sentiment scores using Vader",
+)
+
+nltk_sentiment_class_vader = make_sentiment_scores_metric(
+    get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"),
+    make_evaluation_fn=make_get_sentiment_class,
+    name="nltk_sentiment_class_vader",
+    description="Highest-probability NLTK sentiment class using Vader",
+)
+
+nltk_sentiment_score_overall_positive = make_sentiment_scores_metric(
+    get_polarity_scores=partial(_get_nltk_polarity_scores, model="vader_lexicon"),
+    make_evaluation_fn=make_get_overall_positive_sentiment,
+    name="nltk_sentiment_score_overall_positive",
+    description="Positive minus negative",
+    best_value=TextOverallPositiveSentiment(pos=1.0, neg=0.0),
+    worst_value=TextOverallPositiveSentiment(pos=0.0, neg=1.0),
+)
diff --git a/python/tests/test_eval.py b/python/tests/test_eval.py
index 98bf1126d..0f3357132 100644
--- a/python/tests/test_eval.py
+++ b/python/tests/test_eval.py
@@ -22,6 +22,28 @@
 brevity = metrics.brevity
 substring_match = metrics.substring_match
 
+MOCK_NLTK_SENTIMENT_SCORE_MAPPING = {
+    "nltk is amazing": {"pos": 0.9, "neu": 0.1, "neg": 0.0, "compound": 0.9},
+    "whats for dinner?": {"pos": 0.0, "neu": 0.9, "neg": 0.1, "compound": -0.9},
+    "oh, bother": {"pos": 0.0, "neu": 0.1, "neg": 0.9, "compound": -0.9},
+}
+
+
+def _compute_mock_sentiment_class_mapping(score_mapping: dict[str, dict[str, float]]) -> dict[str, str]:
+    out: dict[str, str] = {}
+    for k, scores in score_mapping.items():
+        max_class, max_score = "", float("-inf")
+        for class_, score in scores.items():
+            if score > max_score:
+                max_score = score
+                max_class = class_
+        out[k] = max_class
+
+    return out
+
+
+MOCK_NLTK_SENTIMENT_CLASS_MAPPING = _compute_mock_sentiment_class_mapping(MOCK_NLTK_SENTIMENT_SCORE_MAPPING)
+
 
 def set_pd():
     pd.set_option("display.max_rows", 50)
@@ -195,34 +217,57 @@ async def test_run_test_suite_with_inputs(data: st.DataObject):
             assert False, f"expected Ok, got Err({e})"
 
 
+def _make_mock_nltk_metrics() -> list[metrics.Metric[str]]:
+    def _mock_get_nltk_polarity_scores(text: str) -> dict[str, float]:
+        return MOCK_NLTK_SENTIMENT_SCORE_MAPPING[text]
+
+    mock_nltk_sentiment_scores_vader = metrics.make_sentiment_scores_metric(
+        get_polarity_scores=_mock_get_nltk_polarity_scores,
+        make_evaluation_fn=metrics.make_get_sentiment_scores,
+        name="nltk_sentiment_scores_vader",
+        description="NLTK sentiment scores using Vader",
+    )
+
+    mock_nltk_sentiment_class_vader = metrics.make_sentiment_scores_metric(
+        get_polarity_scores=_mock_get_nltk_polarity_scores,
+        make_evaluation_fn=metrics.make_get_sentiment_class,
+        name="nltk_sentiment_class_vader",
+        description="Highest-probability NLTK sentiment class using Vader",
+    )
+
+    mock_nltk_sentiment_score_overall_positive = metrics.make_sentiment_scores_metric(
+        get_polarity_scores=_mock_get_nltk_polarity_scores,
+        make_evaluation_fn=metrics.make_get_overall_positive_sentiment,
+        name="nltk_sentiment_score_overall_positive",
+        description="Positive minus negative",
+        best_value=metrics.TextOverallPositiveSentiment(pos=1.0, neg=0.0),
+        worst_value=metrics.TextOverallPositiveSentiment(pos=0.0, neg=1.0),
+    )
+
+    return [mock_nltk_sentiment_scores_vader, mock_nltk_sentiment_class_vader, mock_nltk_sentiment_score_overall_positive]
+
+
 @pytest.mark.asyncio
 async def test_custom_metric_type():
+    mock_nltk_metrics = _make_mock_nltk_metrics()
     user_test_suite_outputs_only = list(
         itertools.product(
             ["nltk is amazing", "whats for dinner?", "oh, bother"],
-            [
-                metrics.sentiment_scores,
-                metrics.sentiment_class,
-                metrics.sentiment_score_overall_positive,
-            ],
+            mock_nltk_metrics,
         )
     )
     df = await run_test_suite_outputs_only(user_test_suite_outputs_only)
     result = df.set_index(["metric_name", "aiconfig_output"]).value.unstack(0).to_dict()  # type: ignore
-    assert result["sentiment_class"] == {
-        "nltk is amazing": "pos",
-        "whats for dinner?": "neu",
-        "oh, bother": "neg",
-    }
+    assert result["nltk_sentiment_class_vader"] == MOCK_NLTK_SENTIMENT_CLASS_MAPPING
 
-    assert all(isinstance(v, metrics.TextSentimentScores) for v in result["sentiment_scores"].values())  # type: ignore
+    assert all(isinstance(v, metrics.TextSentimentScores) for v in result["nltk_sentiment_scores_vader"].values())  # type: ignore
 
-    assert all(isinstance(v, metrics.TextOverallPositiveSentiment) for v in result["sentiment_score_overall_positive"].values())  # type: ignore
+    assert all(isinstance(v, metrics.TextOverallPositiveSentiment) for v in result["nltk_sentiment_score_overall_positive"].values())  # type: ignore
 
     neutral = metrics.TextOverallPositiveSentiment(pos=0.0, neg=0.0)
 
-    assert result["sentiment_score_overall_positive"]["nltk is amazing"] > neutral
-    assert result["sentiment_score_overall_positive"]["oh, bother"] < neutral
+    assert result["nltk_sentiment_score_overall_positive"]["nltk is amazing"] > neutral
+    assert result["nltk_sentiment_score_overall_positive"]["oh, bother"] < neutral
 
 
 @pytest.mark.asyncio