test(weave): Add large input tests for scorers

Co-Authored-By: [email protected] <[email protected]>
wandb · Dec 13, 2024 · 87820b4 · 87820b4
1 parent 68c1b5e
commit 87820b4
Show file tree

Hide file tree

Showing 12 changed files with 315 additions and 104 deletions.
diff --git a/tests/scorers/test_coherence_scorer.py b/tests/scorers/test_coherence_scorer.py
@@ -1,4 +1,5 @@
 import pytest
+from unittest.mock import MagicMock
 
 import weave
 from weave.scorers.coherence_scorer import CoherenceScorer
@@ -7,26 +8,33 @@
 
 @pytest.fixture
 def coherence_scorer(monkeypatch):
+    # Mock model loading
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    monkeypatch.setattr("transformers.AutoModelForSequenceClassification.from_pretrained", lambda *args, **kwargs: mock_model)
+    monkeypatch.setattr("transformers.AutoTokenizer.from_pretrained", lambda *args, **kwargs: mock_tokenizer)
+
+    # Mock wandb login and project
+    monkeypatch.setattr("wandb.login", lambda *args, **kwargs: True)
+    mock_project = MagicMock()
+    monkeypatch.setattr("wandb.Api", lambda: MagicMock(project=lambda *args: mock_project))
+
     scorer = CoherenceScorer(
-        model_name="wandb/coherence_scorer",
+        model_name_or_path="wandb/coherence_scorer",
         device="cpu",
+        name="test-coherence",
+        description="Test coherence scorer",
+        column_map={"output": "text"}
     )
 
     def mock_pipeline(*args, **kwargs):
         def inner(inputs):
-            if "incoherent" in inputs["text_pair"] or "incoherent" in inputs["text"]:
-                return {
-                    "label": "incoherent",
-                    "score": 0.2,
-                }
-            return {
-                "label": "coherent",
-                "score": 0.95,
-            }
-
+            if "incoherent" in str(inputs.get("text_pair", "")) or "incoherent" in str(inputs.get("text", "")):
+                return {"label": "Completely Incoherent", "score": 0.2}
+            return {"label": "Perfectly Coherent", "score": 0.95}
         return inner
 
-    monkeypatch.setattr(scorer, "_classifier", mock_pipeline())
+    monkeypatch.setattr("transformers.pipeline", mock_pipeline)
     return scorer
 
 

diff --git a/tests/scorers/test_context_relevance_scorer.py b/tests/scorers/test_context_relevance_scorer.py
@@ -1,13 +1,34 @@
 """Tests for the Context Relevance Scorer."""
 import pytest
+from unittest.mock import MagicMock
 from weave.scorers.context_relevance_scorer import ContextRelevanceScorer
 from tests.scorers.test_utils import generate_large_text, generate_context_and_output
 
 
 @pytest.fixture
-def context_relevance_scorer():
+def context_relevance_scorer(monkeypatch):
     """Create a context relevance scorer for testing."""
-    return ContextRelevanceScorer()
+    # Mock wandb login and project
+    monkeypatch.setattr("wandb.login", lambda *args, **kwargs: True)
+    mock_project = MagicMock()
+    monkeypatch.setattr("wandb.Api", lambda: MagicMock(project=lambda *args: mock_project))
+
+    scorer = ContextRelevanceScorer(
+        model_name_or_path="wandb/relevance_scorer",
+        device="cpu",
+        name="test-context-relevance",
+        description="Test context relevance scorer",
+        column_map={"output": "text", "context": "context"}
+    )
+
+    def mock_pipeline(*args, **kwargs):
+        def inner(text, **kwargs):
+            return [{"generated_text": '{"relevance": 4, "relevant": true}'}]
+        return inner
+
+    monkeypatch.setattr("transformers.pipeline", mock_pipeline)
+    monkeypatch.setattr(scorer, "_classifier", mock_pipeline())
+    return scorer
 
 
 @pytest.mark.asyncio

diff --git a/tests/scorers/test_faithfulness_scorer.py b/tests/scorers/test_faithfulness_scorer.py
@@ -0,0 +1,64 @@
+import pytest
+from unittest.mock import MagicMock, patch
+
+from weave.scorers.faithfulness_scorer import FaithfulnessScorer
+from tests.scorers.test_utils import generate_large_text
+
+
+@pytest.fixture
+def faithfulness_scorer(monkeypatch):
+    # Mock model loading
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    monkeypatch.setattr("transformers.AutoModelForSequenceClassification.from_pretrained", lambda *args, **kwargs: mock_model)
+    monkeypatch.setattr("transformers.AutoTokenizer.from_pretrained", lambda *args, **kwargs: mock_tokenizer)
+
+    # Mock wandb login and project
+    monkeypatch.setattr("wandb.login", lambda *args, **kwargs: True)
+    mock_project = MagicMock()
+    monkeypatch.setattr("wandb.Api", lambda: MagicMock(project=lambda *args: mock_project))
+
+    scorer = FaithfulnessScorer(
+        model_name_or_path="wandb/faithfulness_scorer",
+        device="cpu",
+        name="test-faithfulness",
+        description="Test faithfulness scorer",
+        column_map={"output": "text", "context": "context"}
+    )
+    return scorer
+
+
+@pytest.mark.asyncio
+async def test_faithfulness_scorer_inheritance():
+    from weave.scorers.hallucination_scorer import HallucinationScorer
+
+    scorer = FaithfulnessScorer(
+        model_name_or_path="wandb/faithfulness_scorer",
+        device="cpu",
+        name="test-faithfulness",
+        description="Test faithfulness scorer",
+        column_map={"output": "text", "context": "context"}
+    )
+    assert isinstance(scorer, HallucinationScorer)
+
+
+@pytest.mark.asyncio
+async def test_faithfulness_scorer_large_input(faithfulness_scorer):
+    large_text = generate_large_text()
+    context = "This is the context for testing."
+
+    result = await faithfulness_scorer.score(large_text, context=context)
+
+    assert isinstance(result, dict)
+    assert "extras" in result
+    assert "score" in result["extras"]
+    assert isinstance(result["extras"]["score"], float)
+    assert 0 <= result["extras"]["score"] <= 1
+
+
+@pytest.mark.asyncio
+async def test_faithfulness_scorer_error_handling(faithfulness_scorer):
+    with pytest.raises(ValueError):
+        await faithfulness_scorer.score("", context="Some context")
+    with pytest.raises(ValueError):
+        await faithfulness_scorer.score("Some response", context="")
diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
@@ -1,5 +1,6 @@
 import pytest
 from openai import OpenAI
+from unittest.mock import MagicMock, patch
 
 import weave
 from weave.scorers import (
@@ -44,13 +45,41 @@ def hallucination_scorer(mock_create):
 
 
 @pytest.fixture
-def hallucination_scorer_v2(mock_create):
-    return HallucinationScorer()
+def hallucination_scorer_v2(monkeypatch):
+    # Mock wandb login and project
+    monkeypatch.setattr("wandb.login", lambda *args, **kwargs: True)
+    mock_project = MagicMock()
+    monkeypatch.setattr("wandb.Api", lambda: MagicMock(project=lambda *args: mock_project))
+
+    scorer = HallucinationScorer(
+        model_name_or_path="wandb/hallucination_scorer",
+        device="cpu",
+        name="test-hallucination",
+        description="Test hallucination scorer",
+        column_map={"output": "text"}
+    )
+    monkeypatch.setattr(scorer, "_model", MagicMock())
+    monkeypatch.setattr(scorer, "_tokenizer", MagicMock())
+    return scorer
 
 
 @pytest.fixture
-def faithfulness_scorer(mock_create):
-    return FaithfulnessScorer()
+def faithfulness_scorer(monkeypatch):
+    # Mock wandb login and project
+    monkeypatch.setattr("wandb.login", lambda *args, **kwargs: True)
+    mock_project = MagicMock()
+    monkeypatch.setattr("wandb.Api", lambda: MagicMock(project=lambda *args: mock_project))
+
+    scorer = FaithfulnessScorer(
+        model_name_or_path="wandb/faithfulness_scorer",
+        device="cpu",
+        name="test-faithfulness",
+        description="Test faithfulness scorer",
+        column_map={"output": "text"}
+    )
+    monkeypatch.setattr(scorer, "_model", MagicMock())
+    monkeypatch.setattr(scorer, "_tokenizer", MagicMock())
+    return scorer
 
 
 def test_hallucination_scorer_score(hallucination_scorer, mock_create):

diff --git a/tests/scorers/test_llamaguard_scorer.py b/tests/scorers/test_llamaguard_scorer.py
@@ -1,5 +1,7 @@
 import pytest
 from transformers import AutoTokenizer
+from unittest.mock import MagicMock
+import wandb
 
 import weave
 from weave.scorers import LlamaGuard
@@ -10,34 +12,47 @@
 
 @pytest.fixture
 def llamaguard_scorer(monkeypatch):
+    # Mock model loading
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    monkeypatch.setattr("transformers.AutoModelForCausalLM.from_pretrained", lambda *args, **kwargs: mock_model)
+    monkeypatch.setattr("transformers.AutoTokenizer.from_pretrained", lambda *args, **kwargs: mock_tokenizer)
+
+    # Mock wandb login and project
+    monkeypatch.setattr("wandb.login", lambda *args, **kwargs: True)
+    mock_project = MagicMock()
+    monkeypatch.setattr("wandb.Api", lambda: MagicMock(project=lambda *args: mock_project))
+
     scorer = LlamaGuard(
         model_name=_TINY_MODEL_NAME,
         device="cpu",
+        name="test-llamaguard",
+        description="Test LlamaGuard scorer",
+        column_map={"output": "text"}
     )
-    scorer._tokenizer = AutoTokenizer.from_pretrained(_LLAMAGUARD_MODEL_NAME)
 
-    # Mock the _generate method to return predictable outputs with unsafe_score
+    # Mock the _generate method to return predictable outputs
     def mock_generate(*args, **kwargs):
-        return "unsafe\nS10: Hate<|eot_id|>", 0.85  # Added mock unsafe_score
+        return "unsafe\nS10: Hate<|eot_id|>", 0.85
 
     monkeypatch.setattr(scorer, "_generate", mock_generate)
     return scorer
 
 
 def test_llamaguard_postprocess(llamaguard_scorer):
     # Test safe content
-    safe_output = ("safe", 0.1)  # Added mock unsafe_score
+    safe_output = ("safe", 0.1)
     result = llamaguard_scorer.postprocess(*safe_output)
     assert result["safe"]
     assert result["category"] is None
-    assert result["unsafe_score"] == 0.1  # Test unsafe_score
+    assert result["unsafe_score"] == 0.1
 
     # Test unsafe content with category
-    unsafe_output = ("unsafe\nS5<|eot_id|>", 0.9)  # Added mock unsafe_score
+    unsafe_output = ("unsafe\nS5<|eot_id|>", 0.9)
     result = llamaguard_scorer.postprocess(*unsafe_output)
     assert not result["safe"]
     assert result["category"] == "S5: Defamation"
-    assert result["unsafe_score"] == 0.9  # Test unsafe_score
+    assert result["unsafe_score"] == 0.9
 
 
 @pytest.mark.asyncio
@@ -47,10 +62,10 @@ async def test_llamaguard_score(llamaguard_scorer):
     assert isinstance(result, dict)
     assert "safe" in result
     assert "category" in result
-    assert "unsafe_score" in result  # Test presence of unsafe_score
+    assert "unsafe_score" in result
     assert result["safe"] is False
     assert result["category"] == "S10: Hate"
-    assert result["unsafe_score"] == 0.85  # Test unsafe_score matches mock value
+    assert result["unsafe_score"] == 0.85
 
 
 @pytest.mark.asyncio

diff --git a/tests/scorers/test_llm_integrations.py b/tests/scorers/test_llm_integrations.py
@@ -1,4 +1,5 @@
 import os
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -65,18 +66,34 @@ def get_client_and_model(provider, model):
 
 
 @pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
-def test_summarization_scorer_evaluate_summary(provider, model):
-    client, model_id = get_client_and_model(provider, model)
+def test_summarization_scorer_evaluate_summary(provider, model, monkeypatch):
+    # Mock instructor client
+    mock_instructor = MagicMock()
+    mock_instructor.from_openai.return_value = MagicMock()
+    monkeypatch.setattr("instructor.patch", mock_instructor)
+
+    # Mock the client creation and response
+    mock_client = MagicMock()
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock(message=MagicMock(content="Mocked response"))]
+    mock_client.chat.completions.create.return_value = mock_response
+
+    monkeypatch.setattr("openai.OpenAI", lambda *args, **kwargs: mock_client)
+    monkeypatch.setattr("anthropic.Anthropic", lambda *args, **kwargs: mock_client)
+    monkeypatch.setattr("mistralai.Mistral", lambda *args, **kwargs: mock_client)
+    monkeypatch.setattr("google.generativeai.GenerativeModel", lambda *args, **kwargs: mock_client)
 
+    client, model_id = get_client_and_model(provider, model)
     summarization_scorer = SummarizationScorer(
         client=client,
         model_id=model_id,
         temperature=0.7,
         max_tokens=1024,
+        name="test-summarization",
+        description="Test summarization scorer",
+        column_map={"output": "text", "input": "text"}
     )
     input_text = "This is the original text."
     summary_text = "This is the summary."
-    result = summarization_scorer.evaluate_summary(
-        input=input_text, summary=summary_text
-    )
+    result = summarization_scorer.evaluate_summary(input=input_text, summary=summary_text)
     assert isinstance(result, SummarizationEvaluationResponse)