From 0b2bbf2f05e053f653b9ff4dad73c3bd34eae34f Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Sat, 12 Oct 2024 22:47:35 +0100
Subject: [PATCH] feat(weave): fixes tests, summarization scorer re-write,
 re-names flow/scorer dir, create weave/scorers dir

---
 docs/docs/guides/integrations/langchain.md  |   2 +-
 docs/docs/tutorial-eval.md                  |   4 +-
 docs/docs/tutorial-rag.md                   |   2 +-
 tests/scorers/test_hallucination_scorer.py  |  42 ++++-
 tests/scorers/test_json_scorer.py           |  28 ++--
 tests/scorers/test_pydantic_scorer.py       |  13 +-
 tests/scorers/test_ragas_scorer.py          |  36 +++-
 tests/scorers/test_string_scorer.py         |   9 +-
 tests/scorers/test_summarization_scorer.py  |  74 +++++---
 tests/trace/test_evaluations.py             |  35 ++--
 weave/__init__.py                           |  14 --
 weave/flow/eval.py                          |  35 ++--
 weave/flow/scorers/__init__.py              |  48 +++---
 weave/flow/scorers/classification_scorer.py |   2 +-
 weave/flow/scorers/hallucination_scorer.py  | 103 ++++++++++--
 weave/flow/scorers/json_scorer.py           |   4 +-
 weave/flow/scorers/llm_scorer.py            |  14 +-
 weave/flow/scorers/llm_utils.py             |  19 ++-
 weave/flow/scorers/moderation_scorer.py     |   2 +-
 weave/flow/scorers/pydantic_scorer.py       |   2 +-
 weave/flow/scorers/ragas_scorer.py          |   4 +-
 weave/flow/scorers/similarity_score.py      |   4 +-
 weave/flow/scorers/string_scorer.py         |   5 +-
 weave/flow/scorers/summarization_scorer.py  | 177 +++++++++++++++++---
 weave/flow/scorers/xml_scorer.py            |   4 +-
 weave/scorers/__init__.py                   |   1 +
 26 files changed, 505 insertions(+), 178 deletions(-)
 create mode 100644 weave/scorers/__init__.py

diff --git a/docs/docs/guides/integrations/langchain.md b/docs/docs/guides/integrations/langchain.md
index b382e793e70..4487a85dfd4 100644
--- a/docs/docs/guides/integrations/langchain.md
+++ b/docs/docs/guides/integrations/langchain.md
@@ -196,7 +196,7 @@ Evaluations help you measure the performance of your models. By using the [`weav
 
 ```python
 
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 sentences = [
     "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
diff --git a/docs/docs/tutorial-eval.md b/docs/docs/tutorial-eval.md
index 929d29ab56a..0705443b1f0 100644
--- a/docs/docs/tutorial-eval.md
+++ b/docs/docs/tutorial-eval.md
@@ -94,7 +94,7 @@ Here `sentence` is passed to the model's predict function, and `target` is used
 
 ```python
 import weave
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 
 weave.init('intro-example')
 
@@ -132,7 +132,7 @@ import asyncio
 # highlight-next-line
 import weave
 # highlight-next-line
-from weave.flow.scorer import MultiTaskBinaryClassificationF1
+from weave.scorers import MultiTaskBinaryClassificationF1
 import openai
 
 # We create a model class with one predict function.
diff --git a/docs/docs/tutorial-rag.md b/docs/docs/tutorial-rag.md
index 81cf3b2b9b2..466d6326549 100644
--- a/docs/docs/tutorial-rag.md
+++ b/docs/docs/tutorial-rag.md
@@ -182,7 +182,7 @@ On a high-level the steps to create custom Scorer are quite simple:
 
 
 ```python
-from weave.flow.scorer import Scorer
+from weave.scorers import Scorer
 from weave import WeaveList
 
 class CorrectnessLLMJudge(Scorer):
diff --git a/tests/scorers/test_hallucination_scorer.py b/tests/scorers/test_hallucination_scorer.py
index b486467f717..6cae7d157d8 100644
--- a/tests/scorers/test_hallucination_scorer.py
+++ b/tests/scorers/test_hallucination_scorer.py
@@ -1,24 +1,43 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorer.hallucination_scorer import (
+from weave.flow.scorers.hallucination_scorer import (
+    HallucinationReasoning,
     HallucinationResponse,
+)
+from weave.scorers import (
     HallucinationScorer,
 )
 
+
 # mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         return HallucinationResponse(
             chain_of_thought="The output is consistent with the input data.",
-            is_hallucination=False
+            hallucination_reasonings=[
+                HallucinationReasoning(
+                    observation="My observation for this is that the output is consistent with the input data.",
+                    hallucination_type="No Hallucination",
+                )
+            ],
+            conclusion="The output is consistent with the input data.",
+            is_hallucination=False,
         )
-    monkeypatch.setattr('weave.flow.scorer.hallucination_scorer.create', _mock_create)
+
+    monkeypatch.setattr("weave.flow.scorers.hallucination_scorer.create", _mock_create)
+
 
 @pytest.fixture
 def hallucination_scorer(mock_create):
-    return HallucinationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=4096)
+    return HallucinationScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=4096,
+    )
+
 
 def test_hallucination_scorer_initialization(hallucination_scorer):
     assert isinstance(hallucination_scorer, HallucinationScorer)
@@ -26,12 +45,19 @@ def test_hallucination_scorer_initialization(hallucination_scorer):
     assert hallucination_scorer.temperature == 0.7
     assert hallucination_scorer.max_tokens == 4096
 
+
 def test_hallucination_scorer_score(hallucination_scorer, mock_create):
     output = "John's favorite cheese is cheddar."
     context = "John likes various types of cheese."
-    result = hallucination_scorer.score(output, context)
+    result = hallucination_scorer.score(output=output, context=context)
     assert isinstance(result, HallucinationResponse)
     assert not result.is_hallucination
-    assert "The output is consistent with the input data."  == result.chain_of_thought
-
-# Add more tests as needed
+    assert isinstance(result.hallucination_reasonings, list)
+    assert isinstance(result.hallucination_reasonings[0], HallucinationReasoning)
+    assert result.chain_of_thought == "The output is consistent with the input data."
+    assert (
+        result.hallucination_reasonings[0].observation
+        == "My observation for this is that the output is consistent with the input data."
+    )
+    assert result.conclusion == "The output is consistent with the input data."
+    assert result.hallucination_reasonings[0].hallucination_type == "No Hallucination"
diff --git a/tests/scorers/test_json_scorer.py b/tests/scorers/test_json_scorer.py
index 443e8d10872..6cd1cf480cf 100644
--- a/tests/scorers/test_json_scorer.py
+++ b/tests/scorers/test_json_scorer.py
@@ -1,44 +1,50 @@
-from weave.flow.scorer.json_scorer import JSONScorer
+from weave.scorers import ValidJSONScorer
 
 
 def test_json_scorer_valid_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"city": "San Francisco", "country": "USA"}'
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_invalid_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"city": "San Francisco", "country": "USA"'
     result = scorer.score(output)
     assert result["json_valid"] is False
 
+
 def test_json_scorer_non_json_string():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = "Just a plain string."
     result = scorer.score(output)
     assert result["json_valid"] is False
 
+
 def test_json_scorer_valid_json_list():
-    scorer = JSONScorer()
-    output = '[1, 2, 3, 4, 5]'
+    scorer = ValidJSONScorer()
+    output = "[1, 2, 3, 4, 5]"
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_nested_json():
-    scorer = JSONScorer()
+    scorer = ValidJSONScorer()
     output = '{"person": {"name": "John", "age": 30}, "city": "New York"}'
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_empty_object():
-    scorer = JSONScorer()
-    output = '{}'
+    scorer = ValidJSONScorer()
+    output = "{}"
     result = scorer.score(output)
     assert result["json_valid"] is True
 
+
 def test_json_scorer_empty_list():
-    scorer = JSONScorer()
-    output = '[]'
+    scorer = ValidJSONScorer()
+    output = "[]"
     result = scorer.score(output)
     assert result["json_valid"] is True
diff --git a/tests/scorers/test_pydantic_scorer.py b/tests/scorers/test_pydantic_scorer.py
index 1a4112fc3f6..f9953ba6abd 100644
--- a/tests/scorers/test_pydantic_scorer.py
+++ b/tests/scorers/test_pydantic_scorer.py
@@ -1,46 +1,55 @@
 import pytest
 from pydantic import BaseModel
 
-from weave.flow.scorer.pydantic_scorer import PydanticScorer
+from weave.scorers import PydanticScorer
 
 
 class User(BaseModel):
     name: str
     age: int
 
+
 @pytest.fixture
 def user_scorer():
     return PydanticScorer(model=User)
 
+
 def test_pydantic_scorer_initialization():
     scorer = PydanticScorer(model=User)
     assert isinstance(scorer, PydanticScorer)
     assert scorer.model == User
 
+
 def test_pydantic_scorer_valid_json_string(user_scorer):
     valid_json = '{"name": "John", "age": 30}'
     assert user_scorer.score(valid_json) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_valid_dict(user_scorer):
     valid_dict = {"name": "John", "age": 30}
     assert user_scorer.score(valid_dict) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_invalid_json_string(user_scorer):
     invalid_json = '{"name": "John", "age": "thirty"}'
     assert user_scorer.score(invalid_json) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_invalid_dict(user_scorer):
     invalid_dict = {"name": "John", "age": "thirty"}
     assert user_scorer.score(invalid_dict) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_missing_field(user_scorer):
     missing_field = '{"name": "John"}'
     assert user_scorer.score(missing_field) == {"valid_pydantic": False}
 
+
 def test_pydantic_scorer_extra_field(user_scorer):
     extra_field = '{"name": "John", "age": 30, "city": "New York"}'
     assert user_scorer.score(extra_field) == {"valid_pydantic": True}
 
+
 def test_pydantic_scorer_invalid_input_type(user_scorer):
     invalid_input = 123  # Neither a string nor a dict
-    assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
\ No newline at end of file
+    assert user_scorer.score(invalid_input) == {"valid_pydantic": False}
diff --git a/tests/scorers/test_ragas_scorer.py b/tests/scorers/test_ragas_scorer.py
index 108db4f69f0..2144200d809 100644
--- a/tests/scorers/test_ragas_scorer.py
+++ b/tests/scorers/test_ragas_scorer.py
@@ -1,42 +1,60 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorer.ragas_scorer import (
-    ContextEntityRecallScorer,
-    ContextRelevancyScorer,
+from weave.flow.scorers.ragas_scorer import (
     EntityExtractionResponse,
     RelevancyResponse,
 )
+from weave.scorers import (
+    ContextEntityRecallScorer,
+    ContextRelevancyScorer,
+)
+
 
 # Mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
         # Retrieve the response_model to return appropriate mock responses
-        response_model = kwargs.get('response_model')
+        response_model = kwargs.get("response_model")
         if response_model == EntityExtractionResponse:
             return EntityExtractionResponse(entities=["Paris"])
         elif response_model == RelevancyResponse:
             return RelevancyResponse(
                 reasoning="The context directly answers the question.",
-                relevancy_score=1
+                relevancy_score=1,
             )
         else:
             return None
-    monkeypatch.setattr('weave.flow.scorer.ragas_scorer.create', _mock_create)
+
+    monkeypatch.setattr("weave.flow.scorers.ragas_scorer.create", _mock_create)
+
 
 @pytest.fixture
 def context_entity_recall_scorer(mock_create):
-    return ContextEntityRecallScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return ContextEntityRecallScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=1024,
+    )
+
 
 @pytest.fixture
 def context_relevancy_scorer(mock_create):
-    return ContextRelevancyScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return ContextRelevancyScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=1024,
+    )
+
 
 def test_context_entity_recall_scorer_initialization(context_entity_recall_scorer):
     assert isinstance(context_entity_recall_scorer, ContextEntityRecallScorer)
     assert context_entity_recall_scorer.model_id == "gpt-4o"
 
+
 def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     output = "Paris is the capital of France."
     context = "The capital city of France is Paris."
@@ -45,10 +63,12 @@ def test_context_entity_recall_scorer_score(context_entity_recall_scorer):
     assert "recall" in result
     assert result["recall"] == 1.0  # Assuming full recall in mock response
 
+
 def test_context_relevancy_scorer_initialization(context_relevancy_scorer):
     assert isinstance(context_relevancy_scorer, ContextRelevancyScorer)
     assert context_relevancy_scorer.model_id == "gpt-4o"
 
+
 def test_context_relevancy_scorer_score(context_relevancy_scorer):
     output = "What is the capital of France?"
     context = "Paris is the capital city of France."
diff --git a/tests/scorers/test_string_scorer.py b/tests/scorers/test_string_scorer.py
index 3c460cb04db..dfa05daf7e8 100644
--- a/tests/scorers/test_string_scorer.py
+++ b/tests/scorers/test_string_scorer.py
@@ -1,4 +1,4 @@
-from weave.flow.scorer.string_scorer import (
+from weave.scorers import (
     LevenshteinScorer,
     StringMatchScorer,
 )
@@ -11,6 +11,7 @@ def test_string_match_scorer():
     result = scorer.score(output, target)
     assert result["string_in_input"] is True
 
+
 def test_string_match_scorer_false():
     scorer = StringMatchScorer()
     output = "Alice"
@@ -18,6 +19,7 @@ def test_string_match_scorer_false():
     result = scorer.score(output, target)
     assert result["string_in_input"] is False
 
+
 # def test_regex_scorer():
 #     scorer = RegexScorer(patterns="engineer")
 #     output = "I am an engineer"
@@ -36,6 +38,7 @@ def test_string_match_scorer_false():
 #     result = scorer.score(output)
 #     assert result["string_match"] is False
 
+
 def test_levenshtein_scorer():
     scorer = LevenshteinScorer()
     output = "Hello"
@@ -43,6 +46,7 @@ def test_levenshtein_scorer():
     result = scorer.score(output, target)
     assert result["levenshtein_distance"] == 1
 
+
 def test_levenshtein_scorer_same_strings():
     scorer = LevenshteinScorer()
     output = "Hello"
@@ -50,9 +54,10 @@ def test_levenshtein_scorer_same_strings():
     result = scorer.score(output, target)
     assert result["levenshtein_distance"] == 0
 
+
 def test_levenshtein_scorer_completely_different():
     scorer = LevenshteinScorer()
     output = "Hello"
     target = "World"
     result = scorer.score(output, target)
-    assert result["levenshtein_distance"] == 4
\ No newline at end of file
+    assert result["levenshtein_distance"] == 4
diff --git a/tests/scorers/test_summarization_scorer.py b/tests/scorers/test_summarization_scorer.py
index 4534056ecf8..60b026b3080 100644
--- a/tests/scorers/test_summarization_scorer.py
+++ b/tests/scorers/test_summarization_scorer.py
@@ -1,45 +1,81 @@
 import pytest
 from openai import OpenAI
 
-from weave.flow.scorer.summarization_scorer import (
+from weave.flow.scorers.summarization_scorer import (
     EntityExtractionResponse,
+    SummarizationEvaluationResponse,
+)
+from weave.scorers import (
     SummarizationScorer,
 )
 
 
-# mock the create function
 @pytest.fixture
 def mock_create(monkeypatch):
     def _mock_create(*args, **kwargs):
-        return EntityExtractionResponse(
-            entities=["entity1", "entity2"]
-        )
-    monkeypatch.setattr('weave.flow.scorer.summarization_scorer.create', _mock_create)
+        response_model = kwargs.get("response_model")
+        if response_model == EntityExtractionResponse:
+            return EntityExtractionResponse(entities=["entity1", "entity2"])
+        elif response_model == SummarizationEvaluationResponse:
+            return SummarizationEvaluationResponse(
+                think_step_by_step="This is some reasoning.",
+                summarization_evaluation="excellent",
+            )
+        else:
+            return None
+
+    # Patch the 'create' function wherever it is called
+    monkeypatch.setattr("weave.flow.scorers.summarization_scorer.create", _mock_create)
+
 
 @pytest.fixture
 def summarization_scorer(mock_create):
-    return SummarizationScorer(client=OpenAI(api_key="DUMMY_API_KEY"), model_id="gpt-4o", temperature=0.7, max_tokens=1024)
+    return SummarizationScorer(
+        client=OpenAI(api_key="DUMMY_API_KEY"),
+        model_id="gpt-4o",
+        temperature=0.7,
+        max_tokens=1024,
+    )
+
 
-def test_summarization_scorer_initialization(summarization_scorer, mock_create):
+def test_summarization_scorer_evaluate_summary(summarization_scorer, mock_create):
+    input_text = "This is the original text."
+    summary_text = "This is the summary."
+    result = summarization_scorer.evaluate_summary(
+        input=input_text, summary=summary_text
+    )
+    assert isinstance(result, SummarizationEvaluationResponse)
+    assert result.summarization_evaluation == "excellent"
+    assert result.think_step_by_step == "This is some reasoning."
+
+
+@pytest.mark.asyncio
+async def test_summarization_scorer_score(summarization_scorer):
+    input_text = "This is the original text."
+    output_text = "This is the summary."
+    result = await summarization_scorer.score(input=input_text, output=output_text)
+    assert isinstance(result, dict)
+    assert "summarization_eval_score" in result
+    assert result["summarization_eval_score"] == 1.0  # "excellent" maps to 1.0
+    assert "llm_eval_reasoning" in result
+    assert result["llm_eval_reasoning"] == "This is some reasoning."
+    assert "is_entity_dense" in result
+    assert isinstance(result["is_entity_dense"], bool)
+    assert "entity_density" in result
+    assert isinstance(result["entity_density"], float)
+
+
+def test_summarization_scorer_initialization(summarization_scorer):
     assert isinstance(summarization_scorer, SummarizationScorer)
     assert summarization_scorer.model_id == "gpt-4o"
     assert summarization_scorer.temperature == 0.7
     assert summarization_scorer.max_tokens == 1024
 
-def test_summarization_scorer_extract_entities(summarization_scorer, mock_create):
+
+def test_summarization_scorer_extract_entities(summarization_scorer):
     text = "This is a sample text with entities."
     entities = summarization_scorer.extract_entities(text)
     assert isinstance(entities, list)
     assert len(entities) == 2
     assert "entity1" in entities
     assert "entity2" in entities
-
-def test_summarization_scorer_score(summarization_scorer):
-    input_text = "This is the original text with entities."
-    output_text = "This is a summary with some entities."
-    result = summarization_scorer.score(input=input_text, output=output_text)
-    assert isinstance(result, dict)
-    assert "recall" in result
-    assert 0 <= result["recall"] <= 1
-
-# Add more tests as needed
diff --git a/tests/trace/test_evaluations.py b/tests/trace/test_evaluations.py
index 49f8426cc65..16c73aed5b3 100644
--- a/tests/trace/test_evaluations.py
+++ b/tests/trace/test_evaluations.py
@@ -749,17 +749,15 @@ def function_score(image, dc, model, obj, text, output) -> bool:
     assert "file_content_read" in access_log
 
 
-
 @pytest.mark.asyncio
 async def test_evaluation_with_column_map():
-
     # Define a dummy scorer that uses column_map
     class DummyScorer(Scorer):
         @weave.op()
         def score(self, foo: str, bar: str, output: str, target: str) -> dict:
             # Return whether foo + bar equals output
             return {"match": (foo + bar) == output == target}
-        
+
     # Create the scorer with column_map mapping 'foo'->'col1', 'bar'->'col2'
     dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
 
@@ -781,12 +779,13 @@ def model_function(col1, col2):
     eval_out = await evaluation.evaluate(model_function)
 
     # Check that 'DummyScorer' is in the results
-    assert 'DummyScorer' in eval_out
+    assert "DummyScorer" in eval_out
 
     # The expected summary should show that 3 out of 4 predictions matched
     expected_results = {"true_count": 3, "true_fraction": 0.75}
-    assert eval_out['DummyScorer']["match"] == expected_results, "The summary should reflect the correct number of matches"
-
+    assert (
+        eval_out["DummyScorer"]["match"] == expected_results
+    ), "The summary should reflect the correct number of matches"
 
 
 # Define another dummy scorer
@@ -799,16 +798,20 @@ class DummyScorer(Scorer):
         def score(self, foo: str, bar: str, output: str, target: str) -> dict:
             # Return whether foo + bar equals output
             return {"match": (foo + bar) == output == target}
+
     class AnotherDummyScorer(Scorer):
         @weave.op()
         def score(self, input1: str, input2: str, output: str) -> dict:
             # Return whether input1 == output reversed
             return {"match": input1 == output[::-1]}
+
     # First scorer maps 'foo'->'col1', 'bar'->'col2'
     dummy_scorer = DummyScorer(column_map={"foo": "col1", "bar": "col2"})
 
     # Second scorer maps 'input1'->'col2', 'input2'->'col1'
-    another_dummy_scorer = AnotherDummyScorer(column_map={"input1": "col2", "input2": "col1"})
+    another_dummy_scorer = AnotherDummyScorer(
+        column_map={"input1": "col2", "input2": "col1"}
+    )
 
     @weave.op()
     def model_function(col1, col2):
@@ -821,18 +824,22 @@ def model_function(col1, col2):
         {"col1": "xyz", "col2": "zyx", "target": "zzzzzz"},
     ]
 
-    evaluation = Evaluation(dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer])
+    evaluation = Evaluation(
+        dataset=dataset, scorers=[dummy_scorer, another_dummy_scorer]
+    )
 
     # Run the evaluation
     eval_out = await evaluation.evaluate(model_function)
 
     # Check that both scorers are in the results
-    assert 'DummyScorer' in eval_out
-    assert 'AnotherDummyScorer' in eval_out
+    assert "DummyScorer" in eval_out
+    assert "AnotherDummyScorer" in eval_out
 
     # Assertions for the first scorer
-    expected_results_dummy = {"true_count": 1, "true_fraction": 1.0/3}
-    assert eval_out['DummyScorer']["match"] == expected_results_dummy, "All concatenations should match the target"
+    expected_results_dummy = {"true_count": 1, "true_fraction": 1.0 / 3}
+    assert (
+        eval_out["DummyScorer"]["match"] == expected_results_dummy
+    ), "All concatenations should match the target"
 
     # Assertions for the second scorer
     # Since input1 == col2, and output is col1 + col2, we check if col2 == (col1 + col2)[::-1]
@@ -842,4 +849,6 @@ def model_function(col1, col2):
     # Third row: col2 = "zyx", output = "xyzzyx", output[::-1] = "xyzzyx" -> "zyx" == "xyzzyx" is False
     # So all matches are False
     expected_results_another_dummy = {"true_count": 0, "true_fraction": 0.0}
-    assert eval_out['AnotherDummyScorer']["match"] == expected_results_another_dummy, "No matches should be found for AnotherDummyScorer"
+    assert (
+        eval_out["AnotherDummyScorer"]["match"] == expected_results_another_dummy
+    ), "No matches should be found for AnotherDummyScorer"
diff --git a/weave/__init__.py b/weave/__init__.py
index 7cf5b49de48..3b54ba97176 100644
--- a/weave/__init__.py
+++ b/weave/__init__.py
@@ -15,20 +15,6 @@
 from weave.trace.util import Thread as Thread
 from weave.trace.util import ThreadPoolExecutor as ThreadPoolExecutor
 
-from typing import TYPE_CHECKING
-
-# Helper for IDEs
-if TYPE_CHECKING:
-    from weave.flow import scorers
-
-# Lazy import for the scorers module
-def __getattr__(name):
-    if name == "scorers":
-        from weave.flow import scorers
-        globals()["scorers"] = scorers
-        return scorers
-    raise AttributeError(f"module {__name__} has no attribute {name}")
-
 # Special object informing doc generation tooling which symbols
 # to document & to associate with this module.
 __docspec__ = [
diff --git a/weave/flow/eval.py b/weave/flow/eval.py
index 9a807d47f98..4c211e5d546 100644
--- a/weave/flow/eval.py
+++ b/weave/flow/eval.py
@@ -206,30 +206,45 @@ async def predict_and_score(
 
             # TODO: Check for input columns parameters in the signature of the scorer
 
-            if "model_output" not in score_arg_names and "output" not in score_arg_names:
+            if (
+                "model_output" not in score_arg_names
+                and "output" not in score_arg_names
+            ):
                 raise OpCallError(
                     f"Scorer {scorer_name} must have a 'model_output' or 'output' argument, to receive the output of the model function."
                 )
 
             if isinstance(example, dict):
-                # If we get a column_map from the scorer, it means that the scorer expects the input to have different names than the dataset columns
-                # So we need to remap the input names to the expected names in the scorer
-                # For instance, if the scorer expects "input" and "target" and we have a dataset with columns "question" and "expected"
-                # we need to remap {"question": "input", "expected": "target"}
-                # and pass those to the scorer
+                # The keys of `score_args` must match the parameter names of the scorer's `score` method.
+                # If scorer.column_map is set, then user is indicating that the dataset column(s)
+                # being passed to the scorer have different names to the scorer's parameter names.
+                # So we need to remap the dataset columns to the expected parameter names in the scorer,
+                #
+                # column_map k:v pairs must be structured as `scorer param name : dataset column name`
+                #
+                # For instance, if the scorer expects "input" and "ground_truth" and we have a dataset
+                # with columns "question" and "answer", column_map should be defined as follows:
+                # {"input": "question", "ground_truth": "answer"}
+                #
                 # input: is the full row, we have access to it via example
                 # output: is the model output, we have access to it via model_output
                 if isinstance(scorer, Scorer) and scorer.column_map is not None:
-                    print(f"scorer.column_map: {scorer.column_map}")
-                    print(f"score_arg_names: {score_arg_names}")
-                    print(f"example: {example}")
+                    print(
+                        f"scorer.column_map: {scorer.column_map}"
+                    )  # TODO: delete print statement
+                    print(
+                        f"score_arg_names: {score_arg_names}"
+                    )  # TODO: delete print statement
+                    print(f"example: {example}")  # TODO: delete print statement
                     score_args = {
                         arg: example[scorer.column_map.get(arg, arg)]
                         for arg in score_arg_names
                         if scorer.column_map.get(arg, arg) in example
                     }
                 else:
-                    score_args = {k: v for k, v in example.items() if k in score_arg_names}
+                    score_args = {
+                        k: v for k, v in example.items() if k in score_arg_names
+                    }
 
             else:
                 if len(score_arg_names) == 2:
diff --git a/weave/flow/scorers/__init__.py b/weave/flow/scorers/__init__.py
index 68c423eea3c..811880abc44 100644
--- a/weave/flow/scorers/__init__.py
+++ b/weave/flow/scorers/__init__.py
@@ -1,51 +1,51 @@
-from weave.flow.scorer.base_scorer import (
+from weave.flow.scorers.base_scorer import (
     Scorer,
     auto_summarize,
     get_scorer_attributes,
 )
-from weave.flow.scorer.classification_scorer import (
+from weave.flow.scorers.classification_scorer import (
     MultiTaskBinaryClassificationF1,
     transpose,
 )
-from weave.flow.scorer.hallucination_scorer import HallucinationScorer
-from weave.flow.scorer.json_scorer import JSONScorer
-from weave.flow.scorer.llm_scorer import (
+from weave.flow.scorers.hallucination_scorer import HallucinationScorer
+from weave.flow.scorers.json_scorer import ValidJSONScorer
+from weave.flow.scorers.llm_scorer import (
     InstructorLLMScorer,
     LLMScorer,
 )
-from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer
-from weave.flow.scorer.pydantic_scorer import PydanticScorer
-from weave.flow.scorer.ragas_scorer import (
+from weave.flow.scorers.moderation_scorer import OpenAIModerationScorer
+from weave.flow.scorers.pydantic_scorer import PydanticScorer
+from weave.flow.scorers.ragas_scorer import (
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
-from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
-from weave.flow.scorer.string_scorer import (
+from weave.flow.scorers.similarity_score import EmbeddingSimilarityScorer
+from weave.flow.scorers.string_scorer import (
     LevenshteinScorer,
     RegexScorer,
     StringMatchScorer,
 )
-from weave.flow.scorer.summarization_scorer import SummarizationScorer
-from weave.flow.scorer.xml_scorer import XMLScorer
+from weave.flow.scorers.summarization_scorer import SummarizationScorer
+from weave.flow.scorers.xml_scorer import ValidXMLScorer
 
 __all__ = [
-    "Scorer",
     "auto_summarize",
+    "ContextEntityRecallScorer",
+    "ContextRelevancyScorer",
+    "EmbeddingSimilarityScorer",
     "get_scorer_attributes",
-    "MultiTaskBinaryClassificationF1",
-    "transpose",
-    "RegexScorer",
-    "StringMatchScorer",
+    "HallucinationScorer",
+    "InstructorLLMScorer",
+    "ValidJSONScorer",
     "LevenshteinScorer",
-    "JSONScorer",
     "LLMScorer",
-    "InstructorLLMScorer",
-    "EmbeddingSimilarityScorer",
+    "MultiTaskBinaryClassificationF1",
     "OpenAIModerationScorer",
     "PydanticScorer",
-    "HallucinationScorer",
-    "ContextEntityRecallScorer",
-    "ContextRelevancyScorer",
+    "RegexScorer",
+    "Scorer",
+    "StringMatchScorer",
     "SummarizationScorer",
-    "XMLScorer",
+    "transpose",
+    "ValidXMLScorer",
 ]
diff --git a/weave/flow/scorers/classification_scorer.py b/weave/flow/scorers/classification_scorer.py
index 622f576e678..4082b291029 100644
--- a/weave/flow/scorers/classification_scorer.py
+++ b/weave/flow/scorers/classification_scorer.py
@@ -2,7 +2,7 @@
 from typing import Optional, Tuple
 
 import weave
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
 def p_r_f1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
diff --git a/weave/flow/scorers/hallucination_scorer.py b/weave/flow/scorers/hallucination_scorer.py
index a2e4d38fa41..9534043d2a9 100644
--- a/weave/flow/scorers/hallucination_scorer.py
+++ b/weave/flow/scorers/hallucination_scorer.py
@@ -1,41 +1,114 @@
+from typing import List
+
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorer.llm_scorer import InstructorLLMScorer
-from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL, create
-from weave.flow.scorer.utils import stringify
+from weave.flow.scorers.llm_scorer import InstructorLLMScorer
+from weave.flow.scorers.llm_utils import OPENAI_DEFAULT_MODEL, create
+from weave.flow.scorers.utils import stringify
+
+DEFAULT_HALLUCINATION_SYSTEM_PROMPT = """
+Given some <input_data> from a user and an <output> generated by an AI system, \
+determine if the <output> contains any hallucinations.
+
+A "hallucination" is defined as information in the <output> that is not supported by \
+the <input_data> or is not factually or logically consistent with the <input_data>.
+
+# Steps
+1. Carefully read and understand the input data.
+2. Examine the model output.
+3. Compare the output to the input data, identifying any inconsistencies or additions.
+4. Evaluate the logical connection between input and output.
+5. Determine if any information in the output is not supported by or conflicts with the input.
+
+# Guidelines
+- Focus on factual accuracy and logical consistency
+- Consider both explicit and implicit information in the input data
+- Be aware of potential misinterpretations or over-generalizations in the output
+- Identify any information in the output that goes beyond the scope of the input
+
+# Examples
+## Data to analyze
+
+<input_data_example>
+The cat is black and white.
+</input_data_example>
+
+<output_example>
+The cat has orange stripes.
+</output_example>
+
+## Analysis:
+{
+  "think_step_by_step": "The cat is black and white. The cat has orange stripes. \
+The output contradicts the input data because the input specifies black and white, \
+while the output mentions orange. The output also introduces a pattern not present in \
+the input.",
+  "reasoning": [
+    {
+      "hallucination_type": "Color comparison",
+      "observation": "Input specifies black and white, output mentions orange"
+    },
+    {
+      "hallucination_type": "Pattern analysis",
+      "observation": "Input doesn't mention any pattern, output introduces stripes"
+    }
+  ],
+  "conclusion": "The output contains two hallucinations: it contradicts the color information \
+and introduces a pattern not present in the input."
+  "is_hallucination": true,
+}
+
+# Notes
+- Ensure each step in the reasoning process is clearly articulated
+- Be objective and avoid assumptions not supported by the input data
+- If the output contains factual information not present in the input, it may be a \
+hallucination even if it doesn't directly contradict the input
+"""
+
+DEFAULT_HALLUCINATION_USER_PROMPT = """
+Analyze the following <input_data> and <output> and determine if the <output> contains any hallucinations.
+# Data to analyze
 
-DEFAULT_SYSTEM_PROMPT = """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
-DEFAULT_USER_PROMPT = """Given some input_data and a output, determine if the output is a hallucination of the input_data.
-## Input data
 <input_data>
 {input_data}
-</dataset_row>
+</input_data>
 
-## Model output
 <output>
 {output}
 </output>
-
-## Instructions
-Think step by step before answering. Is the output an factually and logically consistent with the input_data?
 """
 
 
+class HallucinationReasoning(BaseModel):
+    hallucination_type: str = Field(
+        description="A short name for the type of hallucination."
+    )
+    observation: str = Field(
+        description="An observation from the <input_data> and <output> that supports the hallucination."
+    )
+
+
 class HallucinationResponse(BaseModel):
     chain_of_thought: str = Field(
-        description="Think step by step about whether the output is a hallucination of the dataset_row"
+        description="Think step by step about whether the <output> contains hallucinations \
+based on the <input_data>."
+    )
+    hallucination_reasonings: List[HallucinationReasoning] = Field(
+        description="A list of reasoning steps that lead to the conclusion about whether or not\
+the <output> contains hallucinations."
     )
+    conclusion: str = Field(description="The conclusion of the analysis.")
     is_hallucination: bool = Field(
-        description="Whether the model output is a hallucination of the dataset row"
+        description="Whether the <output> contains hallucinations based on the <input_data>."
     )
 
 
 class HallucinationScorer(InstructorLLMScorer):
     """Scorer that checks if the model output is a hallucination of the dataset row."""
 
-    system_prompt: str = DEFAULT_SYSTEM_PROMPT
-    user_prompt: str = DEFAULT_USER_PROMPT
+    system_prompt: str = DEFAULT_HALLUCINATION_SYSTEM_PROMPT
+    user_prompt: str = DEFAULT_HALLUCINATION_USER_PROMPT
     model_id: str = OPENAI_DEFAULT_MODEL
     temperature: float = 0.7
     max_tokens: int = 4096
diff --git a/weave/flow/scorers/json_scorer.py b/weave/flow/scorers/json_scorer.py
index 598b7a4f002..f40f2d66fe6 100644
--- a/weave/flow/scorers/json_scorer.py
+++ b/weave/flow/scorers/json_scorer.py
@@ -1,10 +1,10 @@
 import json
 from typing import Any
 
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
-class JSONScorer(Scorer):
+class ValidJSONScorer(Scorer):
     """Score a JSON string."""
 
     def score(self, output: Any, **kwargs: Any) -> dict:  # type: ignore
diff --git a/weave/flow/scorers/llm_scorer.py b/weave/flow/scorers/llm_scorer.py
index 7bcf9cf9af6..d319670ae77 100644
--- a/weave/flow/scorers/llm_scorer.py
+++ b/weave/flow/scorers/llm_scorer.py
@@ -2,8 +2,8 @@
 
 from pydantic import Field, field_validator
 
-from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.llm_utils import instructor_client, _LLM_CLIENTS
+from weave.flow.scorers.base_scorer import Scorer
+from weave.flow.scorers.llm_utils import _LLM_CLIENTS_NAMES, instructor_client
 
 
 class LLMScorer(Scorer):
@@ -16,9 +16,10 @@ class LLMScorer(Scorer):
 
     @field_validator("client")
     def validate_client(cls, v):  # type: ignore
-        if not isinstance(v, _LLM_CLIENTS):
+        client_type_name = type(v).__name__
+        if client_type_name not in _LLM_CLIENTS_NAMES:
             raise ValueError(
-                f"Invalid client type. Expected one of {_LLM_CLIENTS}, got {type(v)}"
+                f"Invalid client type. Expected one of {_LLM_CLIENTS_NAMES}, got {client_type_name}"
             )
         return v
 
@@ -39,8 +40,9 @@ class InstructorLLMScorer(Scorer):
 
     @field_validator("client")
     def validate_client(cls, v):  # type: ignore
-        if not isinstance(v, _LLM_CLIENTS):
+        client_type_name = type(v).__name__
+        if client_type_name not in _LLM_CLIENTS_NAMES:
             raise ValueError(
-                f"Invalid client type. Expected one of {_LLM_CLIENTS}, got {type(v)}"
+                f"Invalid client type. Expected one of {_LLM_CLIENTS_NAMES}, got {client_type_name}"
             )
         return instructor_client(v)
diff --git a/weave/flow/scorers/llm_utils.py b/weave/flow/scorers/llm_utils.py
index e6bce53f8db..5d480f080b8 100644
--- a/weave/flow/scorers/llm_utils.py
+++ b/weave/flow/scorers/llm_utils.py
@@ -1,4 +1,6 @@
-from typing import TYPE_CHECKING, List, Optional, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, List, Optional, Union
 
 from weave.trace.autopatch import autopatch
 
@@ -26,6 +28,14 @@
 else:
     _LLM_CLIENTS = object
 
+_LLM_CLIENTS_NAMES = (
+    "OpenAI",
+    "AsyncOpenAI",
+    "Anthropic",
+    "AsyncAnthropic",
+    "Mistral",
+)
+
 
 def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ignore
     try:
@@ -47,12 +57,12 @@ def instructor_client(client: _LLM_CLIENTS) -> "instructor.client":  # type: ign
         raise ValueError(f"Unsupported client type: {client_type}")
 
 
-def create(client: _LLM_CLIENTS, *args, **kwargs):  # type: ignore
+def create(client: instructor.client, *args, **kwargs) -> Any:  # type: ignore
     return client.chat.completions.create(*args, **kwargs)
 
 
 def embed(
-    client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs
+    client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs: Any
 ) -> List[List[float]]:  # type: ignore
     client_type = type(client).__name__.lower()
     if "openai" in client_type:
@@ -71,12 +81,15 @@ def import_client(provider: str) -> Optional[_LLM_CLIENTS]:  # type: ignore
     try:
         if provider == "openai":
             from openai import OpenAI
+
             return OpenAI
         elif provider == "anthropic":
             import anthropic
+
             return anthropic.Anthropic
         elif provider == "mistral":
             from mistralai import Mistral
+
             return Mistral
     except ImportError:
         return None
diff --git a/weave/flow/scorers/moderation_scorer.py b/weave/flow/scorers/moderation_scorer.py
index 51b92b9f85b..8a8e4eee9da 100644
--- a/weave/flow/scorers/moderation_scorer.py
+++ b/weave/flow/scorers/moderation_scorer.py
@@ -3,7 +3,7 @@
 from pydantic import field_validator
 
 import weave
-from weave.flow.scorer.llm_scorer import LLMScorer
+from weave.flow.scorers.llm_scorer import LLMScorer
 
 
 class OpenAIModerationScorer(LLMScorer):
diff --git a/weave/flow/scorers/pydantic_scorer.py b/weave/flow/scorers/pydantic_scorer.py
index 90fdffd6378..5566326774d 100644
--- a/weave/flow/scorers/pydantic_scorer.py
+++ b/weave/flow/scorers/pydantic_scorer.py
@@ -2,7 +2,7 @@
 
 from pydantic import BaseModel, ValidationError
 
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
 class PydanticScorer(Scorer):
diff --git a/weave/flow/scorers/ragas_scorer.py b/weave/flow/scorers/ragas_scorer.py
index 6180697f59f..8b3493c3542 100644
--- a/weave/flow/scorers/ragas_scorer.py
+++ b/weave/flow/scorers/ragas_scorer.py
@@ -6,8 +6,8 @@
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorer.llm_scorer import InstructorLLMScorer
-from weave.flow.scorer.llm_utils import create
+from weave.flow.scorers.llm_scorer import InstructorLLMScorer
+from weave.flow.scorers.llm_utils import create
 
 
 class EntityExtractionResponse(BaseModel):
diff --git a/weave/flow/scorers/similarity_score.py b/weave/flow/scorers/similarity_score.py
index 722c16e98c2..82d8760b747 100644
--- a/weave/flow/scorers/similarity_score.py
+++ b/weave/flow/scorers/similarity_score.py
@@ -4,8 +4,8 @@
 from pydantic import Field
 
 import weave
-from weave.flow.scorer.llm_scorer import LLMScorer
-from weave.flow.scorer.llm_utils import embed
+from weave.flow.scorers.llm_scorer import LLMScorer
+from weave.flow.scorers.llm_utils import embed
 
 
 class EmbeddingSimilarityScorer(LLMScorer):
diff --git a/weave/flow/scorers/string_scorer.py b/weave/flow/scorers/string_scorer.py
index a34fa561960..4dc58922668 100644
--- a/weave/flow/scorers/string_scorer.py
+++ b/weave/flow/scorers/string_scorer.py
@@ -4,7 +4,7 @@
 from pydantic import Field, model_validator
 
 import weave
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
 class StringMatchScorer(Scorer):
@@ -53,7 +53,7 @@ def score(
                 text_to_search = "".join(text_to_search.split())
 
         match_found = any(
-            pattern.search(text_to_search) for pattern in compiled_patterns
+            pattern.search(str(text_to_search)) for pattern in compiled_patterns
         )
 
         return {"string_match": match_found}
@@ -70,6 +70,7 @@ def check_levenshtein(self):  # type: ignore
             from Levenshtein import distance
 
             self.distance = distance
+            return self
         except ImportError:
             raise ValueError(
                 "Levenshtein package not found. Please install it with `pip install Levenshtein`"
diff --git a/weave/flow/scorers/summarization_scorer.py b/weave/flow/scorers/summarization_scorer.py
index ee43a7f48b1..effc4a990a2 100644
--- a/weave/flow/scorers/summarization_scorer.py
+++ b/weave/flow/scorers/summarization_scorer.py
@@ -1,40 +1,121 @@
-from textwrap import dedent
-from typing import Any, List
+import asyncio
+from typing import Any, List, Literal
 
 from pydantic import BaseModel, Field
 
 import weave
-from weave.flow.scorer.llm_scorer import InstructorLLMScorer
-from weave.flow.scorer.llm_utils import create
+from weave.flow.scorers.llm_scorer import InstructorLLMScorer
+from weave.flow.scorers.llm_utils import create
+
+DEFAULT_EXTRACTION_SYSTEM_PROMPT = """
+Given a <text>, extract all the unique entities from the text without repetition.
+"""
+
+DEFAULT_EXTRACTION_USER_PROMPT = """
+Extract all the unique entities from the following <text> without repetition:
+<text>
+{text}
+</text>
+"""
+
+DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT = """
+Given an <input> and a <summary>, evaluate the quality of the <summary>.
+
+# Considerations
+- Does the <summary> contain the key information in the <input>?
+- Is the <summary> concise and informative?
+- Is the <summary> grammatically correct?
+- Does the <summary> contain information or assertions that are not present in the <input>?
+
+# Scoring Rubric
+`excellent`: The <summary> contains all of the key information and entities in the <input>, \
+is concise and information dense, is grammatically correct and doesn't contain any \
+information or assertions that are not present in the <input>.
+
+`ok`: The <summary> contains most of the key information and entities in the <input>, \
+is somewhat concise and informative, is mostly grammatically correct and doesn't contain any \
+information or assertions that are not present in the <input>.
+
+`poor`: The <summary> misses most or all of the key information in the <input>, \
+or is very verbose or vague, or is not concise or informative, or has many grammatical errors, \
+or contains information or assertions that are not present in the <input>.
+"""
+
+DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT = """
+Evaluate the quality of the following <summary> given the <input>:
+
+<input>
+{input}
+</input>
+
+<summary>
+{summary}
+</summary>
+"""
 
 
 class EntityExtractionResponse(BaseModel):
     entities: List[str] = Field(
-        description="A list of unique entities extracted from the text"
+        description="A list of unique entities extracted from the text."
     )
 
 
-class SummarizationScorer(InstructorLLMScorer):
-    """Estimates summary quality by computing the recall of entities in the model output compared to the input."""
+summarization_quality_options = Literal["poor", "ok", "excellent"]
+summarization_quality_mapping = {"poor": 0.0, "ok": 0.5, "excellent": 1.0}
+
+
+class SummarizationEvaluationResponse(BaseModel):
+    think_step_by_step: str = Field(
+        description="Think step-by-step about the quality of the <summary> before deciding \
+on the summarization_score."
+    )
+    summarization_evaluation: summarization_quality_options = Field(
+        description="The evaluation of the summary"
+    )
 
-    extraction_prompt: str = dedent("""
-    Extract unique entities from the following text without repetition.
 
-    Text: {text}
-    Entities:
-    """)
+class SummarizationScorer(InstructorLLMScorer):
+    """
+    Estimates summary quality by both:
+    - Calculating the entity density of the summary, similar to how entity density is
+    used in the Chain of Density paper, https://arxiv.org/abs/2309.04269.
+    - Using an LLM to evaluate the summary quality.
+
+    column_map: A `scorer parameter name : dataset column name` mapping.
+    
+    This summarization scorer expects the input column in the dataset to be named "input" \
+        and the output column in the dataset to be named "summary".
+        You can specify a different mapping in the `column_map` argument. For example, \
+        if your dataset contains columns "news_article" and "news_summary" then you can \
+        specify `column_map={"input": "news_article", "output": "news_summary"}`.
+    
+    Parameters to the `score` function
+    - input: The text that was to be summarized
+    - output: the summary of the text
+    """
 
+    extraction_system_prompt: str = DEFAULT_EXTRACTION_SYSTEM_PROMPT
+    extraction_prompt: str = DEFAULT_EXTRACTION_USER_PROMPT
+    summarization_evaluation_system_prompt: str = (
+        DEFAULT_SUMMARIZATION_EVALUATION_SYSTEM_PROMPT
+    )
+    summarization_evaluation_prompt: str = DEFAULT_SUMMARIZATION_EVALUATION_USER_PROMPT
+    fast_model_id: str = "gpt-4o-mini"
+    entity_density_threshold: float = 0.08
     temperature: float = 0.7
     max_tokens: int = 1024
 
+    @weave.op
     def extract_entities(self, text: str) -> List[str]:
-        # Use LLM to extract entities
-        prompt = self.extraction_prompt.format(text=text)
+        """Use an LLM to extract entities"""
         response = create(
             self.client,
-            messages=[{"role": "user", "content": prompt}],
+            messages=[
+                {"role": "system", "content": self.extraction_system_prompt},
+                {"role": "user", "content": self.extraction_prompt.format(text=text)},
+            ],
             response_model=EntityExtractionResponse,
-            model=self.model_id,
+            model=self.fast_model_id,
             temperature=self.temperature,
             max_tokens=self.max_tokens,
         )
@@ -42,13 +123,57 @@ def extract_entities(self, text: str) -> List[str]:
         return entities
 
     @weave.op
-    def score(self, input: str, output: str, **kwargs: Any) -> dict:
-        # Extract entities
-        output_entities = self.extract_entities(output)
-        input_entities = self.extract_entities(input)
-        # Calculate recall
-        if not output_entities:
-            return {"recall": 0.0}
-        matches = set(output_entities) & set(input_entities)
-        recall = len(matches) / len(input_entities)
-        return {"recall": recall}
+    def evaluate_summary(
+        self, input: str, summary: str
+    ) -> SummarizationEvaluationResponse:
+        """Evaluate the quality of a summary using an LLM"""
+        return create(
+            self.client,
+            messages=[
+                {
+                    "role": "system",
+                    "content": self.summarization_evaluation_system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": self.summarization_evaluation_prompt.format(
+                        input=input, summary=summary
+                    ),
+                },
+            ],
+            response_model=SummarizationEvaluationResponse,
+            model=self.model_id,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+        )
+
+    def simple_word_tokenize(self, text: str) -> List[str]:
+        """Simple word tokenization"""
+        return text.split()
+
+    @weave.op
+    async def score(self, input: str, output: str, **kwargs: Any) -> dict:
+        """
+        - input: the piece of text that was to be summarized
+        - output: the generated summary of the input
+        """
+        extract_task = asyncio.to_thread(self.extract_entities, text=output)
+        evaluate_task = asyncio.to_thread(
+            self.evaluate_summary, input=input, summary=output
+        )
+        summary_entities, llm_eval = await asyncio.gather(extract_task, evaluate_task)
+
+        # LLM evaluation
+        result = {}
+        result["summarization_eval_score"] = summarization_quality_mapping.get(
+            llm_eval.summarization_evaluation.lower()
+        )
+        result["llm_eval_reasoning"] = llm_eval.think_step_by_step
+
+        # Entity density evaluation
+        summary_words = self.simple_word_tokenize(output)
+        entity_density = len(summary_entities) / len(summary_words)
+        result["is_entity_dense"] = entity_density >= self.entity_density_threshold
+        result["entity_density"] = entity_density
+
+        return result
diff --git a/weave/flow/scorers/xml_scorer.py b/weave/flow/scorers/xml_scorer.py
index 7bd42516e69..2ea8384477f 100644
--- a/weave/flow/scorers/xml_scorer.py
+++ b/weave/flow/scorers/xml_scorer.py
@@ -1,10 +1,10 @@
 import xml.etree.ElementTree as ET
 from typing import Union
 
-from weave.flow.scorer.base_scorer import Scorer
+from weave.flow.scorers.base_scorer import Scorer
 
 
-class XMLScorer(Scorer):
+class ValidXMLScorer(Scorer):
     """Score an XML string."""
 
     def score(self, output: Union[str, dict]) -> dict:  # type: ignore
diff --git a/weave/scorers/__init__.py b/weave/scorers/__init__.py
new file mode 100644
index 00000000000..a1db6897f34
--- /dev/null
+++ b/weave/scorers/__init__.py
@@ -0,0 +1 @@
+from weave.flow.scorers import *
\ No newline at end of file