lint

wandb · Oct 11, 2024 · 8bafa5d · 8bafa5d
1 parent 04496cc
commit 8bafa5d
Show file tree

Hide file tree

Showing 11 changed files with 211 additions and 113 deletions.
diff --git a/weave/flow/scorer/__init__.py b/weave/flow/scorer/__init__.py
@@ -1,19 +1,26 @@
 from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
-from weave.flow.scorer.classification_scorer import MultiTaskBinaryClassificationF1, transpose
+from weave.flow.scorer.classification_scorer import (
+    MultiTaskBinaryClassificationF1,
+    transpose,
+)
 from weave.flow.scorer.hallucination_scorer import HallucinationScorer
 from weave.flow.scorer.json_scorer import JSONScorer
 from weave.flow.scorer.llm_scorer import (
-    LLMScorer,
     InstructorLLMScorer,
+    LLMScorer,
 )
-from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
 from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer
 from weave.flow.scorer.pydantic_scorer import PydanticScorer
 from weave.flow.scorer.ragas_scorer import (
     ContextEntityRecallScorer,
     ContextRelevancyScorer,
 )
-from weave.flow.scorer.string_scorer import RegexScorer, StringMatchScorer, LevenshteinScorer
+from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
+from weave.flow.scorer.string_scorer import (
+    LevenshteinScorer,
+    RegexScorer,
+    StringMatchScorer,
+)
 from weave.flow.scorer.summarization_scorer import SummarizationScorer
 from weave.flow.scorer.xml_scorer import XMLScorer
 

diff --git a/weave/flow/scorer/base_scorer.py b/weave/flow/scorer/base_scorer.py
@@ -11,7 +11,11 @@
 
 
 class Scorer(Object):
-    column_map: Optional[dict[str, str]] = Field(default=None, description="A mapping from column names in the dataset to the names expected by the scorer")
+    column_map: Optional[dict[str, str]] = Field(
+        default=None,
+        description="A mapping from column names in the dataset to the names expected by the scorer",
+    )
+
     def score(self, input: Any, target: Any, output: Any) -> Any:
         raise NotImplementedError
 

diff --git a/weave/flow/scorer/hallucination_scorer.py b/weave/flow/scorer/hallucination_scorer.py
@@ -1,14 +1,12 @@
 from pydantic import BaseModel, Field
 
-
 import weave
-from weave.flow.scorer.utils import stringify
 from weave.flow.scorer.llm_scorer import InstructorLLMScorer
 from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL, create
+from weave.flow.scorer.utils import stringify
 
-
-DEFAULT_SYSTEM_PROMPT =  """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
-DEFAULT_USER_PROMPT =  """Given some input_data and a output, determine if the output is a hallucination of the input_data.
+DEFAULT_SYSTEM_PROMPT = """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
+DEFAULT_USER_PROMPT = """Given some input_data and a output, determine if the output is a hallucination of the input_data.
 ## Input data
 <input_data>
 {input_data}
@@ -20,17 +18,22 @@
 </output>
 
 ## Instructions
-Think step by step before answering. Is the output an factually and logically consistent with the input_data? 
+Think step by step before answering. Is the output an factually and logically consistent with the input_data?
 """
 
+
 class HallucinationResponse(BaseModel):
-    chain_of_thought: str = Field(description="Think step by step about whether the output is a hallucination of the dataset_row")
-    is_hallucination: bool = Field(description="Whether the model output is a hallucination of the dataset row")
+    chain_of_thought: str = Field(
+        description="Think step by step about whether the output is a hallucination of the dataset_row"
+    )
+    is_hallucination: bool = Field(
+        description="Whether the model output is a hallucination of the dataset row"
+    )
+
 
 class HallucinationScorer(InstructorLLMScorer):
-    """
-    Scorer that checks if the model output is a hallucination of the dataset row.
-    """
+    """Scorer that checks if the model output is a hallucination of the dataset row."""
+
     system_prompt: str = DEFAULT_SYSTEM_PROMPT
     user_prompt: str = DEFAULT_USER_PROMPT
     model_id: str = OPENAI_DEFAULT_MODEL
@@ -39,13 +42,17 @@ class HallucinationScorer(InstructorLLMScorer):
 
     @weave.op
     def score(self, output: str, context: str) -> HallucinationResponse:
-
         output = stringify(output)
         response = create(
             self.client,
             messages=[
                 {"role": "system", "content": self.system_prompt},
-                {"role": "user", "content": self.user_prompt.format(input_data=context, output=output)},
+                {
+                    "role": "user",
+                    "content": self.user_prompt.format(
+                        input_data=context, output=output
+                    ),
+                },
             ],
             model=self.model_id,
             response_model=HallucinationResponse,
@@ -57,28 +64,33 @@ def score(self, output: str, context: str) -> HallucinationResponse:
 
 if __name__ == "__main__":
     try:
-        import openai, os, weave, asyncio
+        import asyncio
+        import os
+
+        import openai
+
+        import weave
 
         # weave.init("hallucination-scorer-2")
 
         openai_client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-        scorer = HallucinationScorer(client=openai_client, column_map={"text": "context"})
+        scorer = HallucinationScorer(
+            client=openai_client, column_map={"text": "context"}
+        )
 
         output = "John favorite cheese is camembert"
         dataset_row = {"text": "John doesn't like cheese"}
         response = scorer.score(output, context=dataset_row)
         print(response)
-    
+
         @weave.op
         def model():
             return "John favorite food is apples"
 
-        dataset = [{"text": "John doesn't like cheese"}, 
-                   {"text": "John likes pizza"}]
-
+        dataset = [{"text": "John doesn't like cheese"}, {"text": "John likes pizza"}]
+
         evaluation = weave.Evaluation(dataset=dataset, scorers=[scorer])
         asyncio.run(evaluation.evaluate(model))
-    
+
     except Exception as e:
         print(e)
-
diff --git a/weave/flow/scorer/llm_scorer.py b/weave/flow/scorer/llm_scorer.py
@@ -3,7 +3,8 @@
 from pydantic import Field, field_validator
 
 from weave.flow.scorer.base_scorer import Scorer
-from weave.flow.scorer.llm_utils import instructor_client, _LLM_CLIENT_TYPES
+from weave.flow.scorer.llm_utils import _LLM_CLIENT_TYPES, instructor_client
+
 
 class LLMScorer(Scorer):
     """Score a model output using an LLM"""
@@ -21,20 +22,25 @@ def validate_client(cls, v):
             )
         return v
 
+
 class InstructorLLMScorer(Scorer):
     """Score a model output using an LLM"""
 
     client: Any = Field(
         description="The LLM client to use, has to be instantiated with an api_key"
     )
     model_id: str = Field(description="The model to use")
-    temperature: float = Field(..., description="The temperature to use for the response")
-    max_tokens: int = Field(..., description="The maximum number of tokens in the response")
+    temperature: float = Field(
+        ..., description="The temperature to use for the response"
+    )
+    max_tokens: int = Field(
+        ..., description="The maximum number of tokens in the response"
+    )
 
     @field_validator("client")
     def validate_client(cls, v):
         if not any(isinstance(v, client_type) for client_type in _LLM_CLIENT_TYPES):
             raise ValueError(
                 f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
             )
-        return instructor_client(v)
+        return instructor_client(v)
diff --git a/weave/flow/scorer/llm_utils.py b/weave/flow/scorer/llm_utils.py
@@ -1,10 +1,10 @@
-from typing import List, Union, TypeVar
+from typing import List, TypeVar, Union
 
 import instructor
 
 from weave.trace.autopatch import autopatch
 
-autopatch() # fix instrucor tracing
+autopatch()  # fix instrucor tracing
 
 # TODO: Gemini
 
@@ -44,6 +44,7 @@
 
 _LLM_CLIENTS = TypeVar(Union[tuple(_LLM_CLIENT_TYPES)])
 
+
 def instructor_client(client: _LLM_CLIENTS):
     client_type = type(client).__name__.lower()
     if "mistral" in client_type:
@@ -54,11 +55,15 @@ def instructor_client(client: _LLM_CLIENTS):
         return instructor.from_anthropic(client)
     else:
         raise ValueError(f"Unsupported client type: {client_type}")
-
+
+
 def create(client: _LLM_CLIENTS, *args, **kwargs):
     return client.chat.completions.create(*args, **kwargs)
 
-def embed(client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+
+def embed(
+    client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs
+) -> List[List[float]]:
     client_type = type(client).__name__.lower()
     if "mistral" in client_type:
         response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)
@@ -69,6 +74,7 @@ def embed(client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **k
     else:
         raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")
 
+
 # Helper function for dynamic imports
 def import_client(provider: str):
     try:
@@ -90,13 +96,14 @@ def import_client(provider: str):
 
 # Example usage:
 if __name__ == "__main__":
-    import asyncio
     import os
 
     # Mistral example
     MistralClient = import_client("mistral")
     if MistralClient:
-        mistral_client = instructor_client(Mistral(api_key=os.environ.get("MISTRAL_API_KEY")))
+        mistral_client = instructor_client(
+            Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
+        )
         mistral_response = mistral_client.chat.completions.create(
             messages=[{"role": "user", "content": "What is the best French cheese?"}],
             model=MISTRAL_DEFAULT_MODEL,
@@ -108,7 +115,9 @@ def import_client(provider: str):
     # OpenAI example with system message
     OpenAIClient = import_client("openai")
     if OpenAIClient:
-        openai_client = instructor_client(OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY")))
+        openai_client = instructor_client(
+            OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
+        )
         openai_response = openai_client.chat.completions.create(
             messages=[
                 {
@@ -129,7 +138,9 @@ def import_client(provider: str):
     # Anthropic example with system message
     AnthropicClient = import_client("anthropic")
     if AnthropicClient:
-        anthropic_client = instructor_client(AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY")))
+        anthropic_client = instructor_client(
+            AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+        )
         anthropic_response = anthropic_client.messages.create(
             messages=[
                 {
@@ -147,11 +158,18 @@ def import_client(provider: str):
     # Embedding example
     if OpenAIClient:
         openai_embed_client = OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
-        openai_embeddings = embed(openai_embed_client, OPENAI_DEFAULT_EMBEDDING_MODEL, ["Embed this sentence.", "As well as this one."])
+        openai_embeddings = embed(
+            openai_embed_client,
+            OPENAI_DEFAULT_EMBEDDING_MODEL,
+            ["Embed this sentence.", "As well as this one."],
+        )
         print("OpenAI embeddings:", openai_embeddings)
 
     if MistralClient:
         mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
-        mistral_embeddings = embed(mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL, ["Embed this sentence.", "As well as this one."])
+        mistral_embeddings = embed(
+            mistral_embed_client,
+            MISTRAL_DEFAULT_EMBEDDING_MODEL,
+            ["Embed this sentence.", "As well as this one."],
+        )
         print("Mistral embeddings:", mistral_embeddings)
-
diff --git a/weave/flow/scorer/moderation_scorer.py b/weave/flow/scorer/moderation_scorer.py
@@ -1,4 +1,5 @@
 from typing import Any
+
 from pydantic import field_validator
 
 import weave
@@ -11,14 +12,17 @@ class OpenAIModerationScorer(LLMScorer):
     @field_validator("client")
     def validate_openai_client(cls, v):
         try:
-            from openai import AsyncOpenAI, OpenAI  # Ensure these are the correct imports
+            from openai import (  # Ensure these are the correct imports
+                AsyncOpenAI,
+                OpenAI,
+            )
         except ImportError:
             raise ValueError("Install openai to use this scorer")
-        
+
         if not isinstance(v, (OpenAI, AsyncOpenAI)):
             raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI")
         return v
-    
+
     @weave.op
     def score(self, output: Any) -> Any:
         response = self.client.moderations.create(
@@ -34,8 +38,10 @@ def score(self, output: Any) -> Any:
         import openai
 
         client = openai.OpenAI()
-        scorer = OpenAIModerationScorer(client=client, model_id="omni-moderation-latest")
+        scorer = OpenAIModerationScorer(
+            client=client, model_id="omni-moderation-latest"
+        )
         print(scorer.score("I should kill someone"))
     except Exception as e:
         print("Error:", e)
-        raise e
+        raise e