Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
tcapelle committed Oct 11, 2024
1 parent 04496cc commit 8bafa5d
Show file tree
Hide file tree
Showing 11 changed files with 211 additions and 113 deletions.
15 changes: 11 additions & 4 deletions weave/flow/scorer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
from weave.flow.scorer.base_scorer import Scorer, auto_summarize, get_scorer_attributes
from weave.flow.scorer.classification_scorer import MultiTaskBinaryClassificationF1, transpose
from weave.flow.scorer.classification_scorer import (
MultiTaskBinaryClassificationF1,
transpose,
)
from weave.flow.scorer.hallucination_scorer import HallucinationScorer
from weave.flow.scorer.json_scorer import JSONScorer
from weave.flow.scorer.llm_scorer import (
LLMScorer,
InstructorLLMScorer,
LLMScorer,
)
from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
from weave.flow.scorer.moderation_scorer import OpenAIModerationScorer
from weave.flow.scorer.pydantic_scorer import PydanticScorer
from weave.flow.scorer.ragas_scorer import (
ContextEntityRecallScorer,
ContextRelevancyScorer,
)
from weave.flow.scorer.string_scorer import RegexScorer, StringMatchScorer, LevenshteinScorer
from weave.flow.scorer.similarity_score import EmbeddingSimilarityScorer
from weave.flow.scorer.string_scorer import (
LevenshteinScorer,
RegexScorer,
StringMatchScorer,
)
from weave.flow.scorer.summarization_scorer import SummarizationScorer
from weave.flow.scorer.xml_scorer import XMLScorer

Expand Down
6 changes: 5 additions & 1 deletion weave/flow/scorer/base_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@


class Scorer(Object):
column_map: Optional[dict[str, str]] = Field(default=None, description="A mapping from column names in the dataset to the names expected by the scorer")
column_map: Optional[dict[str, str]] = Field(
default=None,
description="A mapping from column names in the dataset to the names expected by the scorer",
)

def score(self, input: Any, target: Any, output: Any) -> Any:
raise NotImplementedError

Expand Down
54 changes: 33 additions & 21 deletions weave/flow/scorer/hallucination_scorer.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from pydantic import BaseModel, Field


import weave
from weave.flow.scorer.utils import stringify
from weave.flow.scorer.llm_scorer import InstructorLLMScorer
from weave.flow.scorer.llm_utils import OPENAI_DEFAULT_MODEL, create
from weave.flow.scorer.utils import stringify


DEFAULT_SYSTEM_PROMPT = """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
DEFAULT_USER_PROMPT = """Given some input_data and a output, determine if the output is a hallucination of the input_data.
DEFAULT_SYSTEM_PROMPT = """You are tasked with auditing AI agents. Your role is to evaluate conversations, ensuring that the agent's responses are plausible, factually accurate, and non-controversial based on the user's input. If the agent chooses to decline providing an answer, this should be regarded as a valid response."""
DEFAULT_USER_PROMPT = """Given some input_data and a output, determine if the output is a hallucination of the input_data.
## Input data
<input_data>
{input_data}
Expand All @@ -20,17 +18,22 @@
</output>
## Instructions
Think step by step before answering. Is the output an factually and logically consistent with the input_data?
Think step by step before answering. Is the output an factually and logically consistent with the input_data?
"""


class HallucinationResponse(BaseModel):
chain_of_thought: str = Field(description="Think step by step about whether the output is a hallucination of the dataset_row")
is_hallucination: bool = Field(description="Whether the model output is a hallucination of the dataset row")
chain_of_thought: str = Field(
description="Think step by step about whether the output is a hallucination of the dataset_row"
)
is_hallucination: bool = Field(
description="Whether the model output is a hallucination of the dataset row"
)


class HallucinationScorer(InstructorLLMScorer):
"""
Scorer that checks if the model output is a hallucination of the dataset row.
"""
"""Scorer that checks if the model output is a hallucination of the dataset row."""

system_prompt: str = DEFAULT_SYSTEM_PROMPT
user_prompt: str = DEFAULT_USER_PROMPT
model_id: str = OPENAI_DEFAULT_MODEL
Expand All @@ -39,13 +42,17 @@ class HallucinationScorer(InstructorLLMScorer):

@weave.op
def score(self, output: str, context: str) -> HallucinationResponse:

output = stringify(output)
response = create(
self.client,
messages=[
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": self.user_prompt.format(input_data=context, output=output)},
{
"role": "user",
"content": self.user_prompt.format(
input_data=context, output=output
),
},
],
model=self.model_id,
response_model=HallucinationResponse,
Expand All @@ -57,28 +64,33 @@ def score(self, output: str, context: str) -> HallucinationResponse:

if __name__ == "__main__":
try:
import openai, os, weave, asyncio
import asyncio
import os

import openai

import weave

# weave.init("hallucination-scorer-2")

openai_client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
scorer = HallucinationScorer(client=openai_client, column_map={"text": "context"})
scorer = HallucinationScorer(
client=openai_client, column_map={"text": "context"}
)

output = "John favorite cheese is camembert"
dataset_row = {"text": "John doesn't like cheese"}
response = scorer.score(output, context=dataset_row)
print(response)

@weave.op
def model():
return "John favorite food is apples"

dataset = [{"text": "John doesn't like cheese"},
{"text": "John likes pizza"}]

dataset = [{"text": "John doesn't like cheese"}, {"text": "John likes pizza"}]

evaluation = weave.Evaluation(dataset=dataset, scorers=[scorer])
asyncio.run(evaluation.evaluate(model))

except Exception as e:
print(e)

14 changes: 10 additions & 4 deletions weave/flow/scorer/llm_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from pydantic import Field, field_validator

from weave.flow.scorer.base_scorer import Scorer
from weave.flow.scorer.llm_utils import instructor_client, _LLM_CLIENT_TYPES
from weave.flow.scorer.llm_utils import _LLM_CLIENT_TYPES, instructor_client


class LLMScorer(Scorer):
"""Score a model output using an LLM"""
Expand All @@ -21,20 +22,25 @@ def validate_client(cls, v):
)
return v


class InstructorLLMScorer(Scorer):
"""Score a model output using an LLM"""

client: Any = Field(
description="The LLM client to use, has to be instantiated with an api_key"
)
model_id: str = Field(description="The model to use")
temperature: float = Field(..., description="The temperature to use for the response")
max_tokens: int = Field(..., description="The maximum number of tokens in the response")
temperature: float = Field(
..., description="The temperature to use for the response"
)
max_tokens: int = Field(
..., description="The maximum number of tokens in the response"
)

@field_validator("client")
def validate_client(cls, v):
if not any(isinstance(v, client_type) for client_type in _LLM_CLIENT_TYPES):
raise ValueError(
f"Invalid client type. Expected one of {_LLM_CLIENT_TYPES}, got {type(v)}"
)
return instructor_client(v)
return instructor_client(v)
40 changes: 29 additions & 11 deletions weave/flow/scorer/llm_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import List, Union, TypeVar
from typing import List, TypeVar, Union

import instructor

from weave.trace.autopatch import autopatch

autopatch() # fix instrucor tracing
autopatch() # fix instrucor tracing

# TODO: Gemini

Expand Down Expand Up @@ -44,6 +44,7 @@

_LLM_CLIENTS = TypeVar(Union[tuple(_LLM_CLIENT_TYPES)])


def instructor_client(client: _LLM_CLIENTS):
client_type = type(client).__name__.lower()
if "mistral" in client_type:
Expand All @@ -54,11 +55,15 @@ def instructor_client(client: _LLM_CLIENTS):
return instructor.from_anthropic(client)
else:
raise ValueError(f"Unsupported client type: {client_type}")



def create(client: _LLM_CLIENTS, *args, **kwargs):
return client.chat.completions.create(*args, **kwargs)

def embed(client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:

def embed(
client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **kwargs
) -> List[List[float]]:
client_type = type(client).__name__.lower()
if "mistral" in client_type:
response = client.embeddings.create(model=model_id, inputs=texts, **kwargs)
Expand All @@ -69,6 +74,7 @@ def embed(client: _LLM_CLIENTS, model_id: str, texts: Union[str, List[str]], **k
else:
raise ValueError(f"Unsupported client type: {type(client).__name__.lower()}")


# Helper function for dynamic imports
def import_client(provider: str):
try:
Expand All @@ -90,13 +96,14 @@ def import_client(provider: str):

# Example usage:
if __name__ == "__main__":
import asyncio
import os

# Mistral example
MistralClient = import_client("mistral")
if MistralClient:
mistral_client = instructor_client(Mistral(api_key=os.environ.get("MISTRAL_API_KEY")))
mistral_client = instructor_client(
Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
)
mistral_response = mistral_client.chat.completions.create(
messages=[{"role": "user", "content": "What is the best French cheese?"}],
model=MISTRAL_DEFAULT_MODEL,
Expand All @@ -108,7 +115,9 @@ def import_client(provider: str):
# OpenAI example with system message
OpenAIClient = import_client("openai")
if OpenAIClient:
openai_client = instructor_client(OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY")))
openai_client = instructor_client(
OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
)
openai_response = openai_client.chat.completions.create(
messages=[
{
Expand All @@ -129,7 +138,9 @@ def import_client(provider: str):
# Anthropic example with system message
AnthropicClient = import_client("anthropic")
if AnthropicClient:
anthropic_client = instructor_client(AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY")))
anthropic_client = instructor_client(
AnthropicClient(api_key=os.environ.get("ANTHROPIC_API_KEY"))
)
anthropic_response = anthropic_client.messages.create(
messages=[
{
Expand All @@ -147,11 +158,18 @@ def import_client(provider: str):
# Embedding example
if OpenAIClient:
openai_embed_client = OpenAIClient(api_key=os.environ.get("OPENAI_API_KEY"))
openai_embeddings = embed(openai_embed_client, OPENAI_DEFAULT_EMBEDDING_MODEL, ["Embed this sentence.", "As well as this one."])
openai_embeddings = embed(
openai_embed_client,
OPENAI_DEFAULT_EMBEDDING_MODEL,
["Embed this sentence.", "As well as this one."],
)
print("OpenAI embeddings:", openai_embeddings)

if MistralClient:
mistral_embed_client = MistralClient(api_key=os.environ.get("MISTRAL_API_KEY"))
mistral_embeddings = embed(mistral_embed_client, MISTRAL_DEFAULT_EMBEDDING_MODEL, ["Embed this sentence.", "As well as this one."])
mistral_embeddings = embed(
mistral_embed_client,
MISTRAL_DEFAULT_EMBEDDING_MODEL,
["Embed this sentence.", "As well as this one."],
)
print("Mistral embeddings:", mistral_embeddings)

16 changes: 11 additions & 5 deletions weave/flow/scorer/moderation_scorer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Any

from pydantic import field_validator

import weave
Expand All @@ -11,14 +12,17 @@ class OpenAIModerationScorer(LLMScorer):
@field_validator("client")
def validate_openai_client(cls, v):
try:
from openai import AsyncOpenAI, OpenAI # Ensure these are the correct imports
from openai import ( # Ensure these are the correct imports
AsyncOpenAI,
OpenAI,
)
except ImportError:
raise ValueError("Install openai to use this scorer")

if not isinstance(v, (OpenAI, AsyncOpenAI)):
raise ValueError("Moderation scoring only works with OpenAI or AsyncOpenAI")
return v

@weave.op
def score(self, output: Any) -> Any:
response = self.client.moderations.create(
Expand All @@ -34,8 +38,10 @@ def score(self, output: Any) -> Any:
import openai

client = openai.OpenAI()
scorer = OpenAIModerationScorer(client=client, model_id="omni-moderation-latest")
scorer = OpenAIModerationScorer(
client=client, model_id="omni-moderation-latest"
)
print(scorer.score("I should kill someone"))
except Exception as e:
print("Error:", e)
raise e
raise e
Loading

0 comments on commit 8bafa5d

Please sign in to comment.