diff --git a/weave/scorers/hallucination_scorer.py b/weave/scorers/hallucination_scorer.py index 0e73092209b..4644b4e3bfd 100644 --- a/weave/scorers/hallucination_scorer.py +++ b/weave/scorers/hallucination_scorer.py @@ -343,14 +343,12 @@ def score(self, query: str, context: str, output: str) -> dict: output=output, ) if self.base_url: - output = self._score_via_api(messages) - output = output["data"] - + res = self._score_via_api(messages) + res = res["data"] else: if self.use_hhem: pairs = [(query + "\n\n" + context, output)] pred = self.llm_model.predict(pairs) - score = pred.item() return { "flagged": score <= self.hhem_score_threshold, @@ -372,7 +370,7 @@ def score(self, query: str, context: str, output: str) -> dict: with torch.no_grad(): self.llm_model.eval() - output = self.llm_model.generate( + res = self.llm_model.generate( inp_tokenized["input_ids"], max_new_tokens=self.max_new_tokens, attention_mask=inp_tokenized["attention_mask"], @@ -388,7 +386,7 @@ def score(self, query: str, context: str, output: str) -> dict: false_token = 4245 input_length = inp_tokenized["input_ids"].shape[1] - completion_tokens = output[0][input_length:].tolist() + completion_tokens = res[0][input_length:].tolist() is_hallucination = true_token in completion_tokens result = { @@ -411,7 +409,7 @@ def score(self, query: str, context: str, output: str) -> dict: { "completion": completion, "completion_tokens": completion_tokens, - "total_tokens": len(output[0]), + "total_tokens": len(res[0]), "total_completion_tokens": len(completion_tokens), "scorer_worked": scorer_worked, } diff --git a/weave/scorers/relevance_scorer.py b/weave/scorers/relevance_scorer.py deleted file mode 100644 index 5b950076076..00000000000 --- a/weave/scorers/relevance_scorer.py +++ /dev/null @@ -1,374 +0,0 @@ -import json -import os -from typing import Any, Optional -import numpy as np -from pydantic import PrivateAttr - -import weave -from weave.scorers.base_scorer import Scorer -from weave.scorers.llm_utils import download_model, scorer_model_paths, set_device - -RELEVANCE_INSTRUCTIONS = """You are an expert evaluator assessing the relevance of LLM-generated outputs relative to their input context. -Your goal is to provide a single relevance score and classification based on comprehensive analysis. -Relevance measures how effectively a generated output addresses its input context across three core dimensions: - -1. **Semantic Alignment** - - How directly does the output address key input requirements? - - Does it maintain topical focus? - - Does it provide complete coverage of necessary information? - - Is unnecessary content avoided? - -2. **Structural Coherence** - - Does the output flow logically and show internal consistency? - - Is the presentation of information clear and organized? - - Is there a good balance between completeness and conciseness? - -3. **Contextual Integration** - - How well does the output use the provided context? - - Does the output align with the broader discourse? - - Is it consistent with background information? - - Does it fulfill task-specific requirements? - -## Evaluation Process - -1. Review all input context (instructions, prompts, documents, chat history) -2. Identify core requirements and purpose -3. Analyze the LLM output across all three dimensions -4. Assign a single relevance score (1-5): - - 5: Exceptional relevance across all dimensions - - 4: Strong relevance with minor gaps - - 3: Adequate relevance with some issues - - 2: Significant relevance issues - - 1: Major relevance problems -5. Classify as relevant (score ≥ 3.5) or not relevant (score < 3.5) - -## Task-Specific Considerations - -- **Summarization**: Focus on key information selection and density -- **Q&A**: Emphasize answer accuracy and completeness -- **Chat**: Consider conversation flow and context maintenance -- **RAG**: Evaluate retrieved information integration - -## Output Format - -Provide evaluation results in the following JSON format: - -```json -{ - "relevance": [score from 1-5], - "relevant": [true/false] -} -``` -""" - - -class OldRelevanceScorer(Scorer): - """ - Use wandb/relevance_scorer to check if the model output is relevant. - - Args: - model_name: The name of the relevance scorer model to use. Defaults to `wandb/relevance_scorer`. - device: The device to use for inference. Defaults to `None`, which will use `cuda` if available. - """ - - model_name_or_path: str = None - base_url: Optional[str] = None - device: str = None - _classifier: Any = PrivateAttr() - _tokenizer: Any = PrivateAttr() - _id2label: dict[int, str] = PrivateAttr() - _system_prompt: str = PrivateAttr() - - def model_post_init(self, __context: Any) -> None: - try: - import torch - from transformers import pipeline - except ImportError: - print( - "The `transformers` package is required to use the ContextRelevanceScorer, please run `pip install transformers`" - ) - if self.base_url: - print(f"Using external API at {self.base_url} for scoring.") - return # Skip local model loading if base_url is provided - - """Initialize the coherence model and tokenizer.""" - self.device = set_device(self.device) - if os.path.isdir(self.model_name_or_path): - self._local_model_path = self.model_name_or_path - else: - self._local_model_path = download_model( - scorer_model_paths["relevance_scorer"] - ) - - self._classifier = pipeline( - task="text-generation", model=self._local_model_path, device=self.device - ) - self._tokenizer = self._classifier.tokenizer - self._id2label = { - 0: "Unknown", - 1: "Completely Irrelevant", - 2: "Mostly Irrelevant", - 3: "A Little Irrelevant", - 4: "Mostly Relevant", - 5: "Perfectly Relevant", - } - self._system_prompt = RELEVANCE_INSTRUCTIONS.strip() - - @weave.op - def score_messages(self, messages: str) -> dict[str, Any]: - """Score a prompt response pair.""" - generated_output = self._classifier( - messages, - max_new_tokens=20, - stop_strings=["}"], - tokenizer=self._tokenizer, - penalty_alpha=0.6, - top_k=4, - ) - assistant_output = generated_output[0].get("generated_text", [])[-1] - classification = assistant_output.get("content", "") - try: - classification = json.loads(classification) - relevance = classification.get("relevance", 0) - relevance = int(relevance) - relevance = max(0, min(5, relevance)) - except Exception: - relevance = 0 - - flagged = True - if relevance > 3: - flagged = False - return { - "flagged": flagged, - "extras": { - "relevance_id": relevance, - "relevance_label": self._id2label.get(relevance, "Unknown"), - }, - } - - def _format_messages( - self, - prompt: str, - completion: str, - context: Optional[list[str]], - chat_history: Optional[list[dict[str, str]]], - ) -> list[dict[str, str]]: - """Format the prompt for the model.""" - chat_history = chat_history if isinstance(chat_history, list) else [] - context = context if isinstance(context, list) else [] - if context: - context = "\n".join(context).strip() - context = f"\n{context}\n" - else: - context = "" - prompt = f"{context}\n\n{prompt}".strip() - - messages = chat_history + [{"role": "user", "content": prompt}] - - messages = [ - f"<|msg_start|>{message['role']}\n{message['content']}<|msg_end|>" - for message in messages - ] - messages = "\n".join(messages) - - context = f"{messages}\n" - completion = f"{completion}\n" - - context_and_completion = context + completion - - return [ - {"role": "system", "content": self._system_prompt}, - {"role": "user", "content": context_and_completion}, - ] - - def _score_via_api( - self, - input: str, - output: str, - context: Optional[list[str]] = None, - chat_history: Optional[list[dict[str, str]]] = None, - ) -> dict[str, Any]: - import requests - - response = requests.post( - self.base_url, - json={ - "input": input, - "output": output, - "context": context, - "chat_history": chat_history, - }, - ) - response.raise_for_status() - return response.json() - - @weave.op - def score( - self, - input: str, - output: str, - context: Optional[list[str]] = None, - chat_history: Optional[list[dict[str, str]]] = None, - ) -> dict[str, Any]: - if self.base_url: - return self._score_via_api(input, output, context, chat_history) - messages = self._format_messages( - prompt=input, completion=output, context=context, chat_history=chat_history - ) - return self.score_messages(messages) - -class ContextRelevanceScorer(Scorer): - """ - A scorer that evaluates the relevance of model outputs relative to input queries and context. - - This scorer uses a fine-tuned model to analyze whether outputs are semantically relevant to their - input queries and context. It processes text in chunks and returns both binary relevance flags - and detailed span-level scores. - - Args: - model_name_or_path (str): Path or name of model weights to load - base_url (Optional[str]): Optional URL for external API scoring instead of local model - device (str): Device to run model on, defaults to "cpu" - threshold (float): Threshold for relevance classification, defaults to 0.7 - debug (bool): Enable debug logging, defaults to False - - Returns: - dict: A dictionary containing: - - flagged (bool): Whether the output was flagged as irrelevant - - extras (dict): Contains: - - score (float): Overall relevance score - - all_spans (list, optional): If return_all_scores=True, includes list of relevant - text spans and their scores - - Example: - >>> scorer = ContextRelevanceScorer(model_name_or_path="path/to/model") - >>> result = scorer.score( - ... query="What is the capital of France?", - ... documents=["Paris is the capital of France."] - ... ) - >>> print(result) - { - 'flagged': False, - 'extras': { - 'score': 0.92, - 'all_spans': [ # Only included if return_all_scores=True - {'text': 'Paris is the capital of France', 'scores': 0.92} - ] - } - } - """ - model_name_or_path: str = None - base_url: Optional[str] = None - device: str = "cpu" - threshold: float = 0.7 - _model: Any = PrivateAttr() - _tokenizer: Any = PrivateAttr() - - def model_post_init(self, __context: Any) -> None: - try: - import torch - from transformers import AutoModelForTokenClassification, AutoTokenizer - except ImportError: - print( - "The `transformers` and `torch` packages are required to use the ContextRelevanceScorer, please run `pip install transformers torch`" - ) - """Initialize the model, tokenizer and device after pydantic initialization.""" - if os.path.isdir(self.model_name_or_path): - self._local_model_path = self.model_name_or_path - else: - self._local_model_path = download_model( - scorer_model_paths["relevance_scorer"] - ) - assert self._local_model_path, "Model path not found" - self._model = AutoModelForTokenClassification.from_pretrained( - self._local_model_path, device_map=self.device - ) - self._tokenizer = AutoTokenizer.from_pretrained(self._local_model_path) - self._model.eval() - self.device = set_device(self.device) - - def _score_document( - self, - query: str, - document: str, - threshold: float) -> tuple[list[dict[str, Any]], int, int]: - """Score a single document.""" - import torch - with torch.no_grad(): - input_text = query + f" {self._tokenizer.sep_token} " + document - model_inputs = self._tokenizer( - input_text, - truncation=True, - padding=False, - return_tensors="pt", - return_special_tokens_mask=True - ) - model_inputs = {k: v.to(self.device) for k, v in model_inputs.items()} - - special_tokens_mask = model_inputs.pop("special_tokens_mask") - combined_mask = ~((model_inputs["input_ids"] == 2).bool() | special_tokens_mask.bool()).cpu().numpy().flatten() - # we should mask the query up to the sep token, - # on the combined mask we have to search for the first False - # TODO: Check that this is now wrong - false_indices = np.where(~combined_mask)[0] - start = false_indices[0] - end = false_indices[1] - combined_mask[start:end] = False - results = self._model(**model_inputs) - - logits = results.logits[0].detach() - probabilities = torch.nn.functional.softmax(logits, dim=-1).detach() - - pred_mask = (probabilities[:,1] > threshold).cpu().numpy().astype(int).flatten() - label_mask = (pred_mask & combined_mask) - positive_probs = probabilities[:, 1].cpu().numpy() - - transitions = np.diff(np.concatenate([[0], label_mask, [0]])) - starts = np.where(transitions == 1)[0] - ends = np.where(transitions == -1)[0] - - spans_with_probs = [] - token_ids = model_inputs["input_ids"].cpu().numpy()[0] - - for start, end in zip(starts, ends): - span_text = self._tokenizer.decode(token_ids[start:end]) - span_prob = positive_probs[start:end].mean() - spans_with_probs.append({ - "text": span_text, - "score": float(span_prob) - }) - print(span_text) - print("-"*100) - print("*"*100) - return spans_with_probs, int(label_mask.sum()), int(len(label_mask)) - - @weave.op - def score( - self, - output: str, - query: str, - documents: list[str], - return_all_scores: bool = False - ) -> tuple[list[dict[str, Any]], float]: - """Score multiple documents and compute weighted average relevance.""" - all_spans = [] - total_weighted_score = 0.0 - total_length = 0 - - for doc in documents: - spans, relevant_tokens, total_tokens = self._score_document(query, doc, self.threshold) - - all_spans.extend(spans) - - if total_tokens > 0: - doc_score = relevant_tokens / total_tokens - doc_weight = total_tokens - total_weighted_score += doc_score * doc_weight - total_length += total_tokens - - final_score = total_weighted_score / total_length if total_length > 0 else 0.0 - output = {"flagged": final_score > self.threshold} - output['extras'] = {'score': final_score} - if return_all_scores: - output['extras']['all_spans'] = all_spans - return output