diff --git a/.github/workflows/cla.yaml b/.github/workflows/cla.yaml
index 8e0275169fb..f211f20e3d2 100644
--- a/.github/workflows/cla.yaml
+++ b/.github/workflows/cla.yaml
@@ -27,4 +27,4 @@ jobs:
# branch should not be protected
branch: 'cla'
# cannot use teams due to: https://github.com/contributor-assistant/github-action/issues/100
- allowlist: actions-user, altay, dannygoldstein, davidwallacejackson, jamie-rasmussen, jlzhao27, jo-fang, jwlee64, laxels, morganmcg1, nickpenaranda, scottire, shawnlewis, staceysv, tssweeney, vanpelt, vwrj, wandbmachine
+ allowlist: actions-user, altay, bdytx5, dannygoldstein, davidwallacejackson, jamie-rasmussen, jlzhao27, jo-fang, jwlee64, laxels, morganmcg1, nickpenaranda, scottire, shawnlewis, staceysv, tssweeney, vanpelt, vwrj, wandbmachine
diff --git a/docs/docs/guides/cookbooks/summarization/.gitignore b/docs/docs/guides/cookbooks/summarization/.gitignore
new file mode 100644
index 00000000000..8b137891791
--- /dev/null
+++ b/docs/docs/guides/cookbooks/summarization/.gitignore
@@ -0,0 +1 @@
+
diff --git a/docs/docs/guides/tracking/feedback.md b/docs/docs/guides/tracking/feedback.md
index 645c1399f17..ffbff6532c3 100644
--- a/docs/docs/guides/tracking/feedback.md
+++ b/docs/docs/guides/tracking/feedback.md
@@ -71,6 +71,40 @@ call.feedback.add_note("this is a note")
call.feedback.add("correctness", { "value": 5 })
```
+### Retrieving the Call UUID
+
+For scenarios where you need to add feedback immediately after a call, you can retrieve the call UUID programmatically during or after the call execution. Here is how to get the UUID of the call from within the operation:
+
+```python
+
+import weave
+weave.init("uuid")
+
+@weave.op()
+def simple_operation(input_value):
+ # Perform some simple operation
+ output = f"Processed {input_value}"
+ # Get the current call ID
+ current_call = weave.get_current_call()
+ call_id = current_call.id
+ return output, call_id
+```
+
+Additionally, you can use call() method to execute the operation and retrieve the call ID after execution of the function:
+
+```python
+import weave
+weave.init("uuid")
+
+@weave.op()
+def simple_operation(input_value):
+ return f"Processed {input_value}"
+
+# Execute the operation and retrieve the result and call ID
+result, call = simple_operation.call("example input")
+call_id = call.id
+```
+
### Querying feedback on a call
```python
diff --git a/docs/docs/reference/gen_notebooks/intro_notebook.md b/docs/docs/reference/gen_notebooks/01-intro_notebook.md
similarity index 100%
rename from docs/docs/reference/gen_notebooks/intro_notebook.md
rename to docs/docs/reference/gen_notebooks/01-intro_notebook.md
diff --git a/docs/docs/reference/gen_notebooks/chain_of_density.md b/docs/docs/reference/gen_notebooks/chain_of_density.md
new file mode 100644
index 00000000000..caa9e6da805
--- /dev/null
+++ b/docs/docs/reference/gen_notebooks/chain_of_density.md
@@ -0,0 +1,381 @@
+---
+title: Chain of Density Summarization
+---
+
+
+:::tip[This is a notebook]
+
+
+
+:::
+
+
+
+
+
+
+# Summarization using Chain of Density
+
+Summarizing complex technical documents while preserving crucial details is a challenging task. The Chain of Density (CoD) summarization technique offers a solution by iteratively refining summaries to be more concise and information-dense. This guide demonstrates how to implement CoD using Weave for tracking and evaluating the application.
+
+## What is Chain of Density Summarization?
+
+[![arXiv](https://img.shields.io/badge/arXiv-2309.04269-b31b1b.svg)](https://arxiv.org/abs/2309.04269)
+
+Chain of Density (CoD) is an iterative summarization technique that produces increasingly concise and information-dense summaries. It works by:
+
+1. Starting with an initial summary
+2. Iteratively refining the summary, making it more concise while preserving key information
+3. Increasing the density of entities and technical details with each iteration
+
+This approach is particularly useful for summarizing scientific papers or technical documents where preserving detailed information is crucial.
+
+## Why use Weave?
+
+In this tutorial, we'll use Weave to implement and evaluate a Chain of Density summarization pipeline for ArXiv papers. You'll learn how to:
+
+1. **Track your LLM pipeline**: Use Weave to automatically log inputs, outputs, and intermediate steps of your summarization process.
+2. **Evaluate LLM outputs**: Create rigorous, apples-to-apples evaluations of your summaries using Weave's built-in tools.
+3. **Build composable operations**: Combine and reuse Weave operations across different parts of your summarization pipeline.
+4. **Integrate seamlessly**: Add Weave to your existing Python code with minimal overhead.
+
+By the end of this tutorial, you'll have created a CoD summarization pipeline that leverages Weave's capabilities for model serving, evaluation, and result tracking.
+
+## Set up the environment
+
+First, let's set up our environment and import the necessary libraries:
+
+
+```python
+!pip install -qU anthropic weave pydantic requests PyPDF2 set-env-colab-kaggle-dotenv
+```
+
+>To get an Anthropic API key:
+> 1. Sign up for an account at https://www.anthropic.com
+> 2. Navigate to the API section in your account settings
+> 3. Generate a new API key
+> 4. Store the API key securely in your .env file
+
+
+```python
+import io
+import os
+from datetime import datetime, timezone
+
+import anthropic
+import requests
+from pydantic import BaseModel
+from PyPDF2 import PdfReader
+from set_env import set_env
+
+import weave
+
+set_env("WANDB_API_KEY")
+set_env("ANTHROPIC_API_KEY")
+
+weave.init("summarization-chain-of-density-cookbook")
+anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+```
+
+We're using Weave to track our experiment and Anthropic's Claude model for text generation. The `weave.init()` call sets up a new Weave project for our summarization task.
+
+## Define the ArxivPaper model
+
+We'll create a simple `ArxivPaper` class to represent our data:
+
+
+```python
+# Define ArxivPaper model
+class ArxivPaper(BaseModel):
+ entry_id: str
+ updated: datetime
+ published: datetime
+ title: str
+ authors: list[str]
+ summary: str
+ pdf_url: str
+
+
+# Create sample ArxivPaper
+arxiv_paper = ArxivPaper(
+ entry_id="http://arxiv.org/abs/2406.04744v1",
+ updated=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
+ published=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
+ title="CRAG -- Comprehensive RAG Benchmark",
+ authors=["Xiao Yang", "Kai Sun", "Hao Xin"], # Truncated for brevity
+ summary="Retrieval-Augmented Generation (RAG) has recently emerged as a promising solution...", # Truncated
+ pdf_url="https://arxiv.org/pdf/2406.04744",
+)
+```
+
+This class encapsulates the metadata and content of an ArXiv paper, which will be the input to our summarization pipeline.
+
+## Load PDF content
+
+To work with the full paper content, we'll add a function to load and extract text from PDFs:
+
+
+```python
+@weave.op()
+def load_pdf(pdf_url: str) -> str:
+ # Download the PDF
+ response = requests.get(pdf_url)
+ pdf_file = io.BytesIO(response.content)
+
+ # Read the PDF
+ pdf_reader = PdfReader(pdf_file)
+
+ # Extract text from all pages
+ text = ""
+ for page in pdf_reader.pages:
+ text += page.extract_text()
+
+ return text
+```
+
+## Implement Chain of Density summarization
+
+Now, let's implement the core CoD summarization logic using Weave operations:
+
+
+```python
+# Chain of Density Summarization
+@weave.op()
+def summarize_current_summary(
+ document: str,
+ instruction: str,
+ current_summary: str = "",
+ iteration: int = 1,
+ model: str = "claude-3-sonnet-20240229",
+):
+ prompt = f"""
+ Document: {document}
+ Current summary: {current_summary}
+ Instruction to focus on: {instruction}
+ Iteration: {iteration}
+
+ Generate an increasingly concise, entity-dense, and highly technical summary from the provided document that specifically addresses the given instruction.
+ """
+ response = anthropic_client.messages.create(
+ model=model, max_tokens=4096, messages=[{"role": "user", "content": prompt}]
+ )
+ return response.content[0].text
+
+
+@weave.op()
+def iterative_density_summarization(
+ document: str,
+ instruction: str,
+ current_summary: str,
+ density_iterations: int,
+ model: str = "claude-3-sonnet-20240229",
+):
+ iteration_summaries = []
+ for iteration in range(1, density_iterations + 1):
+ current_summary = summarize_current_summary(
+ document, instruction, current_summary, iteration, model
+ )
+ iteration_summaries.append(current_summary)
+ return current_summary, iteration_summaries
+
+
+@weave.op()
+def final_summary(
+ instruction: str, current_summary: str, model: str = "claude-3-sonnet-20240229"
+):
+ prompt = f"""
+ Given this summary: {current_summary}
+ And this instruction to focus on: {instruction}
+ Create an extremely dense, final summary that captures all key technical information in the most concise form possible, while specifically addressing the given instruction.
+ """
+ return (
+ anthropic_client.messages.create(
+ model=model, max_tokens=4096, messages=[{"role": "user", "content": prompt}]
+ )
+ .content[0]
+ .text
+ )
+
+
+@weave.op()
+def chain_of_density_summarization(
+ document: str,
+ instruction: str,
+ current_summary: str = "",
+ model: str = "claude-3-sonnet-20240229",
+ density_iterations: int = 2,
+):
+ current_summary, iteration_summaries = iterative_density_summarization(
+ document, instruction, current_summary, density_iterations, model
+ )
+ final_summary_text = final_summary(instruction, current_summary, model)
+ return {
+ "final_summary": final_summary_text,
+ "accumulated_summary": current_summary,
+ "iteration_summaries": iteration_summaries,
+ }
+```
+
+Here's what each function does:
+
+- `summarize_current_summary`: Generates a single summary iteration based on the current state.
+- `iterative_density_summarization`: Applies the CoD technique by calling `summarize_current_summary` multiple times.
+- `chain_of_density_summarization`: Orchestrates the entire summarization process and returns the results.
+
+By using `@weave.op()` decorators, we ensure that Weave tracks the inputs, outputs, and execution of these functions.
+
+
+## Create a Weave Model
+
+Now, let's wrap our summarization pipeline in a Weave Model:
+
+
+```python
+# Weave Model
+class ArxivChainOfDensityPipeline(weave.Model):
+ model: str = "claude-3-sonnet-20240229"
+ density_iterations: int = 3
+
+ @weave.op()
+ def predict(self, paper: ArxivPaper, instruction: str) -> dict:
+ text = load_pdf(paper["pdf_url"])
+ result = chain_of_density_summarization(
+ text,
+ instruction,
+ model=self.model,
+ density_iterations=self.density_iterations,
+ )
+ return result
+```
+
+This `ArxivChainOfDensityPipeline` class encapsulates our summarization logic as a Weave Model, providing several key benefits:
+
+1. Automatic experiment tracking: Weave captures inputs, outputs, and parameters for each run of the model.
+2. Versioning: Changes to the model's attributes or code are automatically versioned, creating a clear history of how your summarization pipeline evolves over time.
+3. Reproducibility: The versioning and tracking make it easy to reproduce any previous result or configuration of your summarization pipeline.
+4. Hyperparameter management: Model attributes (like `model` and `density_iterations`) are clearly defined and tracked across different runs, facilitating experimentation.
+5. Integration with Weave ecosystem: Using `weave.Model` allows seamless integration with other Weave tools, such as evaluations and serving capabilities.
+
+## Implement evaluation metrics
+
+To assess the quality of our summaries, we'll implement simple evaluation metrics:
+
+
+```python
+import json
+
+
+@weave.op()
+def evaluate_summary(
+ summary: str, instruction: str, model: str = "claude-3-sonnet-20240229"
+) -> dict:
+ prompt = f"""
+ Summary: {summary}
+ Instruction: {instruction}
+
+ Evaluate the summary based on the following criteria:
+ 1. Relevance (1-5): How well does the summary address the given instruction?
+ 2. Conciseness (1-5): How concise is the summary while retaining key information?
+ 3. Technical Accuracy (1-5): How accurately does the summary convey technical details?
+
+ Your response MUST be in the following JSON format:
+ {{
+ "relevance": {{
+ "score": ,
+ "explanation": ""
+ }},
+ "conciseness": {{
+ "score": ,
+ "explanation": ""
+ }},
+ "technical_accuracy": {{
+ "score": ,
+ "explanation": ""
+ }}
+ }}
+
+ Ensure that the scores are integers between 1 and 5, and that the explanations are concise.
+ """
+ response = anthropic_client.messages.create(
+ model=model, max_tokens=1000, messages=[{"role": "user", "content": prompt}]
+ )
+ print(response.content[0].text)
+
+ eval_dict = json.loads(response.content[0].text)
+
+ return {
+ "relevance": eval_dict["relevance"]["score"],
+ "conciseness": eval_dict["conciseness"]["score"],
+ "technical_accuracy": eval_dict["technical_accuracy"]["score"],
+ "average_score": sum(eval_dict[k]["score"] for k in eval_dict) / 3,
+ "evaluation_text": response.content[0].text,
+ }
+```
+
+These evaluation functions use the Claude model to assess the quality of the generated summaries based on relevance, conciseness, and technical accuracy.
+
+## Create a Weave Dataset and run evaluation
+
+To evaluate our pipeline, we'll create a Weave Dataset and run an evaluation:
+
+
+```python
+# Create a Weave Dataset
+dataset = weave.Dataset(
+ name="arxiv_papers",
+ rows=[
+ {
+ "paper": arxiv_paper,
+ "instruction": "What was the approach to experimenting with different data mixtures?",
+ },
+ ],
+)
+
+weave.publish(dataset)
+```
+
+For our evaluation, we'll use an LLM-as-a-judge approach. This technique involves using a language model to assess the quality of outputs generated by another model or system. It leverages the LLM's understanding and reasoning capabilities to provide nuanced evaluations, especially for tasks where traditional metrics may fall short.
+
+[![arXiv](https://img.shields.io/badge/arXiv-2306.05685-b31b1b.svg)](https://arxiv.org/abs/2306.05685)
+
+
+```python
+# Define the scorer function
+@weave.op()
+def quality_scorer(instruction: str, model_output: dict) -> dict:
+ result = evaluate_summary(model_output["final_summary"], instruction)
+ return result
+```
+
+
+```python
+# Run evaluation
+evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])
+arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline()
+results = await evaluation.evaluate(arxiv_chain_of_density_pipeline)
+```
+
+This code creates a dataset with our sample ArXiv paper, defines a quality scorer, and runs an evaluation of our summarization pipeline.
+
+## Conclusion
+
+In this example, we've demonstrated how to implement a Chain of Density summarization pipeline for ArXiv papers using Weave. We've shown how to:
+
+1. Create Weave operations for each step of the summarization process
+2. Wrap the pipeline in a Weave Model for easy tracking and evaluation
+3. Implement custom evaluation metrics using Weave operations
+4. Create a dataset and run an evaluation of the pipeline
+
+Weave's seamless integration allows us to track inputs, outputs, and intermediate steps throughout the summarization process, making it easier to debug, optimize, and evaluate our LLM application.
+You can extend this example to handle larger datasets, implement more sophisticated evaluation metrics, or integrate with other LLM workflows.
+
+
+ View Full Report on W&B
+
diff --git a/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md b/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md
new file mode 100644
index 00000000000..d39ccbaa14e
--- /dev/null
+++ b/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md
@@ -0,0 +1,282 @@
+---
+title: Prompt Optimization
+---
+
+
+:::tip[This is a notebook]
+
+
+
+:::
+
+
+
+
+
+
+# Optimizing LLM Workflows Using DSPy and Weave
+
+The [BIG-bench (Beyond the Imitation Game Benchmark)](https://github.com/google/BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities consisting of more than 200 tasks. The [BIG-Bench Hard (BBH)](https://github.com/suzgunmirac/BIG-Bench-Hard) is a suite of 23 most challenging BIG-Bench tasks that can be quite difficult to be solved using the current generation of language models.
+
+This tutorial demonstrates how we can improve the performance of our LLM workflow implemented on the **causal judgement task** from the BIG-bench Hard benchmark and evaluate our prompting strategies. We will use [DSPy](https://dspy-docs.vercel.app/) for implementing our LLM workflow and optimizing our prompting strategy. We will also use [Weave](../../introduction.md) to track our LLM workflow and evaluate our prompting strategies.
+
+## Installing the Dependencies
+
+We need the following libraries for this tutorial:
+
+- [DSPy](https://dspy-docs.vercel.app/) for building the LLM workflow and optimizing it.
+- [Weave](../../introduction.md) to track our LLM workflow and evaluate our prompting strategies.
+- [datasets](https://huggingface.co/docs/datasets/index) to access the Big-Bench Hard dataset from HuggingFace Hub.
+
+
+```python
+!pip install -qU dspy-ai weave datasets
+```
+
+Since we'll be using [OpenAI API](https://openai.com/index/openai-api/) as the LLM Vendor, we will also need an OpenAI API key. You can [sign up](https://platform.openai.com/signup) on the OpenAI platform to get your own API key.
+
+
+```python
+import os
+from getpass import getpass
+
+api_key = getpass("Enter you OpenAI API key: ")
+os.environ["OPENAI_API_KEY"] = api_key
+```
+
+## Enable Tracking using Weave
+
+Weave is currently integrated with DSPy, and including [`weave.init`](../../reference/python-sdk/weave/index.md) at the start of our code lets us automatically trace our DSPy functions which can be explored in the Weave UI. Check out the [Weave integration docs for DSPy](../../guides/integrations/dspy.md) to learn more.
+
+
+
+```python
+import weave
+
+weave.init(project_name="dspy-bigbench-hard")
+```
+
+In this tutorial, we use a metadata class inherited from [`weave.Object`](../../guides/tracking/objects.md) to manage our metadata.
+
+
+```python
+class Metadata(weave.Object):
+ dataset_address: str = "maveriq/bigbenchhard"
+ big_bench_hard_task: str = "causal_judgement"
+ num_train_examples: int = 50
+ openai_model: str = "gpt-3.5-turbo"
+ openai_max_tokens: int = 2048
+ max_bootstrapped_demos: int = 8
+ max_labeled_demos: int = 8
+
+
+metadata = Metadata()
+```
+
+:::tip Object Versioning
+The `Metadata` objects are automatically versioned and traced when functions consuming them are traced
+:::
+
+## Load the BIG-Bench Hard Dataset
+
+We will load this dataset from HuggingFace Hub, split into training and validation sets, and [publish](../../guides/core-types/datasets.md) them on Weave, this will let us version the datasets, and also use [`weave.Evaluation`](../../guides/core-types/evaluations.md) to evaluate our prompting strategy.
+
+
+```python
+import dspy
+from datasets import load_dataset
+
+
+@weave.op()
+def get_dataset(metadata: Metadata):
+ # load the BIG-Bench Hard dataset corresponding to the task from Huggingface Hug
+ dataset = load_dataset(metadata.dataset_address, metadata.big_bench_hard_task)[
+ "train"
+ ]
+
+ # create the training and validation datasets
+ rows = [{"question": data["input"], "answer": data["target"]} for data in dataset]
+ train_rows = rows[0 : metadata.num_train_examples]
+ val_rows = rows[metadata.num_train_examples :]
+
+ # create the training and validation examples consisting of `dspy.Example` objects
+ dspy_train_examples = [
+ dspy.Example(row).with_inputs("question") for row in train_rows
+ ]
+ dspy_val_examples = [dspy.Example(row).with_inputs("question") for row in val_rows]
+
+ # publish the datasets to the Weave, this would let us version the data and use for evaluation
+ weave.publish(
+ weave.Dataset(
+ name=f"bigbenchhard_{metadata.big_bench_hard_task}_train", rows=train_rows
+ )
+ )
+ weave.publish(
+ weave.Dataset(
+ name=f"bigbenchhard_{metadata.big_bench_hard_task}_val", rows=val_rows
+ )
+ )
+
+ return dspy_train_examples, dspy_val_examples
+
+
+dspy_train_examples, dspy_val_examples = get_dataset(metadata)
+```
+
+## The DSPy Program
+
+[DSPy](https://dspy-docs.vercel.app) is a framework that pushes building new LM pipelines away from manipulating free-form strings and closer to programming (composing modular operators to build text transformation graphs) where a compiler automatically generates optimized LM invocation strategies and prompts from a program.
+
+We will use the [`dspy.OpenAI`](https://dspy-docs.vercel.app/api/language_model_clients/OpenAI) abstraction to make LLM calls to [GPT3.5 Turbo](https://platform.openai.com/docs/models/gpt-3-5-turbo).
+
+
+```python
+system_prompt = """
+You are an expert in the field of causal reasoning. You are to analyze the a given question carefully and answer in `Yes` or `No`.
+You should also provide a detailed explanation justifying your answer.
+"""
+
+llm = dspy.OpenAI(model="gpt-3.5-turbo", system_prompt=system_prompt)
+dspy.settings.configure(lm=llm)
+```
+
+### Writing the Causal Reasoning Signature
+
+A [signature](https://dspy-docs.vercel.app/docs/building-blocks/signatures) is a declarative specification of input/output behavior of a [DSPy module](https://dspy-docs.vercel.app/docs/building-blocks/modules) which are task-adaptive components—akin to neural network layers—that abstract any particular text transformation.
+
+
+```python
+from pydantic import BaseModel, Field
+
+
+class Input(BaseModel):
+ query: str = Field(description="The question to be answered")
+
+
+class Output(BaseModel):
+ answer: str = Field(description="The answer for the question")
+ confidence: float = Field(
+ ge=0, le=1, description="The confidence score for the answer"
+ )
+ explanation: str = Field(description="The explanation for the answer")
+
+
+class QuestionAnswerSignature(dspy.Signature):
+ input: Input = dspy.InputField()
+ output: Output = dspy.OutputField()
+
+
+class CausalReasoningModule(dspy.Module):
+ def __init__(self):
+ self.prog = dspy.TypedPredictor(QuestionAnswerSignature)
+
+ @weave.op()
+ def forward(self, question) -> dict:
+ return self.prog(input=Input(query=question)).output.dict()
+```
+
+Let's test our LLM workflow, i.e., the `CausalReasoningModule` on an example from the causal reasoning subset of Big-Bench Hard.
+
+
+```python
+import rich
+
+baseline_module = CausalReasoningModule()
+
+prediction = baseline_module(dspy_train_examples[0]["question"])
+rich.print(prediction)
+```
+
+## Evaluating our DSPy Program
+
+Now that we have a baseline prompting strategy, let's evaluate it on our validation set using [`weave.Evaluation`](../../guides/core-types/evaluations.md) on a simple metric that matches the predicted answer with the ground truth. Weave will take each example, pass it through your application and score the output on multiple custom scoring functions. By doing this, you'll have a view of the performance of your application, and a rich UI to drill into individual outputs and scores.
+
+First, we need to create a simple weave evaluation scoring function that tells whether the answer from the baseline module's output is the same as the ground truth answer or not. Scoring functions need to have a `model_output` keyword argument, but the other arguments are user defined and are taken from the dataset examples. It will only take the necessary keys by using a dictionary key based on the argument name.
+
+
+```python
+@weave.op()
+def weave_evaluation_scorer(answer: str, model_output: Output) -> dict:
+ return {"match": int(answer.lower() == model_output["answer"].lower())}
+```
+
+Next, we can simply define the evaluation and run it.
+
+
+```python
+validation_dataset = weave.ref(
+ f"bigbenchhard_{metadata.big_bench_hard_task}_val:v0"
+).get()
+
+evaluation = weave.Evaluation(
+ name="baseline_causal_reasoning_module",
+ dataset=validation_dataset,
+ scorers=[weave_evaluation_scorer],
+)
+
+await evaluation.evaluate(baseline_module.forward)
+```
+
+:::note
+If you're running from a python script, you can use the following code to run the evaluation:
+
+```python
+import asyncio
+asyncio.run(evaluation.evaluate(baseline_module.forward))
+```
+:::
+
+:::warning
+Running the evaluation causal reasoning dataset will cost approximately $0.24 in OpenAI credits.
+:::
+
+## Optimizing our DSPy Program
+
+Now, that we have a baseline DSPy program, let us try to improve its performance for causal reasoning using a [DSPy teleprompter](https://dspy-docs.vercel.app/docs/building-blocks/optimizers) that can tune the parameters of a DSPy program to maximize the specified metrics. In this tutorial, we use the [BootstrapFewShot](https://dspy-docs.vercel.app/api/category/optimizers) teleprompter.
+
+
+```python
+from dspy.teleprompt import BootstrapFewShot
+
+
+@weave.op()
+def get_optimized_program(model: dspy.Module, metadata: Metadata) -> dspy.Module:
+ @weave.op()
+ def dspy_evaluation_metric(true, prediction, trace=None):
+ return prediction["answer"].lower() == true.answer.lower()
+
+ teleprompter = BootstrapFewShot(
+ metric=dspy_evaluation_metric,
+ max_bootstrapped_demos=metadata.max_bootstrapped_demos,
+ max_labeled_demos=metadata.max_labeled_demos,
+ )
+ return teleprompter.compile(model, trainset=dspy_train_examples)
+
+
+optimized_module = get_optimized_program(baseline_module, metadata)
+```
+
+:::warning
+Running the evaluation causal reasoning dataset will cost approximately $0.04 in OpenAI credits.
+:::
+
+Now that we have our optimized program (the optimized prompting strategy), let's evaluate it once again on our validation set and compare it with our baseline DSPy program.
+
+
+```python
+evaluation = weave.Evaluation(
+ name="optimized_causal_reasoning_module",
+ dataset=validation_dataset,
+ scorers=[weave_evaluation_scorer],
+)
+
+await evaluation.evaluate(optimized_module.forward)
+```
+
+When coomparing the evalution of the baseline program with the optimized one shows that the optimized program answers the causal reasoning questions with siginificantly more accuracy.
+
+## Conclusion
+
+In this tutorial, we learned how to use DSPy for prompt optimization alongside using Weave for tracking and evaluation to compare the original and optimized programs.
diff --git a/docs/docs/tutorial-tracing_2.md b/docs/docs/tutorial-tracing_2.md
index 108571cf650..da1980f1155 100644
--- a/docs/docs/tutorial-tracing_2.md
+++ b/docs/docs/tutorial-tracing_2.md
@@ -5,7 +5,6 @@ In the [Track LLM inputs & outputs](/quickstart) tutorial, the basics of trackin
In this tutorial you will learn how to:
- **Track data** as it flows though your application
- **Track metadata** at call time
-- **Export data** that was logged to Weave
## Tracking nested function calls
diff --git a/docs/notebooks/chain_of_density.ipynb b/docs/notebooks/chain_of_density.ipynb
new file mode 100644
index 00000000000..2fb6ecc3f45
--- /dev/null
+++ b/docs/notebooks/chain_of_density.ipynb
@@ -0,0 +1,542 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Summarization using Chain of Density\n",
+ "\n",
+ "Summarizing complex technical documents while preserving crucial details is a challenging task. The Chain of Density (CoD) summarization technique offers a solution by iteratively refining summaries to be more concise and information-dense. This guide demonstrates how to implement CoD using Weave for tracking and evaluating the application. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## What is Chain of Density Summarization?\n",
+ "\n",
+ "[![arXiv](https://img.shields.io/badge/arXiv-2309.04269-b31b1b.svg)](https://arxiv.org/abs/2309.04269)\n",
+ "\n",
+ "Chain of Density (CoD) is an iterative summarization technique that produces increasingly concise and information-dense summaries. It works by:\n",
+ "\n",
+ "1. Starting with an initial summary\n",
+ "2. Iteratively refining the summary, making it more concise while preserving key information\n",
+ "3. Increasing the density of entities and technical details with each iteration\n",
+ "\n",
+ "This approach is particularly useful for summarizing scientific papers or technical documents where preserving detailed information is crucial."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Why use Weave?\n",
+ "\n",
+ "In this tutorial, we'll use Weave to implement and evaluate a Chain of Density summarization pipeline for ArXiv papers. You'll learn how to:\n",
+ "\n",
+ "1. **Track your LLM pipeline**: Use Weave to automatically log inputs, outputs, and intermediate steps of your summarization process.\n",
+ "2. **Evaluate LLM outputs**: Create rigorous, apples-to-apples evaluations of your summaries using Weave's built-in tools.\n",
+ "3. **Build composable operations**: Combine and reuse Weave operations across different parts of your summarization pipeline.\n",
+ "4. **Integrate seamlessly**: Add Weave to your existing Python code with minimal overhead.\n",
+ "\n",
+ "By the end of this tutorial, you'll have created a CoD summarization pipeline that leverages Weave's capabilities for model serving, evaluation, and result tracking."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Set up the environment\n",
+ "\n",
+ "First, let's set up our environment and import the necessary libraries:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -qU anthropic weave pydantic requests PyPDF2 set-env-colab-kaggle-dotenv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ">To get an Anthropic API key:\n",
+ "> 1. Sign up for an account at https://www.anthropic.com\n",
+ "> 2. Navigate to the API section in your account settings\n",
+ "> 3. Generate a new API key\n",
+ "> 4. Store the API key securely in your .env file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import io\n",
+ "import os\n",
+ "from datetime import datetime, timezone\n",
+ "\n",
+ "import anthropic\n",
+ "import requests\n",
+ "from pydantic import BaseModel\n",
+ "from PyPDF2 import PdfReader\n",
+ "from set_env import set_env\n",
+ "\n",
+ "import weave\n",
+ "\n",
+ "set_env(\"WANDB_API_KEY\")\n",
+ "set_env(\"ANTHROPIC_API_KEY\")\n",
+ "\n",
+ "weave.init(\"summarization-chain-of-density-cookbook\")\n",
+ "anthropic_client = anthropic.Anthropic(api_key=os.getenv(\"ANTHROPIC_API_KEY\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We're using Weave to track our experiment and Anthropic's Claude model for text generation. The `weave.init()` call sets up a new Weave project for our summarization task."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define the ArxivPaper model\n",
+ "\n",
+ "We'll create a simple `ArxivPaper` class to represent our data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define ArxivPaper model\n",
+ "class ArxivPaper(BaseModel):\n",
+ " entry_id: str\n",
+ " updated: datetime\n",
+ " published: datetime\n",
+ " title: str\n",
+ " authors: list[str]\n",
+ " summary: str\n",
+ " pdf_url: str\n",
+ "\n",
+ "\n",
+ "# Create sample ArxivPaper\n",
+ "arxiv_paper = ArxivPaper(\n",
+ " entry_id=\"http://arxiv.org/abs/2406.04744v1\",\n",
+ " updated=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),\n",
+ " published=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),\n",
+ " title=\"CRAG -- Comprehensive RAG Benchmark\",\n",
+ " authors=[\"Xiao Yang\", \"Kai Sun\", \"Hao Xin\"], # Truncated for brevity\n",
+ " summary=\"Retrieval-Augmented Generation (RAG) has recently emerged as a promising solution...\", # Truncated\n",
+ " pdf_url=\"https://arxiv.org/pdf/2406.04744\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This class encapsulates the metadata and content of an ArXiv paper, which will be the input to our summarization pipeline."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load PDF content\n",
+ "\n",
+ "To work with the full paper content, we'll add a function to load and extract text from PDFs:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@weave.op()\n",
+ "def load_pdf(pdf_url: str) -> str:\n",
+ " # Download the PDF\n",
+ " response = requests.get(pdf_url)\n",
+ " pdf_file = io.BytesIO(response.content)\n",
+ "\n",
+ " # Read the PDF\n",
+ " pdf_reader = PdfReader(pdf_file)\n",
+ "\n",
+ " # Extract text from all pages\n",
+ " text = \"\"\n",
+ " for page in pdf_reader.pages:\n",
+ " text += page.extract_text()\n",
+ "\n",
+ " return text"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Implement Chain of Density summarization\n",
+ "\n",
+ "Now, let's implement the core CoD summarization logic using Weave operations:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Chain of Density Summarization\n",
+ "@weave.op()\n",
+ "def summarize_current_summary(\n",
+ " document: str,\n",
+ " instruction: str,\n",
+ " current_summary: str = \"\",\n",
+ " iteration: int = 1,\n",
+ " model: str = \"claude-3-sonnet-20240229\",\n",
+ "):\n",
+ " prompt = f\"\"\"\n",
+ " Document: {document}\n",
+ " Current summary: {current_summary}\n",
+ " Instruction to focus on: {instruction}\n",
+ " Iteration: {iteration}\n",
+ "\n",
+ " Generate an increasingly concise, entity-dense, and highly technical summary from the provided document that specifically addresses the given instruction.\n",
+ " \"\"\"\n",
+ " response = anthropic_client.messages.create(\n",
+ " model=model, max_tokens=4096, messages=[{\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ " return response.content[0].text\n",
+ "\n",
+ "\n",
+ "@weave.op()\n",
+ "def iterative_density_summarization(\n",
+ " document: str,\n",
+ " instruction: str,\n",
+ " current_summary: str,\n",
+ " density_iterations: int,\n",
+ " model: str = \"claude-3-sonnet-20240229\",\n",
+ "):\n",
+ " iteration_summaries = []\n",
+ " for iteration in range(1, density_iterations + 1):\n",
+ " current_summary = summarize_current_summary(\n",
+ " document, instruction, current_summary, iteration, model\n",
+ " )\n",
+ " iteration_summaries.append(current_summary)\n",
+ " return current_summary, iteration_summaries\n",
+ "\n",
+ "\n",
+ "@weave.op()\n",
+ "def final_summary(\n",
+ " instruction: str, current_summary: str, model: str = \"claude-3-sonnet-20240229\"\n",
+ "):\n",
+ " prompt = f\"\"\"\n",
+ " Given this summary: {current_summary}\n",
+ " And this instruction to focus on: {instruction}\n",
+ " Create an extremely dense, final summary that captures all key technical information in the most concise form possible, while specifically addressing the given instruction.\n",
+ " \"\"\"\n",
+ " return (\n",
+ " anthropic_client.messages.create(\n",
+ " model=model, max_tokens=4096, messages=[{\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ " .content[0]\n",
+ " .text\n",
+ " )\n",
+ "\n",
+ "\n",
+ "@weave.op()\n",
+ "def chain_of_density_summarization(\n",
+ " document: str,\n",
+ " instruction: str,\n",
+ " current_summary: str = \"\",\n",
+ " model: str = \"claude-3-sonnet-20240229\",\n",
+ " density_iterations: int = 2,\n",
+ "):\n",
+ " current_summary, iteration_summaries = iterative_density_summarization(\n",
+ " document, instruction, current_summary, density_iterations, model\n",
+ " )\n",
+ " final_summary_text = final_summary(instruction, current_summary, model)\n",
+ " return {\n",
+ " \"final_summary\": final_summary_text,\n",
+ " \"accumulated_summary\": current_summary,\n",
+ " \"iteration_summaries\": iteration_summaries,\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here's what each function does:\n",
+ "\n",
+ "- `summarize_current_summary`: Generates a single summary iteration based on the current state.\n",
+ "- `iterative_density_summarization`: Applies the CoD technique by calling `summarize_current_summary` multiple times.\n",
+ "- `chain_of_density_summarization`: Orchestrates the entire summarization process and returns the results.\n",
+ "\n",
+ "By using `@weave.op()` decorators, we ensure that Weave tracks the inputs, outputs, and execution of these functions.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create a Weave Model\n",
+ "\n",
+ "Now, let's wrap our summarization pipeline in a Weave Model:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Weave Model\n",
+ "class ArxivChainOfDensityPipeline(weave.Model):\n",
+ " model: str = \"claude-3-sonnet-20240229\"\n",
+ " density_iterations: int = 3\n",
+ "\n",
+ " @weave.op()\n",
+ " def predict(self, paper: ArxivPaper, instruction: str) -> dict:\n",
+ " text = load_pdf(paper[\"pdf_url\"])\n",
+ " result = chain_of_density_summarization(\n",
+ " text,\n",
+ " instruction,\n",
+ " model=self.model,\n",
+ " density_iterations=self.density_iterations,\n",
+ " )\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This `ArxivChainOfDensityPipeline` class encapsulates our summarization logic as a Weave Model, providing several key benefits:\n",
+ "\n",
+ "1. Automatic experiment tracking: Weave captures inputs, outputs, and parameters for each run of the model.\n",
+ "2. Versioning: Changes to the model's attributes or code are automatically versioned, creating a clear history of how your summarization pipeline evolves over time.\n",
+ "3. Reproducibility: The versioning and tracking make it easy to reproduce any previous result or configuration of your summarization pipeline.\n",
+ "4. Hyperparameter management: Model attributes (like `model` and `density_iterations`) are clearly defined and tracked across different runs, facilitating experimentation.\n",
+ "5. Integration with Weave ecosystem: Using `weave.Model` allows seamless integration with other Weave tools, such as evaluations and serving capabilities."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Implement evaluation metrics\n",
+ "\n",
+ "To assess the quality of our summaries, we'll implement simple evaluation metrics:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "\n",
+ "\n",
+ "@weave.op()\n",
+ "def evaluate_summary(\n",
+ " summary: str, instruction: str, model: str = \"claude-3-sonnet-20240229\"\n",
+ ") -> dict:\n",
+ " prompt = f\"\"\"\n",
+ " Summary: {summary}\n",
+ " Instruction: {instruction}\n",
+ "\n",
+ " Evaluate the summary based on the following criteria:\n",
+ " 1. Relevance (1-5): How well does the summary address the given instruction?\n",
+ " 2. Conciseness (1-5): How concise is the summary while retaining key information?\n",
+ " 3. Technical Accuracy (1-5): How accurately does the summary convey technical details?\n",
+ "\n",
+ " Your response MUST be in the following JSON format:\n",
+ " {{\n",
+ " \"relevance\": {{\n",
+ " \"score\": ,\n",
+ " \"explanation\": \"\"\n",
+ " }},\n",
+ " \"conciseness\": {{\n",
+ " \"score\": ,\n",
+ " \"explanation\": \"\"\n",
+ " }},\n",
+ " \"technical_accuracy\": {{\n",
+ " \"score\": ,\n",
+ " \"explanation\": \"\"\n",
+ " }}\n",
+ " }}\n",
+ "\n",
+ " Ensure that the scores are integers between 1 and 5, and that the explanations are concise.\n",
+ " \"\"\"\n",
+ " response = anthropic_client.messages.create(\n",
+ " model=model, max_tokens=1000, messages=[{\"role\": \"user\", \"content\": prompt}]\n",
+ " )\n",
+ " print(response.content[0].text)\n",
+ "\n",
+ " eval_dict = json.loads(response.content[0].text)\n",
+ "\n",
+ " return {\n",
+ " \"relevance\": eval_dict[\"relevance\"][\"score\"],\n",
+ " \"conciseness\": eval_dict[\"conciseness\"][\"score\"],\n",
+ " \"technical_accuracy\": eval_dict[\"technical_accuracy\"][\"score\"],\n",
+ " \"average_score\": sum(eval_dict[k][\"score\"] for k in eval_dict) / 3,\n",
+ " \"evaluation_text\": response.content[0].text,\n",
+ " }"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "These evaluation functions use the Claude model to assess the quality of the generated summaries based on relevance, conciseness, and technical accuracy."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create a Weave Dataset and run evaluation\n",
+ "\n",
+ "To evaluate our pipeline, we'll create a Weave Dataset and run an evaluation:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a Weave Dataset\n",
+ "dataset = weave.Dataset(\n",
+ " name=\"arxiv_papers\",\n",
+ " rows=[\n",
+ " {\n",
+ " \"paper\": arxiv_paper,\n",
+ " \"instruction\": \"What was the approach to experimenting with different data mixtures?\",\n",
+ " },\n",
+ " ],\n",
+ ")\n",
+ "\n",
+ "weave.publish(dataset)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For our evaluation, we'll use an LLM-as-a-judge approach. This technique involves using a language model to assess the quality of outputs generated by another model or system. It leverages the LLM's understanding and reasoning capabilities to provide nuanced evaluations, especially for tasks where traditional metrics may fall short."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[![arXiv](https://img.shields.io/badge/arXiv-2306.05685-b31b1b.svg)](https://arxiv.org/abs/2306.05685)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define the scorer function\n",
+ "@weave.op()\n",
+ "def quality_scorer(instruction: str, model_output: dict) -> dict:\n",
+ " result = evaluate_summary(model_output[\"final_summary\"], instruction)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run evaluation\n",
+ "evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])\n",
+ "arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline()\n",
+ "results = await evaluation.evaluate(arxiv_chain_of_density_pipeline)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This code creates a dataset with our sample ArXiv paper, defines a quality scorer, and runs an evaluation of our summarization pipeline."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Conclusion\n",
+ "\n",
+ "In this example, we've demonstrated how to implement a Chain of Density summarization pipeline for ArXiv papers using Weave. We've shown how to:\n",
+ "\n",
+ "1. Create Weave operations for each step of the summarization process\n",
+ "2. Wrap the pipeline in a Weave Model for easy tracking and evaluation\n",
+ "3. Implement custom evaluation metrics using Weave operations\n",
+ "4. Create a dataset and run an evaluation of the pipeline\n",
+ "\n",
+ "Weave's seamless integration allows us to track inputs, outputs, and intermediate steps throughout the summarization process, making it easier to debug, optimize, and evaluate our LLM application.\n",
+ "You can extend this example to handle larger datasets, implement more sophisticated evaluation metrics, or integrate with other LLM workflows.\n",
+ "\n",
+ "\n",
+ " View Full Report on W&B\n",
+ ""
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/notebooks/dspy_prompt_optimization.ipynb b/docs/notebooks/dspy_prompt_optimization.ipynb
index 94c6b485e15..573aaf7085a 100644
--- a/docs/notebooks/dspy_prompt_optimization.ipynb
+++ b/docs/notebooks/dspy_prompt_optimization.ipynb
@@ -6,15 +6,23 @@
"source": [
"\n",
"\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"# Optimizing LLM Workflows Using DSPy and Weave\n",
"\n",
"The [BIG-bench (Beyond the Imitation Game Benchmark)](https://github.com/google/BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities consisting of more than 200 tasks. The [BIG-Bench Hard (BBH)](https://github.com/suzgunmirac/BIG-Bench-Hard) is a suite of 23 most challenging BIG-Bench tasks that can be quite difficult to be solved using the current generation of language models.\n",
"\n",
- "This tutorial demonstrates how we can improve the performance of our LLM workflow implemented on the **causal judgement task** from the BIG-bench Hard benchmark and evaluate our prompting strategies. We will use [DSPy](https://dspy-docs.vercel.app/) for implementing our LLM workflow and optimizing our prompting strategy. We will also use [Weave](../docs/introduction.md) to track our LLM workflow and evaluate our prompting strategies."
+ "This tutorial demonstrates how we can improve the performance of our LLM workflow implemented on the **causal judgement task** from the BIG-bench Hard benchmark and evaluate our prompting strategies. We will use [DSPy](https://dspy-docs.vercel.app/) for implementing our LLM workflow and optimizing our prompting strategy. We will also use [Weave](../../introduction.md) to track our LLM workflow and evaluate our prompting strategies."
]
},
{
@@ -26,7 +34,7 @@
"We need the following libraries for this tutorial:\n",
"\n",
"- [DSPy](https://dspy-docs.vercel.app/) for building the LLM workflow and optimizing it.\n",
- "- [Weave](../introduction.md) to track our LLM workflow and evaluate our prompting strategies.\n",
+ "- [Weave](../../introduction.md) to track our LLM workflow and evaluate our prompting strategies.\n",
"- [datasets](https://huggingface.co/docs/datasets/index) to access the Big-Bench Hard dataset from HuggingFace Hub."
]
},
@@ -65,7 +73,7 @@
"source": [
"## Enable Tracking using Weave\n",
"\n",
- "Weave is currently integrated with DSPy, and including [`weave.init`](../docs/reference/python-sdk/weave/index.md) at the start of our code lets us automatically trace our DSPy functions which can be explored in the Weave UI. Check out the [Weave integration docs for DSPy](../docs/guides/integrations/dspy.md) to learn more."
+ "Weave is currently integrated with DSPy, and including [`weave.init`](../../reference/python-sdk/weave/index.md) at the start of our code lets us automatically trace our DSPy functions which can be explored in the Weave UI. Check out the [Weave integration docs for DSPy](../../guides/integrations/dspy.md) to learn more.\n"
]
},
{
@@ -83,7 +91,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "In this tutorial, we use a metadata class inherited from [`weave.Model`](../docs/guides/core-types/models.md) to manage our metadata."
+ "In this tutorial, we use a metadata class inherited from [`weave.Object`](../../guides/tracking/objects.md) to manage our metadata."
]
},
{
@@ -92,7 +100,7 @@
"metadata": {},
"outputs": [],
"source": [
- "class Metadata(weave.Model):\n",
+ "class Metadata(weave.Object):\n",
" dataset_address: str = \"maveriq/bigbenchhard\"\n",
" big_bench_hard_task: str = \"causal_judgement\"\n",
" num_train_examples: int = 50\n",
@@ -109,9 +117,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "| ![](../static/img/dspy_prompt_optimiztion/metadata.gif) |\n",
- "|---|\n",
- "| The `Metadata` objects are automatically versioned and traced when functions consuming them are traced |"
+ ":::tip Object Versioning\n",
+ "The `Metadata` objects are automatically versioned and traced when functions consuming them are traced\n",
+ ":::"
]
},
{
@@ -120,7 +128,7 @@
"source": [
"## Load the BIG-Bench Hard Dataset\n",
"\n",
- "We will load this dataset from HuggingFace Hub, split into training and validation sets, and [publish](../docs/guides/core-types/datasets.md) them on Weave, this will let us version the datasets, and also use [`weave.Evaluation`](../docs/guides/core-types/evaluations.md) to evaluate our prompting strategy."
+ "We will load this dataset from HuggingFace Hub, split into training and validation sets, and [publish](../../guides/core-types/datasets.md) them on Weave, this will let us version the datasets, and also use [`weave.Evaluation`](../../guides/core-types/evaluations.md) to evaluate our prompting strategy."
]
},
{
@@ -169,15 +177,6 @@
"dspy_train_examples, dspy_val_examples = get_dataset(metadata)"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "| ![](../static/img/dspy_prompt_optimiztion/datasets.gif) |\n",
- "|---|\n",
- "| The datasets, once published, can be explored in the Weave UI |"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -269,22 +268,13 @@
"rich.print(prediction)"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "| ![](../static/img/dspy_prompt_optimiztion/dspy_module_trace.gif) |\n",
- "|---|\n",
- "| Here's how you can explore the traces of the `CausalReasoningModule` in the Weave UI |"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluating our DSPy Program\n",
"\n",
- "Now that we have a baseline prompting strategy, let's evaluate it on our validation set using [`weave.Evaluation`](../docs/guides/core-types/evaluations.md) on a simple metric that matches the predicted answer with the ground truth. Weave will take each example, pass it through your application and score the output on multiple custom scoring functions. By doing this, you'll have a view of the performance of your application, and a rich UI to drill into individual outputs and scores.\n",
+ "Now that we have a baseline prompting strategy, let's evaluate it on our validation set using [`weave.Evaluation`](../../guides/core-types/evaluations.md) on a simple metric that matches the predicted answer with the ground truth. Weave will take each example, pass it through your application and score the output on multiple custom scoring functions. By doing this, you'll have a view of the performance of your application, and a rich UI to drill into individual outputs and scores.\n",
"\n",
"First, we need to create a simple weave evaluation scoring function that tells whether the answer from the baseline module's output is the same as the ground truth answer or not. Scoring functions need to have a `model_output` keyword argument, but the other arguments are user defined and are taken from the dataset examples. It will only take the necessary keys by using a dictionary key based on the argument name."
]
@@ -387,10 +377,6 @@
"Running the evaluation causal reasoning dataset will cost approximately $0.04 in OpenAI credits.\n",
":::\n",
"\n",
- "| ![](../static/img/dspy_prompt_optimiztion/dspy_compile.png) |\n",
- "|---|\n",
- "| You can explore the traces of the optimization process in the Weave UI. |\n",
- "\n",
"Now that we have our optimized program (the optimized prompting strategy), let's evaluate it once again on our validation set and compare it with our baseline DSPy program."
]
},
@@ -413,15 +399,17 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "| ![](../static/img/dspy_prompt_optimiztion/eval_comparison.gif) |\n",
- "|---|\n",
- "| Comparing the evalution of the baseline program with the optimized one shows that the optimized program answers the causal reasoning questions with siginificantly more accuracy. |"
+ "When coomparing the evalution of the baseline program with the optimized one shows that the optimized program answers the causal reasoning questions with siginificantly more accuracy."
]
},
{
"cell_type": "markdown",
"metadata": {},
- "source": []
+ "source": [
+ "## Conclusion\n",
+ "\n",
+ "In this tutorial, we learned how to use DSPy for prompt optimization alongside using Weave for tracking and evaluation to compare the original and optimized programs."
+ ]
}
],
"metadata": {
diff --git a/docs/scripts/generate_notebooks.py b/docs/scripts/generate_notebooks.py
index ad3d8a10277..c3421dc8f73 100644
--- a/docs/scripts/generate_notebooks.py
+++ b/docs/scripts/generate_notebooks.py
@@ -54,7 +54,7 @@ def export_all_notebooks_in_primary_dir():
def main():
export_all_notebooks_in_primary_dir()
export_notebook(
- "./intro_notebook.ipynb", "./docs/reference/gen_notebooks/intro_notebook.md"
+ "./intro_notebook.ipynb", "./docs/reference/gen_notebooks/01-intro_notebook.md"
)
diff --git a/docs/static/img/dspy_prompt_optimiztion/datasets.gif b/docs/static/img/dspy_prompt_optimiztion/datasets.gif
deleted file mode 100644
index 239c7c74767..00000000000
Binary files a/docs/static/img/dspy_prompt_optimiztion/datasets.gif and /dev/null differ
diff --git a/docs/static/img/dspy_prompt_optimiztion/dspy_compile.png b/docs/static/img/dspy_prompt_optimiztion/dspy_compile.png
deleted file mode 100644
index 1afff4bf60d..00000000000
Binary files a/docs/static/img/dspy_prompt_optimiztion/dspy_compile.png and /dev/null differ
diff --git a/docs/static/img/dspy_prompt_optimiztion/dspy_module_trace.gif b/docs/static/img/dspy_prompt_optimiztion/dspy_module_trace.gif
deleted file mode 100644
index 970da65a1dd..00000000000
Binary files a/docs/static/img/dspy_prompt_optimiztion/dspy_module_trace.gif and /dev/null differ
diff --git a/docs/static/img/dspy_prompt_optimiztion/eval_comparison.gif b/docs/static/img/dspy_prompt_optimiztion/eval_comparison.gif
deleted file mode 100644
index 2faccf15427..00000000000
Binary files a/docs/static/img/dspy_prompt_optimiztion/eval_comparison.gif and /dev/null differ
diff --git a/docs/static/img/dspy_prompt_optimiztion/metadata.gif b/docs/static/img/dspy_prompt_optimiztion/metadata.gif
deleted file mode 100644
index 5a00a7375f2..00000000000
Binary files a/docs/static/img/dspy_prompt_optimiztion/metadata.gif and /dev/null differ
diff --git a/requirements.txt b/requirements.txt
index 1729b6cb1ed..e8cb5513f35 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,6 +42,9 @@ numpy<2.0.0
# Segment logging
analytics-python>=1.2.9
+# Used for ISO date parsing.
+python-dateutil>=2.8.2
+
# Used for version parsing in integrations.
packaging>=21.0
diff --git a/weave-js/src/common/components/elements/LegacyWBIcon.tsx b/weave-js/src/common/components/elements/LegacyWBIcon.tsx
index b1fce5a4895..fa440a9ba03 100644
--- a/weave-js/src/common/components/elements/LegacyWBIcon.tsx
+++ b/weave-js/src/common/components/elements/LegacyWBIcon.tsx
@@ -26,6 +26,10 @@ export interface LegacyWBIconProps {
style?: any;
'data-test'?: any;
+
+ role?: string;
+ ariaHidden?: string;
+ ariaLabel?: string;
}
const LegacyWBIconComp = React.forwardRef(
@@ -42,6 +46,10 @@ const LegacyWBIconComp = React.forwardRef(
onMouseLeave,
style,
'data-test': dataTest,
+ role,
+ title,
+ ariaHidden,
+ ariaLabel,
},
ref
) => {
@@ -59,6 +67,10 @@ const LegacyWBIconComp = React.forwardRef(
onMouseLeave,
style,
'data-test': dataTest,
+ role,
+ title,
+ 'aria-hidden': ariaHidden,
+ 'aria-label': ariaLabel,
};
if (ref == null) {
return ;
diff --git a/weave-js/src/components/FancyPage/FancyPageSidebarSection.tsx b/weave-js/src/components/FancyPage/FancyPageSidebarSection.tsx
index 4603188d2f0..709c3ccb781 100644
--- a/weave-js/src/components/FancyPage/FancyPageSidebarSection.tsx
+++ b/weave-js/src/components/FancyPage/FancyPageSidebarSection.tsx
@@ -136,7 +136,11 @@ const FancyPageSidebarSection = (props: FancyPageSidebarSectionProps) => {
}}>
-
+ {item.name}
@@ -159,7 +163,11 @@ const FancyPageSidebarSection = (props: FancyPageSidebarSectionProps) => {
const button = (
-
+ {item.name}
diff --git a/weave-js/src/components/LinearProgress.tsx b/weave-js/src/components/LinearProgress.tsx
new file mode 100644
index 00000000000..6440fcba06b
--- /dev/null
+++ b/weave-js/src/components/LinearProgress.tsx
@@ -0,0 +1,25 @@
+/**
+ * Styled linear progress bar.
+ */
+
+import MuiLinearProgress, {
+ LinearProgressProps as MuiLinearProgressProps,
+} from '@mui/material/LinearProgress';
+import React from 'react';
+
+import * as Colors from '../common/css/color.styles';
+
+export const LinearProgress = (props: MuiLinearProgressProps) => {
+ return (
+
+ );
+};
diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse2/Browse2OpDefCode.tsx b/weave-js/src/components/PagePanelComponents/Home/Browse2/Browse2OpDefCode.tsx
index 86d25d00e0e..38aa07823d8 100644
--- a/weave-js/src/components/PagePanelComponents/Home/Browse2/Browse2OpDefCode.tsx
+++ b/weave-js/src/components/PagePanelComponents/Home/Browse2/Browse2OpDefCode.tsx
@@ -3,6 +3,7 @@ import Box from '@mui/material/Box';
import {Loading} from '@wandb/weave/components/Loading';
import React, {FC} from 'react';
+import {Alert} from '../../../Alert';
import {useWFHooks} from '../Browse3/pages/wfReactInterface/context';
export const Browse2OpDefCode: FC<{uri: string; maxRowsInView?: number}> = ({
@@ -25,6 +26,17 @@ export const Browse2OpDefCode: FC<{uri: string; maxRowsInView?: number}> = ({
);
}
+ if (text.result == null) {
+ return (
+
+ No code found for this operation
+
+ );
+ }
+
const inner = (
= ({title}) => {
+ const close = useClosePeek();
+ const {isPeeking} = useContext(WeaveflowPeekContext);
+ return (
+