diff --git a/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md b/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md index d39ccbaa14e..68e72904f44 100644 --- a/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md +++ b/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md @@ -51,34 +51,12 @@ os.environ["OPENAI_API_KEY"] = api_key Weave is currently integrated with DSPy, and including [`weave.init`](../../reference/python-sdk/weave/index.md) at the start of our code lets us automatically trace our DSPy functions which can be explored in the Weave UI. Check out the [Weave integration docs for DSPy](../../guides/integrations/dspy.md) to learn more. - ```python import weave weave.init(project_name="dspy-bigbench-hard") ``` -In this tutorial, we use a metadata class inherited from [`weave.Object`](../../guides/tracking/objects.md) to manage our metadata. - - -```python -class Metadata(weave.Object): - dataset_address: str = "maveriq/bigbenchhard" - big_bench_hard_task: str = "causal_judgement" - num_train_examples: int = 50 - openai_model: str = "gpt-3.5-turbo" - openai_max_tokens: int = 2048 - max_bootstrapped_demos: int = 8 - max_labeled_demos: int = 8 - - -metadata = Metadata() -``` - -:::tip Object Versioning -The `Metadata` objects are automatically versioned and traced when functions consuming them are traced -::: - ## Load the BIG-Bench Hard Dataset We will load this dataset from HuggingFace Hub, split into training and validation sets, and [publish](../../guides/core-types/datasets.md) them on Weave, this will let us version the datasets, and also use [`weave.Evaluation`](../../guides/core-types/evaluations.md) to evaluate our prompting strategy. @@ -90,16 +68,16 @@ from datasets import load_dataset @weave.op() -def get_dataset(metadata: Metadata): +def get_dataset(dataset_address: str, big_bench_hard_task: str, num_train_examples: int): # load the BIG-Bench Hard dataset corresponding to the task from Huggingface Hug - dataset = load_dataset(metadata.dataset_address, metadata.big_bench_hard_task)[ + dataset = load_dataset(dataset_address, big_bench_hard_task)[ "train" ] # create the training and validation datasets rows = [{"question": data["input"], "answer": data["target"]} for data in dataset] - train_rows = rows[0 : metadata.num_train_examples] - val_rows = rows[metadata.num_train_examples :] + train_rows = rows[0 : num_train_examples] + val_rows = rows[num_train_examples :] # create the training and validation examples consisting of `dspy.Example` objects dspy_train_examples = [ @@ -110,19 +88,23 @@ def get_dataset(metadata: Metadata): # publish the datasets to the Weave, this would let us version the data and use for evaluation weave.publish( weave.Dataset( - name=f"bigbenchhard_{metadata.big_bench_hard_task}_train", rows=train_rows + name=f"bigbenchhard_{big_bench_hard_task}_train", rows=train_rows ) ) weave.publish( weave.Dataset( - name=f"bigbenchhard_{metadata.big_bench_hard_task}_val", rows=val_rows + name=f"bigbenchhard_{big_bench_hard_task}_val", rows=val_rows ) ) return dspy_train_examples, dspy_val_examples -dspy_train_examples, dspy_val_examples = get_dataset(metadata) +dspy_train_examples, dspy_val_examples = get_dataset( + dataset_address="maveriq/bigbenchhard", + big_bench_hard_task="causal_judgement", + num_train_examples=50 +) ``` ## The DSPy Program @@ -131,17 +113,6 @@ dspy_train_examples, dspy_val_examples = get_dataset(metadata) We will use the [`dspy.OpenAI`](https://dspy-docs.vercel.app/api/language_model_clients/OpenAI) abstraction to make LLM calls to [GPT3.5 Turbo](https://platform.openai.com/docs/models/gpt-3-5-turbo). - -```python -system_prompt = """ -You are an expert in the field of causal reasoning. You are to analyze the a given question carefully and answer in `Yes` or `No`. -You should also provide a detailed explanation justifying your answer. -""" - -llm = dspy.OpenAI(model="gpt-3.5-turbo", system_prompt=system_prompt) -dspy.settings.configure(lm=llm) -``` - ### Writing the Causal Reasoning Signature A [signature](https://dspy-docs.vercel.app/docs/building-blocks/signatures) is a declarative specification of input/output behavior of a [DSPy module](https://dspy-docs.vercel.app/docs/building-blocks/modules) which are task-adaptive components—akin to neural network layers—that abstract any particular text transformation. @@ -177,18 +148,57 @@ class CausalReasoningModule(dspy.Module): return self.prog(input=Input(query=question)).output.dict() ``` -Let's test our LLM workflow, i.e., the `CausalReasoningModule` on an example from the causal reasoning subset of Big-Bench Hard. +Next, we write a [weave.Model](../../guides/core-types/models.md) that wraps the `CausalReasoningModule` and the OpenAI language model to form a complete LLM workflow. + + +```python +class WeaveCausalReasoningModel(weave.Model): + openai_model: str + system_prompt: str + program: dspy.Module + language_model: dspy.LM + + def __init__(self, openai_model: str, system_prompt: str, program: dspy.Module): + super().__init__( + openai_model=openai_model, + system_prompt=system_prompt, + program=program, + language_model=dspy.OpenAI( + model=openai_model, system_prompt=system_prompt + ) + ) + + @weave.op() + def predict(self, question: str) -> Output: + with dspy.context(lm=self.language_model): + return self.program(question) +``` + +Let's test our LLM workflow, i.e., the `WeaveCausalReasoningModel` on an example from the causal reasoning subset of Big-Bench Hard. ```python import rich -baseline_module = CausalReasoningModule() +SYSTEM_PROMPT = """ +You are an expert in the field of causal reasoning. You are to analyze the a given question carefully and answer in `Yes` or `No`. +You should also provide a detailed explanation justifying your answer. +""" -prediction = baseline_module(dspy_train_examples[0]["question"]) +baseline_model = WeaveCausalReasoningModel( + openai_model="gpt-3.5-turbo", + system_prompt=SYSTEM_PROMPT, + program=CausalReasoningModule(), +) + +prediction = baseline_model.predict(dspy_train_examples[0]["question"]) rich.print(prediction) ``` +| ![](https://i.imgur.com/woHRHjR.png) | +|---| +| Here's how you can explore the traces of the `CausalReasoningModule` in the Weave UI | + ## Evaluating our DSPy Program Now that we have a baseline prompting strategy, let's evaluate it on our validation set using [`weave.Evaluation`](../../guides/core-types/evaluations.md) on a simple metric that matches the predicted answer with the ground truth. Weave will take each example, pass it through your application and score the output on multiple custom scoring functions. By doing this, you'll have a view of the performance of your application, and a rich UI to drill into individual outputs and scores. @@ -206,9 +216,7 @@ Next, we can simply define the evaluation and run it. ```python -validation_dataset = weave.ref( - f"bigbenchhard_{metadata.big_bench_hard_task}_val:v0" -).get() +validation_dataset = weave.ref("bigbenchhard_causal_judgement_val:v0").get() evaluation = weave.Evaluation( name="baseline_causal_reasoning_module", @@ -216,7 +224,7 @@ evaluation = weave.Evaluation( scorers=[weave_evaluation_scorer], ) -await evaluation.evaluate(baseline_module.forward) +await evaluation.evaluate(baseline_model) ``` :::note @@ -224,7 +232,7 @@ If you're running from a python script, you can use the following code to run th ```python import asyncio -asyncio.run(evaluation.evaluate(baseline_module.forward)) +asyncio.run(evaluation.evaluate(baseline_model)) ``` ::: @@ -241,27 +249,47 @@ Now, that we have a baseline DSPy program, let us try to improve its performance from dspy.teleprompt import BootstrapFewShot -@weave.op() -def get_optimized_program(model: dspy.Module, metadata: Metadata) -> dspy.Module: - @weave.op() - def dspy_evaluation_metric(true, prediction, trace=None): - return prediction["answer"].lower() == true.answer.lower() +class CausalReasoningOptimizer(weave.Model): + model: WeaveCausalReasoningModel - teleprompter = BootstrapFewShot( - metric=dspy_evaluation_metric, - max_bootstrapped_demos=metadata.max_bootstrapped_demos, - max_labeled_demos=metadata.max_labeled_demos, - ) - return teleprompter.compile(model, trainset=dspy_train_examples) + @weave.op() + def get_optimized_program( + self, max_bootstrapped_demos: int, max_labeled_demos: int + ) -> weave.Model: + @weave.op() + def dspy_evaluation_metric(true, prediction, trace=None): + return prediction["answer"].lower() == true.answer.lower() + + teleprompter = BootstrapFewShot( + metric=dspy_evaluation_metric, + max_bootstrapped_demos=max_bootstrapped_demos, + max_labeled_demos=max_labeled_demos, + ) + with dspy.context(lm=self.model.language_model): + optimized_program = teleprompter.compile( + self.model.program, trainset=dspy_train_examples + ) + return WeaveCausalReasoningModel( + openai_model=self.model.openai_model, + system_prompt=self.model.system_prompt, + program=optimized_program + ) -optimized_module = get_optimized_program(baseline_module, metadata) +optimizer = CausalReasoningOptimizer(model=baseline_model) +optimized_model = optimizer.get_optimized_program( + max_bootstrapped_demos=8, max_labeled_demos=8 +) ``` :::warning Running the evaluation causal reasoning dataset will cost approximately $0.04 in OpenAI credits. ::: +| ![](https://i.imgur.com/uXvbROM.png) | +|---| +| You can explore the traces of the optimization process in the Weave UI. | + Now that we have our optimized program (the optimized prompting strategy), let's evaluate it once again on our validation set and compare it with our baseline DSPy program. @@ -272,10 +300,16 @@ evaluation = weave.Evaluation( scorers=[weave_evaluation_scorer], ) -await evaluation.evaluate(optimized_module.forward) +await evaluation.evaluate(optimized_model) ``` -When coomparing the evalution of the baseline program with the optimized one shows that the optimized program answers the causal reasoning questions with siginificantly more accuracy. +:::warning +Running the evaluation causal reasoning dataset will cost approximately $0.30 in OpenAI credits. +::: + +| ![](https://i.imgur.com/hneuY2K.png) | +|---| +| Comparing the evalution of the baseline program with the optimized one shows that the optimized program answers the causal reasoning questions with siginificantly more accuracy. | ## Conclusion diff --git a/docs/notebooks/dspy_prompt_optimization.ipynb b/docs/notebooks/dspy_prompt_optimization.ipynb index 573aaf7085a..d9dc79a8edd 100644 --- a/docs/notebooks/dspy_prompt_optimization.ipynb +++ b/docs/notebooks/dspy_prompt_optimization.ipynb @@ -87,41 +87,6 @@ "weave.init(project_name=\"dspy-bigbench-hard\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial, we use a metadata class inherited from [`weave.Object`](../../guides/tracking/objects.md) to manage our metadata." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class Metadata(weave.Object):\n", - " dataset_address: str = \"maveriq/bigbenchhard\"\n", - " big_bench_hard_task: str = \"causal_judgement\"\n", - " num_train_examples: int = 50\n", - " openai_model: str = \"gpt-3.5-turbo\"\n", - " openai_max_tokens: int = 2048\n", - " max_bootstrapped_demos: int = 8\n", - " max_labeled_demos: int = 8\n", - "\n", - "\n", - "metadata = Metadata()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ":::tip Object Versioning\n", - "The `Metadata` objects are automatically versioned and traced when functions consuming them are traced\n", - ":::" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -142,16 +107,16 @@ "\n", "\n", "@weave.op()\n", - "def get_dataset(metadata: Metadata):\n", + "def get_dataset(\n", + " dataset_address: str, big_bench_hard_task: str, num_train_examples: int\n", + "):\n", " # load the BIG-Bench Hard dataset corresponding to the task from Huggingface Hug\n", - " dataset = load_dataset(metadata.dataset_address, metadata.big_bench_hard_task)[\n", - " \"train\"\n", - " ]\n", + " dataset = load_dataset(dataset_address, big_bench_hard_task)[\"train\"]\n", "\n", " # create the training and validation datasets\n", " rows = [{\"question\": data[\"input\"], \"answer\": data[\"target\"]} for data in dataset]\n", - " train_rows = rows[0 : metadata.num_train_examples]\n", - " val_rows = rows[metadata.num_train_examples :]\n", + " train_rows = rows[0:num_train_examples]\n", + " val_rows = rows[num_train_examples:]\n", "\n", " # create the training and validation examples consisting of `dspy.Example` objects\n", " dspy_train_examples = [\n", @@ -161,20 +126,20 @@ "\n", " # publish the datasets to the Weave, this would let us version the data and use for evaluation\n", " weave.publish(\n", - " weave.Dataset(\n", - " name=f\"bigbenchhard_{metadata.big_bench_hard_task}_train\", rows=train_rows\n", - " )\n", + " weave.Dataset(name=f\"bigbenchhard_{big_bench_hard_task}_train\", rows=train_rows)\n", " )\n", " weave.publish(\n", - " weave.Dataset(\n", - " name=f\"bigbenchhard_{metadata.big_bench_hard_task}_val\", rows=val_rows\n", - " )\n", + " weave.Dataset(name=f\"bigbenchhard_{big_bench_hard_task}_val\", rows=val_rows)\n", " )\n", "\n", " return dspy_train_examples, dspy_val_examples\n", "\n", "\n", - "dspy_train_examples, dspy_val_examples = get_dataset(metadata)" + "dspy_train_examples, dspy_val_examples = get_dataset(\n", + " dataset_address=\"maveriq/bigbenchhard\",\n", + " big_bench_hard_task=\"causal_judgement\",\n", + " num_train_examples=50,\n", + ")" ] }, { @@ -188,21 +153,6 @@ "We will use the [`dspy.OpenAI`](https://dspy-docs.vercel.app/api/language_model_clients/OpenAI) abstraction to make LLM calls to [GPT3.5 Turbo](https://platform.openai.com/docs/models/gpt-3-5-turbo)." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "system_prompt = \"\"\"\n", - "You are an expert in the field of causal reasoning. You are to analyze the a given question carefully and answer in `Yes` or `No`.\n", - "You should also provide a detailed explanation justifying your answer.\n", - "\"\"\"\n", - "\n", - "llm = dspy.OpenAI(model=\"gpt-3.5-turbo\", system_prompt=system_prompt)\n", - "dspy.settings.configure(lm=llm)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -248,12 +198,41 @@ ] }, { + "metadata": {}, "cell_type": "markdown", + "source": "Next, we write a [`weave.Model`](../docs/guides/core-types/models.md) that wraps the `CausalReasoningModule` and the OpenAI language model to form a complete LLM workflow." + }, + { "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "Let's test our LLM workflow, i.e., the `CausalReasoningModule` on an example from the causal reasoning subset of Big-Bench Hard." + "class WeaveCausalReasoningModel(weave.Model):\n", + " openai_model: str\n", + " system_prompt: str\n", + " program: dspy.Module\n", + " language_model: dspy.LM\n", + "\n", + " def __init__(self, openai_model: str, system_prompt: str, program: dspy.Module):\n", + " super().__init__(\n", + " openai_model=openai_model,\n", + " system_prompt=system_prompt,\n", + " program=program,\n", + " language_model=dspy.OpenAI(model=openai_model, system_prompt=system_prompt),\n", + " )\n", + "\n", + " @weave.op()\n", + " def predict(self, question: str) -> Output:\n", + " with dspy.context(lm=self.language_model):\n", + " return self.program(question)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Let's test our LLM workflow, i.e., the `WeaveCausalReasoningModel` on an example from the causal reasoning subset of Big-Bench Hard." + }, { "cell_type": "code", "execution_count": null, @@ -262,9 +241,18 @@ "source": [ "import rich\n", "\n", - "baseline_module = CausalReasoningModule()\n", + "SYSTEM_PROMPT = \"\"\"\n", + "You are an expert in the field of causal reasoning. You are to analyze the a given question carefully and answer in `Yes` or `No`.\n", + "You should also provide a detailed explanation justifying your answer.\n", + "\"\"\"\n", "\n", - "prediction = baseline_module(dspy_train_examples[0][\"question\"])\n", + "baseline_model = WeaveCausalReasoningModel(\n", + " openai_model=\"gpt-3.5-turbo\",\n", + " system_prompt=SYSTEM_PROMPT,\n", + " program=CausalReasoningModule(),\n", + ")\n", + "\n", + "prediction = baseline_model.predict(dspy_train_examples[0][\"question\"])\n", "rich.print(prediction)" ] }, @@ -303,9 +291,7 @@ "metadata": {}, "outputs": [], "source": [ - "validation_dataset = weave.ref(\n", - " f\"bigbenchhard_{metadata.big_bench_hard_task}_val:v0\"\n", - ").get()\n", + "validation_dataset = weave.ref(\"bigbenchhard_causal_judgement_val:v0\").get()\n", "\n", "evaluation = weave.Evaluation(\n", " name=\"baseline_causal_reasoning_module\",\n", @@ -313,7 +299,7 @@ " scorers=[weave_evaluation_scorer],\n", ")\n", "\n", - "await evaluation.evaluate(baseline_module.forward)" + "await evaluation.evaluate(baseline_model)" ] }, { @@ -325,7 +311,7 @@ "\n", "```python\n", "import asyncio\n", - "asyncio.run(evaluation.evaluate(baseline_module.forward))\n", + "asyncio.run(evaluation.evaluate(baseline_model))\n", "```\n", ":::\n", "\n", @@ -352,21 +338,37 @@ "from dspy.teleprompt import BootstrapFewShot\n", "\n", "\n", - "@weave.op()\n", - "def get_optimized_program(model: dspy.Module, metadata: Metadata) -> dspy.Module:\n", - " @weave.op()\n", - " def dspy_evaluation_metric(true, prediction, trace=None):\n", - " return prediction[\"answer\"].lower() == true.answer.lower()\n", + "class CausalReasoningOptimizer(weave.Model):\n", + " model: WeaveCausalReasoningModel\n", "\n", - " teleprompter = BootstrapFewShot(\n", - " metric=dspy_evaluation_metric,\n", - " max_bootstrapped_demos=metadata.max_bootstrapped_demos,\n", - " max_labeled_demos=metadata.max_labeled_demos,\n", - " )\n", - " return teleprompter.compile(model, trainset=dspy_train_examples)\n", + " @weave.op()\n", + " def get_optimized_program(\n", + " self, max_bootstrapped_demos: int, max_labeled_demos: int\n", + " ) -> weave.Model:\n", + " @weave.op()\n", + " def dspy_evaluation_metric(true, prediction, trace=None):\n", + " return prediction[\"answer\"].lower() == true.answer.lower()\n", + "\n", + " teleprompter = BootstrapFewShot(\n", + " metric=dspy_evaluation_metric,\n", + " max_bootstrapped_demos=max_bootstrapped_demos,\n", + " max_labeled_demos=max_labeled_demos,\n", + " )\n", + " with dspy.context(lm=self.model.language_model):\n", + " optimized_program = teleprompter.compile(\n", + " self.model.program, trainset=dspy_train_examples\n", + " )\n", + " return WeaveCausalReasoningModel(\n", + " openai_model=self.model.openai_model,\n", + " system_prompt=self.model.system_prompt,\n", + " program=optimized_program,\n", + " )\n", "\n", "\n", - "optimized_module = get_optimized_program(baseline_module, metadata)" + "optimizer = CausalReasoningOptimizer(model=baseline_model)\n", + "optimized_model = optimizer.get_optimized_program(\n", + " max_bootstrapped_demos=8, max_labeled_demos=8\n", + ")" ] }, { @@ -392,7 +394,7 @@ " scorers=[weave_evaluation_scorer],\n", ")\n", "\n", - "await evaluation.evaluate(optimized_module.forward)" + "await evaluation.evaluate(optimized_model)" ] }, {