:wq

giMerge branch 'master' into griffin/eval-no-model-name
wandb · Oct 4, 2024 · c15c759 · c15c759
2 parents efadfbc + 1dac5f2
commit c15c759
Show file tree

Hide file tree

Showing 90 changed files with 2,262 additions and 1,521 deletions.
diff --git a/.github/workflows/check-which-tests-to-run.yaml b/.github/workflows/check-which-tests-to-run.yaml
@@ -29,16 +29,33 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
           ref: ${{ github.head_ref }}
-      - name: Git setup
+      - name: Get changed files
         run: |
-          git fetch origin ${{ github.base_ref }}
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            base_sha=$(git rev-parse origin/${{ github.base_ref }})
-            head_sha=$(git rev-parse HEAD)
-            changed_files=$(git diff --name-only $base_sha $head_sha)
+          # Fetch all branches
+          git fetch --all
+
+          # Determine the base branch and current commit
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            # For pull requests
+            BASE_BRANCH="${{ github.base_ref }}"
+            CURRENT_COMMIT="${{ github.event.pull_request.head.sha }}"
           else
-            changed_files=$(git diff --name-only HEAD^)
+            # For pushes
+            BASE_BRANCH=$(git remote show origin | sed -n '/HEAD branch/s/.*: //p')
+            CURRENT_COMMIT="${{ github.sha }}"
           fi
+          echo "Base branch is $BASE_BRANCH"
+
+          # Find the common ancestor
+          MERGE_BASE=$(git merge-base origin/$BASE_BRANCH $CURRENT_COMMIT)
+
+          # Get changed files
+          changed_files=$(git diff --name-only $MERGE_BASE $CURRENT_COMMIT)
+          echo "Changed files:"
+          echo "$changed_files"
+          echo "changed_files<<EOF" >> $GITHUB_ENV
+          echo "$changed_files" >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
       - id: weave_query
         name: Weave Query Checks
         run: |
@@ -62,7 +79,7 @@ jobs:
       - id: trace_server
         name: Weave Trace Server Checks
         run: |
-          for path in ${{ env.CORE_INTEGRATION_PATHS }}; do
+          for path in ${{ env.TRACE_SERVER_PATHS }}; do
             if echo "$changed_files" | grep -q "$path"; then
               echo "run_tests=true" >> $GITHUB_OUTPUT
               exit 0

diff --git a/.github/workflows/cla.yaml b/.github/workflows/cla.yaml
@@ -27,4 +27,4 @@ jobs:
           # branch should not be protected
           branch: "cla"
           # cannot use teams due to: https://github.com/contributor-assistant/github-action/issues/100
-          allowlist: actions-user, altay, bdytx5, dannygoldstein, davidwallacejackson, jamie-rasmussen, jlzhao27, jo-fang, jwlee64, laxels, morganmcg1, nickpenaranda, scottire, shawnlewis, staceysv, tssweeney, vanpelt, vwrj, wandbmachine
+          allowlist: actions-user, altay, andrewtruong, bdytx5, dannygoldstein, davidwallacejackson, jamie-rasmussen, jlzhao27, jo-fang, jwlee64, laxels, morganmcg1, nickpenaranda, scottire, shawnlewis, staceysv, tssweeney, vanpelt, vwrj, wandbmachine, [email protected]
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -20,7 +20,8 @@ jobs:
     name: Build Legacy (Query Service) test container
     timeout-minutes: 30
     runs-on: [self-hosted, builder]
-    # runs-on: ubuntu-latest
+    outputs:
+      build_needed: ${{ steps.build_check.outputs.build_needed }}
     env:
       REGISTRY: us-east4-docker.pkg.dev/weave-support-367421/weave-images
     needs: check-which-tests-to-run
@@ -70,7 +71,7 @@ jobs:
       matrix:
         job_num: [0, 1]
     # runs-on: ubuntu-latest
-    container: ${{ needs.build-container-query-service.outputs.build_needed == 'true' && 'us-east4-docker.pkg.dev/weave-support-367421/weave-images/weave-test-python-query-service:${{ github.sha }}' || 'ubuntu:latest' }}
+    container: ${{ needs.build-container-query-service.outputs.build_needed == 'true' && format('us-east4-docker.pkg.dev/weave-support-367421/weave-images/weave-test-python-query-service:{0}', github.sha) || null }}
     services:
       wandbservice:
         image: us-central1-docker.pkg.dev/wandb-production/images/local-testcontainer:master

diff --git a/docs/docs/media/dspy_optimization/1.png b/docs/docs/media/dspy_optimization/1.png
diff --git a/docs/docs/media/dspy_optimization/2.png b/docs/docs/media/dspy_optimization/2.png
diff --git a/docs/docs/media/dspy_optimization/3.png b/docs/docs/media/dspy_optimization/3.png
diff --git a/docs/docs/media/dspy_optimization/4.png b/docs/docs/media/dspy_optimization/4.png
diff --git a/docs/docs/media/dspy_optimization/5.png b/docs/docs/media/dspy_optimization/5.png
diff --git a/docs/docs/media/intro/1.png b/docs/docs/media/intro/1.png
diff --git a/docs/docs/media/intro/10.png b/docs/docs/media/intro/10.png
diff --git a/docs/docs/media/intro/2.png b/docs/docs/media/intro/2.png
diff --git a/docs/docs/media/intro/3.png b/docs/docs/media/intro/3.png
diff --git a/docs/docs/media/intro/4.png b/docs/docs/media/intro/4.png
diff --git a/docs/docs/media/intro/5.png b/docs/docs/media/intro/5.png
diff --git a/docs/docs/media/intro/6.png b/docs/docs/media/intro/6.png
diff --git a/docs/docs/media/intro/7.png b/docs/docs/media/intro/7.png
diff --git a/docs/docs/media/intro/8.png b/docs/docs/media/intro/8.png
diff --git a/docs/docs/media/intro/9.png b/docs/docs/media/intro/9.png
diff --git a/docs/docs/media/summarization/dataset.png b/docs/docs/media/summarization/dataset.png
diff --git a/docs/docs/media/summarization/eval_dash.png b/docs/docs/media/summarization/eval_dash.png
diff --git a/docs/docs/media/summarization/model.png b/docs/docs/media/summarization/model.png
diff --git a/docs/docs/media/summarization/summarization_trace.png b/docs/docs/media/summarization/summarization_trace.png
diff --git a/docs/docs/reference/gen_notebooks/01-intro_notebook.md b/docs/docs/reference/gen_notebooks/01-intro_notebook.md
@@ -72,6 +72,8 @@ weave.init('project-name')      # initialize tracking for a specific W&B project
 
 Add the @weave.op decorator to the functions you want to track
 
+![](../../media/intro/1.png)
+
 
 ```python
 from openai import OpenAI
@@ -102,6 +104,8 @@ You can find your interactive dashboard by clicking any of the  👆 wandb links
 
 Here, we're automatically tracking all calls to `openai`. We automatically track a lot of LLM libraries, but it's really easy to add support for whatever LLM you're using, as you'll see below. 
 
+![](../../media/intro/2.png)
+
 
 ```python
 import weave
@@ -128,6 +132,8 @@ Now that you've seen the basics, let's combine all of the above and track some d
 
 
 
+![](../../media/intro/3.png)
+
 
 ```python
 from openai import OpenAI
@@ -169,6 +175,8 @@ print(result)
 
 Whenever your code crashes, weave will highlight what caused the issue. This is especially useful for finding things like JSON parsing issues that can occasionally happen when parsing data from LLM responses.
 
+![](../../media/intro/4.png)
+
 
 ```python
 import json
@@ -221,6 +229,8 @@ Organizing experimentation is difficult when there are many moving pieces. You c
 
 Many times, it is useful to track & version data, just like you track and version code. For example, here we define a `SystemPrompt(weave.Object)` object that can be shared between teammates
 
+![](../../media/intro/5.png)
+
 
 ```python
 import weave
@@ -242,6 +252,8 @@ weave.publish(system_prompt)
 
 Models are so common of an object type, that we have a special class to represent them: `weave.Model`. The only requirement is that we define a `predict` method.
 
+![](../../media/intro/6.png)
+
 
 ```python
 from openai import OpenAI
@@ -283,6 +295,8 @@ print(result)
 
 Similar to models, a `weave.Dataset` object exists to help track, organize, and operate on datasets
 
+![](../../media/intro/7.png)
+
 
 ```python
 dataset = weave.Dataset(
@@ -309,6 +323,8 @@ Notice that we saved a versioned `GrammarCorrector` object that captures the con
 
 You can publish objects and then retrieve them in your code. You can even call functions from your retrieved objects!
 
+![](../../media/intro/8.png)
+
 
 ```python
 import weave
@@ -324,6 +340,8 @@ ref = weave.publish(corrector)
 print(ref.uri())
 ```
 
+![](../../media/intro/9.png)
+
 
 ```python
 import weave
@@ -346,6 +364,8 @@ Evaluation-driven development helps you reliably iterate on an application. The
 
 See a preview of the API below:
 
+![](../../media/intro/10.png)
+
 
 ```python
 import weave

diff --git a/docs/docs/reference/gen_notebooks/chain_of_density.md b/docs/docs/reference/gen_notebooks/chain_of_density.md
@@ -20,6 +20,8 @@ title: Chain of Density Summarization
 
 Summarizing complex technical documents while preserving crucial details is a challenging task. The Chain of Density (CoD) summarization technique offers a solution by iteratively refining summaries to be more concise and information-dense. This guide demonstrates how to implement CoD using Weave for tracking and evaluating the application. 
 
+![](../../media/summarization/eval_dash.png)
+
 ## What is Chain of Density Summarization?
 
 [![arXiv](https://img.shields.io/badge/arXiv-2309.04269-b31b1b.svg)](https://arxiv.org/abs/2309.04269)
@@ -139,6 +141,8 @@ def load_pdf(pdf_url: str) -> str:
 
 Now, let's implement the core CoD summarization logic using Weave operations:
 
+![](../../media/summarization/summarization_trace.png)
+
 
 ```python
 # Chain of Density Summarization
@@ -231,6 +235,8 @@ By using `@weave.op()` decorators, we ensure that Weave tracks the inputs, outpu
 
 Now, let's wrap our summarization pipeline in a Weave Model:
 
+![](../../media/summarization/model.png)
+
 
 ```python
 # Weave Model
@@ -240,7 +246,7 @@ class ArxivChainOfDensityPipeline(weave.Model):
 
     @weave.op()
     def predict(self, paper: ArxivPaper, instruction: str) -> dict:
-        text = load_pdf(paper["pdf_url"])
+        text = load_pdf(paper.pdf_url)
         result = chain_of_density_summarization(
             text,
             instruction,
@@ -320,6 +326,8 @@ These evaluation functions use the Claude model to assess the quality of the gen
 
 To evaluate our pipeline, we'll create a Weave Dataset and run an evaluation:
 
+![](../../media/summarization/dataset.png)
+
 
 ```python
 # Create a Weave Dataset
@@ -340,6 +348,8 @@ For our evaluation, we'll use an LLM-as-a-judge approach. This technique involve
 
 [![arXiv](https://img.shields.io/badge/arXiv-2306.05685-b31b1b.svg)](https://arxiv.org/abs/2306.05685)
 
+![](../../media/summarization/eval_dash.png)
+
 
 ```python
 # Define the scorer function

diff --git a/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md b/docs/docs/reference/gen_notebooks/dspy_prompt_optimization.md
@@ -125,6 +125,8 @@ def get_dataset(metadata: Metadata):
 dspy_train_examples, dspy_val_examples = get_dataset(metadata)
 ```
 
+![](../../media/dspy_optimization/1.png)
+
 ## The DSPy Program
 
 [DSPy](https://dspy-docs.vercel.app) is a framework that pushes building new LM pipelines away from manipulating free-form strings and closer to programming (composing modular operators to build text transformation graphs) where a compiler automatically generates optimized LM invocation strategies and prompts from a program.
@@ -189,6 +191,8 @@ prediction = baseline_module(dspy_train_examples[0]["question"])
 rich.print(prediction)
 ```
 
+![](../../media/dspy_optimization/2.png)
+
 ## Evaluating our DSPy Program
 
 Now that we have a baseline prompting strategy, let's evaluate it on our validation set using [`weave.Evaluation`](../../guides/core-types/evaluations.md) on a simple metric that matches the predicted answer with the ground truth. Weave will take each example, pass it through your application and score the output on multiple custom scoring functions. By doing this, you'll have a view of the performance of your application, and a rich UI to drill into individual outputs and scores.
@@ -219,6 +223,8 @@ evaluation = weave.Evaluation(
 await evaluation.evaluate(baseline_module.forward)
 ```
 
+![](../../media/dspy_optimization/3.png)
+
 :::note
 If you're running from a python script, you can use the following code to run the evaluation:
 
@@ -258,6 +264,8 @@ def get_optimized_program(model: dspy.Module, metadata: Metadata) -> dspy.Module
 optimized_module = get_optimized_program(baseline_module, metadata)
 ```
 
+![](../../media/dspy_optimization/4.png)
+
 :::warning
 Running the evaluation causal reasoning dataset will cost approximately $0.04 in OpenAI credits.
 :::
@@ -275,6 +283,8 @@ evaluation = weave.Evaluation(
 await evaluation.evaluate(optimized_module.forward)
 ```
 
+![](../../media/dspy_optimization/5.png)
+
 When coomparing the evalution of the baseline program with the optimized one shows that the optimized program answers the causal reasoning questions with siginificantly more accuracy.
 
 ## Conclusion