[AIC-py] promptfoo integration prototype (#317)

[AIC-py] promptfoo integration prototype Make a custom provider using AIConfig and configure it with promptfoo yaml. Run (TS connector): `npx promptfoo@latest eval -c typescript/scripts/eval/promptfoo/examples/travel/travel_promptfooconfig.yaml` Run (Py connector): `npx promptfoo@latest eval -c python/src/aiconfig/eval/promptfoo/examples/travel/travel_promptfooconfig.yaml` <img width="1919" alt="image" src="https://github.com/lastmile-ai/aiconfig/assets/148090348/c1e7c6a3-2d4e-4799-bfa0-131dca06c5f6">
lastmile-ai · Dec 4, 2023 · ad454a9 · ad454a9
2 parents 73974ca + b566b9c
commit ad454a9
Show file tree

Hide file tree

Showing 10 changed files with 244 additions and 0 deletions.
diff --git a/python/src/aiconfig/eval/promptfoo/README.md b/python/src/aiconfig/eval/promptfoo/README.md
@@ -0,0 +1,30 @@
+# Promptfoo integration
+
+Use case: I'm a SWE who wants to run my AIConfig against a set of test cases specified in a config file. Each test case has the input and a success condition of my choosing.
+
+## Philosophy / design
+
+Prompfoo has a pretty nice interface (both input and outputs) for addressing the use case. Tests are specified in a yaml file and the test suite can be run with a simple command. The same config file makes it easy to connect your test suite to an AI config with a small amount of code.
+
+## How-to guide
+
+1. Write your test cases in a Promptfoo config file. See examples/travel/travel_promtfooconfig.yaml as an example.
+2. Define an AIConfig test suite settings file. It should have the prompt name and path to your aiconfig. See examples/travel/travel_aiconfig_test_suite_settings.json for example.
+3. Set your provider to point to run_aiconfig.py with your settings file as the argument. For e.g. see examples/travel/travel_promtfooconfig.yaml. Like this:
+
+```
+providers:
+  - exec:python ../../run_aiconfig.py ./travel_aiconfig_test_suite_settings.json
+```
+
+4. export your provider API key if needed so it's available to subprocess environments:
+   `export OPENAI_API_KEY=...`
+
+5. Make sure your shell environment (including subshells) contains a python3 executable called `python` on its path. One way to do this is to set up an anaconda/miniconda environment.
+
+6. Run `npx promptfoo@latest eval -c path/to/promptfooconfig.yaml`
+You should see one passing test and one failing test.
+
+### Debugging / Troubleshooting
+Help, something went wrong! 
+-> Try adding --verbose flag at the end of the `npx` command.
diff --git a/python/src/aiconfig/eval/promptfoo/examples/travel/travel_aiconfig_test_suite_settings.json b/python/src/aiconfig/eval/promptfoo/examples/travel/travel_aiconfig_test_suite_settings.json
@@ -0,0 +1,4 @@
+{
+  "prompt_name": "get_activities",
+  "aiconfig_path": "travel_parametrized_for_testing.aiconfig.json"
+}
diff --git a/...src/aiconfig/eval/promptfoo/examples/travel/travel_parametrized_for_testing.aiconfig.json b/...src/aiconfig/eval/promptfoo/examples/travel/travel_parametrized_for_testing.aiconfig.json
@@ -0,0 +1,36 @@
+{
+  "name": "NYC Trip Planner",
+  "description": "Intrepid explorer with ChatGPT and AIConfig",
+  "schema_version": "latest",
+  "metadata": {
+    "models": {
+      "gpt-3.5-turbo": {
+        "model": "gpt-3.5-turbo",
+        "top_p": 1,
+        "temperature": 1
+      },
+      "gpt-4": {
+        "model": "gpt-4",
+        "max_tokens": 3000,
+        "system_prompt": "You are an expert travel coordinator with exquisite taste."
+      }
+    },
+    "default_model": "gpt-3.5-turbo"
+  },
+  "prompts": [
+    {
+      "name": "get_activities",
+      "input": "{{the_query}}"
+    },
+    {
+      "name": "gen_itinerary",
+      "input": "Generate an itinerary ordered by {{order_by}} for these activities: {{get_activities.output}}.",
+      "metadata": {
+        "model": "gpt-4",
+        "parameters": {
+          "order_by": "geographic location"
+        }
+      }
+    }
+  ]
+}
diff --git a/python/src/aiconfig/eval/promptfoo/examples/travel/travel_promptfooconfig.yaml b/python/src/aiconfig/eval/promptfoo/examples/travel/travel_promptfooconfig.yaml
@@ -0,0 +1,16 @@
+prompts: ["{{question}}"]
+providers:
+  - exec:python ../../run_aiconfig.py ./travel_aiconfig_test_suite_settings.json
+tests:
+  - description: "Test if output is equal to the expected value"
+    vars:
+      question: Empire State Building is on fifth avenue. What is the cross street?
+    assert:
+      - type: python
+        value: output.lower().find('34th street') != -1
+  - description: "Test if output is equal to the expected value"
+    vars:
+      question: "which is the best borough?"
+    assert:
+      - type: python
+        value: output == "Brooklyn"
diff --git a/python/src/aiconfig/eval/promptfoo/run_aiconfig.py b/python/src/aiconfig/eval/promptfoo/run_aiconfig.py
@@ -0,0 +1,41 @@
+import asyncio
+import os
+
+import openai
+from dotenv import load_dotenv
+
+from aiconfig import AIConfigRuntime
+import sys
+import json
+from typing import Any
+
+
+async def main():
+    settings_path = sys.argv[1]
+    settings = _load_settings(settings_path)
+
+    question = sys.argv[2]
+    prompt_name = settings["prompt_name"]
+    aiconfig_path = settings["aiconfig_path"]
+
+    load_dotenv()
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+    runtime = AIConfigRuntime.load(aiconfig_path)
+
+    params = {
+        "the_query": question,
+    }
+
+    result = await runtime.run(prompt_name, params)
+    final_output = runtime.get_output_text(prompt_name, result[0])
+    print(final_output)
+
+
+def _load_settings(settings_path: str) -> dict[str, Any]:
+    with open(settings_path) as f:
+        settings = json.load(f)
+        return settings
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/typescript/scripts/eval/promptfoo/README.md b/typescript/scripts/eval/promptfoo/README.md
@@ -0,0 +1,30 @@
+# Promptfoo integration
+
+Use case: I'm a SWE who wants to run my AIConfig against a set of test cases specified in a config file. Each test case has the input and a success condition of my choosing.
+
+## Philosophy / design
+
+Prompfoo has a pretty nice interface (both input and outputs) for addressing the use case. Tests are specified in a yaml file and the test suite can be run with a simple command. The same config file makes it easy to connect your test suite to an AI config with a small amount of code.
+
+## How-to guide
+
+1. Write your test cases in a Promptfoo config file. See examples/travel/travel_promtfooconfig.yaml as an example.
+2. Define an AIConfig test suite settings file. It should have the prompt name and path to your aiconfig. See examples/travel/travel_aiconfig_test_suite_settings.json for example.
+3. Set your provider to point to run_aiconfig.ts with your settings file as the argument. For e.g. see examples/travel/travel_promtfooconfig.yaml. Like this:
+
+```
+providers:
+  - exec:npx ts-node ../../run_aiconfig.ts ./travel_aiconfig_test_suite_settings.json
+
+```
+
+4. export your provider API key if needed so it's available to subprocess environments:
+   `export OPENAI_API_KEY=...`
+
+5. Run `cd typescript; npx promptfoo@latest eval -c path/to/promptfooconfig.yaml`
+   You should see one passing test and one failing test.
+
+### Debugging / Troubleshooting
+
+Help, something went wrong!
+-> Try adding --verbose flag at the end of the `npx` command.
diff --git a/typescript/scripts/eval/promptfoo/examples/travel/travel_aiconfig_test_suite_settings.json b/typescript/scripts/eval/promptfoo/examples/travel/travel_aiconfig_test_suite_settings.json
@@ -0,0 +1,4 @@
+{
+  "prompt_name": "get_activities",
+  "aiconfig_path": "examples/travel/travel_parametrized_for_testing.aiconfig.json"
+}
diff --git a/...ript/scripts/eval/promptfoo/examples/travel/travel_parametrized_for_testing.aiconfig.json b/...ript/scripts/eval/promptfoo/examples/travel/travel_parametrized_for_testing.aiconfig.json
@@ -0,0 +1,36 @@
+{
+  "name": "NYC Trip Planner",
+  "description": "Intrepid explorer with ChatGPT and AIConfig",
+  "schema_version": "latest",
+  "metadata": {
+    "models": {
+      "gpt-3.5-turbo": {
+        "model": "gpt-3.5-turbo",
+        "top_p": 1,
+        "temperature": 1
+      },
+      "gpt-4": {
+        "model": "gpt-4",
+        "max_tokens": 3000,
+        "system_prompt": "You are an expert travel coordinator with exquisite taste."
+      }
+    },
+    "default_model": "gpt-3.5-turbo"
+  },
+  "prompts": [
+    {
+      "name": "get_activities",
+      "input": "{{the_query}}"
+    },
+    {
+      "name": "gen_itinerary",
+      "input": "Generate an itinerary ordered by {{order_by}} for these activities: {{get_activities.output}}.",
+      "metadata": {
+        "model": "gpt-4",
+        "parameters": {
+          "order_by": "geographic location"
+        }
+      }
+    }
+  ]
+}
diff --git a/typescript/scripts/eval/promptfoo/examples/travel/travel_promptfooconfig.yaml b/typescript/scripts/eval/promptfoo/examples/travel/travel_promptfooconfig.yaml
@@ -0,0 +1,16 @@
+prompts: ["{{question}}"]
+providers:
+  - exec:npx ts-node ../../run_aiconfig.ts ./travel_aiconfig_test_suite_settings.json
+tests:
+  - description: "Test if output is equal to the expected value"
+    vars:
+      question: Empire State Building is on fifth avenue. What is the cross street?
+    assert:
+      - type: python
+        value: output.lower().find('34th street') != -1
+  - description: "Test if output is equal to the expected value"
+    vars:
+      question: "which is the best borough?"
+    assert:
+      - type: python
+        value: output == "Brooklyn"
diff --git a/typescript/scripts/eval/promptfoo/run_aiconfig.ts b/typescript/scripts/eval/promptfoo/run_aiconfig.ts
@@ -0,0 +1,31 @@
+import { AIConfigRuntime } from "../../../lib/config";
+import fs from "fs";
+import path from "path";
+
+async function main() {
+  let settings_path = process.argv[2];
+  let question = process.argv[3];
+  let settings = _load_settings(settings_path);
+  let prompt_name = settings["prompt_name"];
+  let aiconfig_path = settings["aiconfig_path"];
+
+  let fullAIConfigPath = path.join(__dirname, aiconfig_path);
+  let runtime = AIConfigRuntime.load(fullAIConfigPath);
+  let params = {
+    the_query: question,
+  };
+  let result = await runtime.run(prompt_name, params);
+  let final_output: string;
+
+  let r0 = Array.isArray(result) ? result[0] : result;
+  final_output = runtime.getOutputText(prompt_name, r0);
+  console.log(final_output);
+}
+
+function _load_settings(settings_path: string) {
+  let rawdata = fs.readFileSync(settings_path, "utf-8");
+  let settings = JSON.parse(rawdata);
+  return settings;
+}
+
+main();