From a585ad7e93afc51134ddbd81cfc3b18db1a62a03 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 22 Nov 2023 14:33:26 -0500 Subject: [PATCH] Update standard agent evaluator + update notebooks (#78) This updates the standard agent evaluator to use order_matters --- .../tool_usage/multiverse_math.ipynb | 1067 +++++---- .../tool_usage/relational_data.ipynb | 1975 +++++++---------- .../notebooks/tool_usage/typewriter_1.ipynb | 1115 ++++++---- .../notebooks/tool_usage/typewriter_26.ipynb | 1043 ++++++--- langchain_benchmarks/tool_usage/evaluators.py | 97 +- tests/unit_tests/tool_usage/test_evaluator.py | 115 + 6 files changed, 2966 insertions(+), 2446 deletions(-) create mode 100644 tests/unit_tests/tool_usage/test_evaluator.py diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb index 2843dc75..8bc81073 100644 --- a/docs/source/notebooks/tool_usage/multiverse_math.ipynb +++ b/docs/source/notebooks/tool_usage/multiverse_math.ipynb @@ -1,481 +1,588 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", - "metadata": { - "tags": [] - }, - "source": [ - "# Multiverse Math\n", - "\n", - "\n", - "Let's see how to evaluate an agent's ability to use tools.\n", - "\n", - " Solve basic math question using the provided tools.\n", - "\n", - " Must use the provided tools to solve the math question.\n", - "\n", - " To make sure that innate knowledge is not used, the math operations have been altered to yield different results than expected.\n", - "\n", - " The modified operations should yield different results, but still retain appropriate properties. For example, the modified multiplication operation should still be commutative.\n", - "\n", - " Please note that the modified operations are not guaranteed to even make sense in the real world since not all properties will be retained (e.g., distributive property)." - ] - }, - { - "cell_type": "markdown", - "id": "03488ab1-31ed-41c2-8da2-46b02599b181", - "metadata": {}, - "source": [ - "For this code to work, please configure LangSmith environment variables with your credentials." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1615b8ff-688a-4447-8c4c-d64ad02818ed", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\" # Your LangSmith API key" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_benchmarks import clone_public_dataset, registry" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Name Multiverse Math
Type ToolUsageTask
Dataset ID https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d
DescriptionAn environment that contains a few basic math operations, but with altered results.\n", - "\n", - "For example, mu...
" - ], - "text/plain": [ - "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.')" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task = registry[\"Multiverse Math\"]\n", - "task" - ] - }, - { - "cell_type": "markdown", - "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", - "metadata": {}, - "source": [ - "Clone the dataset associaetd with this task" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "70369f67-deb4-467a-801a-6d38c3d0460d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset Multiverse Math already exists. Skipping.\n", - "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/ddca73f1-ceda-4562-8c49-7ee0a9df2a01.\n" - ] - } - ], - "source": [ - "clone_public_dataset(task.dataset_id, dataset_name=task.name)" - ] - }, - { - "cell_type": "markdown", - "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", - "metadata": {}, - "source": [ - "Let's build an agent that we can use for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "6142cf4e-862c-47a3-aa75-81d7d3231308", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'question': 'how much is 3 + 5',\n", - " 'output': 'In this alternate mathematical universe, the result of adding 3 and 5 is 9.2.',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='add', tool_input={'a': 3, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 3, 'b': 5}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"a\": 3,\\n \"b\": 5\\n}', 'name': 'add'}})]),\n", - " 9.2)]}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain_benchmarks.tool_usage import agents\n", - "\n", - "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n", - "\n", - "# Let's test that our agent works\n", - "agent = agent_factory.create()\n", - "agent.invoke({\"question\": \"how much is 3 + 5\"})" - ] - }, - { - "cell_type": "markdown", - "id": "3821e4b0-8e67-418a-840c-470fcde42df0", - "metadata": {}, - "source": [ - "## Eval\n", - "\n", - "Let's evaluate an agent now" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "View the evaluation results for project 'test-excellent-potato-37' at:\n", - "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/e350cda0-4e1d-49eb-8483-574172d1c635?eval=true\n", - "\n", - "View all tests for Dataset Multiverse Math at:\n", - "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/ddca73f1-ceda-4562-8c49-7ee0a9df2a01\n", - "[------------------------------------------------->] 10/10\n", - " Eval quantiles:\n", - " 0.25 0.5 0.75 mean \\\n", - "Intermediate steps correctness 0.00000 0.00000 0.00000 0.10000 \n", - "# steps / # expected steps 5.00000 7.50000 8.62500 7.75000 \n", - "correctness 0.00000 0.00000 0.00000 0.10000 \n", - "execution_time 38.76436 38.76436 38.76436 38.76436 \n", - "\n", - " mode \n", - "Intermediate steps correctness 0.00000 \n", - "# steps / # expected steps 5.00000 \n", - "correctness 0.00000 \n", - "execution_time 38.76436 \n" - ] - } - ], - "source": [ - "from langsmith.client import Client\n", - "\n", - "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n", - "\n", - "client = Client()\n", - "\n", - "test_run = client.run_on_dataset(\n", - " dataset_name=task.name,\n", - " llm_or_chain_factory=agent_factory.create,\n", - " evaluation=STANDARD_AGENT_EVALUATOR,\n", - " verbose=True,\n", - " tags=[\"gpt-3.5-turbo-16k\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", - "metadata": {}, - "source": [ - "# Inspect\n", - "\n", - "You can take a look at the underlying results." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df = test_run.to_dataframe()\n", - "df = pd.json_normalize(df.to_dict(orient=\"records\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "7ab5a8b9-a937-4537-b879-704284df4494", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.1" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"correctness\"].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n", - "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "50d7590d-20de-4768-ac90-adcdbfa70068", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Intermediate steps correctness# steps / # expected stepscorrectnessexecution_timeinput.questionoutput.questionoutput.outputoutput.intermediate_stepsreference.referencereference.expected_stepsnum_expected_stepsactual_number_of_steps
0015.0038.76436Add 2 and 3Add 2 and 3Agent stopped due to iteration limit or time l...[(tool='add' tool_input={'a': 2, 'b': 3} log=\"...6.20[add]115
1015.0038.76436Subtract 3 from 2Subtract 3 from 2Agent stopped due to iteration limit or time l...[(tool='subtract' tool_input={'a': 2, 'b': 3} ...-4.00[subtract]115
209.0138.76436What is -5 if evaluated using the negate funct...What is -5 if evaluated using the negate funct...-5.0\\n-5.0[(tool='negate' tool_input={'a': -5} log=\"\\nIn...-5.00[negate]19
311.0038.76436what is the result of 2 to the power of 3?what is the result of 2 to the power of 3?The result of 2 to the power of 3 is 32.[(tool='power' tool_input={'a': 2, 'b': 3} log...32.00[power]11
407.5038.76436I ate 1 apple and 2 oranges every day for 7 da...I ate 1 apple and 2 oranges every day for 7 da...Agent stopped due to iteration limit or time l...[(tool='add' tool_input={'a': 1, 'b': 2} log=\"...32.34[multiply, add]215
\n", - "
" - ], - "text/plain": [ - " Intermediate steps correctness # steps / # expected steps correctness \\\n", - "0 0 15.0 0 \n", - "1 0 15.0 0 \n", - "2 0 9.0 1 \n", - "3 1 1.0 0 \n", - "4 0 7.5 0 \n", - "\n", - " execution_time input.question \\\n", - "0 38.76436 Add 2 and 3 \n", - "1 38.76436 Subtract 3 from 2 \n", - "2 38.76436 What is -5 if evaluated using the negate funct... \n", - "3 38.76436 what is the result of 2 to the power of 3? \n", - "4 38.76436 I ate 1 apple and 2 oranges every day for 7 da... \n", - "\n", - " output.question \\\n", - "0 Add 2 and 3 \n", - "1 Subtract 3 from 2 \n", - "2 What is -5 if evaluated using the negate funct... \n", - "3 what is the result of 2 to the power of 3? \n", - "4 I ate 1 apple and 2 oranges every day for 7 da... \n", - "\n", - " output.output \\\n", - "0 Agent stopped due to iteration limit or time l... \n", - "1 Agent stopped due to iteration limit or time l... \n", - "2 -5.0\\n-5.0 \n", - "3 The result of 2 to the power of 3 is 32. \n", - "4 Agent stopped due to iteration limit or time l... \n", - "\n", - " output.intermediate_steps reference.reference \\\n", - "0 [(tool='add' tool_input={'a': 2, 'b': 3} log=\"... 6.20 \n", - "1 [(tool='subtract' tool_input={'a': 2, 'b': 3} ... -4.00 \n", - "2 [(tool='negate' tool_input={'a': -5} log=\"\\nIn... -5.00 \n", - "3 [(tool='power' tool_input={'a': 2, 'b': 3} log... 32.00 \n", - "4 [(tool='add' tool_input={'a': 1, 'b': 2} log=\"... 32.34 \n", - "\n", - " reference.expected_steps num_expected_steps actual_number_of_steps \n", - "0 [add] 1 15 \n", - "1 [subtract] 1 15 \n", - "2 [negate] 1 9 \n", - "3 [power] 1 1 \n", - "4 [multiply, add] 2 15 " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", + "metadata": { + "tags": [] + }, + "source": [ + "# Multiverse Math\n", + "\n", + "In this task, the agent is operating in an alternate universe which in which the basic mathematical operations like addition and multiplication are different.\n", + "\n", + "The agent must use tools that allow is to carry out calculations in this universe.\n", + "\n", + "This task can help verify that an agent is able to ignore its own knowledge of math and instead correctly use information returned by the tools.\n", + "\n", + "The modified mathematical operations yield different reuslts, but still retain some properties (e.g., the modified multiplication operation is still commutative).\n", + "\n", + "Please note that the modified operations are not guaranteed to even make sense in the real world since not all properties will be retained (e.g., distributive property)." + ] + }, + { + "cell_type": "markdown", + "id": "03488ab1-31ed-41c2-8da2-46b02599b181", + "metadata": {}, + "source": [ + "For this code to work, please configure LangSmith environment variables with your credentials." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1615b8ff-688a-4447-8c4c-d64ad02818ed", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\" # Your LangSmith API key" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain_benchmarks import clone_public_dataset, registry" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Name Multiverse Math
Type ToolUsageTask
Dataset ID 594f9f60-30a0-49bf-b075-f44beabf546a
DescriptionAn environment that contains a few basic math operations, but with altered results.\n", + "\n", + "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", + "\n", + "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.
" + ], + "text/plain": [ + "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.')" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task = registry[\"Multiverse Math\"]\n", + "task" + ] + }, + { + "cell_type": "markdown", + "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", + "metadata": {}, + "source": [ + "Clone the dataset associaetd with this task" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "70369f67-deb4-467a-801a-6d38c3d0460d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset Multiverse Math already exists. Skipping.\n", + "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/ddca73f1-ceda-4562-8c49-7ee0a9df2a01.\n" + ] + } + ], + "source": [ + "clone_public_dataset(task.dataset_id, dataset_name=task.name)" + ] + }, + { + "cell_type": "markdown", + "id": "cede4edd-884d-4330-a186-5058b712394b", + "metadata": {}, + "source": [ + "## The Environment\n", + "\n", + "Let's check the environment" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e2439d0c-ccb9-4f5b-a127-548725025a98", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[StructuredTool(name='multiply', description='multiply(a: float, b: float) -> float - Multiply two numbers; a * b.', args_schema=, func=),\n", + " StructuredTool(name='add', description='add(a: float, b: float) -> float - Add two numbers; a + b.', args_schema=, func=),\n", + " StructuredTool(name='divide', description='divide(a: float, b: float) -> float - Divide two numbers; a / b.', args_schema=, func=),\n", + " StructuredTool(name='subtract', description='subtract(a: float, b: float) -> float - Subtract two numbers; a - b.', args_schema=, func=),\n", + " StructuredTool(name='power', description='power(a: float, b: float) -> float - Raise a number to a power; a ** b.', args_schema=, func=)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env = task.create_environment()\n", + "env.tools[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "1941e187-55ee-4d38-b529-4744ea2474b0", + "metadata": {}, + "source": [ + "Multiplying 2 x 4 = 8.8!!" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f5a100bd-6e19-498f-8a36-393b5c19bcb9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "8.8" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.tools[0].invoke({\"a\": 2, \"b\": 4})" + ] + }, + { + "cell_type": "markdown", + "id": "bc60ef11-6300-4a83-989e-ec5b7f196796", + "metadata": {}, + "source": [ + "The task instructions" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "31afb08b-17b8-4866-86c1-ee24e804415c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task.instructions" + ] + }, + { + "cell_type": "markdown", + "id": "92d65770-6a4f-4029-beba-5fa9aeb18809", + "metadata": {}, + "source": [ + "## Agent" + ] + }, + { + "cell_type": "markdown", + "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", + "metadata": {}, + "source": [ + "Let's build an agent that we can use for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6142cf4e-862c-47a3-aa75-81d7d3231308", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'how much is 3 + 5',\n", + " 'output': 'The result of 3 + 5 in this alternate mathematical universe is 9.2.',\n", + " 'intermediate_steps': [(AgentActionMessageLog(tool='add', tool_input={'a': 3, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 3, 'b': 5}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"a\": 3,\\n \"b\": 5\\n}', 'name': 'add'}})]),\n", + " 9.2)]}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_benchmarks.tool_usage import agents\n", + "\n", + "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n", + "\n", + "# Let's test that our agent works\n", + "agent = agent_factory.create()\n", + "agent.invoke({\"question\": \"how much is 3 + 5\"})" + ] + }, + { + "cell_type": "markdown", + "id": "3821e4b0-8e67-418a-840c-470fcde42df0", + "metadata": {}, + "source": [ + "## Eval\n", + "\n", + "Let's evaluate an agent now" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "View the evaluation results for project 'test-weary-wing-36' at:\n", + "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/33124759-882e-4a5c-a121-736310a40a1f?eval=true\n", + "\n", + "View all tests for Dataset Multiverse Math at:\n", + "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/ddca73f1-ceda-4562-8c49-7ee0a9df2a01\n", + "[------------------------------------------------->] 10/10\n", + " Eval quantiles:\n", + " 0.25 0.5 0.75 mean \\\n", + "Intermediate steps correctness 0.250000 1.000000 1.000000 0.700000 \n", + "# steps / # expected steps 1.000000 1.000000 1.000000 1.033333 \n", + "correctness 0.000000 0.000000 1.000000 0.400000 \n", + "execution_time 5.771554 5.771554 5.771554 5.771554 \n", + "\n", + " mode \n", + "Intermediate steps correctness 1.000000 \n", + "# steps / # expected steps 1.000000 \n", + "correctness 0.000000 \n", + "execution_time 5.771554 \n" + ] + } + ], + "source": [ + "from langsmith.client import Client\n", + "\n", + "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n", + "\n", + "client = Client()\n", + "\n", + "test_run = client.run_on_dataset(\n", + " dataset_name=task.name,\n", + " llm_or_chain_factory=agent_factory.create,\n", + " evaluation=STANDARD_AGENT_EVALUATOR,\n", + " verbose=True,\n", + " tags=[\"gpt-3.5-turbo-16k\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", + "metadata": {}, + "source": [ + "# Inspect\n", + "\n", + "You can take a look at the underlying results." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = test_run.to_dataframe()\n", + "df = pd.json_normalize(df.to_dict(orient=\"records\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7ab5a8b9-a937-4537-b879-704284df4494", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.4" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"correctness\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n", + "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "50d7590d-20de-4768-ac90-adcdbfa70068", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Intermediate steps correctness# steps / # expected stepscorrectnessexecution_timeinput.questionoutput.questionoutput.outputoutput.intermediate_stepsreference.referencereference.expected_stepsnum_expected_stepsactual_number_of_steps
011.015.771554Add 2 and 3Add 2 and 3The sum of 2 and 3 in this alternate mathemati...[(tool='add' tool_input={'a': 2, 'b': 3} log=\"...6.20[add]11
111.005.771554Subtract 3 from 2Subtract 3 from 2The result of subtracting 3 from 2 in this alt...[(tool='subtract' tool_input={'a': 2, 'b': 3} ...-4.00[subtract]11
211.015.771554What is -5 if evaluated using the negate funct...What is -5 if evaluated using the negate funct...The result of evaluating -5 using the negate f...[(tool='negate' tool_input={'a': -5} log=\"\\nIn...-5.00[negate]11
311.005.771554what is the result of 2 to the power of 3?what is the result of 2 to the power of 3?The result of 2 to the power of 3 is 32.[(tool='power' tool_input={'a': 2, 'b': 3} log...32.00[power]11
401.005.771554I ate 1 apple and 2 oranges every day for 7 da...I ate 1 apple and 2 oranges every day for 7 da...You ate a total of 32.34 fruits.[(tool='add' tool_input={'a': 1, 'b': 2} log=\"...32.34[multiply, add]22
\n", + "
" + ], + "text/plain": [ + " Intermediate steps correctness # steps / # expected steps correctness \\\n", + "0 1 1.0 1 \n", + "1 1 1.0 0 \n", + "2 1 1.0 1 \n", + "3 1 1.0 0 \n", + "4 0 1.0 0 \n", + "\n", + " execution_time input.question \\\n", + "0 5.771554 Add 2 and 3 \n", + "1 5.771554 Subtract 3 from 2 \n", + "2 5.771554 What is -5 if evaluated using the negate funct... \n", + "3 5.771554 what is the result of 2 to the power of 3? \n", + "4 5.771554 I ate 1 apple and 2 oranges every day for 7 da... \n", + "\n", + " output.question \\\n", + "0 Add 2 and 3 \n", + "1 Subtract 3 from 2 \n", + "2 What is -5 if evaluated using the negate funct... \n", + "3 what is the result of 2 to the power of 3? \n", + "4 I ate 1 apple and 2 oranges every day for 7 da... \n", + "\n", + " output.output \\\n", + "0 The sum of 2 and 3 in this alternate mathemati... \n", + "1 The result of subtracting 3 from 2 in this alt... \n", + "2 The result of evaluating -5 using the negate f... \n", + "3 The result of 2 to the power of 3 is 32. \n", + "4 You ate a total of 32.34 fruits. \n", + "\n", + " output.intermediate_steps reference.reference \\\n", + "0 [(tool='add' tool_input={'a': 2, 'b': 3} log=\"... 6.20 \n", + "1 [(tool='subtract' tool_input={'a': 2, 'b': 3} ... -4.00 \n", + "2 [(tool='negate' tool_input={'a': -5} log=\"\\nIn... -5.00 \n", + "3 [(tool='power' tool_input={'a': 2, 'b': 3} log... 32.00 \n", + "4 [(tool='add' tool_input={'a': 1, 'b': 2} log=\"... 32.34 \n", + "\n", + " reference.expected_steps num_expected_steps actual_number_of_steps \n", + "0 [add] 1 1 \n", + "1 [subtract] 1 1 \n", + "2 [negate] 1 1 \n", + "3 [power] 1 1 \n", + "4 [multiply, add] 2 2 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/notebooks/tool_usage/relational_data.ipynb b/docs/source/notebooks/tool_usage/relational_data.ipynb index 1c233c70..a79cae7d 100644 --- a/docs/source/notebooks/tool_usage/relational_data.ipynb +++ b/docs/source/notebooks/tool_usage/relational_data.ipynb @@ -1,1165 +1,812 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", - "metadata": {}, - "source": [ - "# Relational Data \n", - "\n", - "\n", - "Let's see how to evaluate an agent's ability to use tools." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_benchmarks import clone_public_dataset, registry" - ] - }, - { - "cell_type": "markdown", - "id": "03488ab1-31ed-41c2-8da2-46b02599b181", - "metadata": {}, - "source": [ - "For this code to work, please configure LangSmith environment variables with your credentials." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "60f22779-a948-4833-8e8c-ace9ef17f56f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Name Tool Usage - Relational Data
Type ToolUsageTask
Dataset ID https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d
DescriptionEnvironment with fake data about users and their locations and favorite foods.\n", - "\n", - "The environment provides a set of tools that can be used to query the data.\n", - "\n", - "The objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\n", - "\n", - "The dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\n", - "\n", - "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", - "\n", - "Success is measured by the ability to answer the question correctly, and efficiently.
" - ], - "text/plain": [ - "ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\")" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task = registry[\"Tool Usage - Relational Data\"]\n", - "task" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "49be36d2-343e-49df-8369-dd5bac405d5e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Environment with fake data about users and their locations and favorite foods.\n", - "\n", - "The environment provides a set of tools that can be used to query the data.\n", - "\n", - "The objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\n", - "\n", - "The dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\n", - "\n", - "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", - "\n", - "Success is measured by the ability to answer the question correctly, and efficiently.\n" - ] - } - ], - "source": [ - "print(task.description)" - ] - }, - { - "cell_type": "markdown", - "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", - "metadata": {}, - "source": [ - "Clone the dataset associaetd with this task" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "70369f67-deb4-467a-801a-6d38c3d0460d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset Tool Usage - Relational Data already exists. Skipping.\n", - "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/69c0e0d0-91b5-4183-bed0-6628e76964dc.\n" - ] - } - ], - "source": [ - "clone_public_dataset(task.dataset_id, dataset_name=task.name)" - ] - }, - { - "cell_type": "markdown", - "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", - "metadata": {}, - "source": [ - "## Define an agent\n", - "\n", - "Let's build an agent that we can use for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "09469813-17b6-4456-a913-486a01a4b295", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_benchmarks.tool_usage import agents" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "0ae8c6be-899c-44a6-a89b-0fc04c2cb05c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")" - ] - }, - { - "cell_type": "markdown", - "id": "87a64f76-65ae-4367-b43f-f2be3431e7af", - "metadata": {}, - "source": [ - "Let's test that our agent works" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "612fb603-1401-426b-8a19-4453ad5b698a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "agent = agent_factory()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "0e4896fa-3633-44a1-857f-80a263cf2e03", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'question': 'who is bob?',\n", - " 'output': 'Bob is a user with the ID 21.',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='find_users_by_name', tool_input={'name': 'bob'}, log=\"\\nInvoking: `find_users_by_name` with `{'name': 'bob'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"name\": \"bob\"\\n}', 'name': 'find_users_by_name'}})]),\n", - " [{'id': 21, 'name': 'Bob'},\n", - " {'id': 41, 'name': 'Donna'},\n", - " {'id': 1, 'name': 'Alice'},\n", - " {'id': 35, 'name': 'Charlie'},\n", - " {'id': 42, 'name': 'Eve'},\n", - " {'id': 43, 'name': 'Frank The Cat'}])]}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agent.invoke({\"question\": \"who is bob?\"})" - ] - }, - { - "cell_type": "markdown", - "id": "3821e4b0-8e67-418a-840c-470fcde42df0", - "metadata": {}, - "source": [ - "## Eval\n", - "\n", - "Let's evaluate an agent now" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "513042fe-2878-44f8-ae84-05b9d521c1de", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langsmith.client import Client\n", - "\n", - "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "2bedd9d1-fc06-4066-9f89-b874ae818d82", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "client = Client()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "View the evaluation results for project 'test-warm-whip-57' at:\n", - "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/048077f0-52ca-4bae-8792-ec5e2a817d38?eval=true\n", - "\n", - "View all tests for Dataset Tool Usage - Relational Data at:\n", - "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/69c0e0d0-91b5-4183-bed0-6628e76964dc\n", - "[------------------------------------------------->] 21/21\n", - " Eval quantiles:\n", - " 0.25 0.5 0.75 mean \\\n", - "Intermediate steps correctness 0.000000 1.000000 1.000000 0.714286 \n", - "# steps / # expected steps 1.000000 1.000000 1.000000 0.928571 \n", - "correctness 1.000000 1.000000 1.000000 0.809524 \n", - "execution_time 5.098939 5.098939 5.098939 5.098939 \n", - "\n", - " mode \n", - "Intermediate steps correctness 1.000000 \n", - "# steps / # expected steps 1.000000 \n", - "correctness 1.000000 \n", - "execution_time 5.098939 \n" - ] - } - ], - "source": [ - "test_run = client.run_on_dataset(\n", - " dataset_name=task.name,\n", - " llm_or_chain_factory=agent_factory.create,\n", - " evaluation=STANDARD_AGENT_EVALUATOR,\n", - " verbose=True,\n", - " tags=[\"openai-functions\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", - "metadata": {}, - "source": [ - "# Inspect\n", - "\n", - "Here, we'll take a look at the underlying results a little bit." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df = test_run.to_dataframe()\n", - "df = pd.json_normalize(df.to_dict(orient=\"records\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "7ab5a8b9-a937-4537-b879-704284df4494", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.8095238095238095" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"correctness\"].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n", - "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "50d7590d-20de-4768-ac90-adcdbfa70068", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Intermediate steps correctness# steps / # expected stepscorrectnessexecution_timeinput.questionoutput.questionoutput.outputoutput.intermediate_stepsreference.referencereference.order_mattersreference.expected_stepsnum_expected_stepsactual_number_of_steps
001.015.098939do bob and alice live in the same city?do bob and alice live in the same city?No, Bob and Alice do not live in the same city...[(tool='find_users_by_name' tool_input={'name'...noFalse[find_users_by_name, get_user_location, get_ci...55
100.005.098939Is it likely that Donna is outside with an umb...Is it likely that Donna is outside with an umb...I'm sorry, but I don't have access to real-tim...[]yesFalse[find_users_by_name, get_user_location, get_cu...40
211.015.098939do alice and charlie use the same email provider?do alice and charlie use the same email provider?No, Alice and Charlie do not use the same emai...[(tool='find_users_by_name' tool_input={'name'...noTrue[find_users_by_name, get_user_email, get_user_...33
300.005.098939Is it likely that Donna is awake right now?Is it likely that Donna is awake right now?I'm sorry, but I don't have access to informat...[]yesTrue[find_users_by_name, get_user_location, get_cu...30
401.015.098939Donna is about to go outside. Does she need an...Donna is about to go outside. Does she need an...Donna is currently in a location where it is r...[(tool='find_users_by_name' tool_input={'name'...yesTrue[find_users_by_name, get_user_location, get_cu...33
\n", - "
" - ], - "text/plain": [ - " Intermediate steps correctness # steps / # expected steps correctness \\\n", - "0 0 1.0 1 \n", - "1 0 0.0 0 \n", - "2 1 1.0 1 \n", - "3 0 0.0 0 \n", - "4 0 1.0 1 \n", - "\n", - " execution_time input.question \\\n", - "0 5.098939 do bob and alice live in the same city? \n", - "1 5.098939 Is it likely that Donna is outside with an umb... \n", - "2 5.098939 do alice and charlie use the same email provider? \n", - "3 5.098939 Is it likely that Donna is awake right now? \n", - "4 5.098939 Donna is about to go outside. Does she need an... \n", - "\n", - " output.question \\\n", - "0 do bob and alice live in the same city? \n", - "1 Is it likely that Donna is outside with an umb... \n", - "2 do alice and charlie use the same email provider? \n", - "3 Is it likely that Donna is awake right now? \n", - "4 Donna is about to go outside. Does she need an... \n", - "\n", - " output.output \\\n", - "0 No, Bob and Alice do not live in the same city... \n", - "1 I'm sorry, but I don't have access to real-tim... \n", - "2 No, Alice and Charlie do not use the same emai... \n", - "3 I'm sorry, but I don't have access to informat... \n", - "4 Donna is currently in a location where it is r... \n", - "\n", - " output.intermediate_steps reference.reference \\\n", - "0 [(tool='find_users_by_name' tool_input={'name'... no \n", - "1 [] yes \n", - "2 [(tool='find_users_by_name' tool_input={'name'... no \n", - "3 [] yes \n", - "4 [(tool='find_users_by_name' tool_input={'name'... yes \n", - "\n", - " reference.order_matters reference.expected_steps \\\n", - "0 False [find_users_by_name, get_user_location, get_ci... \n", - "1 False [find_users_by_name, get_user_location, get_cu... \n", - "2 True [find_users_by_name, get_user_email, get_user_... \n", - "3 True [find_users_by_name, get_user_location, get_cu... \n", - "4 True [find_users_by_name, get_user_location, get_cu... \n", - "\n", - " num_expected_steps actual_number_of_steps \n", - "0 5 5 \n", - "1 4 0 \n", - "2 3 3 \n", - "3 3 0 \n", - "4 3 3 " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ffab97b7-eda2-408d-b611-596b637e627a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = df.sort_values(\"actual_number_of_steps\", ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "20eb92f0-9373-4741-a851-b21c41f8c203", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Intermediate steps correctness# steps / # expected stepscorrectnessexecution_timeinput.questionoutput.questionoutput.outputoutput.intermediate_stepsreference.referencereference.order_mattersreference.expected_stepsnum_expected_stepsactual_number_of_steps
001.015.098939do bob and alice live in the same city?do bob and alice live in the same city?No, Bob and Alice do not live in the same city...[(tool='find_users_by_name' tool_input={'name'...noFalse[find_users_by_name, get_user_location, get_ci...55
211.015.098939do alice and charlie use the same email provider?do alice and charlie use the same email provider?No, Alice and Charlie do not use the same emai...[(tool='find_users_by_name' tool_input={'name'...noTrue[find_users_by_name, get_user_email, get_user_...33
401.015.098939Donna is about to go outside. Does she need an...Donna is about to go outside. Does she need an...Donna is currently in a location where it is r...[(tool='find_users_by_name' tool_input={'name'...yesTrue[find_users_by_name, get_user_location, get_cu...33
501.005.098939whats the name of the city where bob lives?whats the name of the city where bob lives?The name of the city where Bob lives is New York.[(tool='list_user_ids' tool_input={} log='\\nIn...Los AngelesTrue[find_users_by_name, get_user_location, get_ci...33
611.015.098939what is the current users favorite color and n...what is the current users favorite color and n...The current user's favorite color is yellow an...[(tool='get_current_user_id' tool_input={} log...yellow and CharlieTrue[get_current_user_id, get_user_favorite_color,...33
701.515.098939Frank who is Even's friend is allergic to dair...Frank who is Even's friend is allergic to dair...Frank's favorite food is the salad, which cont...[(tool='find_users_by_name' tool_input={'name'...yesTrue[find_users_by_name, get_food_allergic_ingredi...23
1111.015.098939list the allergens in chocolatelist the allergens in chocolateThe allergens in chocolate are milk and soy.[(tool='find_foods_by_name' tool_input={'food'...milk, soyTrue[find_foods_by_name, get_food_allergic_ingredi...22
1511.015.098939what is alice's email address?what is alice's email address?Alice's email address is alice@gmail.com.[(tool='find_users_by_name' tool_input={'name'...alice@gmail.comTrue[find_users_by_name, get_user_email]22
1411.015.098939find donna's favorite colorfind donna's favorite colorDonna's favorite color is green.[(tool='find_users_by_name' tool_input={'name'...greenTrue[find_users_by_name, get_user_favorite_color]22
1311.015.098939weather in LA right now?weather in LA right now?The current weather in Los Angeles is sunny wi...[(tool='find_locations_by_name' tool_input={'c...Sunny, Temperature: 75°FTrue[find_locations_by_name, get_current_weather_f...22
1211.005.098939time in chicagotime in chicagoThe current time in Chicago is 11:15 AM.[(tool='find_locations_by_name' tool_input={'c...2023-11-14 11:15 AMTrue[find_locations_by_name, get_current_time_for_...22
1011.015.098939If i eat a serving of pizza, how many calories...If i eat a serving of pizza, how many calories...If you eat a serving of pizza, you will consum...[(tool='find_foods_by_name' tool_input={'food'...285 caloriesTrue[find_foods_by_name, get_food_calories]22
911.015.098939what is the current users favorite color?what is the current users favorite color?The current user's favorite color is yellow.[(tool='get_current_user_id' tool_input={} log...yellowTrue[get_current_user_id, get_user_favorite_color]22
811.015.098939eve ate a serving of sushi, what allergens was...eve ate a serving of sushi, what allergens was...Eve was exposed to the allergens fish and soy ...[(tool='find_foods_by_name' tool_input={'food'...fish, soyTrue[find_foods_by_name, get_food_allergic_ingredi...22
1611.015.098939How many users by the name of bob?How many users by the name of bob?There are 1 user(s) with the name \"Bob\".[(tool='find_users_by_name' tool_input={'name'...1True[find_users_by_name]11
1711.015.098939get the current user idget the current user idThe current user ID is 35.[(tool='get_current_user_id' tool_input={} log...35True[get_current_user_id]11
1811.015.098939what is eve's user id?what is eve's user id?Eve's user ID is 42.[(tool='find_users_by_name' tool_input={'name'...42True[find_users_by_name]11
1911.015.098939What is the name of food with id 6?What is the name of food with id 6?The name of the food with ID 6 is Pasta.[(tool='get_food_name' tool_input={'food_id': ...PastaTrue[get_food_name]11
2011.015.098939What is the city for location ID 1?What is the city for location ID 1?The city for location ID 1 is New York.[(tool='get_city_for_location' tool_input={'lo...New YorkTrue[get_city_for_location]11
100.005.098939Is it likely that Donna is outside with an umb...Is it likely that Donna is outside with an umb...I'm sorry, but I don't have access to real-tim...[]yesFalse[find_users_by_name, get_user_location, get_cu...40
300.005.098939Is it likely that Donna is awake right now?Is it likely that Donna is awake right now?I'm sorry, but I don't have access to informat...[]yesTrue[find_users_by_name, get_user_location, get_cu...30
\n", - "
" - ], - "text/plain": [ - " Intermediate steps correctness # steps / # expected steps correctness \\\n", - "0 0 1.0 1 \n", - "2 1 1.0 1 \n", - "4 0 1.0 1 \n", - "5 0 1.0 0 \n", - "6 1 1.0 1 \n", - "7 0 1.5 1 \n", - "11 1 1.0 1 \n", - "15 1 1.0 1 \n", - "14 1 1.0 1 \n", - "13 1 1.0 1 \n", - "12 1 1.0 0 \n", - "10 1 1.0 1 \n", - "9 1 1.0 1 \n", - "8 1 1.0 1 \n", - "16 1 1.0 1 \n", - "17 1 1.0 1 \n", - "18 1 1.0 1 \n", - "19 1 1.0 1 \n", - "20 1 1.0 1 \n", - "1 0 0.0 0 \n", - "3 0 0.0 0 \n", - "\n", - " execution_time input.question \\\n", - "0 5.098939 do bob and alice live in the same city? \n", - "2 5.098939 do alice and charlie use the same email provider? \n", - "4 5.098939 Donna is about to go outside. Does she need an... \n", - "5 5.098939 whats the name of the city where bob lives? \n", - "6 5.098939 what is the current users favorite color and n... \n", - "7 5.098939 Frank who is Even's friend is allergic to dair... \n", - "11 5.098939 list the allergens in chocolate \n", - "15 5.098939 what is alice's email address? \n", - "14 5.098939 find donna's favorite color \n", - "13 5.098939 weather in LA right now? \n", - "12 5.098939 time in chicago \n", - "10 5.098939 If i eat a serving of pizza, how many calories... \n", - "9 5.098939 what is the current users favorite color? \n", - "8 5.098939 eve ate a serving of sushi, what allergens was... \n", - "16 5.098939 How many users by the name of bob? \n", - "17 5.098939 get the current user id \n", - "18 5.098939 what is eve's user id? \n", - "19 5.098939 What is the name of food with id 6? \n", - "20 5.098939 What is the city for location ID 1? \n", - "1 5.098939 Is it likely that Donna is outside with an umb... \n", - "3 5.098939 Is it likely that Donna is awake right now? \n", - "\n", - " output.question \\\n", - "0 do bob and alice live in the same city? \n", - "2 do alice and charlie use the same email provider? \n", - "4 Donna is about to go outside. Does she need an... \n", - "5 whats the name of the city where bob lives? \n", - "6 what is the current users favorite color and n... \n", - "7 Frank who is Even's friend is allergic to dair... \n", - "11 list the allergens in chocolate \n", - "15 what is alice's email address? \n", - "14 find donna's favorite color \n", - "13 weather in LA right now? \n", - "12 time in chicago \n", - "10 If i eat a serving of pizza, how many calories... \n", - "9 what is the current users favorite color? \n", - "8 eve ate a serving of sushi, what allergens was... \n", - "16 How many users by the name of bob? \n", - "17 get the current user id \n", - "18 what is eve's user id? \n", - "19 What is the name of food with id 6? \n", - "20 What is the city for location ID 1? \n", - "1 Is it likely that Donna is outside with an umb... \n", - "3 Is it likely that Donna is awake right now? \n", - "\n", - " output.output \\\n", - "0 No, Bob and Alice do not live in the same city... \n", - "2 No, Alice and Charlie do not use the same emai... \n", - "4 Donna is currently in a location where it is r... \n", - "5 The name of the city where Bob lives is New York. \n", - "6 The current user's favorite color is yellow an... \n", - "7 Frank's favorite food is the salad, which cont... \n", - "11 The allergens in chocolate are milk and soy. \n", - "15 Alice's email address is alice@gmail.com. \n", - "14 Donna's favorite color is green. \n", - "13 The current weather in Los Angeles is sunny wi... \n", - "12 The current time in Chicago is 11:15 AM. \n", - "10 If you eat a serving of pizza, you will consum... \n", - "9 The current user's favorite color is yellow. \n", - "8 Eve was exposed to the allergens fish and soy ... \n", - "16 There are 1 user(s) with the name \"Bob\". \n", - "17 The current user ID is 35. \n", - "18 Eve's user ID is 42. \n", - "19 The name of the food with ID 6 is Pasta. \n", - "20 The city for location ID 1 is New York. \n", - "1 I'm sorry, but I don't have access to real-tim... \n", - "3 I'm sorry, but I don't have access to informat... \n", - "\n", - " output.intermediate_steps \\\n", - "0 [(tool='find_users_by_name' tool_input={'name'... \n", - "2 [(tool='find_users_by_name' tool_input={'name'... \n", - "4 [(tool='find_users_by_name' tool_input={'name'... \n", - "5 [(tool='list_user_ids' tool_input={} log='\\nIn... \n", - "6 [(tool='get_current_user_id' tool_input={} log... \n", - "7 [(tool='find_users_by_name' tool_input={'name'... \n", - "11 [(tool='find_foods_by_name' tool_input={'food'... \n", - "15 [(tool='find_users_by_name' tool_input={'name'... \n", - "14 [(tool='find_users_by_name' tool_input={'name'... \n", - "13 [(tool='find_locations_by_name' tool_input={'c... \n", - "12 [(tool='find_locations_by_name' tool_input={'c... \n", - "10 [(tool='find_foods_by_name' tool_input={'food'... \n", - "9 [(tool='get_current_user_id' tool_input={} log... \n", - "8 [(tool='find_foods_by_name' tool_input={'food'... \n", - "16 [(tool='find_users_by_name' tool_input={'name'... \n", - "17 [(tool='get_current_user_id' tool_input={} log... \n", - "18 [(tool='find_users_by_name' tool_input={'name'... \n", - "19 [(tool='get_food_name' tool_input={'food_id': ... \n", - "20 [(tool='get_city_for_location' tool_input={'lo... \n", - "1 [] \n", - "3 [] \n", - "\n", - " reference.reference reference.order_matters \\\n", - "0 no False \n", - "2 no True \n", - "4 yes True \n", - "5 Los Angeles True \n", - "6 yellow and Charlie True \n", - "7 yes True \n", - "11 milk, soy True \n", - "15 alice@gmail.com True \n", - "14 green True \n", - "13 Sunny, Temperature: 75°F True \n", - "12 2023-11-14 11:15 AM True \n", - "10 285 calories True \n", - "9 yellow True \n", - "8 fish, soy True \n", - "16 1 True \n", - "17 35 True \n", - "18 42 True \n", - "19 Pasta True \n", - "20 New York True \n", - "1 yes False \n", - "3 yes True \n", - "\n", - " reference.expected_steps num_expected_steps \\\n", - "0 [find_users_by_name, get_user_location, get_ci... 5 \n", - "2 [find_users_by_name, get_user_email, get_user_... 3 \n", - "4 [find_users_by_name, get_user_location, get_cu... 3 \n", - "5 [find_users_by_name, get_user_location, get_ci... 3 \n", - "6 [get_current_user_id, get_user_favorite_color,... 3 \n", - "7 [find_users_by_name, get_food_allergic_ingredi... 2 \n", - "11 [find_foods_by_name, get_food_allergic_ingredi... 2 \n", - "15 [find_users_by_name, get_user_email] 2 \n", - "14 [find_users_by_name, get_user_favorite_color] 2 \n", - "13 [find_locations_by_name, get_current_weather_f... 2 \n", - "12 [find_locations_by_name, get_current_time_for_... 2 \n", - "10 [find_foods_by_name, get_food_calories] 2 \n", - "9 [get_current_user_id, get_user_favorite_color] 2 \n", - "8 [find_foods_by_name, get_food_allergic_ingredi... 2 \n", - "16 [find_users_by_name] 1 \n", - "17 [get_current_user_id] 1 \n", - "18 [find_users_by_name] 1 \n", - "19 [get_food_name] 1 \n", - "20 [get_city_for_location] 1 \n", - "1 [find_users_by_name, get_user_location, get_cu... 4 \n", - "3 [find_users_by_name, get_user_location, get_cu... 3 \n", - "\n", - " actual_number_of_steps \n", - "0 5 \n", - "2 3 \n", - "4 3 \n", - "5 3 \n", - "6 3 \n", - "7 3 \n", - "11 2 \n", - "15 2 \n", - "14 2 \n", - "13 2 \n", - "12 2 \n", - "10 2 \n", - "9 2 \n", - "8 2 \n", - "16 1 \n", - "17 1 \n", - "18 1 \n", - "19 1 \n", - "20 1 \n", - "1 0 \n", - "3 0 " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", + "metadata": {}, + "source": [ + "# Relational Data \n", + "\n", + "In this task, an agent is given access to a set of tools that can be used to make queries across 3 relational tables.\n", + "\n", + "The tables contain information about users, locations and foods. The agent must answer questions about the data using the provided tools.\n", + "\n", + "The underlying data looks like this:\n", + "\n", + "User data:\n", + "\n", + "```json\n", + "{\n", + " \"id\": 1,\n", + " \"name\": \"Alice\",\n", + " \"email\": \"alice@gmail.com\",\n", + " \"location\": 1,\n", + " \"favorite_color\": \"red\",\n", + " \"favorite_foods\": [1, 2, 3],\n", + "},\n", + "{\n", + " \"id\": 21,\n", + " \"name\": \"Bob\",\n", + " \"email\": \"bob@hotmail.com\",\n", + " \"location\": 2,\n", + " \"favorite_color\": \"orange\",\n", + " \"favorite_foods\": [4, 5, 6],\n", + "}\n", + "```\n", + "\n", + "Food data:\n", + "\n", + "```json\n", + "{\n", + " \"id\": 1,\n", + " \"name\": \"Pizza\",\n", + " \"calories\": 285, # Calories per serving\n", + " \"allergic_ingredients\": [\"Gluten\", \"Dairy\"],\n", + "},\n", + "{\n", + " \"id\": 2,\n", + " \"name\": \"Chocolate\",\n", + " \"calories\": 50, # Calories per serving\n", + " \"allergic_ingredients\": [\"Milk\", \"Soy\"],\n", + "},\n", + "```\n", + "\n", + "The tools allow to look up information based on ids (e.g., `get_user_email` takes a user id and returns the email),\n", + "and to search (e.g., `find_foods_by_name` takes a food name and returns a list of results." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain_benchmarks import clone_public_dataset, registry" + ] + }, + { + "cell_type": "markdown", + "id": "03488ab1-31ed-41c2-8da2-46b02599b181", + "metadata": {}, + "source": [ + "For this code to work, please configure LangSmith environment variables with your credentials." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "60f22779-a948-4833-8e8c-ace9ef17f56f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "task = registry[\"Tool Usage - Relational Data\"]" + ] + }, + { + "cell_type": "markdown", + "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", + "metadata": {}, + "source": [ + "Clone the dataset associaetd with this task" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "70369f67-deb4-467a-801a-6d38c3d0460d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset Tool Usage - Relational Data already exists. Skipping.\n", + "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/69c0e0d0-91b5-4183-bed0-6628e76964dc.\n" + ] + } + ], + "source": [ + "clone_public_dataset(task.dataset_id, dataset_name=task.name)" + ] + }, + { + "cell_type": "markdown", + "id": "110bdafa-bdab-4194-90c9-46416d14b2f9", + "metadata": {}, + "source": [ + "## The Environment\n", + "\n", + "Let's check the environment" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "27b6b0fd-639d-43a7-a730-9acdc5b2f102", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[StructuredTool(name='get_user_name', description=\"get_user_name(user_id: int) -> str - Get the name of the user with the given user ID.\\n\\n Args:\\n user_id: The user's ID.\\n\\n Returns:\\n The user's name.\", args_schema=, func=.get_user_name at 0x7fc3d10904c0>),\n", + " StructuredTool(name='list_user_ids', description='list_user_ids() -> List[str] - List all the user IDs.', args_schema=, func=.list_user_ids at 0x7fc3d1090670>),\n", + " StructuredTool(name='find_users_by_name', description='find_users_by_name(name: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find users with the given name.\\n\\n Args:\\n name: The name to search for.\\n\\n Returns:\\n The list of matching users.', args_schema=, func=.find_users_by_name at 0x7fc3d1bc3700>),\n", + " StructuredTool(name='find_locations_by_name', description='find_locations_by_name(city: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find locations with the given city name.', args_schema=, func=.find_locations_by_name at 0x7fc3d145a8b0>),\n", + " StructuredTool(name='find_foods_by_name', description='find_foods_by_name(food: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find foods with the given name.', args_schema=, func=.find_foods_by_name at 0x7fc3d145adc0>)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env = task.create_environment()\n", + "env.tools[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7f1c1242-449c-4536-863d-b62bf6d2dff1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Bob'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.tools[0].invoke({\"user_id\": 21})" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "854e139b-a120-4012-bdf4-6394e0b1c42d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 2, 'city': 'Los Angeles'},\n", + " {'id': 1, 'city': 'New York'},\n", + " {'id': 3, 'city': 'Chicago'},\n", + " {'id': 4, 'city': 'Houston'},\n", + " {'id': 5, 'city': 'Miami'}]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.tools[3].invoke({\"city\": \"LA\"})" + ] + }, + { + "cell_type": "markdown", + "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", + "metadata": {}, + "source": [ + "## Define an agent\n", + "\n", + "Let's build an agent that we can use for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "09469813-17b6-4456-a913-486a01a4b295", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain_benchmarks.tool_usage import agents" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "0ae8c6be-899c-44a6-a89b-0fc04c2cb05c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")" + ] + }, + { + "cell_type": "markdown", + "id": "87a64f76-65ae-4367-b43f-f2be3431e7af", + "metadata": {}, + "source": [ + "Let's test that our agent works" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "612fb603-1401-426b-8a19-4453ad5b698a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "agent = agent_factory()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "0e4896fa-3633-44a1-857f-80a263cf2e03", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'who is bob?',\n", + " 'output': 'Bob is a user with the ID 21.',\n", + " 'intermediate_steps': [(AgentActionMessageLog(tool='find_users_by_name', tool_input={'name': 'bob'}, log=\"\\nInvoking: `find_users_by_name` with `{'name': 'bob'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"name\": \"bob\"\\n}', 'name': 'find_users_by_name'}})]),\n", + " [{'id': 21, 'name': 'Bob'},\n", + " {'id': 41, 'name': 'Donna'},\n", + " {'id': 1, 'name': 'Alice'},\n", + " {'id': 35, 'name': 'Charlie'},\n", + " {'id': 42, 'name': 'Eve'},\n", + " {'id': 43, 'name': 'Frank The Cat'}])]}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent.invoke({\"question\": \"who is bob?\"})" + ] + }, + { + "cell_type": "markdown", + "id": "3821e4b0-8e67-418a-840c-470fcde42df0", + "metadata": {}, + "source": [ + "## Eval\n", + "\n", + "Let's evaluate an agent now" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "513042fe-2878-44f8-ae84-05b9d521c1de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langsmith.client import Client\n", + "\n", + "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "2bedd9d1-fc06-4066-9f89-b874ae818d82", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "client = Client()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "View the evaluation results for project 'test-fixed-self-71' at:\n", + "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/1c2f10b1-370d-4062-9397-bab8189e8b95?eval=true\n", + "\n", + "View all tests for Dataset Tool Usage - Relational Data at:\n", + "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/69c0e0d0-91b5-4183-bed0-6628e76964dc\n", + "[------------------------------------------------->] 21/21\n", + " Eval quantiles:\n", + " 0.25 0.5 0.75 mean \\\n", + "Intermediate steps correctness 1.000000 1.000000 1.000000 0.761905 \n", + "# steps / # expected steps 1.000000 1.000000 1.000000 0.928571 \n", + "correctness 1.000000 1.000000 1.000000 0.809524 \n", + "execution_time 4.253613 4.253613 4.253613 4.253613 \n", + "\n", + " mode \n", + "Intermediate steps correctness 1.000000 \n", + "# steps / # expected steps 1.000000 \n", + "correctness 1.000000 \n", + "execution_time 4.253613 \n" + ] + } + ], + "source": [ + "test_run = client.run_on_dataset(\n", + " dataset_name=task.name,\n", + " llm_or_chain_factory=agent_factory.create,\n", + " evaluation=STANDARD_AGENT_EVALUATOR,\n", + " verbose=True,\n", + " tags=[\"openai-functions\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", + "metadata": {}, + "source": [ + "# Inspect\n", + "\n", + "Here, we'll take a look at the underlying results a little bit." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "e5bea796-c204-42a1-904b-216b964a8936", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8095238095238095" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = test_run.to_dataframe()\n", + "df = pd.json_normalize(df.to_dict(orient=\"records\"))\n", + "\n", + "df[\"correctness\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "423292ca-1ca8-4753-b35b-0916d35802b9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Intermediate steps correctness# steps / # expected stepscorrectnessexecution_timeinput.questionoutput.questionoutput.outputoutput.intermediate_stepsreference.referencereference.order_mattersreference.expected_stepsnum_expected_stepsactual_number_of_steps
011.014.253613do bob and alice live in the same city?do bob and alice live in the same city?No, Bob and Alice do not live in the same city...[(tool='find_users_by_name' tool_input={'name'...noFalse[find_users_by_name, get_user_location, get_ci...55
100.004.253613Is it likely that Donna is outside with an umb...Is it likely that Donna is outside with an umb...I'm sorry, but I don't have access to real-tim...[]yesFalse[find_users_by_name, get_user_location, get_cu...40
211.014.253613do alice and charlie use the same email provider?do alice and charlie use the same email provider?No, Alice and Charlie do not use the same emai...[(tool='find_users_by_name' tool_input={'name'...noTrue[find_users_by_name, get_user_email, get_user_...33
300.004.253613Is it likely that Donna is awake right now?Is it likely that Donna is awake right now?I'm sorry, but I don't have access to informat...[]yesTrue[find_users_by_name, get_user_location, get_cu...30
401.014.253613Donna is about to go outside. Does she need an...Donna is about to go outside. Does she need an...Donna is at location 4 and the current weather...[(tool='find_users_by_name' tool_input={'name'...yesTrue[find_users_by_name, get_user_location, get_cu...33
\n", + "
" + ], + "text/plain": [ + " Intermediate steps correctness # steps / # expected steps correctness \\\n", + "0 1 1.0 1 \n", + "1 0 0.0 0 \n", + "2 1 1.0 1 \n", + "3 0 0.0 0 \n", + "4 0 1.0 1 \n", + "\n", + " execution_time input.question \\\n", + "0 4.253613 do bob and alice live in the same city? \n", + "1 4.253613 Is it likely that Donna is outside with an umb... \n", + "2 4.253613 do alice and charlie use the same email provider? \n", + "3 4.253613 Is it likely that Donna is awake right now? \n", + "4 4.253613 Donna is about to go outside. Does she need an... \n", + "\n", + " output.question \\\n", + "0 do bob and alice live in the same city? \n", + "1 Is it likely that Donna is outside with an umb... \n", + "2 do alice and charlie use the same email provider? \n", + "3 Is it likely that Donna is awake right now? \n", + "4 Donna is about to go outside. Does she need an... \n", + "\n", + " output.output \\\n", + "0 No, Bob and Alice do not live in the same city... \n", + "1 I'm sorry, but I don't have access to real-tim... \n", + "2 No, Alice and Charlie do not use the same emai... \n", + "3 I'm sorry, but I don't have access to informat... \n", + "4 Donna is at location 4 and the current weather... \n", + "\n", + " output.intermediate_steps reference.reference \\\n", + "0 [(tool='find_users_by_name' tool_input={'name'... no \n", + "1 [] yes \n", + "2 [(tool='find_users_by_name' tool_input={'name'... no \n", + "3 [] yes \n", + "4 [(tool='find_users_by_name' tool_input={'name'... yes \n", + "\n", + " reference.order_matters reference.expected_steps \\\n", + "0 False [find_users_by_name, get_user_location, get_ci... \n", + "1 False [find_users_by_name, get_user_location, get_cu... \n", + "2 True [find_users_by_name, get_user_email, get_user_... \n", + "3 True [find_users_by_name, get_user_location, get_cu... \n", + "4 True [find_users_by_name, get_user_location, get_cu... \n", + "\n", + " num_expected_steps actual_number_of_steps \n", + "0 5 5 \n", + "1 4 0 \n", + "2 3 3 \n", + "3 3 0 \n", + "4 3 3 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n", + "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "ffab97b7-eda2-408d-b611-596b637e627a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Intermediate steps correctness# steps / # expected stepscorrectnessexecution_timeinput.questionoutput.questionoutput.outputoutput.intermediate_stepsreference.referencereference.order_mattersreference.expected_stepsnum_expected_stepsactual_number_of_steps
011.014.253613do bob and alice live in the same city?do bob and alice live in the same city?No, Bob and Alice do not live in the same city...[(tool='find_users_by_name' tool_input={'name'...noFalse[find_users_by_name, get_user_location, get_ci...55
211.014.253613do alice and charlie use the same email provider?do alice and charlie use the same email provider?No, Alice and Charlie do not use the same emai...[(tool='find_users_by_name' tool_input={'name'...noTrue[find_users_by_name, get_user_email, get_user_...33
401.014.253613Donna is about to go outside. Does she need an...Donna is about to go outside. Does she need an...Donna is at location 4 and the current weather...[(tool='find_users_by_name' tool_input={'name'...yesTrue[find_users_by_name, get_user_location, get_cu...33
501.004.253613whats the name of the city where bob lives?whats the name of the city where bob lives?The name of the city where Bob lives is New York.[(tool='list_user_ids' tool_input={} log='\\nIn...Los AngelesTrue[find_users_by_name, get_user_location, get_ci...33
611.014.253613what is the current users favorite color and n...what is the current users favorite color and n...The current user's favorite color is yellow an...[(tool='get_current_user_id' tool_input={} log...yellow and CharlieTrue[get_current_user_id, get_user_favorite_color,...33
\n", + "
" + ], + "text/plain": [ + " Intermediate steps correctness # steps / # expected steps correctness \\\n", + "0 1 1.0 1 \n", + "2 1 1.0 1 \n", + "4 0 1.0 1 \n", + "5 0 1.0 0 \n", + "6 1 1.0 1 \n", + "\n", + " execution_time input.question \\\n", + "0 4.253613 do bob and alice live in the same city? \n", + "2 4.253613 do alice and charlie use the same email provider? \n", + "4 4.253613 Donna is about to go outside. Does she need an... \n", + "5 4.253613 whats the name of the city where bob lives? \n", + "6 4.253613 what is the current users favorite color and n... \n", + "\n", + " output.question \\\n", + "0 do bob and alice live in the same city? \n", + "2 do alice and charlie use the same email provider? \n", + "4 Donna is about to go outside. Does she need an... \n", + "5 whats the name of the city where bob lives? \n", + "6 what is the current users favorite color and n... \n", + "\n", + " output.output \\\n", + "0 No, Bob and Alice do not live in the same city... \n", + "2 No, Alice and Charlie do not use the same emai... \n", + "4 Donna is at location 4 and the current weather... \n", + "5 The name of the city where Bob lives is New York. \n", + "6 The current user's favorite color is yellow an... \n", + "\n", + " output.intermediate_steps reference.reference \\\n", + "0 [(tool='find_users_by_name' tool_input={'name'... no \n", + "2 [(tool='find_users_by_name' tool_input={'name'... no \n", + "4 [(tool='find_users_by_name' tool_input={'name'... yes \n", + "5 [(tool='list_user_ids' tool_input={} log='\\nIn... Los Angeles \n", + "6 [(tool='get_current_user_id' tool_input={} log... yellow and Charlie \n", + "\n", + " reference.order_matters reference.expected_steps \\\n", + "0 False [find_users_by_name, get_user_location, get_ci... \n", + "2 True [find_users_by_name, get_user_email, get_user_... \n", + "4 True [find_users_by_name, get_user_location, get_cu... \n", + "5 True [find_users_by_name, get_user_location, get_ci... \n", + "6 True [get_current_user_id, get_user_favorite_color,... \n", + "\n", + " num_expected_steps actual_number_of_steps \n", + "0 5 5 \n", + "2 3 3 \n", + "4 3 3 \n", + "5 3 3 \n", + "6 3 3 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(\"actual_number_of_steps\", ascending=False).head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/notebooks/tool_usage/typewriter_1.ipynb b/docs/source/notebooks/tool_usage/typewriter_1.ipynb index b8a7effc..1105cac8 100644 --- a/docs/source/notebooks/tool_usage/typewriter_1.ipynb +++ b/docs/source/notebooks/tool_usage/typewriter_1.ipynb @@ -1,486 +1,631 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", - "metadata": { - "tags": [] - }, - "source": [ - "# Typewriter: Single Tool\n", - "\n", - "\n", - "Let's see how to evaluate an agent's ability to use tools.\n", - "\n", - " A task where the agent must type a given string one letter at a time.\n", - "\n", - " In this variation of the task, the agent is given a single function,\n", - " that takes a letter as an argument." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_benchmarks import clone_public_dataset, registry" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Name Tool Usage - Typewriter (1 tool)
Type ToolUsageTask
Dataset ID 59577193-8938-4ccf-92a7-e8a96bcf4f86
DescriptionEnvironment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n", - "\n", - "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n", - "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", - "\n", - "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
" - ], - "text/plain": [ - "ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \")" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task = registry[\"Tool Usage - Typewriter (1 tool)\"]\n", - "task" - ] - }, - { - "cell_type": "markdown", - "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", - "metadata": {}, - "source": [ - "Clone the dataset associaetd with this task" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "70369f67-deb4-467a-801a-6d38c3d0460d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset Tool Usage - Typewriter (1 tool) already exists. Skipping.\n", - "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3.\n" - ] - } - ], - "source": [ - "clone_public_dataset(task.dataset_id, dataset_name=task.name)" - ] - }, - { - "cell_type": "markdown", - "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", - "metadata": {}, - "source": [ - "Let's build an agent that we can use for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6142cf4e-862c-47a3-aa75-81d7d3231308", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'question': 'abc',\n", - " 'output': 'a, b, c',\n", - " 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"a\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"b\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK'),\n", - " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"c\"\\n}', 'name': 'type_letter'}})]),\n", - " 'OK')],\n", - " 'state': 'abc'}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain_benchmarks.tool_usage import agents\n", - "\n", - "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n", - "\n", - "# Let's test that our agent works\n", - "agent = agent_factory.create()\n", - "agent.invoke({\"question\": \"abc\"})" - ] - }, - { - "cell_type": "markdown", - "id": "3821e4b0-8e67-418a-840c-470fcde42df0", - "metadata": {}, - "source": [ - "## Eval\n", - "\n", - "Let's evaluate an agent now" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "View the evaluation results for project 'test-fresh-whip-11' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/c0c32118-d413-409f-ac01-088632c0e6ab?eval=true\n", - "\n", - "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n", - "[------------------------------------------------->] 20/20\n", - " Eval quantiles:\n", - " 0.25 0.5 0.75 mean mode\n", - "Intermediate steps correctness 1.00 1.0 1.0 0.95 1.0\n", - "# steps / # expected steps 1.00 1.0 1.0 1.70 1.0\n", - "Correct Final State 1.00 1.0 1.0 0.95 1.0\n", - "correctness 0.75 1.0 1.0 0.75 1.0\n" - ] - } - ], - "source": [ - "from langsmith.client import Client\n", - "\n", - "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n", - "\n", - "client = Client()\n", - "\n", - "test_run = client.run_on_dataset(\n", - " dataset_name=task.name,\n", - " llm_or_chain_factory=agent_factory.create,\n", - " evaluation=STANDARD_AGENT_EVALUATOR,\n", - " verbose=True,\n", - " tags=[\"gpt-3.5-turbo-16k\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", - "metadata": {}, - "source": [ - "# Inspect\n", - "\n", - "You can take a look at the underlying results." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df = test_run.to_dataframe()\n", - "df = pd.json_normalize(df.to_dict(orient=\"records\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "7ab5a8b9-a937-4537-b879-704284df4494", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.75" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"correctness\"].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n", - "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "50d7590d-20de-4768-ac90-adcdbfa70068", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Intermediate steps correctness# steps / # expected stepsCorrect Final Statecorrectnessinput.questionoutput.questionoutput.outputoutput.intermediate_stepsoutput.statereference.statereference.referencereference.expected_stepsnum_expected_stepsactual_number_of_steps
011.011communicationcommunicationcommunication[(tool='type_letter' tool_input={'letter': 'c'...communicationcommunicationcommunication[type_letter, type_letter, type_letter, type_l...1313
111.011informationinformationinformation[(tool='type_letter' tool_input={'letter': 'i'...informationinformationinformation[type_letter, type_letter, type_letter, type_l...1111
211.011dictionarydictionarydictionary[(tool='type_letter' tool_input={'letter': 'd'...dictionarydictionarydictionary[type_letter, type_letter, type_letter, type_l...1010
311.011universityuniversityu\\nn\\ni\\nv\\ne\\nr\\ns\\ni\\nt\\ny[(tool='type_letter' tool_input={'letter': 'u'...universityuniversityuniversity[type_letter, type_letter, type_letter, type_l...1010
411.011keyboardkeyboardkeyboard[(tool='type_letter' tool_input={'letter': 'k'...keyboardkeyboardkeyboard[type_letter, type_letter, type_letter, type_l...88
\n", - "
" - ], - "text/plain": [ - " Intermediate steps correctness # steps / # expected steps \\\n", - "0 1 1.0 \n", - "1 1 1.0 \n", - "2 1 1.0 \n", - "3 1 1.0 \n", - "4 1 1.0 \n", - "\n", - " Correct Final State correctness input.question output.question \\\n", - "0 1 1 communication communication \n", - "1 1 1 information information \n", - "2 1 1 dictionary dictionary \n", - "3 1 1 university university \n", - "4 1 1 keyboard keyboard \n", - "\n", - " output.output \\\n", - "0 communication \n", - "1 information \n", - "2 dictionary \n", - "3 u\\nn\\ni\\nv\\ne\\nr\\ns\\ni\\nt\\ny \n", - "4 keyboard \n", - "\n", - " output.intermediate_steps output.state \\\n", - "0 [(tool='type_letter' tool_input={'letter': 'c'... communication \n", - "1 [(tool='type_letter' tool_input={'letter': 'i'... information \n", - "2 [(tool='type_letter' tool_input={'letter': 'd'... dictionary \n", - "3 [(tool='type_letter' tool_input={'letter': 'u'... university \n", - "4 [(tool='type_letter' tool_input={'letter': 'k'... keyboard \n", - "\n", - " reference.state reference.reference \\\n", - "0 communication communication \n", - "1 information information \n", - "2 dictionary dictionary \n", - "3 university university \n", - "4 keyboard keyboard \n", - "\n", - " reference.expected_steps num_expected_steps \\\n", - "0 [type_letter, type_letter, type_letter, type_l... 13 \n", - "1 [type_letter, type_letter, type_letter, type_l... 11 \n", - "2 [type_letter, type_letter, type_letter, type_l... 10 \n", - "3 [type_letter, type_letter, type_letter, type_l... 10 \n", - "4 [type_letter, type_letter, type_letter, type_l... 8 \n", - "\n", - " actual_number_of_steps \n", - "0 13 \n", - "1 11 \n", - "2 10 \n", - "3 10 \n", - "4 8 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62bcf6c2-6449-4967-a4f4-2f3d90657a52", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", + "metadata": { + "tags": [] + }, + "source": [ + "# Typewriter: Single Tool\n", + "\n", + "In this task, an agent is given access to a single tool called \"type_letter\".\n", + "This tool takes one argument called \"letter\" which is expected to be a character.\n", + "\n", + "The agent must repeat the input string from the user, printing one\n", + "character a time on a piece of virtual paper.\n", + "\n", + "The agent is evaluated based on its ability to print the correct string using\n", + "the \"type_letter\" tool." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain_benchmarks import clone_public_dataset, registry" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Name Tool Usage - Typewriter (1 tool)
Type ToolUsageTask
Dataset ID 59577193-8938-4ccf-92a7-e8a96bcf4f86
DescriptionEnvironment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n", + "\n", + "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
" + ], + "text/plain": [ + "ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \")" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task = registry[\"Tool Usage - Typewriter (1 tool)\"]\n", + "task" + ] + }, + { + "cell_type": "markdown", + "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", + "metadata": {}, + "source": [ + "Clone the dataset associaetd with this task" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "70369f67-deb4-467a-801a-6d38c3d0460d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset Tool Usage - Typewriter (1 tool) already exists. Skipping.\n", + "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/25850d74-d4e0-41ac-81a1-dfc78a79660b.\n" + ] + } + ], + "source": [ + "clone_public_dataset(task.dataset_id, dataset_name=task.name)" + ] + }, + { + "cell_type": "markdown", + "id": "fc78a3e1-80da-4607-98c3-a99c2037e7ca", + "metadata": {}, + "source": [ + "## The Environment\n", + "\n", + "The environment consists of a single tool and a virtual paper.\n", + "\n", + "The tool accepts a single letter as an input and prints the leter on the virtual paper. If successful, the tool returns the output \"OK\".\n", + "\n", + "To determine what's written on the paper, one needs to read the environment state." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "64e538ae-5cf2-4cd5-a312-25ee6924e869", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "env = task.create_environment()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5516a34b-1e9b-4f1e-9462-cfc4d5bc29f9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[StructuredTool(name='type_letter', description='type_letter(letter: str) -> str - Print the given letter on the paper.', args_schema=, func=.type_letter at 0x7f538cc0e040>)]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.tools" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "80501e1a-f1f6-4b38-8637-894503029d86", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tool = env.tools[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "3f352e32-fdb6-4d9e-b1c4-3d78b4f50646", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'OK'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tool.invoke({\"letter\": \"a\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ec9c2e68-b55e-4087-bc1a-c38f4cfd401b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'OK'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tool.invoke({\"letter\": \"b\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2cc5b174-25a4-4d5a-8535-56ecea62ea81", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'ab'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.read_state()" + ] + }, + { + "cell_type": "markdown", + "id": "cd13d120-1bf9-481c-9392-c15ebdd9d77f", + "metadata": {}, + "source": [ + "## Agent" + ] + }, + { + "cell_type": "markdown", + "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", + "metadata": {}, + "source": [ + "Let's build an agent that we can use for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6142cf4e-862c-47a3-aa75-81d7d3231308", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'abc',\n", + " 'output': 'a, b, c',\n", + " 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"a\"\\n}', 'name': 'type_letter'}})]),\n", + " 'OK'),\n", + " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"b\"\\n}', 'name': 'type_letter'}})]),\n", + " 'OK'),\n", + " (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"c\"\\n}', 'name': 'type_letter'}})]),\n", + " 'OK')],\n", + " 'state': 'abc'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_benchmarks.tool_usage import agents\n", + "\n", + "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n", + "\n", + "# Let's test that our agent works\n", + "agent = agent_factory.create()\n", + "agent.invoke({\"question\": \"abc\"})" + ] + }, + { + "cell_type": "markdown", + "id": "3821e4b0-8e67-418a-840c-470fcde42df0", + "metadata": {}, + "source": [ + "## Eval\n", + "\n", + "Let's evaluate an agent now" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "View the evaluation results for project 'test-shiny-curve-39' at:\n", + "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/c66bbd6e-cce5-461d-9287-97391bd2f668?eval=true\n", + "\n", + "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n", + "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/25850d74-d4e0-41ac-81a1-dfc78a79660b\n", + "[------------------------------------------------->] 20/20\n", + " Eval quantiles:\n", + " 0.25 0.5 0.75 mean \\\n", + "Intermediate steps correctness 1.000000 1.000000 1.000000 0.950000 \n", + "# steps / # expected steps 1.000000 1.000000 1.000000 1.700000 \n", + "Correct Final State 1.000000 1.000000 1.000000 0.950000 \n", + "correctness 1.000000 1.000000 1.000000 0.800000 \n", + "execution_time 34.058961 34.058961 34.058961 34.058961 \n", + "\n", + " mode \n", + "Intermediate steps correctness 1.000000 \n", + "# steps / # expected steps 1.000000 \n", + "Correct Final State 1.000000 \n", + "correctness 1.000000 \n", + "execution_time 34.058961 \n" + ] + } + ], + "source": [ + "from langsmith.client import Client\n", + "\n", + "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n", + "\n", + "client = Client()\n", + "\n", + "test_run = client.run_on_dataset(\n", + " dataset_name=task.name,\n", + " llm_or_chain_factory=agent_factory.create,\n", + " evaluation=STANDARD_AGENT_EVALUATOR,\n", + " verbose=True,\n", + " tags=[\"gpt-3.5-turbo-16k\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd", + "metadata": {}, + "source": [ + "# Inspect\n", + "\n", + "You can take a look at the underlying results." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = test_run.to_dataframe()\n", + "df = pd.json_normalize(df.to_dict(orient=\"records\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7ab5a8b9-a937-4537-b879-704284df4494", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"correctness\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n", + "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "50d7590d-20de-4768-ac90-adcdbfa70068", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Intermediate steps correctness# steps / # expected stepsCorrect Final Statecorrectnessexecution_timeinput.questionoutput.questionoutput.outputoutput.intermediate_stepsoutput.statereference.statereference.referencereference.expected_stepsnum_expected_stepsactual_number_of_steps
0015.00034.058961aaAgent stopped due to iteration limit or time l...[(tool='type_letter' tool_input={'letter': 'a'...aaaaaaaaaaaaaaaaa[type_letter]115
111.01134.058961aaaaaa\\naa[(tool='type_letter' tool_input={'letter': 'a'...aaaaaa[type_letter, type_letter]22
211.01034.058961aaaaaaa\\na[(tool='type_letter' tool_input={'letter': 'a'...aaaaaaaaa[type_letter, type_letter, type_letter]33
311.01034.058961aaaaaaaaa\\na[(tool='type_letter' tool_input={'letter': 'a'...aaaaaaaaaaaa[type_letter, type_letter, type_letter, type_l...44
411.01134.058961dogdogd\\no\\ng[(tool='type_letter' tool_input={'letter': 'd'...dogdogdog[type_letter, type_letter, type_letter]33
\n", + "
" + ], + "text/plain": [ + " Intermediate steps correctness # steps / # expected steps \\\n", + "0 0 15.0 \n", + "1 1 1.0 \n", + "2 1 1.0 \n", + "3 1 1.0 \n", + "4 1 1.0 \n", + "\n", + " Correct Final State correctness execution_time input.question \\\n", + "0 0 0 34.058961 a \n", + "1 1 1 34.058961 aa \n", + "2 1 0 34.058961 aaa \n", + "3 1 0 34.058961 aaaa \n", + "4 1 1 34.058961 dog \n", + "\n", + " output.question output.output \\\n", + "0 a Agent stopped due to iteration limit or time l... \n", + "1 aa aa\\naa \n", + "2 aaa a\\na \n", + "3 aaaa a\\na \n", + "4 dog d\\no\\ng \n", + "\n", + " output.intermediate_steps output.state \\\n", + "0 [(tool='type_letter' tool_input={'letter': 'a'... aaaaaaaaaaaaaaa \n", + "1 [(tool='type_letter' tool_input={'letter': 'a'... aa \n", + "2 [(tool='type_letter' tool_input={'letter': 'a'... aaa \n", + "3 [(tool='type_letter' tool_input={'letter': 'a'... aaaa \n", + "4 [(tool='type_letter' tool_input={'letter': 'd'... dog \n", + "\n", + " reference.state reference.reference \\\n", + "0 a a \n", + "1 aa aa \n", + "2 aaa aaa \n", + "3 aaaa aaaa \n", + "4 dog dog \n", + "\n", + " reference.expected_steps num_expected_steps \\\n", + "0 [type_letter] 1 \n", + "1 [type_letter, type_letter] 2 \n", + "2 [type_letter, type_letter, type_letter] 3 \n", + "3 [type_letter, type_letter, type_letter, type_l... 4 \n", + "4 [type_letter, type_letter, type_letter] 3 \n", + "\n", + " actual_number_of_steps \n", + "0 15 \n", + "1 2 \n", + "2 3 \n", + "3 4 \n", + "4 3 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/notebooks/tool_usage/typewriter_26.ipynb b/docs/source/notebooks/tool_usage/typewriter_26.ipynb index 51a8632f..1c57b95d 100644 --- a/docs/source/notebooks/tool_usage/typewriter_26.ipynb +++ b/docs/source/notebooks/tool_usage/typewriter_26.ipynb @@ -1,284 +1,763 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", - "metadata": { - "tags": [] - }, - "source": [ - "# Typewriter: 26 Tools\n", - "\n", - "\n", - "Let's see how to evaluate an agent's ability to use tools.\n", - "\n", - " A task where the agent must type a given string one letter at a time.\n", - "\n", - " In this variation of the task, the agent is given access to 26 parameterless functions,\n", - " each representing a letter of the alphabet." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "845c77a6-9da6-494c-973f-0ee1dac67b19", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\" # Your api key." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_benchmarks import clone_public_dataset, registry" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Name Tool Usage - Typewriter (26 tools)
Type ToolUsageTask
Dataset ID 128af05e-aa00-4e3b-a958-d166dd450581
DescriptionEnvironment with 26 tools each tool represents a letter of the alphabet.\n", - "\n", - "The objective of this task is to evaluate the model's ability the use tools\n", - "for a simple repetition task.\n", - "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", - "\n", - "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", - "\n", - "This is a variation of the typer writer task, where 26 parameterless tools are\n", - "given instead of a single tool that takes a letter as an argument.
" - ], - "text/plain": [ - "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\")" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "task = registry[\"Tool Usage - Typewriter (26 tools)\"]\n", - "task" - ] - }, - { - "cell_type": "markdown", - "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", - "metadata": {}, - "source": [ - "Clone the dataset associaetd with this task" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "70369f67-deb4-467a-801a-6d38c3d0460d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset Tool Usage - Typewriter (26 tools) already exists. Skipping.\n", - "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478.\n" - ] - } - ], - "source": [ - "clone_public_dataset(task.dataset_id, dataset_name=task.name)" - ] - }, - { - "cell_type": "markdown", - "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", - "metadata": {}, - "source": [ - "Let's build an agent that we can use for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "61535a75-24f6-4727-9549-f76c263e9153", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "env = task.create_environment()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "6142cf4e-862c-47a3-aa75-81d7d3231308", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'question': 'foo',\n", - " 'output': \"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\",\n", - " 'intermediate_steps': [(AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': 'f', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n", - " 'Invalid or incomplete response'),\n", - " (AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n", - " 'Invalid or incomplete response'),\n", - " (AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n", - " 'Invalid or incomplete response'),\n", - " (AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n", - " 'Invalid or incomplete response'),\n", - " (AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n", - " 'Invalid or incomplete response')],\n", - " 'state': ''}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain_benchmarks.tool_usage import agents\n", - "\n", - "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n", - "\n", - "# Let's test that our agent works\n", - "agent = agent_factory()\n", - "agent.invoke({\"question\": \"foo\"})" - ] - }, - { - "cell_type": "markdown", - "id": "3821e4b0-8e67-418a-840c-470fcde42df0", - "metadata": {}, - "source": [ - "## Eval\n", - "\n", - "Let's evaluate an agent now" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "View the evaluation results for project 'test-notable-artist-76' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/5c828160-9f7f-4f01-84ea-05f8a498d031?eval=true\n", - "\n", - "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478\n", - "[> ] 0/20" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Chain failed for example 2d4e99fc-8495-468e-8429-6c25a2d176f3 with inputs {'question': 'keyboard'}\n", - "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID b658bca90fb852f4d236fc368bc65bcc in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-------------------> ] 8/20" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Chain failed for example 8af5bd36-fc11-4b23-9019-f642cfaf8a01 with inputs {'question': 'horse'}\n", - "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 3c40664804cb6e8c84e0e8796dbc0a6d in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[------------------------------------------------->] 20/20\n", - " Eval quantiles:\n", - " 0.25 0.5 0.75 mean mode\n", - "Intermediate steps correctness 0.000000 0.00 0.000 0.000000 0.00\n", - "# steps / # expected steps 0.703571 0.75 1.375 1.007551 0.75\n", - "Correct Final State 0.000000 0.00 0.000 0.055556 0.00\n", - "correctness 0.000000 0.00 0.000 0.111111 0.00\n" - ] - } - ], - "source": [ - "from langsmith.client import Client\n", - "\n", - "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n", - "\n", - "client = Client()\n", - "\n", - "test_run = client.run_on_dataset(\n", - " dataset_name=task.name,\n", - " llm_or_chain_factory=agent_factory.create,\n", - " evaluation=STANDARD_AGENT_EVALUATOR,\n", - " verbose=True,\n", - " tags=[\"gpt-3.5-turbo-16k\"],\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } + "cells": [ + { + "cell_type": "markdown", + "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", + "metadata": { + "tags": [] + }, + "source": [ + "# Typewriter: 26 Tools\n", + "\n", + "This is a variation of the typewriter task in which the agent has access to 26 parameterless tools.\n", + "\n", + "Each tool represents a letter of the alphabet (e.g., 'a', 'b', 'c').\n", + "\n", + "The agent can use each tool to \"print\" the corresponding letter on a piece of virtual paper.\n", + "\n", + "The objective for the agent is to \"print\" the user's input on the paper exactly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845c77a6-9da6-494c-973f-0ee1dac67b19", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\" # Your api key." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain_benchmarks import clone_public_dataset, registry" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Name Tool Usage - Typewriter (26 tools)
Type ToolUsageTask
Dataset ID 128af05e-aa00-4e3b-a958-d166dd450581
DescriptionEnvironment with 26 tools each tool represents a letter of the alphabet.\n", + "\n", + "The objective of this task is to evaluate the model's ability the use tools\n", + "for a simple repetition task.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", + "\n", + "This is a variation of the typer writer task, where 26 parameterless tools are\n", + "given instead of a single tool that takes a letter as an argument.
" + ], + "text/plain": [ + "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\")" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task = registry[\"Tool Usage - Typewriter (26 tools)\"]\n", + "task" + ] + }, + { + "cell_type": "markdown", + "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d", + "metadata": {}, + "source": [ + "Clone the dataset associaetd with this task" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "70369f67-deb4-467a-801a-6d38c3d0460d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset Tool Usage - Typewriter (26 tools) already exists. Skipping.\n", + "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/5051c0ae-16be-4afa-b914-84acbc5e9659.\n" + ] + } + ], + "source": [ + "clone_public_dataset(task.dataset_id, dataset_name=task.name)" + ] + }, + { + "cell_type": "markdown", + "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28", + "metadata": {}, + "source": [ + "Let's build an agent that we can use for evaluation." + ] + }, + { + "cell_type": "markdown", + "id": "6ce51f81-1b3a-4dda-a382-c2fed3013af1", + "metadata": {}, + "source": [ + "## The Environment\n", + "\n", + "The environment consists of 26 tools and a virtual paper.\n", + "\n", + "Each tool is responsible for printing a letter on the paper that corresponds to it." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "61535a75-24f6-4727-9549-f76c263e9153", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "env = task.create_environment()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f35a0a1d-5a1e-4de1-8d8c-c7c9a264a6c7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=, func=.func at 0x7f099cebd310>),\n", + " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=, func=.func at 0x7f097f56f940>),\n", + " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=, func=.func at 0x7f097f56ff70>),\n", + " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=, func=.func at 0x7f096421b040>),\n", + " StructuredTool(name='e', description='e() -> str - Run to Type the letter \"e\".', args_schema=, func=.func at 0x7f096421b1f0>)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.tools[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5bea0190-39ec-4f30-9a00-90136bc6bf0b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'OK'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.tools[0].invoke({})" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "bf7444da-15a1-455a-b22e-639cbfff8432", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'OK'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.tools[3].invoke({})" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d12bd710-5c01-4539-a4b9-afbf03164923", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'ad'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "env.read_state()" + ] + }, + { + "cell_type": "markdown", + "id": "f1d62a13-3771-460f-b131-4443f669ca3d", + "metadata": {}, + "source": [ + "## Agent" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6142cf4e-862c-47a3-aa75-81d7d3231308", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'abc',\n", + " 'output': 'abc\\nabc',\n", + " 'intermediate_steps': [(AgentActionMessageLog(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '', 'name': 'a'}})]),\n", + " 'OK'),\n", + " (AgentActionMessageLog(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '', 'name': 'b'}})]),\n", + " 'OK'),\n", + " (AgentActionMessageLog(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '', 'name': 'c'}})]),\n", + " 'OK')],\n", + " 'state': 'abc'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_benchmarks.tool_usage import agents\n", + "\n", + "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n", + "\n", + "# Let's test that our agent works\n", + "agent = agent_factory()\n", + "agent.invoke({\"question\": \"abc\"})" + ] + }, + { + "cell_type": "markdown", + "id": "3821e4b0-8e67-418a-840c-470fcde42df0", + "metadata": {}, + "source": [ + "## Eval\n", + "\n", + "Let's evaluate an agent now" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "View the evaluation results for project 'test-mealy-ink-37' at:\n", + "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/d5562dcb-7bea-432d-8e41-3fcf3f6f2247?eval=true\n", + "\n", + "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n", + "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/5051c0ae-16be-4afa-b914-84acbc5e9659\n", + "[-----------> ] 5/20" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Chain failed for example c0ee0026-e11b-4036-b7f0-135ac9e82d66 with inputs {'question': 'horse'}\n", + "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 8707858df9212b40a8d4f22a0027d2a2 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[--------------> ] 6/20" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Chain failed for example 4ae1d1c0-4c34-4ef0-afd8-292be2e53b8d with inputs {'question': 'school'}\n", + "Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': \"'s()' does not match '^[a-zA-Z0-9_-]{1,64}$' - 'messages.2.function_call.name'\", 'type': 'invalid_request_error', 'param': None, 'code': None}}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-----------------------------> ] 12/20" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Chain failed for example e03000da-4c4b-4060-a798-0e71f3c3ff90 with inputs {'question': 'keyboard'}\n", + "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID d20bfa7a39d9ee8c80e72070a6aafab9 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[------------------------------------------------->] 20/20\n", + " Eval quantiles:\n", + " 0.25 0.5 0.75 mean \\\n", + "Intermediate steps correctness 0.000000 0.000000 1.000000 0.294118 \n", + "# steps / # expected steps 1.000000 1.125000 2.142857 1.722598 \n", + "Correct Final State 0.000000 1.000000 1.000000 0.529412 \n", + "correctness 0.000000 0.000000 1.000000 0.470588 \n", + "execution_time 38.794961 38.794961 38.794961 38.794961 \n", + "\n", + " mode \n", + "Intermediate steps correctness 0.000000 \n", + "# steps / # expected steps 1.000000 \n", + "Correct Final State 1.000000 \n", + "correctness 0.000000 \n", + "execution_time 38.794961 \n" + ] + } + ], + "source": [ + "from langsmith.client import Client\n", + "\n", + "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n", + "\n", + "client = Client()\n", + "\n", + "test_run = client.run_on_dataset(\n", + " dataset_name=task.name,\n", + " llm_or_chain_factory=agent_factory.create,\n", + " evaluation=STANDARD_AGENT_EVALUATOR,\n", + " verbose=True,\n", + " tags=[\"gpt-3.5-turbo-16k\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "641f534e-3ce2-436b-83a8-0289578546ff", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = test_run.to_dataframe()\n", + "df = pd.json_normalize(df.to_dict(orient=\"records\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "4bc23900-46e5-450f-80c6-9d53eb4b12a7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n", + "df[\"actual_number_of_steps\"] = (\n", + " df[\"output.intermediate_steps\"]\n", + " .apply(lambda x: None if not isinstance(x, list) else len(x))\n", + " .fillna(\"\")\n", + ")\n", + "df[\"output.Error\"].fillna(\"\", inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "985e18cc-03ad-47aa-a703-037cec97270a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5294117647058824" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Correct Final State\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "bfe6bebd-0a6d-4787-9441-fc51c0d2e7c3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
input.questionoutput.statenum_expected_stepsactual_number_of_stepsCorrect Final Stateoutput.Error
0aa11.01.0
1aaaa22.01.0
2aaaaaaaaaaaaaaaaaa315.00.0
3aaaaaaaaaaaaaaaaaaa415.00.0
4dogdog34.01.0
5catcat34.01.0
6handhand44.01.0
7headhhhhhhhhhhhhhhh415.00.0
8househouse55.01.0
9horseNaN5NaNInternalServerError(\"Error code: 500 - {'error...
10schoolNaN6NaNBadRequestError('Error code: 400 - {\\'error\\':...
11churchchurchchchchch615.00.0
12teacherteacher77.01.0
13studentstudentstudent715.00.0
14computercomputer89.01.0
15keyboardNaN8NaNInternalServerError(\"Error code: 500 - {'error...
16university105.00.0
17dictionarydictionarystr1015.00.0
18information113.00.0
19communicationcommunication1314.01.0
\n", + "
" + ], + "text/plain": [ + " input.question output.state num_expected_steps actual_number_of_steps \\\n", + "0 a a 1 1.0 \n", + "1 aa aa 2 2.0 \n", + "2 aaa aaaaaaaaaaaaaaa 3 15.0 \n", + "3 aaaa aaaaaaaaaaaaaaa 4 15.0 \n", + "4 dog dog 3 4.0 \n", + "5 cat cat 3 4.0 \n", + "6 hand hand 4 4.0 \n", + "7 head hhhhhhhhhhhhhhh 4 15.0 \n", + "8 house house 5 5.0 \n", + "9 horse NaN 5 \n", + "10 school NaN 6 \n", + "11 church churchchchchch 6 15.0 \n", + "12 teacher teacher 7 7.0 \n", + "13 student studentstudent 7 15.0 \n", + "14 computer computer 8 9.0 \n", + "15 keyboard NaN 8 \n", + "16 university 10 5.0 \n", + "17 dictionary dictionarystr 10 15.0 \n", + "18 information 11 3.0 \n", + "19 communication communication 13 14.0 \n", + "\n", + " Correct Final State output.Error \n", + "0 1.0 \n", + "1 1.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 1.0 \n", + "5 1.0 \n", + "6 1.0 \n", + "7 0.0 \n", + "8 1.0 \n", + "9 NaN InternalServerError(\"Error code: 500 - {'error... \n", + "10 NaN BadRequestError('Error code: 400 - {\\'error\\':... \n", + "11 0.0 \n", + "12 1.0 \n", + "13 0.0 \n", + "14 1.0 \n", + "15 NaN InternalServerError(\"Error code: 500 - {'error... \n", + "16 0.0 \n", + "17 0.0 \n", + "18 0.0 \n", + "19 1.0 " + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\n", + " [\n", + " \"input.question\",\n", + " \"output.state\",\n", + " \"num_expected_steps\",\n", + " \"actual_number_of_steps\",\n", + " \"Correct Final State\",\n", + " \"output.Error\",\n", + " ]\n", + "]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain_benchmarks/tool_usage/evaluators.py b/langchain_benchmarks/tool_usage/evaluators.py index 187ed605..95b284f3 100644 --- a/langchain_benchmarks/tool_usage/evaluators.py +++ b/langchain_benchmarks/tool_usage/evaluators.py @@ -5,11 +5,10 @@ * Agents must output "intermediate_steps" in their run outputs. * The dataset must have "expected_steps" in its outputs. """ -from typing import List, Optional, Union +from typing import Optional from langchain.evaluation import EvaluatorType from langchain.smith import RunEvalConfig -from langchain.smith.evaluation.config import EvalConfig from langsmith.evaluation.evaluator import ( EvaluationResult, EvaluationResults, @@ -17,7 +16,56 @@ ) from langsmith.schemas import Example, Run -from langchain_benchmarks.schema import ExtractionTask + +def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResults: + """Compare the outputs of a run to the expected outputs.""" + intermediate_steps = run_outputs["intermediate_steps"] + # Since we are comparing to the tool names, we now need to get that + # Intermediate steps is a Tuple[AgentAction, Any] + # The first element is the action taken + # The second element is the observation from taking that action + trajectory = [action.tool for action, _ in intermediate_steps] + expected_trajectory = example_outputs["expected_steps"] + + order_matters = example_outputs.get("order_matters", True) + + if order_matters: + # If the order matters trajectory must be the same as expected trajectory + score = int(trajectory == expected_trajectory) + else: + # If order does not matter, then we compare the trajectories after sorting + # them. This will make sure that the number of times each tool is used + # is the same, but the order does not matter. + score = int(sorted(trajectory) == sorted(expected_trajectory)) + + # Just score it based on whether it is correct or not + step_fraction = len(trajectory) / len(expected_trajectory) + + # Add trajectory results + results = [ + EvaluationResult( + key="Intermediate steps correctness", + score=score, + ), + EvaluationResult( + key="# steps / # expected steps", + score=step_fraction, + ), + ] + + # Evaluate state score + # This will need to be evolved it's too simple. + if "state" in run_outputs: + state = run_outputs["state"] + example_state = example_outputs["state"] + results.append( + EvaluationResult( + key="Correct Final State", + score=int(state == example_state), + ) + ) + + return {"results": results} class AgentTrajectoryEvaluator(RunEvaluator): @@ -26,46 +74,25 @@ class AgentTrajectoryEvaluator(RunEvaluator): def evaluate_run( self, run: Run, example: Optional[Example] = None ) -> EvaluationResults: + # The run is the run from the agent if run.outputs is None: raise ValueError("Run outputs cannot be None") - # This is the output of each run - intermediate_steps = run.outputs["intermediate_steps"] - # Since we are comparing to the tool names, we now need to get that - # Intermediate steps is a Tuple[AgentAction, Any] - # The first element is the action taken - # The second element is the observation from taking that action - trajectory = [action.tool for action, _ in intermediate_steps] - # This is what we uploaded to the dataset + + # The example is the example from the dataset if example is None: raise ValueError("Example cannot be None") - expected_trajectory = example.outputs["expected_steps"] - # Just score it based on whether it is correct or not - score = int(trajectory == expected_trajectory) - step_fraction = len(trajectory) / len(expected_trajectory) + if "intermediate_steps" not in run.outputs: + raise ValueError( + "Please make sure that your agent outputs 'intermediate_steps'" + ) - results = [ - EvaluationResult( - key="Intermediate steps correctness", - score=score, - ), - EvaluationResult( - key="# steps / # expected steps", - score=step_fraction, - ), - ] - - if "state" in run.outputs: - state = run.outputs["state"] - example_state = example.outputs["state"] - results.append( - EvaluationResult( - key="Correct Final State", - score=int(state == example_state), - ) + if "expected_steps" not in example.outputs: + raise ValueError( + "Please make sure that your dataset contains 'expected_steps'" ) - return {"results": results} + return compare_outputs(run.outputs, example.outputs) STANDARD_AGENT_EVALUATOR = RunEvalConfig( diff --git a/tests/unit_tests/tool_usage/test_evaluator.py b/tests/unit_tests/tool_usage/test_evaluator.py new file mode 100644 index 00000000..82a5ff62 --- /dev/null +++ b/tests/unit_tests/tool_usage/test_evaluator.py @@ -0,0 +1,115 @@ +"""Test the standard agent evaluator.""" + +import pytest +from langchain.schema import AgentAction + +from langchain_benchmarks.tool_usage.evaluators import compare_outputs + + +@pytest.mark.parametrize( + "run_outputs, example_outputs, expected_results", + [ + ( + { + "intermediate_steps": [ + ( + AgentAction(tool="action_1", tool_input={}, log=""), + "observation1", + ), + ( + AgentAction(tool="action_2", tool_input={}, log=""), + "observation1", + ), + ], + "state": "final_state", + }, + { + "expected_steps": ["action_1", "action_2"], + "state": "final_state", + }, + { + "Intermediate steps correctness": True, + "# steps / # expected steps": 1, + "Correct Final State": 1, + }, + ), + ( + { + "intermediate_steps": [ + ( + AgentAction(tool="action_1", tool_input={}, log=""), + "observation1", + ), + ( + AgentAction(tool="action_2", tool_input={}, log=""), + "observation1", + ), + ], + "state": "final_state", + }, + { + "expected_steps": ["cat", "was", "here"], + "state": "another_state", + }, + { + "Intermediate steps correctness": False, + "# steps / # expected steps": 2 / 3, + "Correct Final State": 0, + }, + ), + ( + { + "intermediate_steps": [ + ( + AgentAction(tool="action_2", tool_input={}, log=""), + "observation1", + ), + ( + AgentAction(tool="action_1", tool_input={}, log=""), + "observation1", + ), + ], + "state": "final_state", + }, + { + "expected_steps": ["action_1", "action_2"], + "order_matters": False, + "state": "different_state", + }, + { + "Intermediate steps correctness": True, + "# steps / # expected steps": 1.0, + "Correct Final State": 0, + }, + ), + # Without state + ( + { + "intermediate_steps": [ + ( + AgentAction(tool="action_2", tool_input={}, log=""), + "observation1", + ), + ( + AgentAction(tool="action_1", tool_input={}, log=""), + "observation1", + ), + ], + }, + { + "expected_steps": ["action_1", "action_2"], + "order_matters": False, + }, + { + "Intermediate steps correctness": True, + "# steps / # expected steps": 1.0, + }, + ), + ], +) +def test_compare_outputs(run_outputs, example_outputs, expected_results): + """Test compare outputs.""" + evaluation_results = compare_outputs(run_outputs, example_outputs) + assert { + result.key: result.score for result in evaluation_results["results"] + } == expected_results