From a585ad7e93afc51134ddbd81cfc3b18db1a62a03 Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev <eyurtsev@gmail.com>
Date: Wed, 22 Nov 2023 14:33:26 -0500
Subject: [PATCH] Update standard agent evaluator + update notebooks (#78)

This updates the standard agent evaluator to use order_matters
---
 .../tool_usage/multiverse_math.ipynb          | 1067 +++++----
 .../tool_usage/relational_data.ipynb          | 1975 +++++++----------
 .../notebooks/tool_usage/typewriter_1.ipynb   | 1115 ++++++----
 .../notebooks/tool_usage/typewriter_26.ipynb  | 1043 ++++++---
 langchain_benchmarks/tool_usage/evaluators.py |   97 +-
 tests/unit_tests/tool_usage/test_evaluator.py |  115 +
 6 files changed, 2966 insertions(+), 2446 deletions(-)
 create mode 100644 tests/unit_tests/tool_usage/test_evaluator.py
diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb
index 2843dc75..8bc81073 100644
--- a/docs/source/notebooks/tool_usage/multiverse_math.ipynb
+++ b/docs/source/notebooks/tool_usage/multiverse_math.ipynb
@@ -1,481 +1,588 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "# Multiverse Math\n",
-                "\n",
-                "\n",
-                "Let's see how to evaluate an agent's ability to use tools.\n",
-                "\n",
-                "    Solve basic math question using the provided tools.\n",
-                "\n",
-                "    Must use the provided tools to solve the math question.\n",
-                "\n",
-                "    To make sure that innate knowledge is not used, the math operations have been altered to yield different results than expected.\n",
-                "\n",
-                "    The modified operations should yield different results, but still retain appropriate properties. For example, the modified multiplication operation should still be commutative.\n",
-                "\n",
-                "    Please note that the modified operations are not guaranteed to even make sense in the real world since not all properties will be retained (e.g., distributive property)."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "03488ab1-31ed-41c2-8da2-46b02599b181",
-            "metadata": {},
-            "source": [
-                "For this code to work, please configure LangSmith environment variables with your credentials."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 1,
-            "id": "1615b8ff-688a-4447-8c4c-d64ad02818ed",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import os\n",
-                "\n",
-                "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"  # Your LangSmith API key"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 18,
-            "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langchain_benchmarks import clone_public_dataset, registry"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 3,
-            "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<table>\n",
-                            "<tbody>\n",
-                            "<tr><td>Name       </td><td>Multiverse Math                                                          </td></tr>\n",
-                            "<tr><td>Type       </td><td>ToolUsageTask                                                            </td></tr>\n",
-                            "<tr><td>Dataset ID </td><td>https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d</td></tr>\n",
-                            "<tr><td>Description</td><td>An environment that contains a few basic math operations, but with altered results.\n",
-                            "\n",
-                            "For example, mu...                                                                          </td></tr>\n",
-                            "</tbody>\n",
-                            "</table>"
-                        ],
-                        "text/plain": [
-                            "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x7fae28d9f310>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.')"
-                        ]
-                    },
-                    "execution_count": 3,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "task = registry[\"Multiverse Math\"]\n",
-                "task"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
-            "metadata": {},
-            "source": [
-                "Clone the dataset associaetd with this task"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 6,
-            "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Dataset Multiverse Math already exists. Skipping.\n",
-                        "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/ddca73f1-ceda-4562-8c49-7ee0a9df2a01.\n"
-                    ]
-                }
-            ],
-            "source": [
-                "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
-            "metadata": {},
-            "source": [
-                "Let's build an agent that we can use for evaluation."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 10,
-            "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "{'question': 'how much is 3 + 5',\n",
-                            " 'output': 'In this alternate mathematical universe, the result of adding 3 and 5 is 9.2.',\n",
-                            " 'intermediate_steps': [(AgentActionMessageLog(tool='add', tool_input={'a': 3, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 3, 'b': 5}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"a\": 3,\\n  \"b\": 5\\n}', 'name': 'add'}})]),\n",
-                            "   9.2)]}"
-                        ]
-                    },
-                    "execution_count": 10,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "from langchain_benchmarks.tool_usage import agents\n",
-                "\n",
-                "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
-                "\n",
-                "# Let's test that our agent works\n",
-                "agent = agent_factory.create()\n",
-                "agent.invoke({\"question\": \"how much is 3 + 5\"})"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-            "metadata": {},
-            "source": [
-                "## Eval\n",
-                "\n",
-                "Let's evaluate an agent now"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 13,
-            "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "View the evaluation results for project 'test-excellent-potato-37' at:\n",
-                        "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/e350cda0-4e1d-49eb-8483-574172d1c635?eval=true\n",
-                        "\n",
-                        "View all tests for Dataset Multiverse Math at:\n",
-                        "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/ddca73f1-ceda-4562-8c49-7ee0a9df2a01\n",
-                        "[------------------------------------------------->] 10/10\n",
-                        " Eval quantiles:\n",
-                        "                                    0.25       0.5      0.75      mean  \\\n",
-                        "Intermediate steps correctness   0.00000   0.00000   0.00000   0.10000   \n",
-                        "# steps / # expected steps       5.00000   7.50000   8.62500   7.75000   \n",
-                        "correctness                      0.00000   0.00000   0.00000   0.10000   \n",
-                        "execution_time                  38.76436  38.76436  38.76436  38.76436   \n",
-                        "\n",
-                        "                                    mode  \n",
-                        "Intermediate steps correctness   0.00000  \n",
-                        "# steps / # expected steps       5.00000  \n",
-                        "correctness                      0.00000  \n",
-                        "execution_time                  38.76436  \n"
-                    ]
-                }
-            ],
-            "source": [
-                "from langsmith.client import Client\n",
-                "\n",
-                "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
-                "\n",
-                "client = Client()\n",
-                "\n",
-                "test_run = client.run_on_dataset(\n",
-                "    dataset_name=task.name,\n",
-                "    llm_or_chain_factory=agent_factory.create,\n",
-                "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
-                "    verbose=True,\n",
-                "    tags=[\"gpt-3.5-turbo-16k\"],\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
-            "metadata": {},
-            "source": [
-                "# Inspect\n",
-                "\n",
-                "You can take a look at the underlying results."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 14,
-            "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "import pandas as pd\n",
-                "\n",
-                "df = test_run.to_dataframe()\n",
-                "df = pd.json_normalize(df.to_dict(orient=\"records\"))"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 15,
-            "id": "7ab5a8b9-a937-4537-b879-704284df4494",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "0.1"
-                        ]
-                    },
-                    "execution_count": 15,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "df[\"correctness\"].mean()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 16,
-            "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n",
-                "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 17,
-            "id": "50d7590d-20de-4768-ac90-adcdbfa70068",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<div>\n",
-                            "<style scoped>\n",
-                            "    .dataframe tbody tr th:only-of-type {\n",
-                            "        vertical-align: middle;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe tbody tr th {\n",
-                            "        vertical-align: top;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe thead th {\n",
-                            "        text-align: right;\n",
-                            "    }\n",
-                            "</style>\n",
-                            "<table border=\"1\" class=\"dataframe\">\n",
-                            "  <thead>\n",
-                            "    <tr style=\"text-align: right;\">\n",
-                            "      <th></th>\n",
-                            "      <th>Intermediate steps correctness</th>\n",
-                            "      <th># steps / # expected steps</th>\n",
-                            "      <th>correctness</th>\n",
-                            "      <th>execution_time</th>\n",
-                            "      <th>input.question</th>\n",
-                            "      <th>output.question</th>\n",
-                            "      <th>output.output</th>\n",
-                            "      <th>output.intermediate_steps</th>\n",
-                            "      <th>reference.reference</th>\n",
-                            "      <th>reference.expected_steps</th>\n",
-                            "      <th>num_expected_steps</th>\n",
-                            "      <th>actual_number_of_steps</th>\n",
-                            "    </tr>\n",
-                            "  </thead>\n",
-                            "  <tbody>\n",
-                            "    <tr>\n",
-                            "      <th>0</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>15.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>38.76436</td>\n",
-                            "      <td>Add 2 and 3</td>\n",
-                            "      <td>Add 2 and 3</td>\n",
-                            "      <td>Agent stopped due to iteration limit or time l...</td>\n",
-                            "      <td>[(tool='add' tool_input={'a': 2, 'b': 3} log=\"...</td>\n",
-                            "      <td>6.20</td>\n",
-                            "      <td>[add]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>15</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>1</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>15.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>38.76436</td>\n",
-                            "      <td>Subtract 3 from 2</td>\n",
-                            "      <td>Subtract 3 from 2</td>\n",
-                            "      <td>Agent stopped due to iteration limit or time l...</td>\n",
-                            "      <td>[(tool='subtract' tool_input={'a': 2, 'b': 3} ...</td>\n",
-                            "      <td>-4.00</td>\n",
-                            "      <td>[subtract]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>15</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>2</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>9.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>38.76436</td>\n",
-                            "      <td>What is -5 if evaluated using the negate funct...</td>\n",
-                            "      <td>What is -5 if evaluated using the negate funct...</td>\n",
-                            "      <td>-5.0\\n-5.0</td>\n",
-                            "      <td>[(tool='negate' tool_input={'a': -5} log=\"\\nIn...</td>\n",
-                            "      <td>-5.00</td>\n",
-                            "      <td>[negate]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>9</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>3</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>38.76436</td>\n",
-                            "      <td>what is the result of 2 to the power of 3?</td>\n",
-                            "      <td>what is the result of 2 to the power of 3?</td>\n",
-                            "      <td>The result of 2 to the power of 3 is 32.</td>\n",
-                            "      <td>[(tool='power' tool_input={'a': 2, 'b': 3} log...</td>\n",
-                            "      <td>32.00</td>\n",
-                            "      <td>[power]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>4</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>7.5</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>38.76436</td>\n",
-                            "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
-                            "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
-                            "      <td>Agent stopped due to iteration limit or time l...</td>\n",
-                            "      <td>[(tool='add' tool_input={'a': 1, 'b': 2} log=\"...</td>\n",
-                            "      <td>32.34</td>\n",
-                            "      <td>[multiply, add]</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>15</td>\n",
-                            "    </tr>\n",
-                            "  </tbody>\n",
-                            "</table>\n",
-                            "</div>"
-                        ],
-                        "text/plain": [
-                            "   Intermediate steps correctness  # steps / # expected steps  correctness  \\\n",
-                            "0                               0                        15.0            0   \n",
-                            "1                               0                        15.0            0   \n",
-                            "2                               0                         9.0            1   \n",
-                            "3                               1                         1.0            0   \n",
-                            "4                               0                         7.5            0   \n",
-                            "\n",
-                            "   execution_time                                     input.question  \\\n",
-                            "0        38.76436                                        Add 2 and 3   \n",
-                            "1        38.76436                                  Subtract 3 from 2   \n",
-                            "2        38.76436  What is -5 if evaluated using the negate funct...   \n",
-                            "3        38.76436         what is the result of 2 to the power of 3?   \n",
-                            "4        38.76436  I ate 1 apple and 2 oranges every day for 7 da...   \n",
-                            "\n",
-                            "                                     output.question  \\\n",
-                            "0                                        Add 2 and 3   \n",
-                            "1                                  Subtract 3 from 2   \n",
-                            "2  What is -5 if evaluated using the negate funct...   \n",
-                            "3         what is the result of 2 to the power of 3?   \n",
-                            "4  I ate 1 apple and 2 oranges every day for 7 da...   \n",
-                            "\n",
-                            "                                       output.output  \\\n",
-                            "0  Agent stopped due to iteration limit or time l...   \n",
-                            "1  Agent stopped due to iteration limit or time l...   \n",
-                            "2                                         -5.0\\n-5.0   \n",
-                            "3           The result of 2 to the power of 3 is 32.   \n",
-                            "4  Agent stopped due to iteration limit or time l...   \n",
-                            "\n",
-                            "                           output.intermediate_steps  reference.reference  \\\n",
-                            "0  [(tool='add' tool_input={'a': 2, 'b': 3} log=\"...                 6.20   \n",
-                            "1  [(tool='subtract' tool_input={'a': 2, 'b': 3} ...                -4.00   \n",
-                            "2  [(tool='negate' tool_input={'a': -5} log=\"\\nIn...                -5.00   \n",
-                            "3  [(tool='power' tool_input={'a': 2, 'b': 3} log...                32.00   \n",
-                            "4  [(tool='add' tool_input={'a': 1, 'b': 2} log=\"...                32.34   \n",
-                            "\n",
-                            "  reference.expected_steps  num_expected_steps  actual_number_of_steps  \n",
-                            "0                    [add]                   1                      15  \n",
-                            "1               [subtract]                   1                      15  \n",
-                            "2                 [negate]                   1                       9  \n",
-                            "3                  [power]                   1                       1  \n",
-                            "4          [multiply, add]                   2                      15  "
-                        ]
-                    },
-                    "execution_count": 17,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "df.head()"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.11.2"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 5
-}
\ No newline at end of file
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Multiverse Math\n",
+    "\n",
+    "In this task, the agent is operating in an alternate universe which in which the basic mathematical operations like addition and multiplication are different.\n",
+    "\n",
+    "The agent must use tools that allow is to carry out calculations in this universe.\n",
+    "\n",
+    "This task can help verify that an agent is able to ignore its own knowledge of math and instead correctly use information returned by the tools.\n",
+    "\n",
+    "The modified mathematical operations yield different reuslts, but still retain some properties (e.g., the modified multiplication operation is still commutative).\n",
+    "\n",
+    "Please note that the modified operations are not guaranteed to even make sense in the real world since not all properties will be retained (e.g., distributive property)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03488ab1-31ed-41c2-8da2-46b02599b181",
+   "metadata": {},
+   "source": [
+    "For this code to work, please configure LangSmith environment variables with your credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1615b8ff-688a-4447-8c4c-d64ad02818ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"  # Your LangSmith API key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_benchmarks import clone_public_dataset, registry"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "<tbody>\n",
+       "<tr><td>Name       </td><td>Multiverse Math                                                                                                                                            </td></tr>\n",
+       "<tr><td>Type       </td><td>ToolUsageTask                                                                                                                                              </td></tr>\n",
+       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d\" target=\"_blank\" rel=\"noopener\">594f9f60-30a0-49bf-b075-f44beabf546a</a></td></tr>\n",
+       "<tr><td>Description</td><td>An environment that contains a few basic math operations, but with altered results.\n",
+       "\n",
+       "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n",
+       "\n",
+       "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.                                                                                                                                                            </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x7f938662a160>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "task = registry[\"Multiverse Math\"]\n",
+    "task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
+   "metadata": {},
+   "source": [
+    "Clone the dataset associaetd with this task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset Multiverse Math already exists. Skipping.\n",
+      "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/ddca73f1-ceda-4562-8c49-7ee0a9df2a01.\n"
+     ]
+    }
+   ],
+   "source": [
+    "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cede4edd-884d-4330-a186-5058b712394b",
+   "metadata": {},
+   "source": [
+    "## The Environment\n",
+    "\n",
+    "Let's check the environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e2439d0c-ccb9-4f5b-a127-548725025a98",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[StructuredTool(name='multiply', description='multiply(a: float, b: float) -> float - Multiply two numbers; a * b.', args_schema=<class 'pydantic.v1.main.multiplySchemaSchema'>, func=<function multiply at 0x7f938669fb80>),\n",
+       " StructuredTool(name='add', description='add(a: float, b: float) -> float - Add two numbers; a + b.', args_schema=<class 'pydantic.v1.main.addSchemaSchema'>, func=<function add at 0x7f938669fca0>),\n",
+       " StructuredTool(name='divide', description='divide(a: float, b: float) -> float - Divide two numbers; a / b.', args_schema=<class 'pydantic.v1.main.divideSchemaSchema'>, func=<function divide at 0x7f938669fc10>),\n",
+       " StructuredTool(name='subtract', description='subtract(a: float, b: float) -> float - Subtract two numbers; a - b.', args_schema=<class 'pydantic.v1.main.subtractSchemaSchema'>, func=<function subtract at 0x7f938669fe50>),\n",
+       " StructuredTool(name='power', description='power(a: float, b: float) -> float - Raise a number to a power; a ** b.', args_schema=<class 'pydantic.v1.main.powerSchemaSchema'>, func=<function power at 0x7f938669fee0>)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env = task.create_environment()\n",
+    "env.tools[:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1941e187-55ee-4d38-b529-4744ea2474b0",
+   "metadata": {},
+   "source": [
+    "Multiplying 2 x 4 = 8.8!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f5a100bd-6e19-498f-8a36-393b5c19bcb9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.8"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.tools[0].invoke({\"a\": 2, \"b\": 4})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc60ef11-6300-4a83-989e-ec5b7f196796",
+   "metadata": {},
+   "source": [
+    "The task instructions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "31afb08b-17b8-4866-86c1-ee24e804415c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "task.instructions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92d65770-6a4f-4029-beba-5fa9aeb18809",
+   "metadata": {},
+   "source": [
+    "## Agent"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
+   "metadata": {},
+   "source": [
+    "Let's build an agent that we can use for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'how much is 3 + 5',\n",
+       " 'output': 'The result of 3 + 5 in this alternate mathematical universe is 9.2.',\n",
+       " 'intermediate_steps': [(AgentActionMessageLog(tool='add', tool_input={'a': 3, 'b': 5}, log=\"\\nInvoking: `add` with `{'a': 3, 'b': 5}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"a\": 3,\\n  \"b\": 5\\n}', 'name': 'add'}})]),\n",
+       "   9.2)]}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain_benchmarks.tool_usage import agents\n",
+    "\n",
+    "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
+    "\n",
+    "# Let's test that our agent works\n",
+    "agent = agent_factory.create()\n",
+    "agent.invoke({\"question\": \"how much is 3 + 5\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
+   "metadata": {},
+   "source": [
+    "## Eval\n",
+    "\n",
+    "Let's evaluate an agent now"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'test-weary-wing-36' at:\n",
+      "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/33124759-882e-4a5c-a121-736310a40a1f?eval=true\n",
+      "\n",
+      "View all tests for Dataset Multiverse Math at:\n",
+      "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/ddca73f1-ceda-4562-8c49-7ee0a9df2a01\n",
+      "[------------------------------------------------->] 10/10\n",
+      " Eval quantiles:\n",
+      "                                    0.25       0.5      0.75      mean  \\\n",
+      "Intermediate steps correctness  0.250000  1.000000  1.000000  0.700000   \n",
+      "# steps / # expected steps      1.000000  1.000000  1.000000  1.033333   \n",
+      "correctness                     0.000000  0.000000  1.000000  0.400000   \n",
+      "execution_time                  5.771554  5.771554  5.771554  5.771554   \n",
+      "\n",
+      "                                    mode  \n",
+      "Intermediate steps correctness  1.000000  \n",
+      "# steps / # expected steps      1.000000  \n",
+      "correctness                     0.000000  \n",
+      "execution_time                  5.771554  \n"
+     ]
+    }
+   ],
+   "source": [
+    "from langsmith.client import Client\n",
+    "\n",
+    "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
+    "\n",
+    "client = Client()\n",
+    "\n",
+    "test_run = client.run_on_dataset(\n",
+    "    dataset_name=task.name,\n",
+    "    llm_or_chain_factory=agent_factory.create,\n",
+    "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
+    "    verbose=True,\n",
+    "    tags=[\"gpt-3.5-turbo-16k\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
+   "metadata": {},
+   "source": [
+    "# Inspect\n",
+    "\n",
+    "You can take a look at the underlying results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = test_run.to_dataframe()\n",
+    "df = pd.json_normalize(df.to_dict(orient=\"records\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7ab5a8b9-a937-4537-b879-704284df4494",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.4"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"correctness\"].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n",
+    "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "50d7590d-20de-4768-ac90-adcdbfa70068",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Intermediate steps correctness</th>\n",
+       "      <th># steps / # expected steps</th>\n",
+       "      <th>correctness</th>\n",
+       "      <th>execution_time</th>\n",
+       "      <th>input.question</th>\n",
+       "      <th>output.question</th>\n",
+       "      <th>output.output</th>\n",
+       "      <th>output.intermediate_steps</th>\n",
+       "      <th>reference.reference</th>\n",
+       "      <th>reference.expected_steps</th>\n",
+       "      <th>num_expected_steps</th>\n",
+       "      <th>actual_number_of_steps</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5.771554</td>\n",
+       "      <td>Add 2 and 3</td>\n",
+       "      <td>Add 2 and 3</td>\n",
+       "      <td>The sum of 2 and 3 in this alternate mathemati...</td>\n",
+       "      <td>[(tool='add' tool_input={'a': 2, 'b': 3} log=\"...</td>\n",
+       "      <td>6.20</td>\n",
+       "      <td>[add]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5.771554</td>\n",
+       "      <td>Subtract 3 from 2</td>\n",
+       "      <td>Subtract 3 from 2</td>\n",
+       "      <td>The result of subtracting 3 from 2 in this alt...</td>\n",
+       "      <td>[(tool='subtract' tool_input={'a': 2, 'b': 3} ...</td>\n",
+       "      <td>-4.00</td>\n",
+       "      <td>[subtract]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5.771554</td>\n",
+       "      <td>What is -5 if evaluated using the negate funct...</td>\n",
+       "      <td>What is -5 if evaluated using the negate funct...</td>\n",
+       "      <td>The result of evaluating -5 using the negate f...</td>\n",
+       "      <td>[(tool='negate' tool_input={'a': -5} log=\"\\nIn...</td>\n",
+       "      <td>-5.00</td>\n",
+       "      <td>[negate]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5.771554</td>\n",
+       "      <td>what is the result of 2 to the power of 3?</td>\n",
+       "      <td>what is the result of 2 to the power of 3?</td>\n",
+       "      <td>The result of 2 to the power of 3 is 32.</td>\n",
+       "      <td>[(tool='power' tool_input={'a': 2, 'b': 3} log...</td>\n",
+       "      <td>32.00</td>\n",
+       "      <td>[power]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5.771554</td>\n",
+       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
+       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
+       "      <td>You ate a total of 32.34 fruits.</td>\n",
+       "      <td>[(tool='add' tool_input={'a': 1, 'b': 2} log=\"...</td>\n",
+       "      <td>32.34</td>\n",
+       "      <td>[multiply, add]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Intermediate steps correctness  # steps / # expected steps  correctness  \\\n",
+       "0                               1                         1.0            1   \n",
+       "1                               1                         1.0            0   \n",
+       "2                               1                         1.0            1   \n",
+       "3                               1                         1.0            0   \n",
+       "4                               0                         1.0            0   \n",
+       "\n",
+       "   execution_time                                     input.question  \\\n",
+       "0        5.771554                                        Add 2 and 3   \n",
+       "1        5.771554                                  Subtract 3 from 2   \n",
+       "2        5.771554  What is -5 if evaluated using the negate funct...   \n",
+       "3        5.771554         what is the result of 2 to the power of 3?   \n",
+       "4        5.771554  I ate 1 apple and 2 oranges every day for 7 da...   \n",
+       "\n",
+       "                                     output.question  \\\n",
+       "0                                        Add 2 and 3   \n",
+       "1                                  Subtract 3 from 2   \n",
+       "2  What is -5 if evaluated using the negate funct...   \n",
+       "3         what is the result of 2 to the power of 3?   \n",
+       "4  I ate 1 apple and 2 oranges every day for 7 da...   \n",
+       "\n",
+       "                                       output.output  \\\n",
+       "0  The sum of 2 and 3 in this alternate mathemati...   \n",
+       "1  The result of subtracting 3 from 2 in this alt...   \n",
+       "2  The result of evaluating -5 using the negate f...   \n",
+       "3           The result of 2 to the power of 3 is 32.   \n",
+       "4                   You ate a total of 32.34 fruits.   \n",
+       "\n",
+       "                           output.intermediate_steps  reference.reference  \\\n",
+       "0  [(tool='add' tool_input={'a': 2, 'b': 3} log=\"...                 6.20   \n",
+       "1  [(tool='subtract' tool_input={'a': 2, 'b': 3} ...                -4.00   \n",
+       "2  [(tool='negate' tool_input={'a': -5} log=\"\\nIn...                -5.00   \n",
+       "3  [(tool='power' tool_input={'a': 2, 'b': 3} log...                32.00   \n",
+       "4  [(tool='add' tool_input={'a': 1, 'b': 2} log=\"...                32.34   \n",
+       "\n",
+       "  reference.expected_steps  num_expected_steps  actual_number_of_steps  \n",
+       "0                    [add]                   1                       1  \n",
+       "1               [subtract]                   1                       1  \n",
+       "2                 [negate]                   1                       1  \n",
+       "3                  [power]                   1                       1  \n",
+       "4          [multiply, add]                   2                       2  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/source/notebooks/tool_usage/relational_data.ipynb b/docs/source/notebooks/tool_usage/relational_data.ipynb
index 1c233c70..a79cae7d 100644
--- a/docs/source/notebooks/tool_usage/relational_data.ipynb
+++ b/docs/source/notebooks/tool_usage/relational_data.ipynb
@@ -1,1165 +1,812 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
-            "metadata": {},
-            "source": [
-                "# Relational Data \n",
-                "\n",
-                "\n",
-                "Let's see how to evaluate an agent's ability to use tools."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 1,
-            "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langchain_benchmarks import clone_public_dataset, registry"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "03488ab1-31ed-41c2-8da2-46b02599b181",
-            "metadata": {},
-            "source": [
-                "For this code to work, please configure LangSmith environment variables with your credentials."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 2,
-            "id": "60f22779-a948-4833-8e8c-ace9ef17f56f",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<table>\n",
-                            "<tbody>\n",
-                            "<tr><td>Name       </td><td>Tool Usage - Relational Data                                             </td></tr>\n",
-                            "<tr><td>Type       </td><td>ToolUsageTask                                                            </td></tr>\n",
-                            "<tr><td>Dataset ID </td><td>https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d</td></tr>\n",
-                            "<tr><td>Description</td><td>Environment with fake data about users and their locations and favorite foods.\n",
-                            "\n",
-                            "The environment provides a set of tools that can be used to query the data.\n",
-                            "\n",
-                            "The objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\n",
-                            "\n",
-                            "The dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\n",
-                            "\n",
-                            "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n",
-                            "\n",
-                            "Success is measured by the ability to answer the question correctly, and efficiently.                                                                          </td></tr>\n",
-                            "</tbody>\n",
-                            "</table>"
-                        ],
-                        "text/plain": [
-                            "ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x7f2dd967cca0>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\")"
-                        ]
-                    },
-                    "execution_count": 2,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "task = registry[\"Tool Usage - Relational Data\"]\n",
-                "task"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 3,
-            "id": "49be36d2-343e-49df-8369-dd5bac405d5e",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Environment with fake data about users and their locations and favorite foods.\n",
-                        "\n",
-                        "The environment provides a set of tools that can be used to query the data.\n",
-                        "\n",
-                        "The objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\n",
-                        "\n",
-                        "The dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\n",
-                        "\n",
-                        "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n",
-                        "\n",
-                        "Success is measured by the ability to answer the question correctly, and efficiently.\n"
-                    ]
-                }
-            ],
-            "source": [
-                "print(task.description)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
-            "metadata": {},
-            "source": [
-                "Clone the dataset associaetd with this task"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 4,
-            "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Dataset Tool Usage - Relational Data already exists. Skipping.\n",
-                        "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/69c0e0d0-91b5-4183-bed0-6628e76964dc.\n"
-                    ]
-                }
-            ],
-            "source": [
-                "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
-            "metadata": {},
-            "source": [
-                "## Define an agent\n",
-                "\n",
-                "Let's build an agent that we can use for evaluation."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 5,
-            "id": "09469813-17b6-4456-a913-486a01a4b295",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langchain_benchmarks.tool_usage import agents"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 6,
-            "id": "0ae8c6be-899c-44a6-a89b-0fc04c2cb05c",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
-            "metadata": {},
-            "source": [
-                "Let's test that our agent works"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 7,
-            "id": "612fb603-1401-426b-8a19-4453ad5b698a",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "agent = agent_factory()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 8,
-            "id": "0e4896fa-3633-44a1-857f-80a263cf2e03",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "{'question': 'who is bob?',\n",
-                            " 'output': 'Bob is a user with the ID 21.',\n",
-                            " 'intermediate_steps': [(AgentActionMessageLog(tool='find_users_by_name', tool_input={'name': 'bob'}, log=\"\\nInvoking: `find_users_by_name` with `{'name': 'bob'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"name\": \"bob\"\\n}', 'name': 'find_users_by_name'}})]),\n",
-                            "   [{'id': 21, 'name': 'Bob'},\n",
-                            "    {'id': 41, 'name': 'Donna'},\n",
-                            "    {'id': 1, 'name': 'Alice'},\n",
-                            "    {'id': 35, 'name': 'Charlie'},\n",
-                            "    {'id': 42, 'name': 'Eve'},\n",
-                            "    {'id': 43, 'name': 'Frank The Cat'}])]}"
-                        ]
-                    },
-                    "execution_count": 8,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "agent.invoke({\"question\": \"who is bob?\"})"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-            "metadata": {},
-            "source": [
-                "## Eval\n",
-                "\n",
-                "Let's evaluate an agent now"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 9,
-            "id": "513042fe-2878-44f8-ae84-05b9d521c1de",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langsmith.client import Client\n",
-                "\n",
-                "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 10,
-            "id": "2bedd9d1-fc06-4066-9f89-b874ae818d82",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "client = Client()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 11,
-            "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "View the evaluation results for project 'test-warm-whip-57' at:\n",
-                        "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/048077f0-52ca-4bae-8792-ec5e2a817d38?eval=true\n",
-                        "\n",
-                        "View all tests for Dataset Tool Usage - Relational Data at:\n",
-                        "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/69c0e0d0-91b5-4183-bed0-6628e76964dc\n",
-                        "[------------------------------------------------->] 21/21\n",
-                        " Eval quantiles:\n",
-                        "                                    0.25       0.5      0.75      mean  \\\n",
-                        "Intermediate steps correctness  0.000000  1.000000  1.000000  0.714286   \n",
-                        "# steps / # expected steps      1.000000  1.000000  1.000000  0.928571   \n",
-                        "correctness                     1.000000  1.000000  1.000000  0.809524   \n",
-                        "execution_time                  5.098939  5.098939  5.098939  5.098939   \n",
-                        "\n",
-                        "                                    mode  \n",
-                        "Intermediate steps correctness  1.000000  \n",
-                        "# steps / # expected steps      1.000000  \n",
-                        "correctness                     1.000000  \n",
-                        "execution_time                  5.098939  \n"
-                    ]
-                }
-            ],
-            "source": [
-                "test_run = client.run_on_dataset(\n",
-                "    dataset_name=task.name,\n",
-                "    llm_or_chain_factory=agent_factory.create,\n",
-                "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
-                "    verbose=True,\n",
-                "    tags=[\"openai-functions\"],\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
-            "metadata": {},
-            "source": [
-                "# Inspect\n",
-                "\n",
-                "Here, we'll take a look at the underlying results a little bit."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 18,
-            "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "import pandas as pd\n",
-                "\n",
-                "df = test_run.to_dataframe()\n",
-                "df = pd.json_normalize(df.to_dict(orient=\"records\"))"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 19,
-            "id": "7ab5a8b9-a937-4537-b879-704284df4494",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "0.8095238095238095"
-                        ]
-                    },
-                    "execution_count": 19,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "df[\"correctness\"].mean()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 20,
-            "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n",
-                "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 21,
-            "id": "50d7590d-20de-4768-ac90-adcdbfa70068",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<div>\n",
-                            "<style scoped>\n",
-                            "    .dataframe tbody tr th:only-of-type {\n",
-                            "        vertical-align: middle;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe tbody tr th {\n",
-                            "        vertical-align: top;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe thead th {\n",
-                            "        text-align: right;\n",
-                            "    }\n",
-                            "</style>\n",
-                            "<table border=\"1\" class=\"dataframe\">\n",
-                            "  <thead>\n",
-                            "    <tr style=\"text-align: right;\">\n",
-                            "      <th></th>\n",
-                            "      <th>Intermediate steps correctness</th>\n",
-                            "      <th># steps / # expected steps</th>\n",
-                            "      <th>correctness</th>\n",
-                            "      <th>execution_time</th>\n",
-                            "      <th>input.question</th>\n",
-                            "      <th>output.question</th>\n",
-                            "      <th>output.output</th>\n",
-                            "      <th>output.intermediate_steps</th>\n",
-                            "      <th>reference.reference</th>\n",
-                            "      <th>reference.order_matters</th>\n",
-                            "      <th>reference.expected_steps</th>\n",
-                            "      <th>num_expected_steps</th>\n",
-                            "      <th>actual_number_of_steps</th>\n",
-                            "    </tr>\n",
-                            "  </thead>\n",
-                            "  <tbody>\n",
-                            "    <tr>\n",
-                            "      <th>0</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>do bob and alice live in the same city?</td>\n",
-                            "      <td>do bob and alice live in the same city?</td>\n",
-                            "      <td>No, Bob and Alice do not live in the same city...</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>no</td>\n",
-                            "      <td>False</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_ci...</td>\n",
-                            "      <td>5</td>\n",
-                            "      <td>5</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>1</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>0.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>Is it likely that Donna is outside with an umb...</td>\n",
-                            "      <td>Is it likely that Donna is outside with an umb...</td>\n",
-                            "      <td>I'm sorry, but I don't have access to real-tim...</td>\n",
-                            "      <td>[]</td>\n",
-                            "      <td>yes</td>\n",
-                            "      <td>False</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
-                            "      <td>4</td>\n",
-                            "      <td>0</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>2</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>do alice and charlie use the same email provider?</td>\n",
-                            "      <td>do alice and charlie use the same email provider?</td>\n",
-                            "      <td>No, Alice and Charlie do not use the same emai...</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>no</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_email, get_user_...</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>3</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>0.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>Is it likely that Donna is awake right now?</td>\n",
-                            "      <td>Is it likely that Donna is awake right now?</td>\n",
-                            "      <td>I'm sorry, but I don't have access to informat...</td>\n",
-                            "      <td>[]</td>\n",
-                            "      <td>yes</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>0</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>4</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>Donna is about to go outside. Does she need an...</td>\n",
-                            "      <td>Donna is about to go outside. Does she need an...</td>\n",
-                            "      <td>Donna is currently in a location where it is r...</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>yes</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>3</td>\n",
-                            "    </tr>\n",
-                            "  </tbody>\n",
-                            "</table>\n",
-                            "</div>"
-                        ],
-                        "text/plain": [
-                            "   Intermediate steps correctness  # steps / # expected steps  correctness  \\\n",
-                            "0                               0                         1.0            1   \n",
-                            "1                               0                         0.0            0   \n",
-                            "2                               1                         1.0            1   \n",
-                            "3                               0                         0.0            0   \n",
-                            "4                               0                         1.0            1   \n",
-                            "\n",
-                            "   execution_time                                     input.question  \\\n",
-                            "0        5.098939            do bob and alice live in the same city?   \n",
-                            "1        5.098939  Is it likely that Donna is outside with an umb...   \n",
-                            "2        5.098939  do alice and charlie use the same email provider?   \n",
-                            "3        5.098939        Is it likely that Donna is awake right now?   \n",
-                            "4        5.098939  Donna is about to go outside. Does she need an...   \n",
-                            "\n",
-                            "                                     output.question  \\\n",
-                            "0            do bob and alice live in the same city?   \n",
-                            "1  Is it likely that Donna is outside with an umb...   \n",
-                            "2  do alice and charlie use the same email provider?   \n",
-                            "3        Is it likely that Donna is awake right now?   \n",
-                            "4  Donna is about to go outside. Does she need an...   \n",
-                            "\n",
-                            "                                       output.output  \\\n",
-                            "0  No, Bob and Alice do not live in the same city...   \n",
-                            "1  I'm sorry, but I don't have access to real-tim...   \n",
-                            "2  No, Alice and Charlie do not use the same emai...   \n",
-                            "3  I'm sorry, but I don't have access to informat...   \n",
-                            "4  Donna is currently in a location where it is r...   \n",
-                            "\n",
-                            "                           output.intermediate_steps reference.reference  \\\n",
-                            "0  [(tool='find_users_by_name' tool_input={'name'...                  no   \n",
-                            "1                                                 []                 yes   \n",
-                            "2  [(tool='find_users_by_name' tool_input={'name'...                  no   \n",
-                            "3                                                 []                 yes   \n",
-                            "4  [(tool='find_users_by_name' tool_input={'name'...                 yes   \n",
-                            "\n",
-                            "   reference.order_matters                           reference.expected_steps  \\\n",
-                            "0                    False  [find_users_by_name, get_user_location, get_ci...   \n",
-                            "1                    False  [find_users_by_name, get_user_location, get_cu...   \n",
-                            "2                     True  [find_users_by_name, get_user_email, get_user_...   \n",
-                            "3                     True  [find_users_by_name, get_user_location, get_cu...   \n",
-                            "4                     True  [find_users_by_name, get_user_location, get_cu...   \n",
-                            "\n",
-                            "   num_expected_steps  actual_number_of_steps  \n",
-                            "0                   5                       5  \n",
-                            "1                   4                       0  \n",
-                            "2                   3                       3  \n",
-                            "3                   3                       0  \n",
-                            "4                   3                       3  "
-                        ]
-                    },
-                    "execution_count": 21,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "df.head()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 22,
-            "id": "ffab97b7-eda2-408d-b611-596b637e627a",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "df = df.sort_values(\"actual_number_of_steps\", ascending=False)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 23,
-            "id": "20eb92f0-9373-4741-a851-b21c41f8c203",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<div>\n",
-                            "<style scoped>\n",
-                            "    .dataframe tbody tr th:only-of-type {\n",
-                            "        vertical-align: middle;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe tbody tr th {\n",
-                            "        vertical-align: top;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe thead th {\n",
-                            "        text-align: right;\n",
-                            "    }\n",
-                            "</style>\n",
-                            "<table border=\"1\" class=\"dataframe\">\n",
-                            "  <thead>\n",
-                            "    <tr style=\"text-align: right;\">\n",
-                            "      <th></th>\n",
-                            "      <th>Intermediate steps correctness</th>\n",
-                            "      <th># steps / # expected steps</th>\n",
-                            "      <th>correctness</th>\n",
-                            "      <th>execution_time</th>\n",
-                            "      <th>input.question</th>\n",
-                            "      <th>output.question</th>\n",
-                            "      <th>output.output</th>\n",
-                            "      <th>output.intermediate_steps</th>\n",
-                            "      <th>reference.reference</th>\n",
-                            "      <th>reference.order_matters</th>\n",
-                            "      <th>reference.expected_steps</th>\n",
-                            "      <th>num_expected_steps</th>\n",
-                            "      <th>actual_number_of_steps</th>\n",
-                            "    </tr>\n",
-                            "  </thead>\n",
-                            "  <tbody>\n",
-                            "    <tr>\n",
-                            "      <th>0</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>do bob and alice live in the same city?</td>\n",
-                            "      <td>do bob and alice live in the same city?</td>\n",
-                            "      <td>No, Bob and Alice do not live in the same city...</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>no</td>\n",
-                            "      <td>False</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_ci...</td>\n",
-                            "      <td>5</td>\n",
-                            "      <td>5</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>2</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>do alice and charlie use the same email provider?</td>\n",
-                            "      <td>do alice and charlie use the same email provider?</td>\n",
-                            "      <td>No, Alice and Charlie do not use the same emai...</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>no</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_email, get_user_...</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>4</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>Donna is about to go outside. Does she need an...</td>\n",
-                            "      <td>Donna is about to go outside. Does she need an...</td>\n",
-                            "      <td>Donna is currently in a location where it is r...</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>yes</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>5</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>whats the name of the city where bob lives?</td>\n",
-                            "      <td>whats the name of the city where bob lives?</td>\n",
-                            "      <td>The name of the city where Bob lives is New York.</td>\n",
-                            "      <td>[(tool='list_user_ids' tool_input={} log='\\nIn...</td>\n",
-                            "      <td>Los Angeles</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_ci...</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>6</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>what is the current users favorite color and n...</td>\n",
-                            "      <td>what is the current users favorite color and n...</td>\n",
-                            "      <td>The current user's favorite color is yellow an...</td>\n",
-                            "      <td>[(tool='get_current_user_id' tool_input={} log...</td>\n",
-                            "      <td>yellow and Charlie</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[get_current_user_id, get_user_favorite_color,...</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>7</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>1.5</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>Frank who is Even's friend is allergic to dair...</td>\n",
-                            "      <td>Frank who is Even's friend is allergic to dair...</td>\n",
-                            "      <td>Frank's favorite food is the salad, which cont...</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>yes</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_food_allergic_ingredi...</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>3</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>11</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>list the allergens in chocolate</td>\n",
-                            "      <td>list the allergens in chocolate</td>\n",
-                            "      <td>The allergens in chocolate are milk and soy.</td>\n",
-                            "      <td>[(tool='find_foods_by_name' tool_input={'food'...</td>\n",
-                            "      <td>milk, soy</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_foods_by_name, get_food_allergic_ingredi...</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>2</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>15</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>what is alice's email address?</td>\n",
-                            "      <td>what is alice's email address?</td>\n",
-                            "      <td>Alice's email address is alice@gmail.com.</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>alice@gmail.com</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_email]</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>2</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>14</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>find donna's favorite color</td>\n",
-                            "      <td>find donna's favorite color</td>\n",
-                            "      <td>Donna's favorite color is green.</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>green</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_favorite_color]</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>2</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>13</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>weather in LA right now?</td>\n",
-                            "      <td>weather in LA right now?</td>\n",
-                            "      <td>The current weather in Los Angeles is sunny wi...</td>\n",
-                            "      <td>[(tool='find_locations_by_name' tool_input={'c...</td>\n",
-                            "      <td>Sunny, Temperature: 75°F</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_locations_by_name, get_current_weather_f...</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>2</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>12</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>time in chicago</td>\n",
-                            "      <td>time in chicago</td>\n",
-                            "      <td>The current time in Chicago is 11:15 AM.</td>\n",
-                            "      <td>[(tool='find_locations_by_name' tool_input={'c...</td>\n",
-                            "      <td>2023-11-14 11:15 AM</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_locations_by_name, get_current_time_for_...</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>2</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>10</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>If i eat a serving of pizza, how many calories...</td>\n",
-                            "      <td>If i eat a serving of pizza, how many calories...</td>\n",
-                            "      <td>If you eat a serving of pizza, you will consum...</td>\n",
-                            "      <td>[(tool='find_foods_by_name' tool_input={'food'...</td>\n",
-                            "      <td>285 calories</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_foods_by_name, get_food_calories]</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>2</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>9</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>what is the current users favorite color?</td>\n",
-                            "      <td>what is the current users favorite color?</td>\n",
-                            "      <td>The current user's favorite color is yellow.</td>\n",
-                            "      <td>[(tool='get_current_user_id' tool_input={} log...</td>\n",
-                            "      <td>yellow</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[get_current_user_id, get_user_favorite_color]</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>2</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>8</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>eve ate a serving of sushi, what allergens was...</td>\n",
-                            "      <td>eve ate a serving of sushi, what allergens was...</td>\n",
-                            "      <td>Eve was exposed to the allergens fish and soy ...</td>\n",
-                            "      <td>[(tool='find_foods_by_name' tool_input={'food'...</td>\n",
-                            "      <td>fish, soy</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_foods_by_name, get_food_allergic_ingredi...</td>\n",
-                            "      <td>2</td>\n",
-                            "      <td>2</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>16</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>How many users by the name of bob?</td>\n",
-                            "      <td>How many users by the name of bob?</td>\n",
-                            "      <td>There are 1 user(s) with the name \"Bob\".</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>17</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>get the current user id</td>\n",
-                            "      <td>get the current user id</td>\n",
-                            "      <td>The current user ID is 35.</td>\n",
-                            "      <td>[(tool='get_current_user_id' tool_input={} log...</td>\n",
-                            "      <td>35</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[get_current_user_id]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>18</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>what is eve's user id?</td>\n",
-                            "      <td>what is eve's user id?</td>\n",
-                            "      <td>Eve's user ID is 42.</td>\n",
-                            "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
-                            "      <td>42</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>19</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>What is the name of food with id 6?</td>\n",
-                            "      <td>What is the name of food with id 6?</td>\n",
-                            "      <td>The name of the food with ID 6 is Pasta.</td>\n",
-                            "      <td>[(tool='get_food_name' tool_input={'food_id': ...</td>\n",
-                            "      <td>Pasta</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[get_food_name]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>20</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>What is the city for location ID 1?</td>\n",
-                            "      <td>What is the city for location ID 1?</td>\n",
-                            "      <td>The city for location ID 1 is New York.</td>\n",
-                            "      <td>[(tool='get_city_for_location' tool_input={'lo...</td>\n",
-                            "      <td>New York</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[get_city_for_location]</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>1</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>0.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>Is it likely that Donna is outside with an umb...</td>\n",
-                            "      <td>Is it likely that Donna is outside with an umb...</td>\n",
-                            "      <td>I'm sorry, but I don't have access to real-tim...</td>\n",
-                            "      <td>[]</td>\n",
-                            "      <td>yes</td>\n",
-                            "      <td>False</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
-                            "      <td>4</td>\n",
-                            "      <td>0</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>3</th>\n",
-                            "      <td>0</td>\n",
-                            "      <td>0.0</td>\n",
-                            "      <td>0</td>\n",
-                            "      <td>5.098939</td>\n",
-                            "      <td>Is it likely that Donna is awake right now?</td>\n",
-                            "      <td>Is it likely that Donna is awake right now?</td>\n",
-                            "      <td>I'm sorry, but I don't have access to informat...</td>\n",
-                            "      <td>[]</td>\n",
-                            "      <td>yes</td>\n",
-                            "      <td>True</td>\n",
-                            "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
-                            "      <td>3</td>\n",
-                            "      <td>0</td>\n",
-                            "    </tr>\n",
-                            "  </tbody>\n",
-                            "</table>\n",
-                            "</div>"
-                        ],
-                        "text/plain": [
-                            "    Intermediate steps correctness  # steps / # expected steps  correctness  \\\n",
-                            "0                                0                         1.0            1   \n",
-                            "2                                1                         1.0            1   \n",
-                            "4                                0                         1.0            1   \n",
-                            "5                                0                         1.0            0   \n",
-                            "6                                1                         1.0            1   \n",
-                            "7                                0                         1.5            1   \n",
-                            "11                               1                         1.0            1   \n",
-                            "15                               1                         1.0            1   \n",
-                            "14                               1                         1.0            1   \n",
-                            "13                               1                         1.0            1   \n",
-                            "12                               1                         1.0            0   \n",
-                            "10                               1                         1.0            1   \n",
-                            "9                                1                         1.0            1   \n",
-                            "8                                1                         1.0            1   \n",
-                            "16                               1                         1.0            1   \n",
-                            "17                               1                         1.0            1   \n",
-                            "18                               1                         1.0            1   \n",
-                            "19                               1                         1.0            1   \n",
-                            "20                               1                         1.0            1   \n",
-                            "1                                0                         0.0            0   \n",
-                            "3                                0                         0.0            0   \n",
-                            "\n",
-                            "    execution_time                                     input.question  \\\n",
-                            "0         5.098939            do bob and alice live in the same city?   \n",
-                            "2         5.098939  do alice and charlie use the same email provider?   \n",
-                            "4         5.098939  Donna is about to go outside. Does she need an...   \n",
-                            "5         5.098939        whats the name of the city where bob lives?   \n",
-                            "6         5.098939  what is the current users favorite color and n...   \n",
-                            "7         5.098939  Frank who is Even's friend is allergic to dair...   \n",
-                            "11        5.098939                    list the allergens in chocolate   \n",
-                            "15        5.098939                     what is alice's email address?   \n",
-                            "14        5.098939                        find donna's favorite color   \n",
-                            "13        5.098939                           weather in LA right now?   \n",
-                            "12        5.098939                                    time in chicago   \n",
-                            "10        5.098939  If i eat a serving of pizza, how many calories...   \n",
-                            "9         5.098939          what is the current users favorite color?   \n",
-                            "8         5.098939  eve ate a serving of sushi, what allergens was...   \n",
-                            "16        5.098939                 How many users by the name of bob?   \n",
-                            "17        5.098939                            get the current user id   \n",
-                            "18        5.098939                             what is eve's user id?   \n",
-                            "19        5.098939                What is the name of food with id 6?   \n",
-                            "20        5.098939                What is the city for location ID 1?   \n",
-                            "1         5.098939  Is it likely that Donna is outside with an umb...   \n",
-                            "3         5.098939        Is it likely that Donna is awake right now?   \n",
-                            "\n",
-                            "                                      output.question  \\\n",
-                            "0             do bob and alice live in the same city?   \n",
-                            "2   do alice and charlie use the same email provider?   \n",
-                            "4   Donna is about to go outside. Does she need an...   \n",
-                            "5         whats the name of the city where bob lives?   \n",
-                            "6   what is the current users favorite color and n...   \n",
-                            "7   Frank who is Even's friend is allergic to dair...   \n",
-                            "11                    list the allergens in chocolate   \n",
-                            "15                     what is alice's email address?   \n",
-                            "14                        find donna's favorite color   \n",
-                            "13                           weather in LA right now?   \n",
-                            "12                                    time in chicago   \n",
-                            "10  If i eat a serving of pizza, how many calories...   \n",
-                            "9           what is the current users favorite color?   \n",
-                            "8   eve ate a serving of sushi, what allergens was...   \n",
-                            "16                 How many users by the name of bob?   \n",
-                            "17                            get the current user id   \n",
-                            "18                             what is eve's user id?   \n",
-                            "19                What is the name of food with id 6?   \n",
-                            "20                What is the city for location ID 1?   \n",
-                            "1   Is it likely that Donna is outside with an umb...   \n",
-                            "3         Is it likely that Donna is awake right now?   \n",
-                            "\n",
-                            "                                        output.output  \\\n",
-                            "0   No, Bob and Alice do not live in the same city...   \n",
-                            "2   No, Alice and Charlie do not use the same emai...   \n",
-                            "4   Donna is currently in a location where it is r...   \n",
-                            "5   The name of the city where Bob lives is New York.   \n",
-                            "6   The current user's favorite color is yellow an...   \n",
-                            "7   Frank's favorite food is the salad, which cont...   \n",
-                            "11       The allergens in chocolate are milk and soy.   \n",
-                            "15          Alice's email address is alice@gmail.com.   \n",
-                            "14                   Donna's favorite color is green.   \n",
-                            "13  The current weather in Los Angeles is sunny wi...   \n",
-                            "12           The current time in Chicago is 11:15 AM.   \n",
-                            "10  If you eat a serving of pizza, you will consum...   \n",
-                            "9        The current user's favorite color is yellow.   \n",
-                            "8   Eve was exposed to the allergens fish and soy ...   \n",
-                            "16           There are 1 user(s) with the name \"Bob\".   \n",
-                            "17                         The current user ID is 35.   \n",
-                            "18                               Eve's user ID is 42.   \n",
-                            "19           The name of the food with ID 6 is Pasta.   \n",
-                            "20            The city for location ID 1 is New York.   \n",
-                            "1   I'm sorry, but I don't have access to real-tim...   \n",
-                            "3   I'm sorry, but I don't have access to informat...   \n",
-                            "\n",
-                            "                            output.intermediate_steps  \\\n",
-                            "0   [(tool='find_users_by_name' tool_input={'name'...   \n",
-                            "2   [(tool='find_users_by_name' tool_input={'name'...   \n",
-                            "4   [(tool='find_users_by_name' tool_input={'name'...   \n",
-                            "5   [(tool='list_user_ids' tool_input={} log='\\nIn...   \n",
-                            "6   [(tool='get_current_user_id' tool_input={} log...   \n",
-                            "7   [(tool='find_users_by_name' tool_input={'name'...   \n",
-                            "11  [(tool='find_foods_by_name' tool_input={'food'...   \n",
-                            "15  [(tool='find_users_by_name' tool_input={'name'...   \n",
-                            "14  [(tool='find_users_by_name' tool_input={'name'...   \n",
-                            "13  [(tool='find_locations_by_name' tool_input={'c...   \n",
-                            "12  [(tool='find_locations_by_name' tool_input={'c...   \n",
-                            "10  [(tool='find_foods_by_name' tool_input={'food'...   \n",
-                            "9   [(tool='get_current_user_id' tool_input={} log...   \n",
-                            "8   [(tool='find_foods_by_name' tool_input={'food'...   \n",
-                            "16  [(tool='find_users_by_name' tool_input={'name'...   \n",
-                            "17  [(tool='get_current_user_id' tool_input={} log...   \n",
-                            "18  [(tool='find_users_by_name' tool_input={'name'...   \n",
-                            "19  [(tool='get_food_name' tool_input={'food_id': ...   \n",
-                            "20  [(tool='get_city_for_location' tool_input={'lo...   \n",
-                            "1                                                  []   \n",
-                            "3                                                  []   \n",
-                            "\n",
-                            "         reference.reference  reference.order_matters  \\\n",
-                            "0                         no                    False   \n",
-                            "2                         no                     True   \n",
-                            "4                        yes                     True   \n",
-                            "5                Los Angeles                     True   \n",
-                            "6         yellow and Charlie                     True   \n",
-                            "7                        yes                     True   \n",
-                            "11                 milk, soy                     True   \n",
-                            "15           alice@gmail.com                     True   \n",
-                            "14                     green                     True   \n",
-                            "13  Sunny, Temperature: 75°F                     True   \n",
-                            "12       2023-11-14 11:15 AM                     True   \n",
-                            "10              285 calories                     True   \n",
-                            "9                     yellow                     True   \n",
-                            "8                  fish, soy                     True   \n",
-                            "16                         1                     True   \n",
-                            "17                        35                     True   \n",
-                            "18                        42                     True   \n",
-                            "19                     Pasta                     True   \n",
-                            "20                  New York                     True   \n",
-                            "1                        yes                    False   \n",
-                            "3                        yes                     True   \n",
-                            "\n",
-                            "                             reference.expected_steps  num_expected_steps  \\\n",
-                            "0   [find_users_by_name, get_user_location, get_ci...                   5   \n",
-                            "2   [find_users_by_name, get_user_email, get_user_...                   3   \n",
-                            "4   [find_users_by_name, get_user_location, get_cu...                   3   \n",
-                            "5   [find_users_by_name, get_user_location, get_ci...                   3   \n",
-                            "6   [get_current_user_id, get_user_favorite_color,...                   3   \n",
-                            "7   [find_users_by_name, get_food_allergic_ingredi...                   2   \n",
-                            "11  [find_foods_by_name, get_food_allergic_ingredi...                   2   \n",
-                            "15               [find_users_by_name, get_user_email]                   2   \n",
-                            "14      [find_users_by_name, get_user_favorite_color]                   2   \n",
-                            "13  [find_locations_by_name, get_current_weather_f...                   2   \n",
-                            "12  [find_locations_by_name, get_current_time_for_...                   2   \n",
-                            "10            [find_foods_by_name, get_food_calories]                   2   \n",
-                            "9      [get_current_user_id, get_user_favorite_color]                   2   \n",
-                            "8   [find_foods_by_name, get_food_allergic_ingredi...                   2   \n",
-                            "16                               [find_users_by_name]                   1   \n",
-                            "17                              [get_current_user_id]                   1   \n",
-                            "18                               [find_users_by_name]                   1   \n",
-                            "19                                    [get_food_name]                   1   \n",
-                            "20                            [get_city_for_location]                   1   \n",
-                            "1   [find_users_by_name, get_user_location, get_cu...                   4   \n",
-                            "3   [find_users_by_name, get_user_location, get_cu...                   3   \n",
-                            "\n",
-                            "    actual_number_of_steps  \n",
-                            "0                        5  \n",
-                            "2                        3  \n",
-                            "4                        3  \n",
-                            "5                        3  \n",
-                            "6                        3  \n",
-                            "7                        3  \n",
-                            "11                       2  \n",
-                            "15                       2  \n",
-                            "14                       2  \n",
-                            "13                       2  \n",
-                            "12                       2  \n",
-                            "10                       2  \n",
-                            "9                        2  \n",
-                            "8                        2  \n",
-                            "16                       1  \n",
-                            "17                       1  \n",
-                            "18                       1  \n",
-                            "19                       1  \n",
-                            "20                       1  \n",
-                            "1                        0  \n",
-                            "3                        0  "
-                        ]
-                    },
-                    "execution_count": 23,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "df"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.11.2"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 5
-}
\ No newline at end of file
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
+   "metadata": {},
+   "source": [
+    "# Relational Data \n",
+    "\n",
+    "In this task, an agent is given access to a set of tools that can be used to make queries across 3 relational tables.\n",
+    "\n",
+    "The tables contain information about users, locations and foods. The agent must answer questions about the data using the provided tools.\n",
+    "\n",
+    "The underlying data looks like this:\n",
+    "\n",
+    "User data:\n",
+    "\n",
+    "```json\n",
+    "{\n",
+    "    \"id\": 1,\n",
+    "    \"name\": \"Alice\",\n",
+    "    \"email\": \"alice@gmail.com\",\n",
+    "    \"location\": 1,\n",
+    "    \"favorite_color\": \"red\",\n",
+    "    \"favorite_foods\": [1, 2, 3],\n",
+    "},\n",
+    "{\n",
+    "    \"id\": 21,\n",
+    "    \"name\": \"Bob\",\n",
+    "    \"email\": \"bob@hotmail.com\",\n",
+    "    \"location\": 2,\n",
+    "    \"favorite_color\": \"orange\",\n",
+    "    \"favorite_foods\": [4, 5, 6],\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "Food data:\n",
+    "\n",
+    "```json\n",
+    "{\n",
+    "    \"id\": 1,\n",
+    "    \"name\": \"Pizza\",\n",
+    "    \"calories\": 285,  # Calories per serving\n",
+    "    \"allergic_ingredients\": [\"Gluten\", \"Dairy\"],\n",
+    "},\n",
+    "{\n",
+    "    \"id\": 2,\n",
+    "    \"name\": \"Chocolate\",\n",
+    "    \"calories\": 50,  # Calories per serving\n",
+    "    \"allergic_ingredients\": [\"Milk\", \"Soy\"],\n",
+    "},\n",
+    "```\n",
+    "\n",
+    "The tools allow to look up information based on ids (e.g., `get_user_email` takes a user id and returns the email),\n",
+    "and to search (e.g., `find_foods_by_name` takes a food name and returns a list of results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_benchmarks import clone_public_dataset, registry"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03488ab1-31ed-41c2-8da2-46b02599b181",
+   "metadata": {},
+   "source": [
+    "For this code to work, please configure LangSmith environment variables with your credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "60f22779-a948-4833-8e8c-ace9ef17f56f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "task = registry[\"Tool Usage - Relational Data\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
+   "metadata": {},
+   "source": [
+    "Clone the dataset associaetd with this task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset Tool Usage - Relational Data already exists. Skipping.\n",
+      "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/69c0e0d0-91b5-4183-bed0-6628e76964dc.\n"
+     ]
+    }
+   ],
+   "source": [
+    "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "110bdafa-bdab-4194-90c9-46416d14b2f9",
+   "metadata": {},
+   "source": [
+    "## The Environment\n",
+    "\n",
+    "Let's check the environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "27b6b0fd-639d-43a7-a730-9acdc5b2f102",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[StructuredTool(name='get_user_name', description=\"get_user_name(user_id: int) -> str - Get the name of the user with the given user ID.\\n\\n        Args:\\n            user_id: The user's ID.\\n\\n        Returns:\\n            The user's name.\", args_schema=<class 'pydantic.v1.main.get_user_nameSchemaSchema'>, func=<function get_available_functions.<locals>.get_user_name at 0x7fc3d10904c0>),\n",
+       " StructuredTool(name='list_user_ids', description='list_user_ids() -> List[str] - List all the user IDs.', args_schema=<class 'pydantic.v1.main.list_user_idsSchemaSchema'>, func=<function get_available_functions.<locals>.list_user_ids at 0x7fc3d1090670>),\n",
+       " StructuredTool(name='find_users_by_name', description='find_users_by_name(name: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find users with the given name.\\n\\n        Args:\\n            name: The name to search for.\\n\\n        Returns:\\n            The list of matching users.', args_schema=<class 'pydantic.v1.main.find_users_by_nameSchemaSchema'>, func=<function get_available_functions.<locals>.find_users_by_name at 0x7fc3d1bc3700>),\n",
+       " StructuredTool(name='find_locations_by_name', description='find_locations_by_name(city: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find locations with the given city name.', args_schema=<class 'pydantic.v1.main.find_locations_by_nameSchemaSchema'>, func=<function get_available_functions.<locals>.find_locations_by_name at 0x7fc3d145a8b0>),\n",
+       " StructuredTool(name='find_foods_by_name', description='find_foods_by_name(food: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find foods with the given name.', args_schema=<class 'pydantic.v1.main.find_foods_by_nameSchemaSchema'>, func=<function get_available_functions.<locals>.find_foods_by_name at 0x7fc3d145adc0>)]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env = task.create_environment()\n",
+    "env.tools[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "7f1c1242-449c-4536-863d-b62bf6d2dff1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Bob'"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.tools[0].invoke({\"user_id\": 21})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "854e139b-a120-4012-bdf4-6394e0b1c42d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'id': 2, 'city': 'Los Angeles'},\n",
+       " {'id': 1, 'city': 'New York'},\n",
+       " {'id': 3, 'city': 'Chicago'},\n",
+       " {'id': 4, 'city': 'Houston'},\n",
+       " {'id': 5, 'city': 'Miami'}]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.tools[3].invoke({\"city\": \"LA\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
+   "metadata": {},
+   "source": [
+    "## Define an agent\n",
+    "\n",
+    "Let's build an agent that we can use for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "09469813-17b6-4456-a913-486a01a4b295",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_benchmarks.tool_usage import agents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "0ae8c6be-899c-44a6-a89b-0fc04c2cb05c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
+   "metadata": {},
+   "source": [
+    "Let's test that our agent works"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "612fb603-1401-426b-8a19-4453ad5b698a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "agent = agent_factory()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "0e4896fa-3633-44a1-857f-80a263cf2e03",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'who is bob?',\n",
+       " 'output': 'Bob is a user with the ID 21.',\n",
+       " 'intermediate_steps': [(AgentActionMessageLog(tool='find_users_by_name', tool_input={'name': 'bob'}, log=\"\\nInvoking: `find_users_by_name` with `{'name': 'bob'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"name\": \"bob\"\\n}', 'name': 'find_users_by_name'}})]),\n",
+       "   [{'id': 21, 'name': 'Bob'},\n",
+       "    {'id': 41, 'name': 'Donna'},\n",
+       "    {'id': 1, 'name': 'Alice'},\n",
+       "    {'id': 35, 'name': 'Charlie'},\n",
+       "    {'id': 42, 'name': 'Eve'},\n",
+       "    {'id': 43, 'name': 'Frank The Cat'}])]}"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent.invoke({\"question\": \"who is bob?\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
+   "metadata": {},
+   "source": [
+    "## Eval\n",
+    "\n",
+    "Let's evaluate an agent now"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "513042fe-2878-44f8-ae84-05b9d521c1de",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langsmith.client import Client\n",
+    "\n",
+    "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "2bedd9d1-fc06-4066-9f89-b874ae818d82",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "client = Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'test-fixed-self-71' at:\n",
+      "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/1c2f10b1-370d-4062-9397-bab8189e8b95?eval=true\n",
+      "\n",
+      "View all tests for Dataset Tool Usage - Relational Data at:\n",
+      "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/69c0e0d0-91b5-4183-bed0-6628e76964dc\n",
+      "[------------------------------------------------->] 21/21\n",
+      " Eval quantiles:\n",
+      "                                    0.25       0.5      0.75      mean  \\\n",
+      "Intermediate steps correctness  1.000000  1.000000  1.000000  0.761905   \n",
+      "# steps / # expected steps      1.000000  1.000000  1.000000  0.928571   \n",
+      "correctness                     1.000000  1.000000  1.000000  0.809524   \n",
+      "execution_time                  4.253613  4.253613  4.253613  4.253613   \n",
+      "\n",
+      "                                    mode  \n",
+      "Intermediate steps correctness  1.000000  \n",
+      "# steps / # expected steps      1.000000  \n",
+      "correctness                     1.000000  \n",
+      "execution_time                  4.253613  \n"
+     ]
+    }
+   ],
+   "source": [
+    "test_run = client.run_on_dataset(\n",
+    "    dataset_name=task.name,\n",
+    "    llm_or_chain_factory=agent_factory.create,\n",
+    "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
+    "    verbose=True,\n",
+    "    tags=[\"openai-functions\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
+   "metadata": {},
+   "source": [
+    "# Inspect\n",
+    "\n",
+    "Here, we'll take a look at the underlying results a little bit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "e5bea796-c204-42a1-904b-216b964a8936",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8095238095238095"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = test_run.to_dataframe()\n",
+    "df = pd.json_normalize(df.to_dict(orient=\"records\"))\n",
+    "\n",
+    "df[\"correctness\"].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "423292ca-1ca8-4753-b35b-0916d35802b9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Intermediate steps correctness</th>\n",
+       "      <th># steps / # expected steps</th>\n",
+       "      <th>correctness</th>\n",
+       "      <th>execution_time</th>\n",
+       "      <th>input.question</th>\n",
+       "      <th>output.question</th>\n",
+       "      <th>output.output</th>\n",
+       "      <th>output.intermediate_steps</th>\n",
+       "      <th>reference.reference</th>\n",
+       "      <th>reference.order_matters</th>\n",
+       "      <th>reference.expected_steps</th>\n",
+       "      <th>num_expected_steps</th>\n",
+       "      <th>actual_number_of_steps</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>do bob and alice live in the same city?</td>\n",
+       "      <td>do bob and alice live in the same city?</td>\n",
+       "      <td>No, Bob and Alice do not live in the same city...</td>\n",
+       "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
+       "      <td>no</td>\n",
+       "      <td>False</td>\n",
+       "      <td>[find_users_by_name, get_user_location, get_ci...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>Is it likely that Donna is outside with an umb...</td>\n",
+       "      <td>Is it likely that Donna is outside with an umb...</td>\n",
+       "      <td>I'm sorry, but I don't have access to real-tim...</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>False</td>\n",
+       "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>do alice and charlie use the same email provider?</td>\n",
+       "      <td>do alice and charlie use the same email provider?</td>\n",
+       "      <td>No, Alice and Charlie do not use the same emai...</td>\n",
+       "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
+       "      <td>no</td>\n",
+       "      <td>True</td>\n",
+       "      <td>[find_users_by_name, get_user_email, get_user_...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>Is it likely that Donna is awake right now?</td>\n",
+       "      <td>Is it likely that Donna is awake right now?</td>\n",
+       "      <td>I'm sorry, but I don't have access to informat...</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>True</td>\n",
+       "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>Donna is about to go outside. Does she need an...</td>\n",
+       "      <td>Donna is about to go outside. Does she need an...</td>\n",
+       "      <td>Donna is at location 4 and the current weather...</td>\n",
+       "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>True</td>\n",
+       "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Intermediate steps correctness  # steps / # expected steps  correctness  \\\n",
+       "0                               1                         1.0            1   \n",
+       "1                               0                         0.0            0   \n",
+       "2                               1                         1.0            1   \n",
+       "3                               0                         0.0            0   \n",
+       "4                               0                         1.0            1   \n",
+       "\n",
+       "   execution_time                                     input.question  \\\n",
+       "0        4.253613            do bob and alice live in the same city?   \n",
+       "1        4.253613  Is it likely that Donna is outside with an umb...   \n",
+       "2        4.253613  do alice and charlie use the same email provider?   \n",
+       "3        4.253613        Is it likely that Donna is awake right now?   \n",
+       "4        4.253613  Donna is about to go outside. Does she need an...   \n",
+       "\n",
+       "                                     output.question  \\\n",
+       "0            do bob and alice live in the same city?   \n",
+       "1  Is it likely that Donna is outside with an umb...   \n",
+       "2  do alice and charlie use the same email provider?   \n",
+       "3        Is it likely that Donna is awake right now?   \n",
+       "4  Donna is about to go outside. Does she need an...   \n",
+       "\n",
+       "                                       output.output  \\\n",
+       "0  No, Bob and Alice do not live in the same city...   \n",
+       "1  I'm sorry, but I don't have access to real-tim...   \n",
+       "2  No, Alice and Charlie do not use the same emai...   \n",
+       "3  I'm sorry, but I don't have access to informat...   \n",
+       "4  Donna is at location 4 and the current weather...   \n",
+       "\n",
+       "                           output.intermediate_steps reference.reference  \\\n",
+       "0  [(tool='find_users_by_name' tool_input={'name'...                  no   \n",
+       "1                                                 []                 yes   \n",
+       "2  [(tool='find_users_by_name' tool_input={'name'...                  no   \n",
+       "3                                                 []                 yes   \n",
+       "4  [(tool='find_users_by_name' tool_input={'name'...                 yes   \n",
+       "\n",
+       "   reference.order_matters                           reference.expected_steps  \\\n",
+       "0                    False  [find_users_by_name, get_user_location, get_ci...   \n",
+       "1                    False  [find_users_by_name, get_user_location, get_cu...   \n",
+       "2                     True  [find_users_by_name, get_user_email, get_user_...   \n",
+       "3                     True  [find_users_by_name, get_user_location, get_cu...   \n",
+       "4                     True  [find_users_by_name, get_user_location, get_cu...   \n",
+       "\n",
+       "   num_expected_steps  actual_number_of_steps  \n",
+       "0                   5                       5  \n",
+       "1                   4                       0  \n",
+       "2                   3                       3  \n",
+       "3                   3                       0  \n",
+       "4                   3                       3  "
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n",
+    "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "ffab97b7-eda2-408d-b611-596b637e627a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Intermediate steps correctness</th>\n",
+       "      <th># steps / # expected steps</th>\n",
+       "      <th>correctness</th>\n",
+       "      <th>execution_time</th>\n",
+       "      <th>input.question</th>\n",
+       "      <th>output.question</th>\n",
+       "      <th>output.output</th>\n",
+       "      <th>output.intermediate_steps</th>\n",
+       "      <th>reference.reference</th>\n",
+       "      <th>reference.order_matters</th>\n",
+       "      <th>reference.expected_steps</th>\n",
+       "      <th>num_expected_steps</th>\n",
+       "      <th>actual_number_of_steps</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>do bob and alice live in the same city?</td>\n",
+       "      <td>do bob and alice live in the same city?</td>\n",
+       "      <td>No, Bob and Alice do not live in the same city...</td>\n",
+       "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
+       "      <td>no</td>\n",
+       "      <td>False</td>\n",
+       "      <td>[find_users_by_name, get_user_location, get_ci...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>do alice and charlie use the same email provider?</td>\n",
+       "      <td>do alice and charlie use the same email provider?</td>\n",
+       "      <td>No, Alice and Charlie do not use the same emai...</td>\n",
+       "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
+       "      <td>no</td>\n",
+       "      <td>True</td>\n",
+       "      <td>[find_users_by_name, get_user_email, get_user_...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>Donna is about to go outside. Does she need an...</td>\n",
+       "      <td>Donna is about to go outside. Does she need an...</td>\n",
+       "      <td>Donna is at location 4 and the current weather...</td>\n",
+       "      <td>[(tool='find_users_by_name' tool_input={'name'...</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>True</td>\n",
+       "      <td>[find_users_by_name, get_user_location, get_cu...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>whats the name of the city where bob lives?</td>\n",
+       "      <td>whats the name of the city where bob lives?</td>\n",
+       "      <td>The name of the city where Bob lives is New York.</td>\n",
+       "      <td>[(tool='list_user_ids' tool_input={} log='\\nIn...</td>\n",
+       "      <td>Los Angeles</td>\n",
+       "      <td>True</td>\n",
+       "      <td>[find_users_by_name, get_user_location, get_ci...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4.253613</td>\n",
+       "      <td>what is the current users favorite color and n...</td>\n",
+       "      <td>what is the current users favorite color and n...</td>\n",
+       "      <td>The current user's favorite color is yellow an...</td>\n",
+       "      <td>[(tool='get_current_user_id' tool_input={} log...</td>\n",
+       "      <td>yellow and Charlie</td>\n",
+       "      <td>True</td>\n",
+       "      <td>[get_current_user_id, get_user_favorite_color,...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Intermediate steps correctness  # steps / # expected steps  correctness  \\\n",
+       "0                               1                         1.0            1   \n",
+       "2                               1                         1.0            1   \n",
+       "4                               0                         1.0            1   \n",
+       "5                               0                         1.0            0   \n",
+       "6                               1                         1.0            1   \n",
+       "\n",
+       "   execution_time                                     input.question  \\\n",
+       "0        4.253613            do bob and alice live in the same city?   \n",
+       "2        4.253613  do alice and charlie use the same email provider?   \n",
+       "4        4.253613  Donna is about to go outside. Does she need an...   \n",
+       "5        4.253613        whats the name of the city where bob lives?   \n",
+       "6        4.253613  what is the current users favorite color and n...   \n",
+       "\n",
+       "                                     output.question  \\\n",
+       "0            do bob and alice live in the same city?   \n",
+       "2  do alice and charlie use the same email provider?   \n",
+       "4  Donna is about to go outside. Does she need an...   \n",
+       "5        whats the name of the city where bob lives?   \n",
+       "6  what is the current users favorite color and n...   \n",
+       "\n",
+       "                                       output.output  \\\n",
+       "0  No, Bob and Alice do not live in the same city...   \n",
+       "2  No, Alice and Charlie do not use the same emai...   \n",
+       "4  Donna is at location 4 and the current weather...   \n",
+       "5  The name of the city where Bob lives is New York.   \n",
+       "6  The current user's favorite color is yellow an...   \n",
+       "\n",
+       "                           output.intermediate_steps reference.reference  \\\n",
+       "0  [(tool='find_users_by_name' tool_input={'name'...                  no   \n",
+       "2  [(tool='find_users_by_name' tool_input={'name'...                  no   \n",
+       "4  [(tool='find_users_by_name' tool_input={'name'...                 yes   \n",
+       "5  [(tool='list_user_ids' tool_input={} log='\\nIn...         Los Angeles   \n",
+       "6  [(tool='get_current_user_id' tool_input={} log...  yellow and Charlie   \n",
+       "\n",
+       "   reference.order_matters                           reference.expected_steps  \\\n",
+       "0                    False  [find_users_by_name, get_user_location, get_ci...   \n",
+       "2                     True  [find_users_by_name, get_user_email, get_user_...   \n",
+       "4                     True  [find_users_by_name, get_user_location, get_cu...   \n",
+       "5                     True  [find_users_by_name, get_user_location, get_ci...   \n",
+       "6                     True  [get_current_user_id, get_user_favorite_color,...   \n",
+       "\n",
+       "   num_expected_steps  actual_number_of_steps  \n",
+       "0                   5                       5  \n",
+       "2                   3                       3  \n",
+       "4                   3                       3  \n",
+       "5                   3                       3  \n",
+       "6                   3                       3  "
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.sort_values(\"actual_number_of_steps\", ascending=False).head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/source/notebooks/tool_usage/typewriter_1.ipynb b/docs/source/notebooks/tool_usage/typewriter_1.ipynb
index b8a7effc..1105cac8 100644
--- a/docs/source/notebooks/tool_usage/typewriter_1.ipynb
+++ b/docs/source/notebooks/tool_usage/typewriter_1.ipynb
@@ -1,486 +1,631 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "# Typewriter: Single Tool\n",
-                "\n",
-                "\n",
-                "Let's see how to evaluate an agent's ability to use tools.\n",
-                "\n",
-                "    A task where the agent must type a given string one letter at a time.\n",
-                "\n",
-                "    In this variation of the task, the agent is given a single function,\n",
-                "    that takes a letter as an argument."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 1,
-            "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langchain_benchmarks import clone_public_dataset, registry"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 2,
-            "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<table>\n",
-                            "<tbody>\n",
-                            "<tr><td>Name       </td><td>Tool Usage - Typewriter (1 tool)                                                                                                                           </td></tr>\n",
-                            "<tr><td>Type       </td><td>ToolUsageTask                                                                                                                                              </td></tr>\n",
-                            "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d\" target=\"_blank\" rel=\"noopener\">59577193-8938-4ccf-92a7-e8a96bcf4f86</a></td></tr>\n",
-                            "<tr><td>Description</td><td>Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n",
-                            "\n",
-                            "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n",
-                            "\n",
-                            "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n",
-                            "\n",
-                            "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.                                                                                                                                                            </td></tr>\n",
-                            "</tbody>\n",
-                            "</table>"
-                        ],
-                        "text/plain": [
-                            "ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x12438d760>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \")"
-                        ]
-                    },
-                    "execution_count": 2,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "task = registry[\"Tool Usage - Typewriter (1 tool)\"]\n",
-                "task"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
-            "metadata": {},
-            "source": [
-                "Clone the dataset associaetd with this task"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 3,
-            "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Dataset Tool Usage - Typewriter (1 tool) already exists. Skipping.\n",
-                        "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3.\n"
-                    ]
-                }
-            ],
-            "source": [
-                "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
-            "metadata": {},
-            "source": [
-                "Let's build an agent that we can use for evaluation."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 4,
-            "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "{'question': 'abc',\n",
-                            " 'output': 'a, b, c',\n",
-                            " 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"a\"\\n}', 'name': 'type_letter'}})]),\n",
-                            "   'OK'),\n",
-                            "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"b\"\\n}', 'name': 'type_letter'}})]),\n",
-                            "   'OK'),\n",
-                            "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"c\"\\n}', 'name': 'type_letter'}})]),\n",
-                            "   'OK')],\n",
-                            " 'state': 'abc'}"
-                        ]
-                    },
-                    "execution_count": 4,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "from langchain_benchmarks.tool_usage import agents\n",
-                "\n",
-                "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
-                "\n",
-                "# Let's test that our agent works\n",
-                "agent = agent_factory.create()\n",
-                "agent.invoke({\"question\": \"abc\"})"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-            "metadata": {},
-            "source": [
-                "## Eval\n",
-                "\n",
-                "Let's evaluate an agent now"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 5,
-            "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "View the evaluation results for project 'test-fresh-whip-11' at:\n",
-                        "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/c0c32118-d413-409f-ac01-088632c0e6ab?eval=true\n",
-                        "\n",
-                        "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
-                        "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3\n",
-                        "[------------------------------------------------->] 20/20\n",
-                        " Eval quantiles:\n",
-                        "                                0.25  0.5  0.75  mean  mode\n",
-                        "Intermediate steps correctness  1.00  1.0   1.0  0.95   1.0\n",
-                        "# steps / # expected steps      1.00  1.0   1.0  1.70   1.0\n",
-                        "Correct Final State             1.00  1.0   1.0  0.95   1.0\n",
-                        "correctness                     0.75  1.0   1.0  0.75   1.0\n"
-                    ]
-                }
-            ],
-            "source": [
-                "from langsmith.client import Client\n",
-                "\n",
-                "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
-                "\n",
-                "client = Client()\n",
-                "\n",
-                "test_run = client.run_on_dataset(\n",
-                "    dataset_name=task.name,\n",
-                "    llm_or_chain_factory=agent_factory.create,\n",
-                "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
-                "    verbose=True,\n",
-                "    tags=[\"gpt-3.5-turbo-16k\"],\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
-            "metadata": {},
-            "source": [
-                "# Inspect\n",
-                "\n",
-                "You can take a look at the underlying results."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 12,
-            "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "import pandas as pd\n",
-                "\n",
-                "df = test_run.to_dataframe()\n",
-                "df = pd.json_normalize(df.to_dict(orient=\"records\"))"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 13,
-            "id": "7ab5a8b9-a937-4537-b879-704284df4494",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "0.75"
-                        ]
-                    },
-                    "execution_count": 13,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "df[\"correctness\"].mean()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 14,
-            "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n",
-                "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 15,
-            "id": "50d7590d-20de-4768-ac90-adcdbfa70068",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<div>\n",
-                            "<style scoped>\n",
-                            "    .dataframe tbody tr th:only-of-type {\n",
-                            "        vertical-align: middle;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe tbody tr th {\n",
-                            "        vertical-align: top;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe thead th {\n",
-                            "        text-align: right;\n",
-                            "    }\n",
-                            "</style>\n",
-                            "<table border=\"1\" class=\"dataframe\">\n",
-                            "  <thead>\n",
-                            "    <tr style=\"text-align: right;\">\n",
-                            "      <th></th>\n",
-                            "      <th>Intermediate steps correctness</th>\n",
-                            "      <th># steps / # expected steps</th>\n",
-                            "      <th>Correct Final State</th>\n",
-                            "      <th>correctness</th>\n",
-                            "      <th>input.question</th>\n",
-                            "      <th>output.question</th>\n",
-                            "      <th>output.output</th>\n",
-                            "      <th>output.intermediate_steps</th>\n",
-                            "      <th>output.state</th>\n",
-                            "      <th>reference.state</th>\n",
-                            "      <th>reference.reference</th>\n",
-                            "      <th>reference.expected_steps</th>\n",
-                            "      <th>num_expected_steps</th>\n",
-                            "      <th>actual_number_of_steps</th>\n",
-                            "    </tr>\n",
-                            "  </thead>\n",
-                            "  <tbody>\n",
-                            "    <tr>\n",
-                            "      <th>0</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>communication</td>\n",
-                            "      <td>communication</td>\n",
-                            "      <td>communication</td>\n",
-                            "      <td>[(tool='type_letter' tool_input={'letter': 'c'...</td>\n",
-                            "      <td>communication</td>\n",
-                            "      <td>communication</td>\n",
-                            "      <td>communication</td>\n",
-                            "      <td>[type_letter, type_letter, type_letter, type_l...</td>\n",
-                            "      <td>13</td>\n",
-                            "      <td>13</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>1</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>information</td>\n",
-                            "      <td>information</td>\n",
-                            "      <td>information</td>\n",
-                            "      <td>[(tool='type_letter' tool_input={'letter': 'i'...</td>\n",
-                            "      <td>information</td>\n",
-                            "      <td>information</td>\n",
-                            "      <td>information</td>\n",
-                            "      <td>[type_letter, type_letter, type_letter, type_l...</td>\n",
-                            "      <td>11</td>\n",
-                            "      <td>11</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>2</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>dictionary</td>\n",
-                            "      <td>dictionary</td>\n",
-                            "      <td>dictionary</td>\n",
-                            "      <td>[(tool='type_letter' tool_input={'letter': 'd'...</td>\n",
-                            "      <td>dictionary</td>\n",
-                            "      <td>dictionary</td>\n",
-                            "      <td>dictionary</td>\n",
-                            "      <td>[type_letter, type_letter, type_letter, type_l...</td>\n",
-                            "      <td>10</td>\n",
-                            "      <td>10</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>3</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>university</td>\n",
-                            "      <td>university</td>\n",
-                            "      <td>u\\nn\\ni\\nv\\ne\\nr\\ns\\ni\\nt\\ny</td>\n",
-                            "      <td>[(tool='type_letter' tool_input={'letter': 'u'...</td>\n",
-                            "      <td>university</td>\n",
-                            "      <td>university</td>\n",
-                            "      <td>university</td>\n",
-                            "      <td>[type_letter, type_letter, type_letter, type_l...</td>\n",
-                            "      <td>10</td>\n",
-                            "      <td>10</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>4</th>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>1</td>\n",
-                            "      <td>keyboard</td>\n",
-                            "      <td>keyboard</td>\n",
-                            "      <td>keyboard</td>\n",
-                            "      <td>[(tool='type_letter' tool_input={'letter': 'k'...</td>\n",
-                            "      <td>keyboard</td>\n",
-                            "      <td>keyboard</td>\n",
-                            "      <td>keyboard</td>\n",
-                            "      <td>[type_letter, type_letter, type_letter, type_l...</td>\n",
-                            "      <td>8</td>\n",
-                            "      <td>8</td>\n",
-                            "    </tr>\n",
-                            "  </tbody>\n",
-                            "</table>\n",
-                            "</div>"
-                        ],
-                        "text/plain": [
-                            "   Intermediate steps correctness  # steps / # expected steps  \\\n",
-                            "0                               1                         1.0   \n",
-                            "1                               1                         1.0   \n",
-                            "2                               1                         1.0   \n",
-                            "3                               1                         1.0   \n",
-                            "4                               1                         1.0   \n",
-                            "\n",
-                            "   Correct Final State  correctness input.question output.question  \\\n",
-                            "0                    1            1  communication   communication   \n",
-                            "1                    1            1    information     information   \n",
-                            "2                    1            1     dictionary      dictionary   \n",
-                            "3                    1            1     university      university   \n",
-                            "4                    1            1       keyboard        keyboard   \n",
-                            "\n",
-                            "                  output.output  \\\n",
-                            "0                 communication   \n",
-                            "1                   information   \n",
-                            "2                    dictionary   \n",
-                            "3  u\\nn\\ni\\nv\\ne\\nr\\ns\\ni\\nt\\ny   \n",
-                            "4                      keyboard   \n",
-                            "\n",
-                            "                           output.intermediate_steps   output.state  \\\n",
-                            "0  [(tool='type_letter' tool_input={'letter': 'c'...  communication   \n",
-                            "1  [(tool='type_letter' tool_input={'letter': 'i'...    information   \n",
-                            "2  [(tool='type_letter' tool_input={'letter': 'd'...     dictionary   \n",
-                            "3  [(tool='type_letter' tool_input={'letter': 'u'...     university   \n",
-                            "4  [(tool='type_letter' tool_input={'letter': 'k'...       keyboard   \n",
-                            "\n",
-                            "  reference.state reference.reference  \\\n",
-                            "0   communication       communication   \n",
-                            "1     information         information   \n",
-                            "2      dictionary          dictionary   \n",
-                            "3      university          university   \n",
-                            "4        keyboard            keyboard   \n",
-                            "\n",
-                            "                            reference.expected_steps  num_expected_steps  \\\n",
-                            "0  [type_letter, type_letter, type_letter, type_l...                  13   \n",
-                            "1  [type_letter, type_letter, type_letter, type_l...                  11   \n",
-                            "2  [type_letter, type_letter, type_letter, type_l...                  10   \n",
-                            "3  [type_letter, type_letter, type_letter, type_l...                  10   \n",
-                            "4  [type_letter, type_letter, type_letter, type_l...                   8   \n",
-                            "\n",
-                            "   actual_number_of_steps  \n",
-                            "0                      13  \n",
-                            "1                      11  \n",
-                            "2                      10  \n",
-                            "3                      10  \n",
-                            "4                       8  "
-                        ]
-                    },
-                    "execution_count": 15,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "df.head()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "62bcf6c2-6449-4967-a4f4-2f3d90657a52",
-            "metadata": {},
-            "outputs": [],
-            "source": []
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.11.2"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 5
-}
\ No newline at end of file
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Typewriter: Single Tool\n",
+    "\n",
+    "In this task, an agent is given access to a single tool called \"type_letter\".\n",
+    "This tool takes one argument called \"letter\" which is expected to be a character.\n",
+    "\n",
+    "The agent must repeat the input string from the user, printing one\n",
+    "character a time on a piece of virtual paper.\n",
+    "\n",
+    "The agent is evaluated based on its ability to print the correct string using\n",
+    "the \"type_letter\" tool."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_benchmarks import clone_public_dataset, registry"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "<tbody>\n",
+       "<tr><td>Name       </td><td>Tool Usage - Typewriter (1 tool)                                                                                                                           </td></tr>\n",
+       "<tr><td>Type       </td><td>ToolUsageTask                                                                                                                                              </td></tr>\n",
+       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d\" target=\"_blank\" rel=\"noopener\">59577193-8938-4ccf-92a7-e8a96bcf4f86</a></td></tr>\n",
+       "<tr><td>Description</td><td>Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n",
+       "\n",
+       "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n",
+       "\n",
+       "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n",
+       "\n",
+       "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.                                                                                                                                                            </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x7f53b5c5b430>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \")"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "task = registry[\"Tool Usage - Typewriter (1 tool)\"]\n",
+    "task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
+   "metadata": {},
+   "source": [
+    "Clone the dataset associaetd with this task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset Tool Usage - Typewriter (1 tool) already exists. Skipping.\n",
+      "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/25850d74-d4e0-41ac-81a1-dfc78a79660b.\n"
+     ]
+    }
+   ],
+   "source": [
+    "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc78a3e1-80da-4607-98c3-a99c2037e7ca",
+   "metadata": {},
+   "source": [
+    "## The Environment\n",
+    "\n",
+    "The environment consists of a single tool and a virtual paper.\n",
+    "\n",
+    "The tool accepts a single letter as an input and prints the leter on the virtual paper. If successful, the tool returns the output \"OK\".\n",
+    "\n",
+    "To determine what's written on the paper, one needs to read the environment state."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "64e538ae-5cf2-4cd5-a312-25ee6924e869",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "env = task.create_environment()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "5516a34b-1e9b-4f1e-9462-cfc4d5bc29f9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[StructuredTool(name='type_letter', description='type_letter(letter: str) -> str - Print the given letter on the paper.', args_schema=<class 'pydantic.v1.main.type_letterSchemaSchema'>, func=<function create_typer.<locals>.type_letter at 0x7f538cc0e040>)]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.tools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "80501e1a-f1f6-4b38-8637-894503029d86",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "tool = env.tools[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "3f352e32-fdb6-4d9e-b1c4-3d78b4f50646",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'OK'"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tool.invoke({\"letter\": \"a\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "ec9c2e68-b55e-4087-bc1a-c38f4cfd401b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'OK'"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tool.invoke({\"letter\": \"b\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "2cc5b174-25a4-4d5a-8535-56ecea62ea81",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ab'"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.read_state()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd13d120-1bf9-481c-9392-c15ebdd9d77f",
+   "metadata": {},
+   "source": [
+    "## Agent"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
+   "metadata": {},
+   "source": [
+    "Let's build an agent that we can use for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'abc',\n",
+       " 'output': 'a, b, c',\n",
+       " 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"a\"\\n}', 'name': 'type_letter'}})]),\n",
+       "   'OK'),\n",
+       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"b\"\\n}', 'name': 'type_letter'}})]),\n",
+       "   'OK'),\n",
+       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"c\"\\n}', 'name': 'type_letter'}})]),\n",
+       "   'OK')],\n",
+       " 'state': 'abc'}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain_benchmarks.tool_usage import agents\n",
+    "\n",
+    "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
+    "\n",
+    "# Let's test that our agent works\n",
+    "agent = agent_factory.create()\n",
+    "agent.invoke({\"question\": \"abc\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
+   "metadata": {},
+   "source": [
+    "## Eval\n",
+    "\n",
+    "Let's evaluate an agent now"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'test-shiny-curve-39' at:\n",
+      "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/c66bbd6e-cce5-461d-9287-97391bd2f668?eval=true\n",
+      "\n",
+      "View all tests for Dataset Tool Usage - Typewriter (1 tool) at:\n",
+      "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/25850d74-d4e0-41ac-81a1-dfc78a79660b\n",
+      "[------------------------------------------------->] 20/20\n",
+      " Eval quantiles:\n",
+      "                                     0.25        0.5       0.75       mean  \\\n",
+      "Intermediate steps correctness   1.000000   1.000000   1.000000   0.950000   \n",
+      "# steps / # expected steps       1.000000   1.000000   1.000000   1.700000   \n",
+      "Correct Final State              1.000000   1.000000   1.000000   0.950000   \n",
+      "correctness                      1.000000   1.000000   1.000000   0.800000   \n",
+      "execution_time                  34.058961  34.058961  34.058961  34.058961   \n",
+      "\n",
+      "                                     mode  \n",
+      "Intermediate steps correctness   1.000000  \n",
+      "# steps / # expected steps       1.000000  \n",
+      "Correct Final State              1.000000  \n",
+      "correctness                      1.000000  \n",
+      "execution_time                  34.058961  \n"
+     ]
+    }
+   ],
+   "source": [
+    "from langsmith.client import Client\n",
+    "\n",
+    "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
+    "\n",
+    "client = Client()\n",
+    "\n",
+    "test_run = client.run_on_dataset(\n",
+    "    dataset_name=task.name,\n",
+    "    llm_or_chain_factory=agent_factory.create,\n",
+    "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
+    "    verbose=True,\n",
+    "    tags=[\"gpt-3.5-turbo-16k\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
+   "metadata": {},
+   "source": [
+    "# Inspect\n",
+    "\n",
+    "You can take a look at the underlying results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = test_run.to_dataframe()\n",
+    "df = pd.json_normalize(df.to_dict(orient=\"records\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7ab5a8b9-a937-4537-b879-704284df4494",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"correctness\"].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ab7516ed-36b1-4c16-bf4a-cc49077460ad",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n",
+    "df[\"actual_number_of_steps\"] = df[\"output.intermediate_steps\"].apply(len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "50d7590d-20de-4768-ac90-adcdbfa70068",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Intermediate steps correctness</th>\n",
+       "      <th># steps / # expected steps</th>\n",
+       "      <th>Correct Final State</th>\n",
+       "      <th>correctness</th>\n",
+       "      <th>execution_time</th>\n",
+       "      <th>input.question</th>\n",
+       "      <th>output.question</th>\n",
+       "      <th>output.output</th>\n",
+       "      <th>output.intermediate_steps</th>\n",
+       "      <th>output.state</th>\n",
+       "      <th>reference.state</th>\n",
+       "      <th>reference.reference</th>\n",
+       "      <th>reference.expected_steps</th>\n",
+       "      <th>num_expected_steps</th>\n",
+       "      <th>actual_number_of_steps</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>34.058961</td>\n",
+       "      <td>a</td>\n",
+       "      <td>a</td>\n",
+       "      <td>Agent stopped due to iteration limit or time l...</td>\n",
+       "      <td>[(tool='type_letter' tool_input={'letter': 'a'...</td>\n",
+       "      <td>aaaaaaaaaaaaaaa</td>\n",
+       "      <td>a</td>\n",
+       "      <td>a</td>\n",
+       "      <td>[type_letter]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>34.058961</td>\n",
+       "      <td>aa</td>\n",
+       "      <td>aa</td>\n",
+       "      <td>aa\\naa</td>\n",
+       "      <td>[(tool='type_letter' tool_input={'letter': 'a'...</td>\n",
+       "      <td>aa</td>\n",
+       "      <td>aa</td>\n",
+       "      <td>aa</td>\n",
+       "      <td>[type_letter, type_letter]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>34.058961</td>\n",
+       "      <td>aaa</td>\n",
+       "      <td>aaa</td>\n",
+       "      <td>a\\na</td>\n",
+       "      <td>[(tool='type_letter' tool_input={'letter': 'a'...</td>\n",
+       "      <td>aaa</td>\n",
+       "      <td>aaa</td>\n",
+       "      <td>aaa</td>\n",
+       "      <td>[type_letter, type_letter, type_letter]</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>34.058961</td>\n",
+       "      <td>aaaa</td>\n",
+       "      <td>aaaa</td>\n",
+       "      <td>a\\na</td>\n",
+       "      <td>[(tool='type_letter' tool_input={'letter': 'a'...</td>\n",
+       "      <td>aaaa</td>\n",
+       "      <td>aaaa</td>\n",
+       "      <td>aaaa</td>\n",
+       "      <td>[type_letter, type_letter, type_letter, type_l...</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>34.058961</td>\n",
+       "      <td>dog</td>\n",
+       "      <td>dog</td>\n",
+       "      <td>d\\no\\ng</td>\n",
+       "      <td>[(tool='type_letter' tool_input={'letter': 'd'...</td>\n",
+       "      <td>dog</td>\n",
+       "      <td>dog</td>\n",
+       "      <td>dog</td>\n",
+       "      <td>[type_letter, type_letter, type_letter]</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Intermediate steps correctness  # steps / # expected steps  \\\n",
+       "0                               0                        15.0   \n",
+       "1                               1                         1.0   \n",
+       "2                               1                         1.0   \n",
+       "3                               1                         1.0   \n",
+       "4                               1                         1.0   \n",
+       "\n",
+       "   Correct Final State  correctness  execution_time input.question  \\\n",
+       "0                    0            0       34.058961              a   \n",
+       "1                    1            1       34.058961             aa   \n",
+       "2                    1            0       34.058961            aaa   \n",
+       "3                    1            0       34.058961           aaaa   \n",
+       "4                    1            1       34.058961            dog   \n",
+       "\n",
+       "  output.question                                      output.output  \\\n",
+       "0               a  Agent stopped due to iteration limit or time l...   \n",
+       "1              aa                                             aa\\naa   \n",
+       "2             aaa                                               a\\na   \n",
+       "3            aaaa                                               a\\na   \n",
+       "4             dog                                            d\\no\\ng   \n",
+       "\n",
+       "                           output.intermediate_steps     output.state  \\\n",
+       "0  [(tool='type_letter' tool_input={'letter': 'a'...  aaaaaaaaaaaaaaa   \n",
+       "1  [(tool='type_letter' tool_input={'letter': 'a'...               aa   \n",
+       "2  [(tool='type_letter' tool_input={'letter': 'a'...              aaa   \n",
+       "3  [(tool='type_letter' tool_input={'letter': 'a'...             aaaa   \n",
+       "4  [(tool='type_letter' tool_input={'letter': 'd'...              dog   \n",
+       "\n",
+       "  reference.state reference.reference  \\\n",
+       "0               a                   a   \n",
+       "1              aa                  aa   \n",
+       "2             aaa                 aaa   \n",
+       "3            aaaa                aaaa   \n",
+       "4             dog                 dog   \n",
+       "\n",
+       "                            reference.expected_steps  num_expected_steps  \\\n",
+       "0                                      [type_letter]                   1   \n",
+       "1                         [type_letter, type_letter]                   2   \n",
+       "2            [type_letter, type_letter, type_letter]                   3   \n",
+       "3  [type_letter, type_letter, type_letter, type_l...                   4   \n",
+       "4            [type_letter, type_letter, type_letter]                   3   \n",
+       "\n",
+       "   actual_number_of_steps  \n",
+       "0                      15  \n",
+       "1                       2  \n",
+       "2                       3  \n",
+       "3                       4  \n",
+       "4                       3  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/source/notebooks/tool_usage/typewriter_26.ipynb b/docs/source/notebooks/tool_usage/typewriter_26.ipynb
index 51a8632f..1c57b95d 100644
--- a/docs/source/notebooks/tool_usage/typewriter_26.ipynb
+++ b/docs/source/notebooks/tool_usage/typewriter_26.ipynb
@@ -1,284 +1,763 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "# Typewriter: 26 Tools\n",
-                "\n",
-                "\n",
-                "Let's see how to evaluate an agent's ability to use tools.\n",
-                "\n",
-                "    A task where the agent must type a given string one letter at a time.\n",
-                "\n",
-                "    In this variation of the task, the agent is given access to 26 parameterless functions,\n",
-                "    each representing a letter of the alphabet."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "845c77a6-9da6-494c-973f-0ee1dac67b19",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import os\n",
-                "\n",
-                "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"  # Your api key."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 1,
-            "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langchain_benchmarks import clone_public_dataset, registry"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 2,
-            "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<table>\n",
-                            "<tbody>\n",
-                            "<tr><td>Name       </td><td>Tool Usage - Typewriter (26 tools)                                                                                                                         </td></tr>\n",
-                            "<tr><td>Type       </td><td>ToolUsageTask                                                                                                                                              </td></tr>\n",
-                            "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d\" target=\"_blank\" rel=\"noopener\">128af05e-aa00-4e3b-a958-d166dd450581</a></td></tr>\n",
-                            "<tr><td>Description</td><td>Environment with 26 tools each tool represents a letter of the alphabet.\n",
-                            "\n",
-                            "The objective of this task is to evaluate the model's ability the use tools\n",
-                            "for a simple repetition task.\n",
-                            "\n",
-                            "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n",
-                            "\n",
-                            "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n",
-                            "\n",
-                            "This is a variation of the typer writer task, where 26 parameterless tools are\n",
-                            "given instead of a single tool that takes a letter as an argument.                                                                                                                                                            </td></tr>\n",
-                            "</tbody>\n",
-                            "</table>"
-                        ],
-                        "text/plain": [
-                            "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x12788dd00>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\")"
-                        ]
-                    },
-                    "execution_count": 2,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "task = registry[\"Tool Usage - Typewriter (26 tools)\"]\n",
-                "task"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
-            "metadata": {},
-            "source": [
-                "Clone the dataset associaetd with this task"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 3,
-            "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Dataset Tool Usage - Typewriter (26 tools) already exists. Skipping.\n",
-                        "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478.\n"
-                    ]
-                }
-            ],
-            "source": [
-                "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
-            "metadata": {},
-            "source": [
-                "Let's build an agent that we can use for evaluation."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 4,
-            "id": "61535a75-24f6-4727-9549-f76c263e9153",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "env = task.create_environment()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 5,
-            "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "{'question': 'foo',\n",
-                            " 'output': \"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\",\n",
-                            " 'intermediate_steps': [(AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': 'f', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n",
-                            "   'Invalid or incomplete response'),\n",
-                            "  (AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n",
-                            "   'Invalid or incomplete response'),\n",
-                            "  (AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n",
-                            "   'Invalid or incomplete response'),\n",
-                            "  (AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n",
-                            "   'Invalid or incomplete response'),\n",
-                            "  (AgentAction(tool='_Exception', tool_input='Invalid or incomplete response', log=\"Could not parse tool input: {'arguments': '', 'name': 'f'} because the `arguments` is not valid JSON.\"),\n",
-                            "   'Invalid or incomplete response')],\n",
-                            " 'state': ''}"
-                        ]
-                    },
-                    "execution_count": 5,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "from langchain_benchmarks.tool_usage import agents\n",
-                "\n",
-                "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
-                "\n",
-                "# Let's test that our agent works\n",
-                "agent = agent_factory()\n",
-                "agent.invoke({\"question\": \"foo\"})"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-            "metadata": {},
-            "source": [
-                "## Eval\n",
-                "\n",
-                "Let's evaluate an agent now"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 6,
-            "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "View the evaluation results for project 'test-notable-artist-76' at:\n",
-                        "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/5c828160-9f7f-4f01-84ea-05f8a498d031?eval=true\n",
-                        "\n",
-                        "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n",
-                        "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478\n",
-                        "[>                                                 ] 0/20"
-                    ]
-                },
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "Chain failed for example 2d4e99fc-8495-468e-8429-6c25a2d176f3 with inputs {'question': 'keyboard'}\n",
-                        "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID b658bca90fb852f4d236fc368bc65bcc in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-                    ]
-                },
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "[------------------->                              ] 8/20"
-                    ]
-                },
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "Chain failed for example 8af5bd36-fc11-4b23-9019-f642cfaf8a01 with inputs {'question': 'horse'}\n",
-                        "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 3c40664804cb6e8c84e0e8796dbc0a6d in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
-                    ]
-                },
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "[------------------------------------------------->] 20/20\n",
-                        " Eval quantiles:\n",
-                        "                                    0.25   0.5   0.75      mean  mode\n",
-                        "Intermediate steps correctness  0.000000  0.00  0.000  0.000000  0.00\n",
-                        "# steps / # expected steps      0.703571  0.75  1.375  1.007551  0.75\n",
-                        "Correct Final State             0.000000  0.00  0.000  0.055556  0.00\n",
-                        "correctness                     0.000000  0.00  0.000  0.111111  0.00\n"
-                    ]
-                }
-            ],
-            "source": [
-                "from langsmith.client import Client\n",
-                "\n",
-                "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
-                "\n",
-                "client = Client()\n",
-                "\n",
-                "test_run = client.run_on_dataset(\n",
-                "    dataset_name=task.name,\n",
-                "    llm_or_chain_factory=agent_factory.create,\n",
-                "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
-                "    verbose=True,\n",
-                "    tags=[\"gpt-3.5-turbo-16k\"],\n",
-                ")"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.11.2"
-        }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Typewriter: 26 Tools\n",
+    "\n",
+    "This is a variation of the typewriter task in which the agent has access to 26 parameterless tools.\n",
+    "\n",
+    "Each tool represents a letter of the alphabet (e.g., 'a', 'b', 'c').\n",
+    "\n",
+    "The agent can use each tool to \"print\" the corresponding letter on a piece of virtual paper.\n",
+    "\n",
+    "The objective for the agent is to \"print\" the user's input on the paper exactly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "845c77a6-9da6-494c-973f-0ee1dac67b19",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"  # Your api key."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_benchmarks import clone_public_dataset, registry"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "<tbody>\n",
+       "<tr><td>Name       </td><td>Tool Usage - Typewriter (26 tools)                                                                                                                         </td></tr>\n",
+       "<tr><td>Type       </td><td>ToolUsageTask                                                                                                                                              </td></tr>\n",
+       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d\" target=\"_blank\" rel=\"noopener\">128af05e-aa00-4e3b-a958-d166dd450581</a></td></tr>\n",
+       "<tr><td>Description</td><td>Environment with 26 tools each tool represents a letter of the alphabet.\n",
+       "\n",
+       "The objective of this task is to evaluate the model's ability the use tools\n",
+       "for a simple repetition task.\n",
+       "\n",
+       "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n",
+       "\n",
+       "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n",
+       "\n",
+       "This is a variation of the typer writer task, where 26 parameterless tools are\n",
+       "given instead of a single tool that takes a letter as an argument.                                                                                                                                                            </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7f09670b7160>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\")"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "task = registry[\"Tool Usage - Typewriter (26 tools)\"]\n",
+    "task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
+   "metadata": {},
+   "source": [
+    "Clone the dataset associaetd with this task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset Tool Usage - Typewriter (26 tools) already exists. Skipping.\n",
+      "You can access the dataset at https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/5051c0ae-16be-4afa-b914-84acbc5e9659.\n"
+     ]
+    }
+   ],
+   "source": [
+    "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
+   "metadata": {},
+   "source": [
+    "Let's build an agent that we can use for evaluation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6ce51f81-1b3a-4dda-a382-c2fed3013af1",
+   "metadata": {},
+   "source": [
+    "## The Environment\n",
+    "\n",
+    "The environment consists of 26 tools and a virtual paper.\n",
+    "\n",
+    "Each tool is responsible for printing a letter on the paper that corresponds to it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "61535a75-24f6-4727-9549-f76c263e9153",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "env = task.create_environment()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f35a0a1d-5a1e-4de1-8d8c-c7c9a264a6c7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f099cebd310>),\n",
+       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f097f56f940>),\n",
+       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f097f56ff70>),\n",
+       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f096421b040>),\n",
+       " StructuredTool(name='e', description='e() -> str - Run to Type the letter \"e\".', args_schema=<class 'pydantic.v1.main.eSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f096421b1f0>)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.tools[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "5bea0190-39ec-4f30-9a00-90136bc6bf0b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'OK'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.tools[0].invoke({})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "bf7444da-15a1-455a-b22e-639cbfff8432",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'OK'"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.tools[3].invoke({})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d12bd710-5c01-4539-a4b9-afbf03164923",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ad'"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.read_state()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1d62a13-3771-460f-b131-4443f669ca3d",
+   "metadata": {},
+   "source": [
+    "## Agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'abc',\n",
+       " 'output': 'abc\\nabc',\n",
+       " 'intermediate_steps': [(AgentActionMessageLog(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '', 'name': 'a'}})]),\n",
+       "   'OK'),\n",
+       "  (AgentActionMessageLog(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '', 'name': 'b'}})]),\n",
+       "   'OK'),\n",
+       "  (AgentActionMessageLog(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '', 'name': 'c'}})]),\n",
+       "   'OK')],\n",
+       " 'state': 'abc'}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain_benchmarks.tool_usage import agents\n",
+    "\n",
+    "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
+    "\n",
+    "# Let's test that our agent works\n",
+    "agent = agent_factory()\n",
+    "agent.invoke({\"question\": \"abc\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
+   "metadata": {},
+   "source": [
+    "## Eval\n",
+    "\n",
+    "Let's evaluate an agent now"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'test-mealy-ink-37' at:\n",
+      "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/projects/p/d5562dcb-7bea-432d-8e41-3fcf3f6f2247?eval=true\n",
+      "\n",
+      "View all tests for Dataset Tool Usage - Typewriter (26 tools) at:\n",
+      "https://smith.langchain.com/o/e081f11e-fbd2-41b4-9fa8-5d76c76ef854/datasets/5051c0ae-16be-4afa-b914-84acbc5e9659\n",
+      "[----------->                                      ] 5/20"
+     ]
     },
-    "nbformat": 4,
-    "nbformat_minor": 5
-}
\ No newline at end of file
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Chain failed for example c0ee0026-e11b-4036-b7f0-135ac9e82d66 with inputs {'question': 'horse'}\n",
+      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID 8707858df9212b40a8d4f22a0027d2a2 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-------------->                                   ] 6/20"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Chain failed for example 4ae1d1c0-4c34-4ef0-afd8-292be2e53b8d with inputs {'question': 'school'}\n",
+      "Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': \"'s()' does not match '^[a-zA-Z0-9_-]{1,64}$' - 'messages.2.function_call.name'\", 'type': 'invalid_request_error', 'param': None, 'code': None}}\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[----------------------------->                    ] 12/20"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Chain failed for example e03000da-4c4b-4060-a798-0e71f3c3ff90 with inputs {'question': 'keyboard'}\n",
+      "Error Type: InternalServerError, Message: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID d20bfa7a39d9ee8c80e72070a6aafab9 in your email.)', 'type': 'server_error', 'param': None, 'code': None}}\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[------------------------------------------------->] 20/20\n",
+      " Eval quantiles:\n",
+      "                                     0.25        0.5       0.75       mean  \\\n",
+      "Intermediate steps correctness   0.000000   0.000000   1.000000   0.294118   \n",
+      "# steps / # expected steps       1.000000   1.125000   2.142857   1.722598   \n",
+      "Correct Final State              0.000000   1.000000   1.000000   0.529412   \n",
+      "correctness                      0.000000   0.000000   1.000000   0.470588   \n",
+      "execution_time                  38.794961  38.794961  38.794961  38.794961   \n",
+      "\n",
+      "                                     mode  \n",
+      "Intermediate steps correctness   0.000000  \n",
+      "# steps / # expected steps       1.000000  \n",
+      "Correct Final State              1.000000  \n",
+      "correctness                      0.000000  \n",
+      "execution_time                  38.794961  \n"
+     ]
+    }
+   ],
+   "source": [
+    "from langsmith.client import Client\n",
+    "\n",
+    "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
+    "\n",
+    "client = Client()\n",
+    "\n",
+    "test_run = client.run_on_dataset(\n",
+    "    dataset_name=task.name,\n",
+    "    llm_or_chain_factory=agent_factory.create,\n",
+    "    evaluation=STANDARD_AGENT_EVALUATOR,\n",
+    "    verbose=True,\n",
+    "    tags=[\"gpt-3.5-turbo-16k\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "641f534e-3ce2-436b-83a8-0289578546ff",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = test_run.to_dataframe()\n",
+    "df = pd.json_normalize(df.to_dict(orient=\"records\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "4bc23900-46e5-450f-80c6-9d53eb4b12a7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df[\"num_expected_steps\"] = df[\"reference.expected_steps\"].apply(len)\n",
+    "df[\"actual_number_of_steps\"] = (\n",
+    "    df[\"output.intermediate_steps\"]\n",
+    "    .apply(lambda x: None if not isinstance(x, list) else len(x))\n",
+    "    .fillna(\"\")\n",
+    ")\n",
+    "df[\"output.Error\"].fillna(\"\", inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "985e18cc-03ad-47aa-a703-037cec97270a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.5294117647058824"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"Correct Final State\"].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "bfe6bebd-0a6d-4787-9441-fc51c0d2e7c3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>input.question</th>\n",
+       "      <th>output.state</th>\n",
+       "      <th>num_expected_steps</th>\n",
+       "      <th>actual_number_of_steps</th>\n",
+       "      <th>Correct Final State</th>\n",
+       "      <th>output.Error</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>a</td>\n",
+       "      <td>a</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>aa</td>\n",
+       "      <td>aa</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>aaa</td>\n",
+       "      <td>aaaaaaaaaaaaaaa</td>\n",
+       "      <td>3</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>aaaa</td>\n",
+       "      <td>aaaaaaaaaaaaaaa</td>\n",
+       "      <td>4</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>dog</td>\n",
+       "      <td>dog</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>cat</td>\n",
+       "      <td>cat</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>hand</td>\n",
+       "      <td>hand</td>\n",
+       "      <td>4</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>head</td>\n",
+       "      <td>hhhhhhhhhhhhhhh</td>\n",
+       "      <td>4</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>house</td>\n",
+       "      <td>house</td>\n",
+       "      <td>5</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>horse</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5</td>\n",
+       "      <td></td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>InternalServerError(\"Error code: 500 - {'error...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>school</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>6</td>\n",
+       "      <td></td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>BadRequestError('Error code: 400 - {\\'error\\':...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>church</td>\n",
+       "      <td>churchchchchch</td>\n",
+       "      <td>6</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>teacher</td>\n",
+       "      <td>teacher</td>\n",
+       "      <td>7</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>student</td>\n",
+       "      <td>studentstudent</td>\n",
+       "      <td>7</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>computer</td>\n",
+       "      <td>computer</td>\n",
+       "      <td>8</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>keyboard</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8</td>\n",
+       "      <td></td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>InternalServerError(\"Error code: 500 - {'error...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>university</td>\n",
+       "      <td></td>\n",
+       "      <td>10</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>dictionary</td>\n",
+       "      <td>dictionarystr</td>\n",
+       "      <td>10</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>information</td>\n",
+       "      <td></td>\n",
+       "      <td>11</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>communication</td>\n",
+       "      <td>communication</td>\n",
+       "      <td>13</td>\n",
+       "      <td>14.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   input.question     output.state  num_expected_steps actual_number_of_steps  \\\n",
+       "0               a                a                   1                    1.0   \n",
+       "1              aa               aa                   2                    2.0   \n",
+       "2             aaa  aaaaaaaaaaaaaaa                   3                   15.0   \n",
+       "3            aaaa  aaaaaaaaaaaaaaa                   4                   15.0   \n",
+       "4             dog              dog                   3                    4.0   \n",
+       "5             cat              cat                   3                    4.0   \n",
+       "6            hand             hand                   4                    4.0   \n",
+       "7            head  hhhhhhhhhhhhhhh                   4                   15.0   \n",
+       "8           house            house                   5                    5.0   \n",
+       "9           horse              NaN                   5                          \n",
+       "10         school              NaN                   6                          \n",
+       "11         church   churchchchchch                   6                   15.0   \n",
+       "12        teacher          teacher                   7                    7.0   \n",
+       "13        student   studentstudent                   7                   15.0   \n",
+       "14       computer         computer                   8                    9.0   \n",
+       "15       keyboard              NaN                   8                          \n",
+       "16     university                                   10                    5.0   \n",
+       "17     dictionary    dictionarystr                  10                   15.0   \n",
+       "18    information                                   11                    3.0   \n",
+       "19  communication    communication                  13                   14.0   \n",
+       "\n",
+       "    Correct Final State                                       output.Error  \n",
+       "0                   1.0                                                     \n",
+       "1                   1.0                                                     \n",
+       "2                   0.0                                                     \n",
+       "3                   0.0                                                     \n",
+       "4                   1.0                                                     \n",
+       "5                   1.0                                                     \n",
+       "6                   1.0                                                     \n",
+       "7                   0.0                                                     \n",
+       "8                   1.0                                                     \n",
+       "9                   NaN  InternalServerError(\"Error code: 500 - {'error...  \n",
+       "10                  NaN  BadRequestError('Error code: 400 - {\\'error\\':...  \n",
+       "11                  0.0                                                     \n",
+       "12                  1.0                                                     \n",
+       "13                  0.0                                                     \n",
+       "14                  1.0                                                     \n",
+       "15                  NaN  InternalServerError(\"Error code: 500 - {'error...  \n",
+       "16                  0.0                                                     \n",
+       "17                  0.0                                                     \n",
+       "18                  0.0                                                     \n",
+       "19                  1.0                                                     "
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\n",
+    "    [\n",
+    "        \"input.question\",\n",
+    "        \"output.state\",\n",
+    "        \"num_expected_steps\",\n",
+    "        \"actual_number_of_steps\",\n",
+    "        \"Correct Final State\",\n",
+    "        \"output.Error\",\n",
+    "    ]\n",
+    "]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain_benchmarks/tool_usage/evaluators.py b/langchain_benchmarks/tool_usage/evaluators.py
index 187ed605..95b284f3 100644
--- a/langchain_benchmarks/tool_usage/evaluators.py
+++ b/langchain_benchmarks/tool_usage/evaluators.py
@@ -5,11 +5,10 @@
 * Agents must output "intermediate_steps" in their run outputs.
 * The dataset must have "expected_steps" in its outputs.
 """
-from typing import List, Optional, Union
+from typing import Optional
 
 from langchain.evaluation import EvaluatorType
 from langchain.smith import RunEvalConfig
-from langchain.smith.evaluation.config import EvalConfig
 from langsmith.evaluation.evaluator import (
     EvaluationResult,
     EvaluationResults,
@@ -17,7 +16,56 @@
 )
 from langsmith.schemas import Example, Run
 
-from langchain_benchmarks.schema import ExtractionTask
+
+def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResults:
+    """Compare the outputs of a run to the expected outputs."""
+    intermediate_steps = run_outputs["intermediate_steps"]
+    # Since we are comparing to the tool names, we now need to get that
+    # Intermediate steps is a Tuple[AgentAction, Any]
+    # The first element is the action taken
+    # The second element is the observation from taking that action
+    trajectory = [action.tool for action, _ in intermediate_steps]
+    expected_trajectory = example_outputs["expected_steps"]
+
+    order_matters = example_outputs.get("order_matters", True)
+
+    if order_matters:
+        # If the order matters trajectory must be the same as expected trajectory
+        score = int(trajectory == expected_trajectory)
+    else:
+        # If order does not matter, then we compare the trajectories after sorting
+        # them. This will make sure that the number of times each tool is used
+        # is the same, but the order does not matter.
+        score = int(sorted(trajectory) == sorted(expected_trajectory))
+
+    # Just score it based on whether it is correct or not
+    step_fraction = len(trajectory) / len(expected_trajectory)
+
+    # Add trajectory results
+    results = [
+        EvaluationResult(
+            key="Intermediate steps correctness",
+            score=score,
+        ),
+        EvaluationResult(
+            key="# steps / # expected steps",
+            score=step_fraction,
+        ),
+    ]
+
+    # Evaluate state score
+    # This will need to be evolved it's too simple.
+    if "state" in run_outputs:
+        state = run_outputs["state"]
+        example_state = example_outputs["state"]
+        results.append(
+            EvaluationResult(
+                key="Correct Final State",
+                score=int(state == example_state),
+            )
+        )
+
+    return {"results": results}
 
 
 class AgentTrajectoryEvaluator(RunEvaluator):
@@ -26,46 +74,25 @@ class AgentTrajectoryEvaluator(RunEvaluator):
     def evaluate_run(
         self, run: Run, example: Optional[Example] = None
     ) -> EvaluationResults:
+        # The run is the run from the agent
         if run.outputs is None:
             raise ValueError("Run outputs cannot be None")
-        # This is the output of each run
-        intermediate_steps = run.outputs["intermediate_steps"]
-        # Since we are comparing to the tool names, we now need to get that
-        # Intermediate steps is a Tuple[AgentAction, Any]
-        # The first element is the action taken
-        # The second element is the observation from taking that action
-        trajectory = [action.tool for action, _ in intermediate_steps]
-        # This is what we uploaded to the dataset
+
+        # The example is the example from the dataset
         if example is None:
             raise ValueError("Example cannot be None")
-        expected_trajectory = example.outputs["expected_steps"]
 
-        # Just score it based on whether it is correct or not
-        score = int(trajectory == expected_trajectory)
-        step_fraction = len(trajectory) / len(expected_trajectory)
+        if "intermediate_steps" not in run.outputs:
+            raise ValueError(
+                "Please make sure that your agent outputs 'intermediate_steps'"
+            )
 
-        results = [
-            EvaluationResult(
-                key="Intermediate steps correctness",
-                score=score,
-            ),
-            EvaluationResult(
-                key="# steps / # expected steps",
-                score=step_fraction,
-            ),
-        ]
-
-        if "state" in run.outputs:
-            state = run.outputs["state"]
-            example_state = example.outputs["state"]
-            results.append(
-                EvaluationResult(
-                    key="Correct Final State",
-                    score=int(state == example_state),
-                )
+        if "expected_steps" not in example.outputs:
+            raise ValueError(
+                "Please make sure that your dataset contains 'expected_steps'"
             )
 
-        return {"results": results}
+        return compare_outputs(run.outputs, example.outputs)
 
 
 STANDARD_AGENT_EVALUATOR = RunEvalConfig(
diff --git a/tests/unit_tests/tool_usage/test_evaluator.py b/tests/unit_tests/tool_usage/test_evaluator.py
new file mode 100644
index 00000000..82a5ff62
--- /dev/null
+++ b/tests/unit_tests/tool_usage/test_evaluator.py
@@ -0,0 +1,115 @@
+"""Test the standard agent evaluator."""
+
+import pytest
+from langchain.schema import AgentAction
+
+from langchain_benchmarks.tool_usage.evaluators import compare_outputs
+
+
+@pytest.mark.parametrize(
+    "run_outputs, example_outputs, expected_results",
+    [
+        (
+            {
+                "intermediate_steps": [
+                    (
+                        AgentAction(tool="action_1", tool_input={}, log=""),
+                        "observation1",
+                    ),
+                    (
+                        AgentAction(tool="action_2", tool_input={}, log=""),
+                        "observation1",
+                    ),
+                ],
+                "state": "final_state",
+            },
+            {
+                "expected_steps": ["action_1", "action_2"],
+                "state": "final_state",
+            },
+            {
+                "Intermediate steps correctness": True,
+                "# steps / # expected steps": 1,
+                "Correct Final State": 1,
+            },
+        ),
+        (
+            {
+                "intermediate_steps": [
+                    (
+                        AgentAction(tool="action_1", tool_input={}, log=""),
+                        "observation1",
+                    ),
+                    (
+                        AgentAction(tool="action_2", tool_input={}, log=""),
+                        "observation1",
+                    ),
+                ],
+                "state": "final_state",
+            },
+            {
+                "expected_steps": ["cat", "was", "here"],
+                "state": "another_state",
+            },
+            {
+                "Intermediate steps correctness": False,
+                "# steps / # expected steps": 2 / 3,
+                "Correct Final State": 0,
+            },
+        ),
+        (
+            {
+                "intermediate_steps": [
+                    (
+                        AgentAction(tool="action_2", tool_input={}, log=""),
+                        "observation1",
+                    ),
+                    (
+                        AgentAction(tool="action_1", tool_input={}, log=""),
+                        "observation1",
+                    ),
+                ],
+                "state": "final_state",
+            },
+            {
+                "expected_steps": ["action_1", "action_2"],
+                "order_matters": False,
+                "state": "different_state",
+            },
+            {
+                "Intermediate steps correctness": True,
+                "# steps / # expected steps": 1.0,
+                "Correct Final State": 0,
+            },
+        ),
+        # Without state
+        (
+            {
+                "intermediate_steps": [
+                    (
+                        AgentAction(tool="action_2", tool_input={}, log=""),
+                        "observation1",
+                    ),
+                    (
+                        AgentAction(tool="action_1", tool_input={}, log=""),
+                        "observation1",
+                    ),
+                ],
+            },
+            {
+                "expected_steps": ["action_1", "action_2"],
+                "order_matters": False,
+            },
+            {
+                "Intermediate steps correctness": True,
+                "# steps / # expected steps": 1.0,
+            },
+        ),
+    ],
+)
+def test_compare_outputs(run_outputs, example_outputs, expected_results):
+    """Test compare outputs."""
+    evaluation_results = compare_outputs(run_outputs, example_outputs)
+    assert {
+        result.key: result.score for result in evaluation_results["results"]
+    } == expected_results

Name	Multiverse Math
Type	ToolUsageTask
Dataset ID	https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d
Description	An environment that contains a few basic math operations, but with altered results.\n", - "\n", - "For example, mu...
	Intermediate steps correctness	# steps / # expected steps	correctness	execution_time	input.question	output.question	output.output	output.intermediate_steps	reference.reference	reference.expected_steps	num_expected_steps	actual_number_of_steps
0	0	15.0	0	38.76436	Add 2 and 3	Add 2 and 3	Agent stopped due to iteration limit or time l...	[(tool='add' tool_input={'a': 2, 'b': 3} log=\"...	6.20	[add]	1	15
1	0	15.0	0	38.76436	Subtract 3 from 2	Subtract 3 from 2	Agent stopped due to iteration limit or time l...	[(tool='subtract' tool_input={'a': 2, 'b': 3} ...	-4.00	[subtract]	1	15
2	0	9.0	1	38.76436	What is -5 if evaluated using the negate funct...	What is -5 if evaluated using the negate funct...	-5.0\\n-5.0	[(tool='negate' tool_input={'a': -5} log=\"\\nIn...	-5.00	[negate]	1	9
3	1	1.0	0	38.76436	what is the result of 2 to the power of 3?	what is the result of 2 to the power of 3?	The result of 2 to the power of 3 is 32.	[(tool='power' tool_input={'a': 2, 'b': 3} log...	32.00	[power]	1	1
4	0	7.5	0	38.76436	I ate 1 apple and 2 oranges every day for 7 da...	I ate 1 apple and 2 oranges every day for 7 da...	Agent stopped due to iteration limit or time l...	[(tool='add' tool_input={'a': 1, 'b': 2} log=\"...	32.34	[multiply, add]	2	15
	Intermediate steps correctness	# steps / # expected steps	correctness	execution_time	input.question	output.question	output.output	output.intermediate_steps	reference.reference	reference.expected_steps	num_expected_steps	actual_number_of_steps
0	1	1.0	1	5.771554	Add 2 and 3	Add 2 and 3	The sum of 2 and 3 in this alternate mathemati...	[(tool='add' tool_input={'a': 2, 'b': 3} log=\"...	6.20	[add]	1	1
1	1	1.0	0	5.771554	Subtract 3 from 2	Subtract 3 from 2	The result of subtracting 3 from 2 in this alt...	[(tool='subtract' tool_input={'a': 2, 'b': 3} ...	-4.00	[subtract]	1	1
2	1	1.0	1	5.771554	What is -5 if evaluated using the negate funct...	What is -5 if evaluated using the negate funct...	The result of evaluating -5 using the negate f...	[(tool='negate' tool_input={'a': -5} log=\"\\nIn...	-5.00	[negate]	1	1
3	1	1.0	0	5.771554	what is the result of 2 to the power of 3?	what is the result of 2 to the power of 3?	The result of 2 to the power of 3 is 32.	[(tool='power' tool_input={'a': 2, 'b': 3} log...	32.00	[power]	1	1
4	0	1.0	0	5.771554	I ate 1 apple and 2 oranges every day for 7 da...	I ate 1 apple and 2 oranges every day for 7 da...	You ate a total of 32.34 fruits.	[(tool='add' tool_input={'a': 1, 'b': 2} log=\"...	32.34	[multiply, add]	2	2
Name	Tool Usage - Relational Data
Type	ToolUsageTask
Dataset ID	https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d
Description	Environment with fake data about users and their locations and favorite foods.\n", - "\n", - "The environment provides a set of tools that can be used to query the data.\n", - "\n", - "The objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\n", - "\n", - "The dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\n", - "\n", - "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", - "\n", - "Success is measured by the ability to answer the question correctly, and efficiently.
	Intermediate steps correctness	# steps / # expected steps	correctness	execution_time	input.question	output.question	output.output	output.intermediate_steps	reference.reference	reference.order_matters	reference.expected_steps	num_expected_steps	actual_number_of_steps
0	0	1.0	1	5.098939	do bob and alice live in the same city?	do bob and alice live in the same city?	No, Bob and Alice do not live in the same city...	[(tool='find_users_by_name' tool_input={'name'...	no	False	[find_users_by_name, get_user_location, get_ci...	5	5
1	0	0.0	0	5.098939	Is it likely that Donna is outside with an umb...	Is it likely that Donna is outside with an umb...	I'm sorry, but I don't have access to real-tim...	[]	yes	False	[find_users_by_name, get_user_location, get_cu...	4	0
2	1	1.0	1	5.098939	do alice and charlie use the same email provider?	do alice and charlie use the same email provider?	No, Alice and Charlie do not use the same emai...	[(tool='find_users_by_name' tool_input={'name'...	no	True	[find_users_by_name, get_user_email, get_user_...	3	3
3	0	0.0	0	5.098939	Is it likely that Donna is awake right now?	Is it likely that Donna is awake right now?	I'm sorry, but I don't have access to informat...	[]	yes	True	[find_users_by_name, get_user_location, get_cu...	3	0
4	0	1.0	1	5.098939	Donna is about to go outside. Does she need an...	Donna is about to go outside. Does she need an...	Donna is currently in a location where it is r...	[(tool='find_users_by_name' tool_input={'name'...	yes	True	[find_users_by_name, get_user_location, get_cu...	3	3
	Intermediate steps correctness	# steps / # expected steps	correctness	execution_time	input.question	output.question	output.output	output.intermediate_steps	reference.reference	reference.order_matters	reference.expected_steps	num_expected_steps	actual_number_of_steps
0	1	1.0	1	4.253613	do bob and alice live in the same city?	do bob and alice live in the same city?	No, Bob and Alice do not live in the same city...	[(tool='find_users_by_name' tool_input={'name'...	no	False	[find_users_by_name, get_user_location, get_ci...	5	5
1	0	0.0	0	4.253613	Is it likely that Donna is outside with an umb...	Is it likely that Donna is outside with an umb...	I'm sorry, but I don't have access to real-tim...	[]	yes	False	[find_users_by_name, get_user_location, get_cu...	4	0
2	1	1.0	1	4.253613	do alice and charlie use the same email provider?	do alice and charlie use the same email provider?	No, Alice and Charlie do not use the same emai...	[(tool='find_users_by_name' tool_input={'name'...	no	True	[find_users_by_name, get_user_email, get_user_...	3	3
3	0	0.0	0	4.253613	Is it likely that Donna is awake right now?	Is it likely that Donna is awake right now?	I'm sorry, but I don't have access to informat...	[]	yes	True	[find_users_by_name, get_user_location, get_cu...	3	0
4	0	1.0	1	4.253613	Donna is about to go outside. Does she need an...	Donna is about to go outside. Does she need an...	Donna is at location 4 and the current weather...	[(tool='find_users_by_name' tool_input={'name'...	yes	True	[find_users_by_name, get_user_location, get_cu...	3	3
Name	Tool Usage - Typewriter (1 tool)
Type	ToolUsageTask
Dataset ID	59577193-8938-4ccf-92a7-e8a96bcf4f86
Description	Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n", - "\n", - "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n", - "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", - "\n", - "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
	Intermediate steps correctness	# steps / # expected steps	Correct Final State	correctness	input.question	output.question	output.output	output.intermediate_steps	output.state	reference.state	reference.reference	reference.expected_steps	num_expected_steps	actual_number_of_steps
0	1	1.0	1	1	communication	communication	communication	[(tool='type_letter' tool_input={'letter': 'c'...	communication	communication	communication	[type_letter, type_letter, type_letter, type_l...	13	13
1	1	1.0	1	1	information	information	information	[(tool='type_letter' tool_input={'letter': 'i'...	information	information	information	[type_letter, type_letter, type_letter, type_l...	11	11
2	1	1.0	1	1	dictionary	dictionary	dictionary	[(tool='type_letter' tool_input={'letter': 'd'...	dictionary	dictionary	dictionary	[type_letter, type_letter, type_letter, type_l...	10	10
3	1	1.0	1	1	university	university	u\\nn\\ni\\nv\\ne\\nr\\ns\\ni\\nt\\ny	[(tool='type_letter' tool_input={'letter': 'u'...	university	university	university	[type_letter, type_letter, type_letter, type_l...	10	10
4	1	1.0	1	1	keyboard	keyboard	keyboard	[(tool='type_letter' tool_input={'letter': 'k'...	keyboard	keyboard	keyboard	[type_letter, type_letter, type_letter, type_l...	8	8
	Intermediate steps correctness	# steps / # expected steps	Correct Final State	correctness	execution_time	input.question	output.question	output.output	output.intermediate_steps	output.state	reference.state	reference.reference	reference.expected_steps	num_expected_steps	actual_number_of_steps
0	0	15.0	0	0	34.058961	a	a	Agent stopped due to iteration limit or time l...	[(tool='type_letter' tool_input={'letter': 'a'...	aaaaaaaaaaaaaaa	a	a	[type_letter]	1	15
1	1	1.0	1	1	34.058961	aa	aa	aa\\naa	[(tool='type_letter' tool_input={'letter': 'a'...	aa	aa	aa	[type_letter, type_letter]	2	2
2	1	1.0	1	0	34.058961	aaa	aaa	a\\na	[(tool='type_letter' tool_input={'letter': 'a'...	aaa	aaa	aaa	[type_letter, type_letter, type_letter]	3	3
3	1	1.0	1	0	34.058961	aaaa	aaaa	a\\na	[(tool='type_letter' tool_input={'letter': 'a'...	aaaa	aaaa	aaaa	[type_letter, type_letter, type_letter, type_l...	4	4
4	1	1.0	1	1	34.058961	dog	dog	d\\no\\ng	[(tool='type_letter' tool_input={'letter': 'd'...	dog	dog	dog	[type_letter, type_letter, type_letter]	3	3
Name	Tool Usage - Typewriter (26 tools)
Type	ToolUsageTask
Dataset ID	128af05e-aa00-4e3b-a958-d166dd450581
Description	Environment with 26 tools each tool represents a letter of the alphabet.\n", - "\n", - "The objective of this task is to evaluate the model's ability the use tools\n", - "for a simple repetition task.\n", - "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", - "\n", - "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", - "\n", - "This is a variation of the typer writer task, where 26 parameterless tools are\n", - "given instead of a single tool that takes a letter as an argument.