From ca14a4ef3b81e96baa7e2201aed117fca74b11b1 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Tue, 21 Nov 2023 19:26:19 -0800
Subject: [PATCH] Wfh/rerun notebooks (#69)

Add links to the notebooks
---
 docs/source/notebooks/datasets.ipynb          |   2 +-
 docs/source/notebooks/extraction/email.ipynb  |   4 +-
 docs/source/notebooks/getting_started.ipynb   |  42 +-
 .../retrieval/comparing_techniques.ipynb      |  43 +-
 .../retrieval/langchain_docs_qa.ipynb         | 392 ++++++++++++++----
 5 files changed, 376 insertions(+), 107 deletions(-)
diff --git a/docs/source/notebooks/datasets.ipynb b/docs/source/notebooks/datasets.ipynb
index 949971b6..f09cd528 100644
--- a/docs/source/notebooks/datasets.ipynb
+++ b/docs/source/notebooks/datasets.ipynb
@@ -195,7 +195,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.11.2"
   }
  },
  "nbformat": 4,
diff --git a/docs/source/notebooks/extraction/email.ipynb b/docs/source/notebooks/extraction/email.ipynb
index f04da650..e4e9b4a8 100644
--- a/docs/source/notebooks/extraction/email.ipynb
+++ b/docs/source/notebooks/extraction/email.ipynb
@@ -30,8 +30,8 @@
     "import os\n",
     "\n",
     "# Get your API key from https://smith.langchain.com/settings\n",
-    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"\n",
-    "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
    ]
   },
   {
diff --git a/docs/source/notebooks/getting_started.ipynb b/docs/source/notebooks/getting_started.ipynb
index 1e3912d7..a037432b 100644
--- a/docs/source/notebooks/getting_started.ipynb
+++ b/docs/source/notebooks/getting_started.ipynb
@@ -66,7 +66,7 @@
    },
    "outputs": [],
    "source": [
-    "# %pip install -U --quiet langchain_benchmarks langchain langsmith"
+    "%pip install -U --quiet langchain_benchmarks langchain langsmith"
    ]
   },
   {
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "c516c725-c968-422b-aedf-e360d4f7774c",
    "metadata": {
     "tags": []
@@ -92,26 +92,28 @@
       "text/html": [
        "<table>\n",
        "<thead>\n",
-       "<tr><th>Name                            </th><th>Type          </th><th>Dataset ID                                                               </th><th>Description  </th></tr>\n",
+       "<tr><th>Name                              </th><th>Type          </th><th>Dataset ID                                                                                                                                                 </th><th>Description  </th></tr>\n",
        "</thead>\n",
        "<tbody>\n",
-       "<tr><td>Tool Usage - Typewriter (1 func)</td><td>ToolUsageTask </td><td>placeholder                                                              </td><td>Environment with a single function that accepts a single letter as input, and &quot;prints&quot; it on a piece of paper.\n",
+       "<tr><td>Tool Usage - Typewriter (1 tool)  </td><td>ToolUsageTask </td><td><a href=\"https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d\" target=\"_blank\" rel=\"noopener\">59577193-8938-4ccf-92a7-e8a96bcf4f86</a></td><td>Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n",
        "\n",
-       "The objective of this task is to evaluate the ability to use the provided tools to repeat a given input string.\n",
+       "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n",
        "\n",
-       "For example, if the string is &#x27;abc&#x27;, the tools &#x27;a&#x27;, &#x27;b&#x27;, and &#x27;c&#x27; must be invoked in that order.\n",
+       "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n",
        "\n",
        "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.              </td></tr>\n",
-       "<tr><td>Tool Usage - Typewriter         </td><td>ToolUsageTask </td><td>placeholder                                                              </td><td>Environment with 26 functions each representing a letter of the alphabet.\n",
+       "<tr><td>Tool Usage - Typewriter (26 tools)</td><td>ToolUsageTask </td><td><a href=\"https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d\" target=\"_blank\" rel=\"noopener\">128af05e-aa00-4e3b-a958-d166dd450581</a></td><td>Environment with 26 tools each tool represents a letter of the alphabet.\n",
        "\n",
-       "In this variation of the typewriter task, there are 26 parameterless functions, where each function represents a letter of the alphabet (instead of a single function that takes a letter as an argument).\n",
+       "The objective of this task is to evaluate the model's ability the use tools\n",
+       "for a simple repetition task.\n",
        "\n",
-       "The object is to evaluate the ability of use the functions to repeat the given string.\n",
+       "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n",
        "\n",
-       "For example, if the string is &#x27;abc&#x27;, the tools &#x27;a&#x27;, &#x27;b&#x27;, and &#x27;c&#x27; must be invoked in that order.\n",
+       "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n",
        "\n",
-       "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.              </td></tr>\n",
-       "<tr><td>Tool Usage - Relational Data    </td><td>ToolUsageTask </td><td>e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5                                     </td><td>Environment with fake data about users and their locations and favorite foods.\n",
+       "This is a variation of the typer writer task, where 26 parameterless tools are\n",
+       "given instead of a single tool that takes a letter as an argument.              </td></tr>\n",
+       "<tr><td>Tool Usage - Relational Data      </td><td>ToolUsageTask </td><td><a href=\"https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d\" target=\"_blank\" rel=\"noopener\">1d89f4b3-5f73-48cf-a127-2fdeb22f6d84</a></td><td>Environment with fake data about users and their locations and favorite foods.\n",
        "\n",
        "The environment provides a set of tools that can be used to query the data.\n",
        "\n",
@@ -122,25 +124,25 @@
        "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n",
        "\n",
        "Success is measured by the ability to answer the question correctly, and efficiently.              </td></tr>\n",
-       "<tr><td>Multiverse Math                 </td><td>ToolUsageTask </td><td>placeholder                                                              </td><td>An environment that contains a few basic math operations, but with altered results.\n",
+       "<tr><td>Multiverse Math                   </td><td>ToolUsageTask </td><td><a href=\"https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d\" target=\"_blank\" rel=\"noopener\">594f9f60-30a0-49bf-b075-f44beabf546a</a></td><td>An environment that contains a few basic math operations, but with altered results.\n",
        "\n",
        "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n",
        "\n",
        "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.              </td></tr>\n",
-       "<tr><td>Email Extraction                </td><td>ExtractionTask</td><td>https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d</td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
+       "<tr><td>Email Extraction                  </td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d\" target=\"_blank\" rel=\"noopener\">a1742786-bde5-4f51-a1d8-e148e5251ddb</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
        "\n",
        "Some additional cleanup of the data was done by hand after the initial pass.\n",
        "\n",
        "See https://github.com/jacoblee93/oss-model-extraction-evals.              </td></tr>\n",
-       "<tr><td>LangChain Docs Q&amp;A              </td><td>RetrievalTask </td><td>452ccafc-18e1-4314-885b-edd735f17b9d                                     </td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
+       "<tr><td>LangChain Docs Q&A                </td><td>RetrievalTask </td><td><a href=\"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d\" target=\"_blank\" rel=\"noopener\">452ccafc-18e1-4314-885b-edd735f17b9d</a></td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
        "\n",
        "The environment provides the documents and the retriever information.\n",
        "\n",
        "Each example is composed of a question and reference answer.\n",
        "\n",
        "Success is measured based on the accuracy of the answer relative to the reference answer.\n",
-       "We also measure the faithfulness of the model&#x27;s response relative to the retrieved documents (if any).              </td></tr>\n",
-       "<tr><td>Semi-structured Earnings        </td><td>RetrievalTask </td><td>c47d9617-ab99-4d6e-a6e6-92b8daf85a7d                                     </td><td>Questions and answers based on PDFs containing tables and charts.\n",
+       "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).              </td></tr>\n",
+       "<tr><td>Semi-structured Reports           </td><td>RetrievalTask </td><td><a href=\"https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d\" target=\"_blank\" rel=\"noopener\">c47d9617-ab99-4d6e-a6e6-92b8daf85a7d</a></td><td>Questions and answers based on PDFs containing tables and charts.\n",
        "\n",
        "The task provides the raw documents as well as factory methods to easily index them\n",
        "and create a retriever.\n",
@@ -148,15 +150,15 @@
        "Each example is composed of a question and reference answer.\n",
        "\n",
        "Success is measured based on the accuracy of the answer relative to the reference answer.\n",
-       "We also measure the faithfulness of the model&#x27;s response relative to the retrieved documents (if any).              </td></tr>\n",
+       "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).              </td></tr>\n",
        "</tbody>\n",
        "</table>"
       ],
       "text/plain": [
-       "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 func)', dataset_id='placeholder', description='Environment with a single function that accepts a single letter as input, and \"prints\" it on a piece of paper.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is \\'abc\\', the tools \\'a\\', \\'b\\', and \\'c\\' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n', create_environment=<function get_environment at 0x137749da0>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the function with a single letter at a time.\"), ToolUsageTask(name='Tool Usage - Typewriter', dataset_id='placeholder', description=\"Environment with 26 functions each representing a letter of the alphabet.\\n\\nIn this variation of the typewriter task, there are 26 parameterless functions, where each function represents a letter of the alphabet (instead of a single function that takes a letter as an argument).\\n\\nThe object is to evaluate the ability of use the functions to repeat the given string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x13774a160>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\"), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x1377498a0>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\"), ToolUsageTask(name='Multiverse Math', dataset_id='placeholder', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x137749260>, instructions='You are requested to solve math questions in an alternate mathematical universe. The rules of association, commutativity, and distributivity still apply, but the operations have been altered to yield different results than expected. Solve the given math questions using the provided tools. Do not guess the answer.'), ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n    ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['email'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['email'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{email}\\n```'))])), RetrievalTask(name='LangChain Docs Q&A', dataset_id='452ccafc-18e1-4314-885b-edd735f17b9d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x137748900>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x1377489a0>, 'hyde': <function _chroma_hyde_retriever_factory at 0x137748a40>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x133e11940>}, get_docs=<function load_cached_docs at 0x133e11260>), RetrievalTask(name='Semi-structured Earnings', dataset_id='c47d9617-ab99-4d6e-a6e6-92b8daf85a7d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x137748fe0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x137749080>, 'hyde': <function _chroma_hyde_retriever_factory at 0x137749120>}, architecture_factories={}, get_docs=<function load_docs at 0x137748f40>)])"
+       "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x11a085440>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \"), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x11a085940>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\"), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x11a084f40>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\"), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x11a084860>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.'), ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n    ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{input}\\n```'))])), RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x11a053a60>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x11a053b00>, 'hyde': <function _chroma_hyde_retriever_factory at 0x11a053ba0>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x10f8334c0>}, get_docs=<function load_cached_docs at 0x10f833060>), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x11a084220>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x11a0842c0>, 'hyde': <function _chroma_hyde_retriever_factory at 0x11a084360>}, architecture_factories={}, get_docs=<function load_docs at 0x11a084180>)])"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/docs/source/notebooks/retrieval/comparing_techniques.ipynb b/docs/source/notebooks/retrieval/comparing_techniques.ipynb
index 99d66546..19b644b5 100644
--- a/docs/source/notebooks/retrieval/comparing_techniques.ipynb
+++ b/docs/source/notebooks/retrieval/comparing_techniques.ipynb
@@ -85,7 +85,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
    "metadata": {
     "tags": []
@@ -97,12 +97,49 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "3644d211-382e-41aa-b282-21b01d28fc35",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "<thead>\n",
+       "<tr><th>Name                   </th><th>Type         </th><th>Dataset ID                                                                                                                                                 </th><th>Description  </th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>LangChain Docs Q&A     </td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d\" target=\"_blank\" rel=\"noopener\">452ccafc-18e1-4314-885b-edd735f17b9d</a></td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
+       "\n",
+       "The environment provides the documents and the retriever information.\n",
+       "\n",
+       "Each example is composed of a question and reference answer.\n",
+       "\n",
+       "Success is measured based on the accuracy of the answer relative to the reference answer.\n",
+       "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).              </td></tr>\n",
+       "<tr><td>Semi-structured Reports</td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d\" target=\"_blank\" rel=\"noopener\">c47d9617-ab99-4d6e-a6e6-92b8daf85a7d</a></td><td>Questions and answers based on PDFs containing tables and charts.\n",
+       "\n",
+       "The task provides the raw documents as well as factory methods to easily index them\n",
+       "and create a retriever.\n",
+       "\n",
+       "Each example is composed of a question and reference answer.\n",
+       "\n",
+       "Success is measured based on the accuracy of the answer relative to the reference answer.\n",
+       "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).              </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x12aae2840>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x12aae28e0>, 'hyde': <function _chroma_hyde_retriever_factory at 0x12aae2980>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x12a1be020>}, get_docs=<function load_cached_docs at 0x12a1bdb20>), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x12aae3060>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x12aae3100>, 'hyde': <function _chroma_hyde_retriever_factory at 0x12aae31a0>}, architecture_factories={}, get_docs=<function load_docs at 0x12aae2fc0>)])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "registry.filter(Type=\"RetrievalTask\")"
    ]
diff --git a/docs/source/notebooks/retrieval/langchain_docs_qa.ipynb b/docs/source/notebooks/retrieval/langchain_docs_qa.ipynb
index fc7a5d22..6a7df06c 100644
--- a/docs/source/notebooks/retrieval/langchain_docs_qa.ipynb
+++ b/docs/source/notebooks/retrieval/langchain_docs_qa.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "9f44b59b",
    "metadata": {
     "tags": []
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "62b518cf-99fb-44be-8acb-ee0a8ba62272",
    "metadata": {},
    "outputs": [],
@@ -49,7 +49,7 @@
     "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
     "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"  # Your API key\n",
     "\n",
-    "# # Silence warnings from HuggingFace\n",
+    "# Silence warnings from HuggingFace\n",
     "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
    ]
   },
@@ -65,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
    "metadata": {
     "tags": []
@@ -77,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "3644d211-382e-41aa-b282-21b01d28fc35",
    "metadata": {
     "tags": []
@@ -88,18 +88,18 @@
       "text/html": [
        "<table>\n",
        "<thead>\n",
-       "<tr><th>Name                    </th><th>Type         </th><th>Dataset ID                          </th><th>Description  </th></tr>\n",
+       "<tr><th>Name                   </th><th>Type         </th><th>Dataset ID                                                                                                                                                 </th><th>Description  </th></tr>\n",
        "</thead>\n",
        "<tbody>\n",
-       "<tr><td>LangChain Docs Q&amp;A      </td><td>RetrievalTask</td><td>452ccafc-18e1-4314-885b-edd735f17b9d</td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
+       "<tr><td>LangChain Docs Q&A     </td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d\" target=\"_blank\" rel=\"noopener\">452ccafc-18e1-4314-885b-edd735f17b9d</a></td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
        "\n",
        "The environment provides the documents and the retriever information.\n",
        "\n",
        "Each example is composed of a question and reference answer.\n",
        "\n",
        "Success is measured based on the accuracy of the answer relative to the reference answer.\n",
-       "We also measure the faithfulness of the model&#x27;s response relative to the retrieved documents (if any).              </td></tr>\n",
-       "<tr><td>Semi-structured Earnings</td><td>RetrievalTask</td><td>c47d9617-ab99-4d6e-a6e6-92b8daf85a7d</td><td>Questions and answers based on PDFs containing tables and charts.\n",
+       "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).              </td></tr>\n",
+       "<tr><td>Semi-structured Reports</td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d\" target=\"_blank\" rel=\"noopener\">c47d9617-ab99-4d6e-a6e6-92b8daf85a7d</a></td><td>Questions and answers based on PDFs containing tables and charts.\n",
        "\n",
        "The task provides the raw documents as well as factory methods to easily index them\n",
        "and create a retriever.\n",
@@ -107,15 +107,15 @@
        "Each example is composed of a question and reference answer.\n",
        "\n",
        "Success is measured based on the accuracy of the answer relative to the reference answer.\n",
-       "We also measure the faithfulness of the model&#x27;s response relative to the retrieved documents (if any).              </td></tr>\n",
+       "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).              </td></tr>\n",
        "</tbody>\n",
        "</table>"
       ],
       "text/plain": [
-       "Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='452ccafc-18e1-4314-885b-edd735f17b9d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x126948e00>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x126948ea0>, 'hyde': <function _chroma_hyde_retriever_factory at 0x126948f40>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x12600e0c0>}, get_docs=<function load_cached_docs at 0x102d17240>), RetrievalTask(name='Semi-structured Earnings', dataset_id='c47d9617-ab99-4d6e-a6e6-92b8daf85a7d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x1269496c0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x126949760>, 'hyde': <function _chroma_hyde_retriever_factory at 0x126949800>}, architecture_factories={}, get_docs=<function load_docs at 0x126949620>)])"
+       "Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x16a5802c0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x16a580360>, 'hyde': <function _chroma_hyde_retriever_factory at 0x16a580400>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x12fc37ce0>}, get_docs=<function load_cached_docs at 0x12fc377e0>), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x16a580ae0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x16a580b80>, 'hyde': <function _chroma_hyde_retriever_factory at 0x16a580c20>}, architecture_factories={}, get_docs=<function load_docs at 0x16a580a40>)])"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -127,7 +127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "671282f8-c455-4390-b018-e53bbd833093",
    "metadata": {
     "tags": []
@@ -138,9 +138,9 @@
       "text/html": [
        "<table>\n",
        "<tbody>\n",
-       "<tr><td>Name                  </td><td>LangChain Docs Q&amp;A                        </td></tr>\n",
-       "<tr><td>Type                  </td><td>RetrievalTask                             </td></tr>\n",
-       "<tr><td>Dataset ID            </td><td>452ccafc-18e1-4314-885b-edd735f17b9d      </td></tr>\n",
+       "<tr><td>Name                  </td><td>LangChain Docs Q&A                                                                                                                                         </td></tr>\n",
+       "<tr><td>Type                  </td><td>RetrievalTask                                                                                                                                              </td></tr>\n",
+       "<tr><td>Dataset ID            </td><td><a href=\"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d\" target=\"_blank\" rel=\"noopener\">452ccafc-18e1-4314-885b-edd735f17b9d</a></td></tr>\n",
        "<tr><td>Description           </td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
        "\n",
        "The environment provides the documents and the retriever information.\n",
@@ -148,18 +148,18 @@
        "Each example is composed of a question and reference answer.\n",
        "\n",
        "Success is measured based on the accuracy of the answer relative to the reference answer.\n",
-       "We also measure the faithfulness of the model&#x27;s response relative to the retrieved documents (if any).                                           </td></tr>\n",
-       "<tr><td>Retriever Factories   </td><td>basic, parent-doc, hyde                   </td></tr>\n",
-       "<tr><td>Architecture Factories</td><td>conversational-retrieval-qa               </td></tr>\n",
-       "<tr><td>get_docs              </td><td>&lt;function load_cached_docs at 0x102d17240&gt;</td></tr>\n",
+       "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).                                                                                                                                                            </td></tr>\n",
+       "<tr><td>Retriever Factories   </td><td>basic, parent-doc, hyde                                                                                                                                    </td></tr>\n",
+       "<tr><td>Architecture Factories</td><td>conversational-retrieval-qa                                                                                                                                </td></tr>\n",
+       "<tr><td>get_docs              </td><td><function load_cached_docs at 0x12fc377e0>                                                                                                                 </td></tr>\n",
        "</tbody>\n",
        "</table>"
       ],
       "text/plain": [
-       "RetrievalTask(name='LangChain Docs Q&A', dataset_id='452ccafc-18e1-4314-885b-edd735f17b9d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x126948e00>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x126948ea0>, 'hyde': <function _chroma_hyde_retriever_factory at 0x126948f40>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x12600e0c0>}, get_docs=<function load_cached_docs at 0x102d17240>)"
+       "RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': <function _chroma_retriever_factory at 0x16a5802c0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x16a580360>, 'hyde': <function _chroma_hyde_retriever_factory at 0x16a580400>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x12fc37ce0>}, get_docs=<function load_cached_docs at 0x12fc377e0>)"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -171,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
    "metadata": {
     "tags": []
@@ -192,7 +192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "c58247f5-b9bd-4cc5-9632-78bc21bb10b4",
    "metadata": {
     "tags": []
@@ -201,7 +201,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2c698bb93d4548fcacbb3c08990642c6",
+       "model_id": "4fc93abde64d43c99ffa0eb5374dca86",
        "version_major": 2,
        "version_minor": 0
       },
@@ -228,7 +228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 8,
    "id": "41e64350-63a7-4e7d-8e03-7dc459c444cc",
    "metadata": {
     "tags": []
@@ -290,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 9,
    "id": "10a1fca9-d356-4cff-93a9-c4f63944e57d",
    "metadata": {
     "tags": []
@@ -299,10 +299,10 @@
     {
      "data": {
       "text/plain": [
-       "' Based on the LangChain documentation, the LangChain Expression Language (LCEL) is a declarative way to easily compose chains together. Key things to know about LCEL:\\n\\n- It was designed to support putting prototypes into production with no code changes, from simple \"prompt + LLM\" chains to complex chains with hundreds of steps. \\n\\n- It provides streaming support - when you build chains with LCEL you get the best possible time-to-first-token, streaming tokens from the LLM to output parsers incrementally.\\n\\n- Chains built with LCEL can be called synchronously (like in a notebook) or asynchronously (like in a production server), using the same code.\\n\\n- LCEL automatically runs steps in parallel when possible, minimizing latency. \\n\\n- It supports configuring retries and fallbacks to make chains more reliable.\\n\\n- You can access intermediate chain results before the final output is ready, which helps with debugging and user notifications.\\n\\n- LCEL chains get input and output validation schemas automatically.\\n\\n- All steps are logged to LangSmith for observability.\\n\\n- Chains authored in LCEL can be easily deployed with LangServe.\\n\\nSo in summary, the LangChain Expression Language is'"
+       "' Unfortunately, I do not have any documents to reference information about expression language. As an AI assistant without access to external information, I can only respond based on the content provided to me. If you could provide me with some documents that describe expression language, I would be happy to summarize or share information from those documents to answer your question. Please feel free to provide any additional context or documents that may allow me to assist further.'"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -323,7 +323,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 10,
    "id": "513042fe-2878-44f8-ae84-05b9d521c1de",
    "metadata": {
     "tags": []
@@ -337,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 11,
    "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1",
    "metadata": {
     "tags": []
@@ -347,42 +347,51 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "View the evaluation results for project 'test-essential-pot-37' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/d57047ac-d6e7-49c2-bd52-e2158e2ce56f?eval=true\n",
+      "View the evaluation results for project 'only-man-12' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/c915fa65-2be0-42ad-8038-9a9fe3a0a879?eval=true\n",
       "\n",
       "View all tests for Dataset LangChain Docs Q&A at:\n",
       "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/3f29798f-5939-4643-bd99-008ca66b72ed\n",
-      "[>                                                 ] 1/86"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
       "[------------------------------------------------->] 86/86\n",
       " Eval quantiles:\n",
-      "                               0.25      0.5      0.75      mean      mode\n",
-      "embedding_cosine_distance  0.086465  0.12291  0.159078  0.128270  0.046088\n",
-      "score_string:accuracy      0.500000  0.70000  1.000000  0.641860  0.700000\n",
-      "faithfulness               0.700000  1.00000  1.000000  0.860465  1.000000\n"
+      "                                          inputs.question  \\\n",
+      "count                                                  86   \n",
+      "unique                                                 86   \n",
+      "top     in code, how can i add a system message at the...   \n",
+      "freq                                                    1   \n",
+      "mean                                                  NaN   \n",
+      "std                                                   NaN   \n",
+      "min                                                   NaN   \n",
+      "25%                                                   NaN   \n",
+      "50%                                                   NaN   \n",
+      "75%                                                   NaN   \n",
+      "max                                                   NaN   \n",
+      "\n",
+      "        feedback.embedding_cosine_distance  feedback.score_string:accuracy  \\\n",
+      "count                            86.000000                       86.000000   \n",
+      "unique                                 NaN                             NaN   \n",
+      "top                                    NaN                             NaN   \n",
+      "freq                                   NaN                             NaN   \n",
+      "mean                              0.190418                        0.177907   \n",
+      "std                               0.045291                        0.176503   \n",
+      "min                               0.074583                        0.100000   \n",
+      "25%                               0.154158                        0.100000   \n",
+      "50%                               0.190138                        0.100000   \n",
+      "75%                               0.222883                        0.100000   \n",
+      "max                               0.289047                        1.000000   \n",
+      "\n",
+      "        feedback.faithfulness error  execution_time  \n",
+      "count               82.000000     0       86.000000  \n",
+      "unique                    NaN     0             NaN  \n",
+      "top                       NaN   NaN             NaN  \n",
+      "freq                      NaN   NaN             NaN  \n",
+      "mean                 0.939024   NaN        9.605034  \n",
+      "std                  0.199231   NaN        3.323173  \n",
+      "min                  0.100000   NaN        4.748375  \n",
+      "25%                  1.000000   NaN        7.521995  \n",
+      "50%                  1.000000   NaN        8.637612  \n",
+      "75%                  1.000000   NaN       10.116563  \n",
+      "max                  1.000000   NaN       18.631366  \n"
      ]
     }
    ],
@@ -400,12 +409,191 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "e86578d5-be5c-4bcd-9dcb-35280eeed3f9",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>inputs.question</th>\n",
+       "      <th>feedback.embedding_cosine_distance</th>\n",
+       "      <th>feedback.score_string:accuracy</th>\n",
+       "      <th>feedback.faithfulness</th>\n",
+       "      <th>error</th>\n",
+       "      <th>execution_time</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>86</td>\n",
+       "      <td>86.000000</td>\n",
+       "      <td>86.000000</td>\n",
+       "      <td>82.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>86.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>86</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>in code, how can i add a system message at the...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.190418</td>\n",
+       "      <td>0.177907</td>\n",
+       "      <td>0.939024</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>9.605034</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.045291</td>\n",
+       "      <td>0.176503</td>\n",
+       "      <td>0.199231</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3.323173</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.074583</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.748375</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.154158</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>7.521995</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.190138</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8.637612</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.222883</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10.116563</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.289047</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>18.631366</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          inputs.question  \\\n",
+       "count                                                  86   \n",
+       "unique                                                 86   \n",
+       "top     in code, how can i add a system message at the...   \n",
+       "freq                                                    1   \n",
+       "mean                                                  NaN   \n",
+       "std                                                   NaN   \n",
+       "min                                                   NaN   \n",
+       "25%                                                   NaN   \n",
+       "50%                                                   NaN   \n",
+       "75%                                                   NaN   \n",
+       "max                                                   NaN   \n",
+       "\n",
+       "        feedback.embedding_cosine_distance  feedback.score_string:accuracy  \\\n",
+       "count                            86.000000                       86.000000   \n",
+       "unique                                 NaN                             NaN   \n",
+       "top                                    NaN                             NaN   \n",
+       "freq                                   NaN                             NaN   \n",
+       "mean                              0.190418                        0.177907   \n",
+       "std                               0.045291                        0.176503   \n",
+       "min                               0.074583                        0.100000   \n",
+       "25%                               0.154158                        0.100000   \n",
+       "50%                               0.190138                        0.100000   \n",
+       "75%                               0.222883                        0.100000   \n",
+       "max                               0.289047                        1.000000   \n",
+       "\n",
+       "        feedback.faithfulness error  execution_time  \n",
+       "count               82.000000     0       86.000000  \n",
+       "unique                    NaN     0             NaN  \n",
+       "top                       NaN   NaN             NaN  \n",
+       "freq                      NaN   NaN             NaN  \n",
+       "mean                 0.939024   NaN        9.605034  \n",
+       "std                  0.199231   NaN        3.323173  \n",
+       "min                  0.100000   NaN        4.748375  \n",
+       "25%                  1.000000   NaN        7.521995  \n",
+       "50%                  1.000000   NaN        8.637612  \n",
+       "75%                  1.000000   NaN       10.116563  \n",
+       "max                  1.000000   NaN       18.631366  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "test_run.get_aggregate_feedback()"
    ]
@@ -422,7 +610,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "f4d2e139-2653-4f7b-944b-91ef52f43d3e",
    "metadata": {
     "tags": []
@@ -435,7 +623,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "6e938e5b-c430-4ab1-ab7d-84c33f83bdc5",
    "metadata": {},
    "outputs": [],
@@ -445,29 +633,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "id": "9f9be718-64f0-4706-9527-240a1cdb3ecb",
    "metadata": {
     "tags": []
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
-       "' <b>\\n\\n- Expression language (LCEL) is a declarative way to easily compose chains together in LangChain. It was designed to support putting prototypes into production with no code changes, from simple \"prompt + LLM\" chains to complex chains with hundreds of steps [0].\\n\\n- Key features of LCEL include streaming support, asynchronous support, optimized parallel execution, configurable retries and fallbacks, access to intermediate results, input/output validation schemas, seamless integration with LangSmith tracing and LangServe deployment [0].\\n\\n- The LangChain cookbook contains examples of common tasks using LCEL like chaining prompts and LLMs, adding retrieval, querying databases, writing code, adding memory and moderation, etc [1].\\n\\n</b>\\n\\n[0] - https://langchain.org/docs/expression_language\\n[1] - https://langchain.org/docs/expression_language/cookbook'"
+       "\" <context>\\n\\nNo search results have been provided.\\n\\n</context>\\n\\nHmm, I'm not sure.\""
       ]
      },
-     "execution_count": 16,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -486,12 +664,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 16,
    "id": "e9c013e2-241a-4def-9aa6-ccb34273eeb9",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'bold-increase-73' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/ec954070-30ee-47e3-acf7-698f3c70c20f?eval=true\n",
+      "\n",
+      "View all tests for Dataset LangChain Docs Q&A at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/3f29798f-5939-4643-bd99-008ca66b72ed\n",
+      "[------------------------------------------------->] 86/86\n",
+      " Eval quantiles:\n",
+      "                                          inputs.question  \\\n",
+      "count                                                  86   \n",
+      "unique                                                 86   \n",
+      "top     in code, how can i add a system message at the...   \n",
+      "freq                                                    1   \n",
+      "mean                                                  NaN   \n",
+      "std                                                   NaN   \n",
+      "min                                                   NaN   \n",
+      "25%                                                   NaN   \n",
+      "50%                                                   NaN   \n",
+      "75%                                                   NaN   \n",
+      "max                                                   NaN   \n",
+      "\n",
+      "        feedback.embedding_cosine_distance  feedback.score_string:accuracy  \\\n",
+      "count                            86.000000                       86.000000   \n",
+      "unique                                 NaN                             NaN   \n",
+      "top                                    NaN                             NaN   \n",
+      "freq                                   NaN                             NaN   \n",
+      "mean                              0.194922                        0.202326   \n",
+      "std                               0.053206                        0.180833   \n",
+      "min                               0.071091                        0.100000   \n",
+      "25%                               0.152611                        0.100000   \n",
+      "50%                               0.191152                        0.100000   \n",
+      "75%                               0.223243                        0.300000   \n",
+      "max                               0.328942                        0.700000   \n",
+      "\n",
+      "        feedback.faithfulness error  execution_time  \n",
+      "count               85.000000     0       86.000000  \n",
+      "unique                    NaN     0             NaN  \n",
+      "top                       NaN   NaN             NaN  \n",
+      "freq                      NaN   NaN             NaN  \n",
+      "mean                 0.824706   NaN        9.743528  \n",
+      "std                  0.289048   NaN        4.399820  \n",
+      "min                  0.100000   NaN        4.208420  \n",
+      "25%                  0.700000   NaN        6.018769  \n",
+      "50%                  1.000000   NaN        8.086979  \n",
+      "75%                  1.000000   NaN       14.081097  \n",
+      "max                  1.000000   NaN       18.671288  \n"
+     ]
+    }
+   ],
    "source": [
     "from functools import partial\n",
     "\n",

Name	Type	Dataset ID	Description
Name	Type	Dataset ID	Description
Tool Usage - Typewriter (1 func)	ToolUsageTask	placeholder	Environment with a single function that accepts a single letter as input, and "prints" it on a piece of paper.\n", + "
Tool Usage - Typewriter (1 tool)	ToolUsageTask	59577193-8938-4ccf-92a7-e8a96bcf4f86	Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n", "\n", - "The objective of this task is to evaluate the ability to use the provided tools to repeat a given input string.\n", + "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n", "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", "\n", "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
Tool Usage - Typewriter	ToolUsageTask	placeholder	Environment with 26 functions each representing a letter of the alphabet.\n", + "
Tool Usage - Typewriter (26 tools)	ToolUsageTask	128af05e-aa00-4e3b-a958-d166dd450581	Environment with 26 tools each tool represents a letter of the alphabet.\n", "\n", - "In this variation of the typewriter task, there are 26 parameterless functions, where each function represents a letter of the alphabet (instead of a single function that takes a letter as an argument).\n", + "The objective of this task is to evaluate the model's ability the use tools\n", + "for a simple repetition task.\n", "\n", - "The object is to evaluate the ability of use the functions to repeat the given string.\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", "\n", - "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
Tool Usage - Relational Data	ToolUsageTask	e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5	Environment with fake data about users and their locations and favorite foods.\n", + "This is a variation of the typer writer task, where 26 parameterless tools are\n", + "given instead of a single tool that takes a letter as an argument.
Tool Usage - Relational Data	ToolUsageTask	1d89f4b3-5f73-48cf-a127-2fdeb22f6d84	Environment with fake data about users and their locations and favorite foods.\n", "\n", "The environment provides a set of tools that can be used to query the data.\n", "\n", @@ -122,25 +124,25 @@ "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", "\n", "Success is measured by the ability to answer the question correctly, and efficiently.
Multiverse Math	ToolUsageTask	placeholder	An environment that contains a few basic math operations, but with altered results.\n", + "
Multiverse Math	ToolUsageTask	594f9f60-30a0-49bf-b075-f44beabf546a	An environment that contains a few basic math operations, but with altered results.\n", "\n", "For example, multiplication of 53 will be re-interpreted as 53*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", "\n", "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.
Email Extraction	ExtractionTask	https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d	A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n", + "
Email Extraction	ExtractionTask	a1742786-bde5-4f51-a1d8-e148e5251ddb	A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n", "\n", "Some additional cleanup of the data was done by hand after the initial pass.\n", "\n", "See https://github.com/jacoblee93/oss-model-extraction-evals.
LangChain Docs Q&A	RetrievalTask	452ccafc-18e1-4314-885b-edd735f17b9d	Questions and answers based on a snapshot of the LangChain python docs.\n", + "
LangChain Docs Q&A	RetrievalTask	452ccafc-18e1-4314-885b-edd735f17b9d	Questions and answers based on a snapshot of the LangChain python docs.\n", "\n", "The environment provides the documents and the retriever information.\n", "\n", "Each example is composed of a question and reference answer.\n", "\n", "Success is measured based on the accuracy of the answer relative to the reference answer.\n", - "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Semi-structured Earnings	RetrievalTask	c47d9617-ab99-4d6e-a6e6-92b8daf85a7d	Questions and answers based on PDFs containing tables and charts.\n", + "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Semi-structured Reports	RetrievalTask	c47d9617-ab99-4d6e-a6e6-92b8daf85a7d	Questions and answers based on PDFs containing tables and charts.\n", "\n", "The task provides the raw documents as well as factory methods to easily index them\n", "and create a retriever.\n", @@ -148,15 +150,15 @@ "Each example is composed of a question and reference answer.\n", "\n", "Success is measured based on the accuracy of the answer relative to the reference answer.\n", - "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
	inputs.question	feedback.embedding_cosine_distance	feedback.score_string:accuracy	feedback.faithfulness	error	execution_time
count	86	86.000000	86.000000	82.000000	0	86.000000
unique	86	NaN	NaN	NaN	0	NaN
top	in code, how can i add a system message at the...	NaN	NaN	NaN	NaN	NaN
freq	1	NaN	NaN	NaN	NaN	NaN
mean	NaN	0.190418	0.177907	0.939024	NaN	9.605034
std	NaN	0.045291	0.176503	0.199231	NaN	3.323173
min	NaN	0.074583	0.100000	0.100000	NaN	4.748375
25%	NaN	0.154158	0.100000	1.000000	NaN	7.521995
50%	NaN	0.190138	0.100000	1.000000	NaN	8.637612
75%	NaN	0.222883	0.100000	1.000000	NaN	10.116563
max	NaN	0.289047	1.000000	1.000000	NaN	18.631366