From ca14a4ef3b81e96baa7e2201aed117fca74b11b1 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Tue, 21 Nov 2023 19:26:19 -0800 Subject: [PATCH] Wfh/rerun notebooks (#69) Add links to the notebooks --- docs/source/notebooks/datasets.ipynb | 2 +- docs/source/notebooks/extraction/email.ipynb | 4 +- docs/source/notebooks/getting_started.ipynb | 42 +- .../retrieval/comparing_techniques.ipynb | 43 +- .../retrieval/langchain_docs_qa.ipynb | 392 ++++++++++++++---- 5 files changed, 376 insertions(+), 107 deletions(-) diff --git a/docs/source/notebooks/datasets.ipynb b/docs/source/notebooks/datasets.ipynb index 949971b6..f09cd528 100644 --- a/docs/source/notebooks/datasets.ipynb +++ b/docs/source/notebooks/datasets.ipynb @@ -195,7 +195,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.2" } }, "nbformat": 4, diff --git a/docs/source/notebooks/extraction/email.ipynb b/docs/source/notebooks/extraction/email.ipynb index f04da650..e4e9b4a8 100644 --- a/docs/source/notebooks/extraction/email.ipynb +++ b/docs/source/notebooks/extraction/email.ipynb @@ -30,8 +30,8 @@ "import os\n", "\n", "# Get your API key from https://smith.langchain.com/settings\n", - "# os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"\n", - "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" + "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" ] }, { diff --git a/docs/source/notebooks/getting_started.ipynb b/docs/source/notebooks/getting_started.ipynb index 1e3912d7..a037432b 100644 --- a/docs/source/notebooks/getting_started.ipynb +++ b/docs/source/notebooks/getting_started.ipynb @@ -66,7 +66,7 @@ }, "outputs": [], "source": [ - "# %pip install -U --quiet langchain_benchmarks langchain langsmith" + "%pip install -U --quiet langchain_benchmarks langchain langsmith" ] }, { @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "c516c725-c968-422b-aedf-e360d4f7774c", "metadata": { "tags": [] @@ -92,26 +92,28 @@ "text/html": [ "\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", - "\n", - "\n", + "\n", - "\n", - "\n", - "\n", - "\n", + "\n", + "We also measure the faithfulness of the model's response relative to the retrieved documents (if any). \n", "\n", "
Name Type Dataset ID Description
Name Type Dataset ID Description
Tool Usage - Typewriter (1 func)ToolUsageTask placeholder Environment with a single function that accepts a single letter as input, and "prints" it on a piece of paper.\n", + "
Tool Usage - Typewriter (1 tool) ToolUsageTask 59577193-8938-4ccf-92a7-e8a96bcf4f86Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n", "\n", - "The objective of this task is to evaluate the ability to use the provided tools to repeat a given input string.\n", + "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n", "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", "\n", "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
Tool Usage - Typewriter ToolUsageTask placeholder Environment with 26 functions each representing a letter of the alphabet.\n", + "
Tool Usage - Typewriter (26 tools)ToolUsageTask 128af05e-aa00-4e3b-a958-d166dd450581Environment with 26 tools each tool represents a letter of the alphabet.\n", "\n", - "In this variation of the typewriter task, there are 26 parameterless functions, where each function represents a letter of the alphabet (instead of a single function that takes a letter as an argument).\n", + "The objective of this task is to evaluate the model's ability the use tools\n", + "for a simple repetition task.\n", "\n", - "The object is to evaluate the ability of use the functions to repeat the given string.\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", "\n", - "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.
Tool Usage - Relational Data ToolUsageTask e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5 Environment with fake data about users and their locations and favorite foods.\n", + "This is a variation of the typer writer task, where 26 parameterless tools are\n", + "given instead of a single tool that takes a letter as an argument.
Tool Usage - Relational Data ToolUsageTask 1d89f4b3-5f73-48cf-a127-2fdeb22f6d84Environment with fake data about users and their locations and favorite foods.\n", "\n", "The environment provides a set of tools that can be used to query the data.\n", "\n", @@ -122,25 +124,25 @@ "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", "\n", "Success is measured by the ability to answer the question correctly, and efficiently.
Multiverse Math ToolUsageTask placeholder An environment that contains a few basic math operations, but with altered results.\n", + "
Multiverse Math ToolUsageTask 594f9f60-30a0-49bf-b075-f44beabf546aAn environment that contains a few basic math operations, but with altered results.\n", "\n", "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", "\n", "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.
Email Extraction ExtractionTaskhttps://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/dA dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n", + "
Email Extraction ExtractionTaska1742786-bde5-4f51-a1d8-e148e5251ddbA dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n", "\n", "Some additional cleanup of the data was done by hand after the initial pass.\n", "\n", "See https://github.com/jacoblee93/oss-model-extraction-evals.
LangChain Docs Q&A RetrievalTask 452ccafc-18e1-4314-885b-edd735f17b9d Questions and answers based on a snapshot of the LangChain python docs.\n", + "
LangChain Docs Q&A RetrievalTask 452ccafc-18e1-4314-885b-edd735f17b9dQuestions and answers based on a snapshot of the LangChain python docs.\n", "\n", "The environment provides the documents and the retriever information.\n", "\n", "Each example is composed of a question and reference answer.\n", "\n", "Success is measured based on the accuracy of the answer relative to the reference answer.\n", - "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Semi-structured Earnings RetrievalTask c47d9617-ab99-4d6e-a6e6-92b8daf85a7d Questions and answers based on PDFs containing tables and charts.\n", + "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Semi-structured Reports RetrievalTask c47d9617-ab99-4d6e-a6e6-92b8daf85a7dQuestions and answers based on PDFs containing tables and charts.\n", "\n", "The task provides the raw documents as well as factory methods to easily index them\n", "and create a retriever.\n", @@ -148,15 +150,15 @@ "Each example is composed of a question and reference answer.\n", "\n", "Success is measured based on the accuracy of the answer relative to the reference answer.\n", - "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
" ], "text/plain": [ - "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 func)', dataset_id='placeholder', description='Environment with a single function that accepts a single letter as input, and \"prints\" it on a piece of paper.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is \\'abc\\', the tools \\'a\\', \\'b\\', and \\'c\\' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n', create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the function with a single letter at a time.\"), ToolUsageTask(name='Tool Usage - Typewriter', dataset_id='placeholder', description=\"Environment with 26 functions each representing a letter of the alphabet.\\n\\nIn this variation of the typewriter task, there are 26 parameterless functions, where each function represents a letter of the alphabet (instead of a single function that takes a letter as an argument).\\n\\nThe object is to evaluate the ability of use the functions to repeat the given string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\"), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\"), ToolUsageTask(name='Multiverse Math', dataset_id='placeholder', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The rules of association, commutativity, and distributivity still apply, but the operations have been altered to yield different results than expected. Solve the given math questions using the provided tools. Do not guess the answer.'), ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=, instructions=ChatPromptTemplate(input_variables=['email'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['email'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{email}\\n```'))])), RetrievalTask(name='LangChain Docs Q&A', dataset_id='452ccafc-18e1-4314-885b-edd735f17b9d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={'conversational-retrieval-qa': }, get_docs=), RetrievalTask(name='Semi-structured Earnings', dataset_id='c47d9617-ab99-4d6e-a6e6-92b8daf85a7d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={}, get_docs=)])" + "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \"), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\"), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\"), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.'), ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=, instructions=ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{input}\\n```'))])), RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={'conversational-retrieval-qa': }, get_docs=), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={}, get_docs=)])" ] }, - "execution_count": 4, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/source/notebooks/retrieval/comparing_techniques.ipynb b/docs/source/notebooks/retrieval/comparing_techniques.ipynb index 99d66546..19b644b5 100644 --- a/docs/source/notebooks/retrieval/comparing_techniques.ipynb +++ b/docs/source/notebooks/retrieval/comparing_techniques.ipynb @@ -85,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", "metadata": { "tags": [] @@ -97,12 +97,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "3644d211-382e-41aa-b282-21b01d28fc35", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Name Type Dataset ID Description
LangChain Docs Q&A RetrievalTask452ccafc-18e1-4314-885b-edd735f17b9dQuestions and answers based on a snapshot of the LangChain python docs.\n", + "\n", + "The environment provides the documents and the retriever information.\n", + "\n", + "Each example is composed of a question and reference answer.\n", + "\n", + "Success is measured based on the accuracy of the answer relative to the reference answer.\n", + "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Semi-structured ReportsRetrievalTaskc47d9617-ab99-4d6e-a6e6-92b8daf85a7dQuestions and answers based on PDFs containing tables and charts.\n", + "\n", + "The task provides the raw documents as well as factory methods to easily index them\n", + "and create a retriever.\n", + "\n", + "Each example is composed of a question and reference answer.\n", + "\n", + "Success is measured based on the accuracy of the answer relative to the reference answer.\n", + "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
" + ], + "text/plain": [ + "Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={'conversational-retrieval-qa': }, get_docs=), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={}, get_docs=)])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "registry.filter(Type=\"RetrievalTask\")" ] diff --git a/docs/source/notebooks/retrieval/langchain_docs_qa.ipynb b/docs/source/notebooks/retrieval/langchain_docs_qa.ipynb index fc7a5d22..6a7df06c 100644 --- a/docs/source/notebooks/retrieval/langchain_docs_qa.ipynb +++ b/docs/source/notebooks/retrieval/langchain_docs_qa.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "9f44b59b", "metadata": { "tags": [] @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "62b518cf-99fb-44be-8acb-ee0a8ba62272", "metadata": {}, "outputs": [], @@ -49,7 +49,7 @@ "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n", "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\" # Your API key\n", "\n", - "# # Silence warnings from HuggingFace\n", + "# Silence warnings from HuggingFace\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" ] }, @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", "metadata": { "tags": [] @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "3644d211-382e-41aa-b282-21b01d28fc35", "metadata": { "tags": [] @@ -88,18 +88,18 @@ "text/html": [ "\n", "\n", - "\n", + "\n", "\n", "\n", - "\n", - "\n", + "\n", + "We also measure the faithfulness of the model's response relative to the retrieved documents (if any). \n", "\n", "
Name Type Dataset ID Description
Name Type Dataset ID Description
LangChain Docs Q&A RetrievalTask452ccafc-18e1-4314-885b-edd735f17b9dQuestions and answers based on a snapshot of the LangChain python docs.\n", + "
LangChain Docs Q&A RetrievalTask452ccafc-18e1-4314-885b-edd735f17b9dQuestions and answers based on a snapshot of the LangChain python docs.\n", "\n", "The environment provides the documents and the retriever information.\n", "\n", "Each example is composed of a question and reference answer.\n", "\n", "Success is measured based on the accuracy of the answer relative to the reference answer.\n", - "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Semi-structured EarningsRetrievalTaskc47d9617-ab99-4d6e-a6e6-92b8daf85a7dQuestions and answers based on PDFs containing tables and charts.\n", + "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Semi-structured ReportsRetrievalTaskc47d9617-ab99-4d6e-a6e6-92b8daf85a7dQuestions and answers based on PDFs containing tables and charts.\n", "\n", "The task provides the raw documents as well as factory methods to easily index them\n", "and create a retriever.\n", @@ -107,15 +107,15 @@ "Each example is composed of a question and reference answer.\n", "\n", "Success is measured based on the accuracy of the answer relative to the reference answer.\n", - "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
" ], "text/plain": [ - "Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='452ccafc-18e1-4314-885b-edd735f17b9d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={'conversational-retrieval-qa': }, get_docs=), RetrievalTask(name='Semi-structured Earnings', dataset_id='c47d9617-ab99-4d6e-a6e6-92b8daf85a7d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={}, get_docs=)])" + "Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={'conversational-retrieval-qa': }, get_docs=), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={}, get_docs=)])" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -127,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "671282f8-c455-4390-b018-e53bbd833093", "metadata": { "tags": [] @@ -138,9 +138,9 @@ "text/html": [ "\n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "\n", + "We also measure the faithfulness of the model's response relative to the retrieved documents (if any). \n", + "\n", + "\n", + "\n", "\n", "
Name LangChain Docs Q&A
Type RetrievalTask
Dataset ID 452ccafc-18e1-4314-885b-edd735f17b9d
Name LangChain Docs Q&A
Type RetrievalTask
Dataset ID 452ccafc-18e1-4314-885b-edd735f17b9d
Description Questions and answers based on a snapshot of the LangChain python docs.\n", "\n", "The environment provides the documents and the retriever information.\n", @@ -148,18 +148,18 @@ "Each example is composed of a question and reference answer.\n", "\n", "Success is measured based on the accuracy of the answer relative to the reference answer.\n", - "We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Retriever Factories basic, parent-doc, hyde
Architecture Factoriesconversational-retrieval-qa
get_docs <function load_cached_docs at 0x102d17240>
Retriever Factories basic, parent-doc, hyde
Architecture Factoriesconversational-retrieval-qa
get_docs
" ], "text/plain": [ - "RetrievalTask(name='LangChain Docs Q&A', dataset_id='452ccafc-18e1-4314-885b-edd735f17b9d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={'conversational-retrieval-qa': }, get_docs=)" + "RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", retriever_factories={'basic': , 'parent-doc': , 'hyde': }, architecture_factories={'conversational-retrieval-qa': }, get_docs=)" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -171,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "70369f67-deb4-467a-801a-6d38c3d0460d", "metadata": { "tags": [] @@ -192,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "c58247f5-b9bd-4cc5-9632-78bc21bb10b4", "metadata": { "tags": [] @@ -201,7 +201,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2c698bb93d4548fcacbb3c08990642c6", + "model_id": "4fc93abde64d43c99ffa0eb5374dca86", "version_major": 2, "version_minor": 0 }, @@ -228,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 8, "id": "41e64350-63a7-4e7d-8e03-7dc459c444cc", "metadata": { "tags": [] @@ -290,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 9, "id": "10a1fca9-d356-4cff-93a9-c4f63944e57d", "metadata": { "tags": [] @@ -299,10 +299,10 @@ { "data": { "text/plain": [ - "' Based on the LangChain documentation, the LangChain Expression Language (LCEL) is a declarative way to easily compose chains together. Key things to know about LCEL:\\n\\n- It was designed to support putting prototypes into production with no code changes, from simple \"prompt + LLM\" chains to complex chains with hundreds of steps. \\n\\n- It provides streaming support - when you build chains with LCEL you get the best possible time-to-first-token, streaming tokens from the LLM to output parsers incrementally.\\n\\n- Chains built with LCEL can be called synchronously (like in a notebook) or asynchronously (like in a production server), using the same code.\\n\\n- LCEL automatically runs steps in parallel when possible, minimizing latency. \\n\\n- It supports configuring retries and fallbacks to make chains more reliable.\\n\\n- You can access intermediate chain results before the final output is ready, which helps with debugging and user notifications.\\n\\n- LCEL chains get input and output validation schemas automatically.\\n\\n- All steps are logged to LangSmith for observability.\\n\\n- Chains authored in LCEL can be easily deployed with LangServe.\\n\\nSo in summary, the LangChain Expression Language is'" + "' Unfortunately, I do not have any documents to reference information about expression language. As an AI assistant without access to external information, I can only respond based on the content provided to me. If you could provide me with some documents that describe expression language, I would be happy to summarize or share information from those documents to answer your question. Please feel free to provide any additional context or documents that may allow me to assist further.'" ] }, - "execution_count": 33, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -323,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 10, "id": "513042fe-2878-44f8-ae84-05b9d521c1de", "metadata": { "tags": [] @@ -337,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 11, "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1", "metadata": { "tags": [] @@ -347,42 +347,51 @@ "name": "stdout", "output_type": "stream", "text": [ - "View the evaluation results for project 'test-essential-pot-37' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/d57047ac-d6e7-49c2-bd52-e2158e2ce56f?eval=true\n", + "View the evaluation results for project 'only-man-12' at:\n", + "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/c915fa65-2be0-42ad-8038-9a9fe3a0a879?eval=true\n", "\n", "View all tests for Dataset LangChain Docs Q&A at:\n", "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/3f29798f-5939-4643-bd99-008ca66b72ed\n", - "[> ] 1/86" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ "[------------------------------------------------->] 86/86\n", " Eval quantiles:\n", - " 0.25 0.5 0.75 mean mode\n", - "embedding_cosine_distance 0.086465 0.12291 0.159078 0.128270 0.046088\n", - "score_string:accuracy 0.500000 0.70000 1.000000 0.641860 0.700000\n", - "faithfulness 0.700000 1.00000 1.000000 0.860465 1.000000\n" + " inputs.question \\\n", + "count 86 \n", + "unique 86 \n", + "top in code, how can i add a system message at the... \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " feedback.embedding_cosine_distance feedback.score_string:accuracy \\\n", + "count 86.000000 86.000000 \n", + "unique NaN NaN \n", + "top NaN NaN \n", + "freq NaN NaN \n", + "mean 0.190418 0.177907 \n", + "std 0.045291 0.176503 \n", + "min 0.074583 0.100000 \n", + "25% 0.154158 0.100000 \n", + "50% 0.190138 0.100000 \n", + "75% 0.222883 0.100000 \n", + "max 0.289047 1.000000 \n", + "\n", + " feedback.faithfulness error execution_time \n", + "count 82.000000 0 86.000000 \n", + "unique NaN 0 NaN \n", + "top NaN NaN NaN \n", + "freq NaN NaN NaN \n", + "mean 0.939024 NaN 9.605034 \n", + "std 0.199231 NaN 3.323173 \n", + "min 0.100000 NaN 4.748375 \n", + "25% 1.000000 NaN 7.521995 \n", + "50% 1.000000 NaN 8.637612 \n", + "75% 1.000000 NaN 10.116563 \n", + "max 1.000000 NaN 18.631366 \n" ] } ], @@ -400,12 +409,191 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "e86578d5-be5c-4bcd-9dcb-35280eeed3f9", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
inputs.questionfeedback.embedding_cosine_distancefeedback.score_string:accuracyfeedback.faithfulnesserrorexecution_time
count8686.00000086.00000082.000000086.000000
unique86NaNNaNNaN0NaN
topin code, how can i add a system message at the...NaNNaNNaNNaNNaN
freq1NaNNaNNaNNaNNaN
meanNaN0.1904180.1779070.939024NaN9.605034
stdNaN0.0452910.1765030.199231NaN3.323173
minNaN0.0745830.1000000.100000NaN4.748375
25%NaN0.1541580.1000001.000000NaN7.521995
50%NaN0.1901380.1000001.000000NaN8.637612
75%NaN0.2228830.1000001.000000NaN10.116563
maxNaN0.2890471.0000001.000000NaN18.631366
\n", + "
" + ], + "text/plain": [ + " inputs.question \\\n", + "count 86 \n", + "unique 86 \n", + "top in code, how can i add a system message at the... \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " feedback.embedding_cosine_distance feedback.score_string:accuracy \\\n", + "count 86.000000 86.000000 \n", + "unique NaN NaN \n", + "top NaN NaN \n", + "freq NaN NaN \n", + "mean 0.190418 0.177907 \n", + "std 0.045291 0.176503 \n", + "min 0.074583 0.100000 \n", + "25% 0.154158 0.100000 \n", + "50% 0.190138 0.100000 \n", + "75% 0.222883 0.100000 \n", + "max 0.289047 1.000000 \n", + "\n", + " feedback.faithfulness error execution_time \n", + "count 82.000000 0 86.000000 \n", + "unique NaN 0 NaN \n", + "top NaN NaN NaN \n", + "freq NaN NaN NaN \n", + "mean 0.939024 NaN 9.605034 \n", + "std 0.199231 NaN 3.323173 \n", + "min 0.100000 NaN 4.748375 \n", + "25% 1.000000 NaN 7.521995 \n", + "50% 1.000000 NaN 8.637612 \n", + "75% 1.000000 NaN 10.116563 \n", + "max 1.000000 NaN 18.631366 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "test_run.get_aggregate_feedback()" ] @@ -422,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "f4d2e139-2653-4f7b-944b-91ef52f43d3e", "metadata": { "tags": [] @@ -435,7 +623,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "6e938e5b-c430-4ab1-ab7d-84c33f83bdc5", "metadata": {}, "outputs": [], @@ -445,29 +633,19 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "9f9be718-64f0-4706-9527-240a1cdb3ecb", "metadata": { "tags": [] }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, { "data": { "text/plain": [ - "' \\n\\n- Expression language (LCEL) is a declarative way to easily compose chains together in LangChain. It was designed to support putting prototypes into production with no code changes, from simple \"prompt + LLM\" chains to complex chains with hundreds of steps [0].\\n\\n- Key features of LCEL include streaming support, asynchronous support, optimized parallel execution, configurable retries and fallbacks, access to intermediate results, input/output validation schemas, seamless integration with LangSmith tracing and LangServe deployment [0].\\n\\n- The LangChain cookbook contains examples of common tasks using LCEL like chaining prompts and LLMs, adding retrieval, querying databases, writing code, adding memory and moderation, etc [1].\\n\\n\\n\\n[0] - https://langchain.org/docs/expression_language\\n[1] - https://langchain.org/docs/expression_language/cookbook'" + "\" \\n\\nNo search results have been provided.\\n\\n\\n\\nHmm, I'm not sure.\"" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -486,12 +664,64 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 16, "id": "e9c013e2-241a-4def-9aa6-ccb34273eeb9", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "View the evaluation results for project 'bold-increase-73' at:\n", + "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/ec954070-30ee-47e3-acf7-698f3c70c20f?eval=true\n", + "\n", + "View all tests for Dataset LangChain Docs Q&A at:\n", + "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/3f29798f-5939-4643-bd99-008ca66b72ed\n", + "[------------------------------------------------->] 86/86\n", + " Eval quantiles:\n", + " inputs.question \\\n", + "count 86 \n", + "unique 86 \n", + "top in code, how can i add a system message at the... \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " feedback.embedding_cosine_distance feedback.score_string:accuracy \\\n", + "count 86.000000 86.000000 \n", + "unique NaN NaN \n", + "top NaN NaN \n", + "freq NaN NaN \n", + "mean 0.194922 0.202326 \n", + "std 0.053206 0.180833 \n", + "min 0.071091 0.100000 \n", + "25% 0.152611 0.100000 \n", + "50% 0.191152 0.100000 \n", + "75% 0.223243 0.300000 \n", + "max 0.328942 0.700000 \n", + "\n", + " feedback.faithfulness error execution_time \n", + "count 85.000000 0 86.000000 \n", + "unique NaN 0 NaN \n", + "top NaN NaN NaN \n", + "freq NaN NaN NaN \n", + "mean 0.824706 NaN 9.743528 \n", + "std 0.289048 NaN 4.399820 \n", + "min 0.100000 NaN 4.208420 \n", + "25% 0.700000 NaN 6.018769 \n", + "50% 1.000000 NaN 8.086979 \n", + "75% 1.000000 NaN 14.081097 \n", + "max 1.000000 NaN 18.671288 \n" + ] + } + ], "source": [ "from functools import partial\n", "\n",