diff --git a/docs/source/notebooks/extraction/chat_extraction.ipynb b/docs/source/notebooks/extraction/chat_extraction.ipynb
new file mode 100644
index 00000000..5dd795fd
--- /dev/null
+++ b/docs/source/notebooks/extraction/chat_extraction.ipynb
@@ -0,0 +1,3303 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
+   "metadata": {},
+   "source": [
+    "# Chat Extraction\n",
+    "\n",
+    "This benchmark combines classification, summarization, and extraction in one a combined task. The model is\n",
+    "expected to output formatted json in the expected schema."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "47de0d20-d20b-44be-9e41-d2275f0866e8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install -U --quiet langchain langchain_benchmarks\n",
+    "# %pip install -U openai rapidfuzz fireworks-ai anthropic"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "af75ce4b-f159-4917-9249-01ee88b1b8fc",
+   "metadata": {},
+   "source": [
+    "For this code to work, please configure LangSmith environment variables with your credentials,\n",
+    "in addition to your LLM providers' API keys."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c401de19-814e-4bd7-bb9c-7ea6e217985c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "import uuid\n",
+    "\n",
+    "uid = uuid.uuid4().hex[:4]  # Avoid conflicts in project names\n",
+    "\n",
+    "# Get your API key from https://smith.langchain.com/settings\n",
+    "api_keys = [\n",
+    "    \"LANGCHAIN_API_KEY\",\n",
+    "    \"OPENAI_API_KEY\",\n",
+    "    \"ANTHROPIC_API_KEY\",\n",
+    "    \"FIREWORKS_API_KEY\",\n",
+    "]\n",
+    "for key in api_keys:\n",
+    "    if key not in os.environ:\n",
+    "        os.environ[key] = getpass.getpass(f\"Enter your {key}: \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "60f22779-a948-4833-8e8c-ace9ef17f56f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset Chat Extraction already exists. Skipping.\n",
+      "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "<tbody>\n",
+       "<tr><td>Name       </td><td>Chat Extraction                                                                                                                                            </td></tr>\n",
+       "<tr><td>Type       </td><td>ExtractionTask                                                                                                                                             </td></tr>\n",
+       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d\" target=\"_blank\" rel=\"noopener\">00f4444c-9460-4a82-b87a-f50096f1cfef</a></td></tr>\n",
+       "<tr><td>Description</td><td>A dataset meant to test the ability of an LLM to extract and infer\n",
+       "structured information from a dialogue. The dialogue is between a user and a support\n",
+       "engineer. Outputs should be structured as a JSON object and test both the ability\n",
+       "of the LLM to correctly structure the information and its ability to perform simple \n",
+       "classification tasks.                                                                                                                                                            </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "ExtractionTask(name='Chat Extraction', dataset_id='https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d', description='A dataset meant to test the ability of an LLM to extract and infer\\nstructured information from a dialogue. The dialogue is between a user and a support\\nengineer. Outputs should be structured as a JSON object and test both the ability\\nof the LLM to correctly structure the information and its ability to perform simple \\nclassification tasks.', schema=<class 'langchain_benchmarks.extraction.tasks.chat_extraction.schema.GenerateTicket'>, instructions=ChatPromptTemplate(input_variables=['dialogue'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['dialogue'], template='Generate a ticket for the following question-response pair:\\n<Dialogue>\\n{dialogue}\\n</Dialogue>'))]))"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain_benchmarks import clone_public_dataset, registry\n",
+    "\n",
+    "task = registry[\"Chat Extraction\"]\n",
+    "\n",
+    "# Clone the dataset to your tenant\n",
+    "clone_public_dataset(task.dataset_id, dataset_name=task.name)\n",
+    "\n",
+    "\n",
+    "task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86f1378a-9a62-477e-bdb8-a7fd10915b62",
+   "metadata": {},
+   "source": [
+    "#### Schema\n",
+    "\n",
+    "Each extraction task has an expected output schema defined in a Pydantic BaseModel object, which we can use to\n",
+    "get a JSON schema object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "12e302e6-9b3d-42a4-b612-d672c591e8f0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'title': 'GenerateTicket',\n",
+       " 'description': 'Generate a ticket containing all the extracted information.',\n",
+       " 'type': 'object',\n",
+       " 'properties': {'issue_summary': {'title': 'Issue Summary',\n",
+       "   'description': 'short (<10 word) summary of the issue or question',\n",
+       "   'type': 'string'},\n",
+       "  'question': {'title': 'Question',\n",
+       "   'description': 'Information inferred from the the question.',\n",
+       "   'allOf': [{'$ref': '#/definitions/QuestionCategorization'}]},\n",
+       "  'response': {'title': 'Response',\n",
+       "   'description': 'Information inferred from the the response.',\n",
+       "   'allOf': [{'$ref': '#/definitions/ResponseCategorization'}]}},\n",
+       " 'required': ['issue_summary', 'question', 'response'],\n",
+       " 'definitions': {'QuestionCategory': {'title': 'QuestionCategory',\n",
+       "   'description': 'An enumeration.',\n",
+       "   'enum': ['Implementation Issues',\n",
+       "    'Feature Requests',\n",
+       "    'Concept Explanations',\n",
+       "    'Code Optimization',\n",
+       "    'Security and Privacy Concerns',\n",
+       "    'Model Training and Fine-tuning',\n",
+       "    'Data Handling and Manipulation',\n",
+       "    'User Interaction Flow',\n",
+       "    'Technical Integration',\n",
+       "    'Error Handling and Logging',\n",
+       "    'Customization and Configuration',\n",
+       "    'External API and Data Source Integration',\n",
+       "    'Language and Localization',\n",
+       "    'Streaming and Real-time Processing',\n",
+       "    'Tool Development',\n",
+       "    'Function Calling',\n",
+       "    'LLM Integrations',\n",
+       "    'General Agent Question',\n",
+       "    'General Chit Chat',\n",
+       "    'Memory',\n",
+       "    'Debugging Help',\n",
+       "    'Application Design',\n",
+       "    'Prompt Templates',\n",
+       "    'Cost Tracking',\n",
+       "    'Other'],\n",
+       "   'type': 'string'},\n",
+       "  'Sentiment': {'title': 'Sentiment',\n",
+       "   'description': 'An enumeration.',\n",
+       "   'enum': ['Negative', 'Neutral', 'Positive'],\n",
+       "   'type': 'string'},\n",
+       "  'ProgrammingLanguage': {'title': 'ProgrammingLanguage',\n",
+       "   'description': 'An enumeration.',\n",
+       "   'enum': ['python', 'javascript', 'typescript', 'unknown', 'other'],\n",
+       "   'type': 'string'},\n",
+       "  'QuestionCategorization': {'title': 'QuestionCategorization',\n",
+       "   'type': 'object',\n",
+       "   'properties': {'question_category': {'$ref': '#/definitions/QuestionCategory'},\n",
+       "    'category_if_other': {'title': 'Category If Other',\n",
+       "     'description': \"question category if the category above is 'other'\",\n",
+       "     'type': 'string'},\n",
+       "    'is_off_topic': {'title': 'Is Off Topic',\n",
+       "     'description': 'If the input is general chit chat or does not pertain to technical inqueries about LangChain or building/debugging applications with LLMs/AI, it is off topic. For context, LangChain is a library and framework designed to assist in building applications with LLMs. Questions may also be about similar packages like LangServe, LangSmith, OpenAI, Anthropic, vectorstores, agents, etc.',\n",
+       "     'type': 'boolean'},\n",
+       "    'toxicity': {'title': 'Toxicity',\n",
+       "     'description': 'Whether or not the input question is toxic',\n",
+       "     'default': 0,\n",
+       "     'exclusiveMaximum': 6,\n",
+       "     'minimum': 0,\n",
+       "     'type': 'integer'},\n",
+       "    'sentiment': {'$ref': '#/definitions/Sentiment'},\n",
+       "    'programming_language': {'$ref': '#/definitions/ProgrammingLanguage'}},\n",
+       "   'required': ['question_category',\n",
+       "    'is_off_topic',\n",
+       "    'sentiment',\n",
+       "    'programming_language']},\n",
+       "  'ResponseType': {'title': 'ResponseType',\n",
+       "   'description': 'An enumeration.',\n",
+       "   'enum': ['resolve issue',\n",
+       "    'provide guidance',\n",
+       "    'request information',\n",
+       "    'give up',\n",
+       "    'none',\n",
+       "    'other'],\n",
+       "   'type': 'string'},\n",
+       "  'ResponseCategorization': {'title': 'ResponseCategorization',\n",
+       "   'type': 'object',\n",
+       "   'properties': {'response_type': {'$ref': '#/definitions/ResponseType'},\n",
+       "    'response_type_if_other': {'title': 'Response Type If Other',\n",
+       "     'type': 'string'},\n",
+       "    'confidence_level': {'title': 'Confidence Level',\n",
+       "     'description': 'The confidence of the assistant in its answer.',\n",
+       "     'exclusiveMaximum': 6,\n",
+       "     'minimum': 0,\n",
+       "     'type': 'integer'},\n",
+       "    'followup_actions': {'title': 'Followup Actions',\n",
+       "     'description': 'Actions the assistant recommended the user take.',\n",
+       "     'type': 'array',\n",
+       "     'items': {'type': 'string'}}},\n",
+       "   'required': ['response_type', 'confidence_level']}}}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "task.schema.schema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
+   "metadata": {},
+   "source": [
+    "## Define an extraction chain\n",
+    "\n",
+    "Let's build the extraction chain that we can use to get structured information from the emails."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ade7077c-4602-4e5b-ad6d-3eb43cbd0247",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4-1106-preview\", temperature=0).bind_functions(\n",
+    "    functions=[task.schema],\n",
+    "    function_call=task.schema.schema()[\"title\"],\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def format_run(dialogue_input: dict):\n",
+    "    question = dialogue_input[\"question\"]\n",
+    "    answer = dialogue_input[\"answer\"]\n",
+    "    return {\n",
+    "        \"dialogue\": f\"<question>\\n{question}\\n</question>\\n\"\n",
+    "        f\"<assistant-response>\\n{answer}\\n</assistant-response>\"\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "output_parser = JsonOutputFunctionsParser()\n",
+    "extraction_chain = (\n",
+    "    format_run\n",
+    "    | task.instructions\n",
+    "    | llm\n",
+    "    | output_parser\n",
+    "    # Wrap as 'output' so to be unified for the evaluators\n",
+    "    | (lambda x: {\"output\": x})\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f66ed218-e1db-49b5-bde3-40ebec961723",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'output': {'issue_summary': 'Running Llama 2 Locally',\n",
+       "  'question': {'question_category': 'Implementation Issues',\n",
+       "   'is_off_topic': False,\n",
+       "   'sentiment': 'Neutral',\n",
+       "   'programming_language': 'unknown'},\n",
+       "  'response': {'response_type': 'provide guidance', 'confidence_level': 1}}}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extraction_chain.invoke(\n",
+    "    {\"question\": \"how do i run llama 2 locally?\", \"answer\": \"Llama.cpp of course.\"}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
+   "metadata": {},
+   "source": [
+    "Now it's time to measure our chain's effectiveness!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
+   "metadata": {},
+   "source": [
+    "## Evaluate\n",
+    "\n",
+    "Let's evaluate the chain now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "aab7514e-a6ef-4c21-b90f-d9cbefcf5af1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'gpt-4-1106-preview-af10' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=9ee3b369-1988-4db0-a2a9-ea6259c8e19c\n",
+      "\n",
+      "View all tests for Dataset Chat Extraction at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n",
+      "[------------------------------------------------->] 27/27"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>Experiment Results:</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.json_edit_distance</th>\n",
+       "      <th>feedback.json_schema</th>\n",
+       "      <th>feedback.toxicity_similarity</th>\n",
+       "      <th>feedback.sentiment_similarity</th>\n",
+       "      <th>feedback.confidence_level_similarity</th>\n",
+       "      <th>feedback.question_category</th>\n",
+       "      <th>feedback.off_topic_similarity</th>\n",
+       "      <th>feedback.programming_language_similarity</th>\n",
+       "      <th>error</th>\n",
+       "      <th>execution_time</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>27.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.258825</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.962963</td>\n",
+       "      <td>0.592593</td>\n",
+       "      <td>0.888889</td>\n",
+       "      <td>0.592593</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>11.573060</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.177651</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.079169</td>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.320256</td>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.616704</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.049430</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.800000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>6.609211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>0.100351</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8.454940</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>0.222621</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10.141127</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>0.365307</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>13.332418</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>0.595300</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>27.191173</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        feedback.json_edit_distance  feedback.json_schema  \\\n",
+       "count                     27.000000                  27.0   \n",
+       "unique                          NaN                   NaN   \n",
+       "top                             NaN                   NaN   \n",
+       "freq                            NaN                   NaN   \n",
+       "mean                       0.258825                   1.0   \n",
+       "std                        0.177651                   0.0   \n",
+       "min                        0.049430                   1.0   \n",
+       "25%                        0.100351                   1.0   \n",
+       "50%                        0.222621                   1.0   \n",
+       "75%                        0.365307                   1.0   \n",
+       "max                        0.595300                   1.0   \n",
+       "\n",
+       "        feedback.toxicity_similarity  feedback.sentiment_similarity  \\\n",
+       "count                           27.0                           27.0   \n",
+       "unique                           NaN                            NaN   \n",
+       "top                              NaN                            NaN   \n",
+       "freq                             NaN                            NaN   \n",
+       "mean                             0.0                            1.0   \n",
+       "std                              0.0                            0.0   \n",
+       "min                              0.0                            1.0   \n",
+       "25%                              0.0                            1.0   \n",
+       "50%                              0.0                            1.0   \n",
+       "75%                              0.0                            1.0   \n",
+       "max                              0.0                            1.0   \n",
+       "\n",
+       "        feedback.confidence_level_similarity  feedback.question_category  \\\n",
+       "count                              27.000000                   27.000000   \n",
+       "unique                                   NaN                         NaN   \n",
+       "top                                      NaN                         NaN   \n",
+       "freq                                     NaN                         NaN   \n",
+       "mean                                0.962963                    0.592593   \n",
+       "std                                 0.079169                    0.500712   \n",
+       "min                                 0.800000                    0.000000   \n",
+       "25%                                 1.000000                    0.000000   \n",
+       "50%                                 1.000000                    1.000000   \n",
+       "75%                                 1.000000                    1.000000   \n",
+       "max                                 1.000000                    1.000000   \n",
+       "\n",
+       "        feedback.off_topic_similarity  \\\n",
+       "count                       27.000000   \n",
+       "unique                            NaN   \n",
+       "top                               NaN   \n",
+       "freq                              NaN   \n",
+       "mean                         0.888889   \n",
+       "std                          0.320256   \n",
+       "min                          0.000000   \n",
+       "25%                          1.000000   \n",
+       "50%                          1.000000   \n",
+       "75%                          1.000000   \n",
+       "max                          1.000000   \n",
+       "\n",
+       "        feedback.programming_language_similarity error  execution_time  \n",
+       "count                                  27.000000     0       27.000000  \n",
+       "unique                                       NaN     0             NaN  \n",
+       "top                                          NaN   NaN             NaN  \n",
+       "freq                                         NaN   NaN             NaN  \n",
+       "mean                                    0.592593   NaN       11.573060  \n",
+       "std                                     0.500712   NaN        4.616704  \n",
+       "min                                     0.000000   NaN        6.609211  \n",
+       "25%                                     0.000000   NaN        8.454940  \n",
+       "50%                                     1.000000   NaN       10.141127  \n",
+       "75%                                     1.000000   NaN       13.332418  \n",
+       "max                                     1.000000   NaN       27.191173  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from langsmith.client import Client\n",
+    "\n",
+    "from langchain_benchmarks.extraction.tasks.chat_extraction import get_eval_config\n",
+    "\n",
+    "client = Client()\n",
+    "\n",
+    "eval_config = get_eval_config()\n",
+    "\n",
+    "test_run = client.run_on_dataset(\n",
+    "    dataset_name=task.name,\n",
+    "    llm_or_chain_factory=extraction_chain,\n",
+    "    evaluation=eval_config,\n",
+    "    verbose=True,\n",
+    "    project_name=f\"gpt-4-1106-preview-{uid}\",\n",
+    "    project_metadata={\n",
+    "        \"arch\": \"openai-functions\",\n",
+    "        \"model\": \"gpt-4-1106-preview\",\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9828990-f498-4d3f-9e51-76d72bf8f4e9",
+   "metadata": {},
+   "source": [
+    "## Compare to Claude-2\n",
+    "\n",
+    "Let's compare our results to Anthropic's Claude-2. We will mimic the function calling interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1be9d1cb-b9b6-4d77-b0d5-63a6784626d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Any, Dict, List, Type\n",
+    "\n",
+    "from langchain.chat_models import ChatAnthropic\n",
+    "from langchain.output_parsers.xml import XMLOutputParser\n",
+    "from langchain.prompts import ChatPromptTemplate\n",
+    "from langchain.pydantic_v1 import BaseModel\n",
+    "\n",
+    "claude_prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\n",
+    "            \"system\",\n",
+    "            \"You are a data extraction bot tasked with extracting and inferring information from dialogues and generating tickets. Always respond \"\n",
+    "            \"only with XML based on the following JSON schema:\\n{schema}\",\n",
+    "        ),\n",
+    "        (\n",
+    "            \"user\",\n",
+    "            \"Generate a ticket from the following question-response pair:\\n\"\n",
+    "            \"<Dialogue>\\n{dialogue}\\n</Dialogue>\\n\"\n",
+    "            \"Remember, respond directly with this format:\\n\"\n",
+    "            \"<{function_call}>\\n...\\n</{function_call}>\"\n",
+    "            \"RESPOND ONLY IN XML THEN STOP.\",\n",
+    "        ),\n",
+    "    ]\n",
+    ")\n",
+    "prompt = claude_prompt.partial(\n",
+    "    schema=task.schema.schema_json(), function_call=task.schema.schema()[\"title\"]\n",
+    ")\n",
+    "\n",
+    "claude = ChatAnthropic(model=\"claude-2\", temperature=0, max_tokens_to_sample=2048)\n",
+    "\n",
+    "\n",
+    "class MergeSchema:\n",
+    "    \"\"\"Merge the XML Output Parser schema into the output.\"\"\"\n",
+    "\n",
+    "    def __init__(self, schema: Type[BaseModel]):\n",
+    "        self.schema = schema\n",
+    "\n",
+    "    @property\n",
+    "    def _func_name(self) -> str:\n",
+    "        return self.schema.__name__\n",
+    "\n",
+    "    def _merge_schema(self, parsed_output: Any, schema: Type[BaseModel]):\n",
+    "        merged_output = {}\n",
+    "        if isinstance(parsed_output, dict):\n",
+    "            items = parsed_output.items()\n",
+    "        elif isinstance(parsed_output, list):\n",
+    "            items = [(k, v) for item in parsed_output for k, v in item.items()]\n",
+    "        else:\n",
+    "            return parsed_output\n",
+    "\n",
+    "        for key, value in items:\n",
+    "            if key in schema.__fields__:\n",
+    "                field_info = schema.__fields__[key]\n",
+    "                if isinstance(value, list):\n",
+    "                    if issubclass(field_info.type_, (BaseModel, dict)):\n",
+    "                        result = self._merge_schema(value, field_info.type_)\n",
+    "                    elif all(\n",
+    "                        isinstance(item, dict) and item.keys() == {\"item\"}\n",
+    "                        for item in value\n",
+    "                    ):\n",
+    "                        result = [next(iter(item.values())) for item in value]\n",
+    "                    else:\n",
+    "                        result = value\n",
+    "                else:\n",
+    "                    result = value\n",
+    "            else:\n",
+    "                result = value\n",
+    "            if key in merged_output:\n",
+    "                if isinstance(merged_output[key], list):\n",
+    "                    merged_output[key].append(result)\n",
+    "                else:\n",
+    "                    merged_output[key] = [merged_output[key], result]\n",
+    "            else:\n",
+    "                merged_output[key] = result\n",
+    "\n",
+    "        return merged_output\n",
+    "\n",
+    "    def __call__(self, parsed_output: dict) -> Dict[str, Any]:\n",
+    "        merged_output = {}\n",
+    "        if self._func_name not in parsed_output:\n",
+    "            return parsed_output\n",
+    "        return {\n",
+    "            self._func_name: self._merge_schema(\n",
+    "                parsed_output[self._func_name], self.schema\n",
+    "            )\n",
+    "        }\n",
+    "\n",
+    "\n",
+    "def try_parse(llm_output, config):\n",
+    "    try:\n",
+    "        output_chain = XMLOutputParser() | MergeSchema(task.schema)\n",
+    "        parsed = output_chain.invoke(llm_output, config)\n",
+    "        # Wrap as 'output' so to be unified for the evaluators\n",
+    "        return {\"output\": parsed.get(\"GenerateTicket\")}\n",
+    "    except Exception as e:\n",
+    "        return {\"output\": llm_output, \"error\": str(e)}\n",
+    "\n",
+    "\n",
+    "claude_extraction_chain = format_run | prompt | claude | try_parse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "cea759e7-a51a-4abd-9869-f928bea80da2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'output': {'issue_summary': 'How to run Llama locally',\n",
+       "  'question': {'question_category': 'Implementation Issues',\n",
+       "   'is_off_topic': 'false',\n",
+       "   'toxicity': '0',\n",
+       "   'sentiment': 'Neutral',\n",
+       "   'programming_language': 'unknown'},\n",
+       "  'response': {'response_type': 'provide guidance',\n",
+       "   'confidence_level': '3',\n",
+       "   'followup_actions': ['Ask clarifying questions about the specific issue',\n",
+       "    'Provide documentation or examples for running Llama locally']}}}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = claude_extraction_chain.invoke(\n",
+    "    {\"question\": \"how do i run llama 2 locally?\", \"answer\": \"Llama.cpp of course.\"}\n",
+    ")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7723e6f4-b214-46a8-9286-93116fe893d8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'claude-2-json-schema-to-xml-af10' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=fb67ac1a-4e37-44ca-94bf-970eee89ee04\n",
+      "\n",
+      "View all tests for Dataset Chat Extraction at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n",
+      "[------------------------------------------------->] 27/27"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>Experiment Results:</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.json_edit_distance</th>\n",
+       "      <th>feedback.json_schema</th>\n",
+       "      <th>feedback.toxicity_similarity</th>\n",
+       "      <th>feedback.sentiment_similarity</th>\n",
+       "      <th>feedback.confidence_level_similarity</th>\n",
+       "      <th>feedback.question_category</th>\n",
+       "      <th>feedback.off_topic_similarity</th>\n",
+       "      <th>feedback.programming_language_similarity</th>\n",
+       "      <th>error</th>\n",
+       "      <th>execution_time</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>27.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.365055</td>\n",
+       "      <td>0.777778</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.944444</td>\n",
+       "      <td>0.970370</td>\n",
+       "      <td>0.481481</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.444444</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>11.570401</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.108204</td>\n",
+       "      <td>0.423659</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.160128</td>\n",
+       "      <td>0.072403</td>\n",
+       "      <td>0.509175</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.506370</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.648157</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.105033</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0.800000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8.821772</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>0.298704</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10.545821</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>0.393478</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>11.427731</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>0.444609</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>12.390761</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>0.537678</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>17.776214</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        feedback.json_edit_distance  feedback.json_schema  \\\n",
+       "count                     27.000000             27.000000   \n",
+       "unique                          NaN                   NaN   \n",
+       "top                             NaN                   NaN   \n",
+       "freq                            NaN                   NaN   \n",
+       "mean                       0.365055              0.777778   \n",
+       "std                        0.108204              0.423659   \n",
+       "min                        0.105033              0.000000   \n",
+       "25%                        0.298704              1.000000   \n",
+       "50%                        0.393478              1.000000   \n",
+       "75%                        0.444609              1.000000   \n",
+       "max                        0.537678              1.000000   \n",
+       "\n",
+       "        feedback.toxicity_similarity  feedback.sentiment_similarity  \\\n",
+       "count                           27.0                      27.000000   \n",
+       "unique                           NaN                            NaN   \n",
+       "top                              NaN                            NaN   \n",
+       "freq                             NaN                            NaN   \n",
+       "mean                             1.0                       0.944444   \n",
+       "std                              0.0                       0.160128   \n",
+       "min                              1.0                       0.500000   \n",
+       "25%                              1.0                       1.000000   \n",
+       "50%                              1.0                       1.000000   \n",
+       "75%                              1.0                       1.000000   \n",
+       "max                              1.0                       1.000000   \n",
+       "\n",
+       "        feedback.confidence_level_similarity  feedback.question_category  \\\n",
+       "count                              27.000000                   27.000000   \n",
+       "unique                                   NaN                         NaN   \n",
+       "top                                      NaN                         NaN   \n",
+       "freq                                     NaN                         NaN   \n",
+       "mean                                0.970370                    0.481481   \n",
+       "std                                 0.072403                    0.509175   \n",
+       "min                                 0.800000                    0.000000   \n",
+       "25%                                 1.000000                    0.000000   \n",
+       "50%                                 1.000000                    0.000000   \n",
+       "75%                                 1.000000                    1.000000   \n",
+       "max                                 1.000000                    1.000000   \n",
+       "\n",
+       "        feedback.off_topic_similarity  \\\n",
+       "count                            27.0   \n",
+       "unique                            NaN   \n",
+       "top                               NaN   \n",
+       "freq                              NaN   \n",
+       "mean                              0.0   \n",
+       "std                               0.0   \n",
+       "min                               0.0   \n",
+       "25%                               0.0   \n",
+       "50%                               0.0   \n",
+       "75%                               0.0   \n",
+       "max                               0.0   \n",
+       "\n",
+       "        feedback.programming_language_similarity error  execution_time  \n",
+       "count                                  27.000000     0       27.000000  \n",
+       "unique                                       NaN     0             NaN  \n",
+       "top                                          NaN   NaN             NaN  \n",
+       "freq                                         NaN   NaN             NaN  \n",
+       "mean                                    0.444444   NaN       11.570401  \n",
+       "std                                     0.506370   NaN        1.648157  \n",
+       "min                                     0.000000   NaN        8.821772  \n",
+       "25%                                     0.000000   NaN       10.545821  \n",
+       "50%                                     0.000000   NaN       11.427731  \n",
+       "75%                                     1.000000   NaN       12.390761  \n",
+       "max                                     1.000000   NaN       17.776214  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "claude_test_run = client.run_on_dataset(\n",
+    "    dataset_name=task.name,\n",
+    "    llm_or_chain_factory=claude_extraction_chain,\n",
+    "    evaluation=eval_config,\n",
+    "    verbose=True,\n",
+    "    project_name=f\"claude-2-json-schema-to-xml-{uid}\",\n",
+    "    project_metadata={\n",
+    "        \"arch\": \"claude-json-schema-xml-output\",\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5d34455c-e9d3-4fb0-b8d7-a3ee4a4b6ae0",
+   "metadata": {},
+   "source": [
+    "So it looks like edit distance is pretty good, but the schema validation leaves something to be desired.\n",
+    "\n",
+    "We're defining the schema in JSON then requesting XML. Let's try keeping it unified."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a9612d56-08a1-4f24-a961-af7f7916997d",
+   "metadata": {},
+   "source": [
+    "## Try with XSD Schema Definition\n",
+    "\n",
+    "In this variant, let's see if Claude performs better if we keep our structure consistent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b9914571-d3f2-4f48-bdbb-2dfcfb03f26d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Any, Dict, List, Type\n",
+    "\n",
+    "from langchain.chat_models import ChatAnthropic\n",
+    "from langchain.output_parsers.xml import XMLOutputParser\n",
+    "from langchain.prompts import ChatPromptTemplate\n",
+    "from langchain.pydantic_v1 import BaseModel\n",
+    "\n",
+    "# This is the schema the model will populate\n",
+    "xsd = \"\"\"<xs:schema xmlns:xs=\"http://www.w3.org/2001/XMLSchema\">\n",
+    "\n",
+    "    <xs:simpleType name=\"QuestionCategory\">\n",
+    "        <xs:restriction base=\"xs:string\">\n",
+    "            <xs:enumeration value=\"Implementation Issues\"/>\n",
+    "            <xs:enumeration value=\"Feature Requests\"/>\n",
+    "            <xs:enumeration value=\"Concept Explanations\"/>\n",
+    "            <xs:enumeration value=\"Code Optimization\"/>\n",
+    "            <xs:enumeration value=\"Security and Privacy Concerns\"/>\n",
+    "            <xs:enumeration value=\"Model Training and Fine-tuning\"/>\n",
+    "            <xs:enumeration value=\"Data Handling and Manipulation\"/>\n",
+    "            <xs:enumeration value=\"User Interaction Flow\"/>\n",
+    "            <xs:enumeration value=\"Technical Integration\"/>\n",
+    "            <xs:enumeration value=\"Error Handling and Logging\"/>\n",
+    "            <xs:enumeration value=\"Customization and Configuration\"/>\n",
+    "            <xs:enumeration value=\"External API and Data Source Integration\"/>\n",
+    "            <xs:enumeration value=\"Language and Localization\"/>\n",
+    "            <xs:enumeration value=\"Streaming and Real-time Processing\"/>\n",
+    "            <xs:enumeration value=\"Tool Development\"/>\n",
+    "            <xs:enumeration value=\"Function Calling\"/>\n",
+    "            <xs:enumeration value=\"LLM Integrations\"/>\n",
+    "            <xs:enumeration value=\"General Agent Questions\"/>\n",
+    "            <xs:enumeration value=\"General Chit Chat\"/>\n",
+    "            <xs:enumeration value=\"Memory\"/>\n",
+    "            <xs:enumeration value=\"Debugging Help\"/>\n",
+    "            <xs:enumeration value=\"Application Design\"/>\n",
+    "            <xs:enumeration value=\"Prompt Templates\"/>\n",
+    "            <xs:enumeration value=\"Cost Tracking\"/>\n",
+    "            <xs:enumeration value=\"Other\"/>\n",
+    "        </xs:restriction>\n",
+    "    </xs:simpleType>\n",
+    "\n",
+    "    <xs:simpleType name=\"Sentiment\">\n",
+    "        <xs:restriction base=\"xs:string\">\n",
+    "            <xs:enumeration value=\"Negative\"/>\n",
+    "            <xs:enumeration value=\"Neutral\"/>\n",
+    "            <xs:enumeration value=\"Positive\"/>\n",
+    "        </xs:restriction>\n",
+    "    </xs:simpleType>\n",
+    "\n",
+    "    <xs:simpleType name=\"ProgrammingLanguage\">\n",
+    "        <xs:restriction base=\"xs:string\">\n",
+    "            <xs:enumeration value=\"python\"/>\n",
+    "            <xs:enumeration value=\"javascript\"/>\n",
+    "            <xs:enumeration value=\"typescript\"/>\n",
+    "            <xs:enumeration value=\"unknown\"/>\n",
+    "            <xs:enumeration value=\"other\"/>\n",
+    "        </xs:restriction>\n",
+    "    </xs:simpleType>\n",
+    "\n",
+    "    <xs:complexType name=\"QuestionCategorization\">\n",
+    "        <xs:sequence>\n",
+    "            <xs:element name=\"question_category\" type=\"QuestionCategory\"/>\n",
+    "            <xs:element name=\"category_if_other\" type=\"xs:string\" minOccurs=\"0\"/>\n",
+    "            <xs:element name=\"is_off_topic\" type=\"xs:boolean\"/>\n",
+    "            <xs:element name=\"toxicity\" type=\"xs:int\">\n",
+    "                <xs:minInclusive value=\"0\"/>\n",
+    "                <xs:maxInclusive value=\"5\"/>\n",
+    "            </xs:element>\n",
+    "            <xs:element name=\"sentiment\" type=\"Sentiment\"/>\n",
+    "            <xs:element name=\"programming_language\" type=\"ProgrammingLanguage\"/>\n",
+    "        </xs:sequence>\n",
+    "    </xs:complexType>\n",
+    "\n",
+    "    <xs:simpleType name=\"ResponseType\">\n",
+    "        <xs:restriction base=\"xs:string\">\n",
+    "            <xs:enumeration value=\"resolve issue\"/>\n",
+    "            <xs:enumeration value=\"provide guidance\"/>\n",
+    "            <xs:enumeration value=\"request information\"/>\n",
+    "            <xs:enumeration value=\"give up\"/>\n",
+    "            <xs:enumeration value=\"none\"/>\n",
+    "            <xs:enumeration value=\"other\"/>\n",
+    "        </xs:restriction>\n",
+    "    </xs:simpleType>\n",
+    "\n",
+    "    <xs:complexType name=\"ResponseCategorization\">\n",
+    "        <xs:sequence>\n",
+    "            <xs:element name=\"response_type\" type=\"ResponseType\"/>\n",
+    "            <xs:element name=\"response_type_if_other\" type=\"xs:string\" minOccurs=\"0\"/>\n",
+    "            <xs:element name=\"confidence_level\" type=\"xs:int\">\n",
+    "                <xs:minInclusive value=\"0\"/>\n",
+    "                <xs:maxInclusive value=\"5\"/>\n",
+    "            </xs:element>\n",
+    "            <xs:element name=\"followup_actions\" type=\"xs:string\" minOccurs=\"0\" maxOccurs=\"unbounded\"/>\n",
+    "        </xs:sequence>\n",
+    "    </xs:complexType>\n",
+    "\n",
+    "    <xs:complexType name=\"GenerateTicket\">\n",
+    "        <xs:sequence>\n",
+    "            <xs:element name=\"issue_summary\" type=\"xs:string\"/>\n",
+    "            <xs:element name=\"question\" type=\"QuestionCategorization\"/>\n",
+    "            <xs:element name=\"response\" type=\"ResponseCategorization\"/>\n",
+    "        </xs:sequence>\n",
+    "    </xs:complexType>\n",
+    "\n",
+    "</xs:schema>\"\"\"\n",
+    "\n",
+    "prompt = claude_prompt.partial(schema=xsd, function_call=task.schema.schema()[\"title\"])\n",
+    "\n",
+    "claude_extraction_chain = format_run | prompt | claude | try_parse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "26dc6d70-b745-4fd3-9592-1a13a3f2751f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'output': {'issue_summary': 'How to run Llama locally',\n",
+       "  'question': {'question_category': 'LLM Integrations',\n",
+       "   'is_off_topic': 'false',\n",
+       "   'toxicity': '0',\n",
+       "   'sentiment': 'Neutral',\n",
+       "   'programming_language': 'unknown'},\n",
+       "  'response': {'response_type': 'provide guidance',\n",
+       "   'confidence_level': '3',\n",
+       "   'followup_actions': ['Install Llama locally', 'Add Llama to path']}}}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = claude_extraction_chain.invoke(\n",
+    "    {\n",
+    "        \"question\": \"how do i run llama 2 locally?\",\n",
+    "        \"answer\": \"Llama.cpp of course. Afterwords remember to install it, then add it to your path!\",\n",
+    "    }\n",
+    ")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "f8d58656-108d-48d2-ba16-815fc9bdebcc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'claude-2-xsd-to-xml-af10' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=07edf03d-97b9-42a8-acde-9a6e9facb388\n",
+      "\n",
+      "View all tests for Dataset Chat Extraction at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n",
+      "[------------------------------------------------->] 27/27"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>Experiment Results:</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.json_edit_distance</th>\n",
+       "      <th>feedback.json_schema</th>\n",
+       "      <th>feedback.toxicity_similarity</th>\n",
+       "      <th>feedback.sentiment_similarity</th>\n",
+       "      <th>feedback.confidence_level_similarity</th>\n",
+       "      <th>feedback.question_category</th>\n",
+       "      <th>feedback.off_topic_similarity</th>\n",
+       "      <th>feedback.programming_language_similarity</th>\n",
+       "      <th>error</th>\n",
+       "      <th>execution_time</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>27.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.391835</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.907407</td>\n",
+       "      <td>0.970370</td>\n",
+       "      <td>0.370370</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10.930946</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.097901</td>\n",
+       "      <td>0.509175</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.197924</td>\n",
+       "      <td>0.072403</td>\n",
+       "      <td>0.492103</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.509175</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.594109</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.116608</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0.800000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8.416739</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>0.348812</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>9.813120</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>0.379653</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10.371725</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>0.425574</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>11.964592</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>0.644007</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>14.291423</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        feedback.json_edit_distance  feedback.json_schema  \\\n",
+       "count                     27.000000             27.000000   \n",
+       "unique                          NaN                   NaN   \n",
+       "top                             NaN                   NaN   \n",
+       "freq                            NaN                   NaN   \n",
+       "mean                       0.391835              0.518519   \n",
+       "std                        0.097901              0.509175   \n",
+       "min                        0.116608              0.000000   \n",
+       "25%                        0.348812              0.000000   \n",
+       "50%                        0.379653              1.000000   \n",
+       "75%                        0.425574              1.000000   \n",
+       "max                        0.644007              1.000000   \n",
+       "\n",
+       "        feedback.toxicity_similarity  feedback.sentiment_similarity  \\\n",
+       "count                           27.0                      27.000000   \n",
+       "unique                           NaN                            NaN   \n",
+       "top                              NaN                            NaN   \n",
+       "freq                             NaN                            NaN   \n",
+       "mean                             1.0                       0.907407   \n",
+       "std                              0.0                       0.197924   \n",
+       "min                              1.0                       0.500000   \n",
+       "25%                              1.0                       1.000000   \n",
+       "50%                              1.0                       1.000000   \n",
+       "75%                              1.0                       1.000000   \n",
+       "max                              1.0                       1.000000   \n",
+       "\n",
+       "        feedback.confidence_level_similarity  feedback.question_category  \\\n",
+       "count                              27.000000                   27.000000   \n",
+       "unique                                   NaN                         NaN   \n",
+       "top                                      NaN                         NaN   \n",
+       "freq                                     NaN                         NaN   \n",
+       "mean                                0.970370                    0.370370   \n",
+       "std                                 0.072403                    0.492103   \n",
+       "min                                 0.800000                    0.000000   \n",
+       "25%                                 1.000000                    0.000000   \n",
+       "50%                                 1.000000                    0.000000   \n",
+       "75%                                 1.000000                    1.000000   \n",
+       "max                                 1.000000                    1.000000   \n",
+       "\n",
+       "        feedback.off_topic_similarity  \\\n",
+       "count                            27.0   \n",
+       "unique                            NaN   \n",
+       "top                               NaN   \n",
+       "freq                              NaN   \n",
+       "mean                              0.0   \n",
+       "std                               0.0   \n",
+       "min                               0.0   \n",
+       "25%                               0.0   \n",
+       "50%                               0.0   \n",
+       "75%                               0.0   \n",
+       "max                               0.0   \n",
+       "\n",
+       "        feedback.programming_language_similarity error  execution_time  \n",
+       "count                                  27.000000     0       27.000000  \n",
+       "unique                                       NaN     0             NaN  \n",
+       "top                                          NaN   NaN             NaN  \n",
+       "freq                                         NaN   NaN             NaN  \n",
+       "mean                                    0.518519   NaN       10.930946  \n",
+       "std                                     0.509175   NaN        1.594109  \n",
+       "min                                     0.000000   NaN        8.416739  \n",
+       "25%                                     0.000000   NaN        9.813120  \n",
+       "50%                                     1.000000   NaN       10.371725  \n",
+       "75%                                     1.000000   NaN       11.964592  \n",
+       "max                                     1.000000   NaN       14.291423  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "claude_xsd_test_run = client.run_on_dataset(\n",
+    "    dataset_name=task.name,\n",
+    "    llm_or_chain_factory=claude_extraction_chain,\n",
+    "    evaluation=eval_config,\n",
+    "    verbose=True,\n",
+    "    project_name=f\"claude-2-xsd-to-xml-{uid}\",\n",
+    "    project_metadata={\n",
+    "        \"arch\": \"claude-xml\",\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3df7ce82-73a7-4913-9569-1066d982b528",
+   "metadata": {},
+   "source": [
+    "The json schema metric went down, meaning that the output counter-intuitively is less friendly to our parser than before.\n",
+    "\n",
+    "\n",
+    "Let's try with an open source model: `llama-v2-34b-code-instruct`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "102df41d-2c93-4ffc-a09a-4198ea5b6acc",
+   "metadata": {},
+   "source": [
+    "## Try with Llama 2\n",
+    "\n",
+    "`llama-v2-34b-code-instruct` is an open source model that is meant to be good at both code-gen and other tasks.\n",
+    "Let's benchmark it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "27cc37f1-2dc3-4d8e-a380-3c8296bf105a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "from langchain.chat_models import ChatFireworks\n",
+    "from langchain.output_parsers.json import parse_json_markdown\n",
+    "from langchain.schema.output_parser import StrOutputParser\n",
+    "\n",
+    "llama_prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\n",
+    "            \"system\",\n",
+    "            \"You are a data extraction bot tasked with extracting and inferring information from dialogues and generating tickets. Always respond \"\n",
+    "            \"only with json based on the following JSON schema:\\n{schema}\",\n",
+    "        ),\n",
+    "        (\n",
+    "            \"user\",\n",
+    "            \"Generate a ticket from the following question-response pair:\\n\"\n",
+    "            \"<Dialogue>\\n{dialogue}\\n</Dialogue>\\n\"\n",
+    "            \"Remember, respond directly with this format:\\n\"\n",
+    "            '{{\"{function_call}\": ...}}\\n'\n",
+    "            \"RESPOND ONLY IN JSON THEN STOP.\",\n",
+    "        ),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "prompt = llama_prompt.partial(\n",
+    "    schema=task.schema.schema_json(), function_call=task.schema.schema()[\"title\"]\n",
+    ")\n",
+    "\n",
+    "llm = ChatFireworks(\n",
+    "    model=\"accounts/fireworks/models/llama-v2-34b-code-instruct\",\n",
+    "    temperature=0,\n",
+    "    model_kwargs={\"max_tokens\": 4000},\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def parse_output(ai_message):\n",
+    "    content = ai_message.content\n",
+    "    parser = lambda x: json.loads(x, strict=False)\n",
+    "    try:\n",
+    "        parsed = parse_json_markdown(content, parser=parser)\n",
+    "        if \"GenerateTicket\" in parsed:\n",
+    "            return {\"output\": parsed[\"GenerateTicket\"]}\n",
+    "        return {\"output\": parsed}\n",
+    "    except json.JSONDecodeError:\n",
+    "        return {\"output\": content}\n",
+    "\n",
+    "\n",
+    "fireworks_extraction_chain = format_run | prompt | llm | parse_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "266e2273-2fd7-42c2-986b-c08a07cbcc96",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'output': 'Here is the generated ticket:\\n{\"GenerateTicket\": {\"issue_summary\": \"Running Llama 2 locally\", \"question\": {\"QuestionCategorization\": {\"question_category\": \"Technical Integration\", \"is_off_topic\": false, \"sentiment\": \"Neutral\", \"programming_language\": \"unknown\"}}}}'}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = fireworks_extraction_chain.invoke(\n",
+    "    {\"question\": \"how do i run llama 2 locally?\", \"answer\": \"Llama.cpp of course.\"}\n",
+    ")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "9f4f4b39-d1b0-4f89-aa09-4fe261296dbc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'llama-v2-34b-code-instruct-af10' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=53631e16-bdb2-4d53-ae8f-10cde961614e\n",
+      "\n",
+      "View all tests for Dataset Chat Extraction at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n",
+      "[------------------------------------------------->] 27/27"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>Experiment Results:</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.json_edit_distance</th>\n",
+       "      <th>feedback.json_schema</th>\n",
+       "      <th>feedback.toxicity_similarity</th>\n",
+       "      <th>feedback.sentiment_similarity</th>\n",
+       "      <th>feedback.confidence_level_similarity</th>\n",
+       "      <th>feedback.question_category</th>\n",
+       "      <th>feedback.off_topic_similarity</th>\n",
+       "      <th>feedback.programming_language_similarity</th>\n",
+       "      <th>error</th>\n",
+       "      <th>execution_time</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>23.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>27.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.378524</td>\n",
+       "      <td>0.407407</td>\n",
+       "      <td>0.592593</td>\n",
+       "      <td>0.574074</td>\n",
+       "      <td>0.681481</td>\n",
+       "      <td>0.148148</td>\n",
+       "      <td>0.666667</td>\n",
+       "      <td>0.481481</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.310952</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.127190</td>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.359051</td>\n",
+       "      <td>0.389316</td>\n",
+       "      <td>0.362014</td>\n",
+       "      <td>0.480384</td>\n",
+       "      <td>0.509175</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.591779</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.089130</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3.046112</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>0.309743</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3.983488</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>0.352751</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0.800000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.326160</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>0.468417</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.687441</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>0.659091</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5.713148</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        feedback.json_edit_distance  feedback.json_schema  \\\n",
+       "count                     23.000000             27.000000   \n",
+       "unique                          NaN                   NaN   \n",
+       "top                             NaN                   NaN   \n",
+       "freq                            NaN                   NaN   \n",
+       "mean                       0.378524              0.407407   \n",
+       "std                        0.127190              0.500712   \n",
+       "min                        0.089130              0.000000   \n",
+       "25%                        0.309743              0.000000   \n",
+       "50%                        0.352751              0.000000   \n",
+       "75%                        0.468417              1.000000   \n",
+       "max                        0.659091              1.000000   \n",
+       "\n",
+       "        feedback.toxicity_similarity  feedback.sentiment_similarity  \\\n",
+       "count                      27.000000                      27.000000   \n",
+       "unique                           NaN                            NaN   \n",
+       "top                              NaN                            NaN   \n",
+       "freq                             NaN                            NaN   \n",
+       "mean                        0.592593                       0.574074   \n",
+       "std                         0.500712                       0.359051   \n",
+       "min                         0.000000                       0.000000   \n",
+       "25%                         0.000000                       0.500000   \n",
+       "50%                         1.000000                       0.500000   \n",
+       "75%                         1.000000                       1.000000   \n",
+       "max                         1.000000                       1.000000   \n",
+       "\n",
+       "        feedback.confidence_level_similarity  feedback.question_category  \\\n",
+       "count                              27.000000                   27.000000   \n",
+       "unique                                   NaN                         NaN   \n",
+       "top                                      NaN                         NaN   \n",
+       "freq                                     NaN                         NaN   \n",
+       "mean                                0.681481                    0.148148   \n",
+       "std                                 0.389316                    0.362014   \n",
+       "min                                 0.000000                    0.000000   \n",
+       "25%                                 0.500000                    0.000000   \n",
+       "50%                                 0.800000                    0.000000   \n",
+       "75%                                 1.000000                    0.000000   \n",
+       "max                                 1.000000                    1.000000   \n",
+       "\n",
+       "        feedback.off_topic_similarity  \\\n",
+       "count                       27.000000   \n",
+       "unique                            NaN   \n",
+       "top                               NaN   \n",
+       "freq                              NaN   \n",
+       "mean                         0.666667   \n",
+       "std                          0.480384   \n",
+       "min                          0.000000   \n",
+       "25%                          0.000000   \n",
+       "50%                          1.000000   \n",
+       "75%                          1.000000   \n",
+       "max                          1.000000   \n",
+       "\n",
+       "        feedback.programming_language_similarity error  execution_time  \n",
+       "count                                  27.000000     0       27.000000  \n",
+       "unique                                       NaN     0             NaN  \n",
+       "top                                          NaN   NaN             NaN  \n",
+       "freq                                         NaN   NaN             NaN  \n",
+       "mean                                    0.481481   NaN        4.310952  \n",
+       "std                                     0.509175   NaN        0.591779  \n",
+       "min                                     0.000000   NaN        3.046112  \n",
+       "25%                                     0.000000   NaN        3.983488  \n",
+       "50%                                     0.000000   NaN        4.326160  \n",
+       "75%                                     1.000000   NaN        4.687441  \n",
+       "max                                     1.000000   NaN        5.713148  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "llama_v2_test_run = client.run_on_dataset(\n",
+    "    dataset_name=task.name,\n",
+    "    llm_or_chain_factory=fireworks_extraction_chain,\n",
+    "    evaluation=eval_config,\n",
+    "    verbose=True,\n",
+    "    project_name=f\"llama-v2-34b-code-instruct-{uid}\",\n",
+    "    project_metadata={\"arch\": \"claude-xml\", \"model\": \"llama-v2-34b-code-instruct\"},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
+   "metadata": {},
+   "source": [
+    "## Compare Results\n",
+    "\n",
+    "Here, we'll take a look at the underlying results a little bit. You can review the results to see relative performance in aggregate and on a per-example basis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "6eb19db1-43b8-4866-a3d2-f211ba92ab8b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = (\n",
+    "    test_run.to_dataframe()\n",
+    "    .join(claude_test_run.to_dataframe(), rsuffix=\"_claude\")\n",
+    "    .join(claude_xsd_test_run.to_dataframe(), rsuffix=\"_claude_xsd\")\n",
+    "    .join(llama_v2_test_run.to_dataframe(), rsuffix=\"_llama_v2\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "c292b4ed-8331-4068-82fa-7cea2725e24d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>inputs.answer</th>\n",
+       "      <th>inputs.question</th>\n",
+       "      <th>outputs.output</th>\n",
+       "      <th>reference.output</th>\n",
+       "      <th>feedback.json_edit_distance</th>\n",
+       "      <th>feedback.json_schema</th>\n",
+       "      <th>feedback.toxicity_similarity</th>\n",
+       "      <th>feedback.sentiment_similarity</th>\n",
+       "      <th>feedback.confidence_level_similarity</th>\n",
+       "      <th>feedback.question_category</th>\n",
+       "      <th>...</th>\n",
+       "      <th>feedback.json_edit_distance_llama_v2</th>\n",
+       "      <th>feedback.json_schema_llama_v2</th>\n",
+       "      <th>feedback.toxicity_similarity_llama_v2</th>\n",
+       "      <th>feedback.sentiment_similarity_llama_v2</th>\n",
+       "      <th>feedback.confidence_level_similarity_llama_v2</th>\n",
+       "      <th>feedback.question_category_llama_v2</th>\n",
+       "      <th>feedback.off_topic_similarity_llama_v2</th>\n",
+       "      <th>feedback.programming_language_similarity_llama_v2</th>\n",
+       "      <th>error_llama_v2</th>\n",
+       "      <th>execution_time_llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>23a81130-2ad9-46cf-ad27-46589bcea94a</th>\n",
+       "      <td>Pour joindre les deux outputs, vous pouvez uti...</td>\n",
+       "      <td>je travail sur python. je souhaite joindre ces...</td>\n",
+       "      <td>{'issue_summary': 'Joining two outputs in Pyth...</td>\n",
+       "      <td>{'question': {'toxicity': 0, 'sentiment': 'Neu...</td>\n",
+       "      <td>0.089219</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.477690</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>0.8</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>4.100867</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>598316ec-f5e2-4b4d-83a8-36adb18e12fe</th>\n",
+       "      <td>Hmm, I'm not sure.</td>\n",
+       "      <td>example for dalle agent</td>\n",
+       "      <td>{'issue_summary': 'Example for DALL-E Agent', ...</td>\n",
+       "      <td>{'question': {'toxicity': 0, 'sentiment': 'Neu...</td>\n",
+       "      <td>0.174905</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.8</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.346749</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>3.653370</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>d1a1a2e8-6f4c-4325-8aaa-ea20e2449268</th>\n",
+       "      <td>To run Llama2 using pandas, you can follow the...</td>\n",
+       "      <td>how do I run llama2 using pandas</td>\n",
+       "      <td>{'issue_summary': 'Running Llama2 with Pandas'...</td>\n",
+       "      <td>{'question': {'toxicity': 0, 'sentiment': 'Neu...</td>\n",
+       "      <td>0.222621</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.262118</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>4.507702</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>140a4819-0046-469d-b4df-8e747ddae112</th>\n",
+       "      <td>To clear the conversation in ConversationalRet...</td>\n",
+       "      <td>if Im useing ConversationalRetrievalChain how ...</td>\n",
+       "      <td>{'issue_summary': 'Clearing Conversation in Co...</td>\n",
+       "      <td>{'question': {'toxicity': 0, 'sentiment': 'Neu...</td>\n",
+       "      <td>0.353261</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.279330</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>3.654116</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7b0a9dd9-68ce-41a1-9f9d-067d93175477</th>\n",
+       "      <td>To perform the task of creating an app that in...</td>\n",
+       "      <td>I want to create an app which:\\n- chats with u...</td>\n",
+       "      <td>{'issue_summary': 'Building an app with Langch...</td>\n",
+       "      <td>{'question': {'toxicity': 0, 'sentiment': 'Neu...</td>\n",
+       "      <td>0.562950</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>4.666831</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 56 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                          inputs.answer  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a  Pour joindre les deux outputs, vous pouvez uti...   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                                 Hmm, I'm not sure.   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268  To run Llama2 using pandas, you can follow the...   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112  To clear the conversation in ConversationalRet...   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477  To perform the task of creating an app that in...   \n",
+       "\n",
+       "                                                                        inputs.question  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a  je travail sur python. je souhaite joindre ces...   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                            example for dalle agent   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                   how do I run llama2 using pandas   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112  if Im useing ConversationalRetrievalChain how ...   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477  I want to create an app which:\\n- chats with u...   \n",
+       "\n",
+       "                                                                         outputs.output  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a  {'issue_summary': 'Joining two outputs in Pyth...   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe  {'issue_summary': 'Example for DALL-E Agent', ...   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268  {'issue_summary': 'Running Llama2 with Pandas'...   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112  {'issue_summary': 'Clearing Conversation in Co...   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477  {'issue_summary': 'Building an app with Langch...   \n",
+       "\n",
+       "                                                                       reference.output  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a  {'question': {'toxicity': 0, 'sentiment': 'Neu...   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe  {'question': {'toxicity': 0, 'sentiment': 'Neu...   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268  {'question': {'toxicity': 0, 'sentiment': 'Neu...   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112  {'question': {'toxicity': 0, 'sentiment': 'Neu...   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477  {'question': {'toxicity': 0, 'sentiment': 'Neu...   \n",
+       "\n",
+       "                                      feedback.json_edit_distance  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                     0.089219   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                     0.174905   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                     0.222621   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                     0.353261   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                     0.562950   \n",
+       "\n",
+       "                                      feedback.json_schema  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                     1   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                     1   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                     1   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                     1   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                     1   \n",
+       "\n",
+       "                                      feedback.toxicity_similarity  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                             0   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                             0   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                             0   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                             0   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                             0   \n",
+       "\n",
+       "                                      feedback.sentiment_similarity  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                            1.0   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                            1.0   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                            1.0   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                            1.0   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                            1.0   \n",
+       "\n",
+       "                                      feedback.confidence_level_similarity  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                                   1.0   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                                   0.8   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                                   1.0   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                                   1.0   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                                   0.8   \n",
+       "\n",
+       "                                      feedback.question_category  ...  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                           1  ...   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                           0  ...   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                           0  ...   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                           0  ...   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                           1  ...   \n",
+       "\n",
+       "                                      feedback.json_edit_distance_llama_v2  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                              0.477690   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                              0.346749   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                              0.262118   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                              0.279330   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                                   NaN   \n",
+       "\n",
+       "                                      feedback.json_schema_llama_v2  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                              1   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                              1   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                              1   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                              1   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                              0   \n",
+       "\n",
+       "                                     feedback.toxicity_similarity_llama_v2  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                                   0.0   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                                   1.0   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                                   1.0   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                                   1.0   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                                   0.0   \n",
+       "\n",
+       "                                      feedback.sentiment_similarity_llama_v2  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                                     0.5   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                                     1.0   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                                     0.5   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                                     1.0   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                                     0.0   \n",
+       "\n",
+       "                                     feedback.confidence_level_similarity_llama_v2  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                                           0.8   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                                           0.4   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                                           1.0   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                                           1.0   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                                           0.0   \n",
+       "\n",
+       "                                     feedback.question_category_llama_v2  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                                   0   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                                   1   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                                   0   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                                   0   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                                   0   \n",
+       "\n",
+       "                                     feedback.off_topic_similarity_llama_v2  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                                      0   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                                      1   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                                      1   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                                      1   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                                      0   \n",
+       "\n",
+       "                                     feedback.programming_language_similarity_llama_v2  \\\n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a                                                 1   \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe                                                 1   \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268                                                 1   \n",
+       "140a4819-0046-469d-b4df-8e747ddae112                                                 0   \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477                                                 0   \n",
+       "\n",
+       "                                      error_llama_v2  execution_time_llama_v2  \n",
+       "23a81130-2ad9-46cf-ad27-46589bcea94a            None                 4.100867  \n",
+       "598316ec-f5e2-4b4d-83a8-36adb18e12fe            None                 3.653370  \n",
+       "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268            None                 4.507702  \n",
+       "140a4819-0046-469d-b4df-8e747ddae112            None                 3.654116  \n",
+       "7b0a9dd9-68ce-41a1-9f9d-067d93175477            None                 4.666831  \n",
+       "\n",
+       "[5 rows x 56 columns]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da665f3c-4ef6-474d-8ab5-284434060bec",
+   "metadata": {},
+   "source": [
+    "#### Here, we compare the aggregate metrics side-by-side"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "b5b936c2-d676-4931-bb13-ec06ab55d401",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df = (\n",
+    "    test_run.get_aggregate_feedback()\n",
+    "    .add_suffix(\".gpt-4\")\n",
+    "    .join(claude_test_run.get_aggregate_feedback(), rsuffix=\".claude\")\n",
+    "    .join(claude_xsd_test_run.get_aggregate_feedback(), rsuffix=\".claude_xsd\")\n",
+    "    .join(llama_v2_test_run.get_aggregate_feedback(), rsuffix=\".llama_v2\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "1a151781-9c69-43c3-84d7-5617ee0e7d63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import HTML, display\n",
+    "\n",
+    "feedback_columns = sorted(\n",
+    "    {col.rsplit(\".\", 1)[0] for col in df.columns if col.startswith(\"feedback.\")}\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def render_metric(df, metric):\n",
+    "    sub_cols = [col for col in df.columns if col.startswith(metric)]\n",
+    "    display(HTML(f\"<h3>{metric.split('.')[-1]}</h3>\"))\n",
+    "    display(df[sub_cols][df.index.isin([\"mean\", \"std\"])])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "97892d06-ac72-43fa-8e1e-ff33b284940d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['feedback',\n",
+       " 'feedback.confidence_level_similarity',\n",
+       " 'feedback.json_edit_distance',\n",
+       " 'feedback.json_schema',\n",
+       " 'feedback.off_topic_similarity',\n",
+       " 'feedback.programming_language_similarity',\n",
+       " 'feedback.question_category',\n",
+       " 'feedback.sentiment_similarity',\n",
+       " 'feedback.toxicity_similarity']"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "feedback_columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "090284d7-29b6-4ea7-b193-ebc159fae143",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<h3>execution_time</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>execution_time.gpt-4</th>\n",
+       "      <th>execution_time</th>\n",
+       "      <th>execution_time.claude_xsd</th>\n",
+       "      <th>execution_time.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>11.573060</td>\n",
+       "      <td>11.570401</td>\n",
+       "      <td>10.930946</td>\n",
+       "      <td>4.310952</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>4.616704</td>\n",
+       "      <td>1.648157</td>\n",
+       "      <td>1.594109</td>\n",
+       "      <td>0.591779</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      execution_time.gpt-4  execution_time  execution_time.claude_xsd  \\\n",
+       "mean             11.573060       11.570401                  10.930946   \n",
+       "std               4.616704        1.648157                   1.594109   \n",
+       "\n",
+       "      execution_time.llama_v2  \n",
+       "mean                 4.310952  \n",
+       "std                  0.591779  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "render_metric(df, \"execution_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "8f4cf5f5-dd75-4318-9bf4-25b63fa1b895",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<h3>feedback</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.json_edit_distance.gpt-4</th>\n",
+       "      <th>feedback.json_schema.gpt-4</th>\n",
+       "      <th>feedback.toxicity_similarity.gpt-4</th>\n",
+       "      <th>feedback.sentiment_similarity.gpt-4</th>\n",
+       "      <th>feedback.confidence_level_similarity.gpt-4</th>\n",
+       "      <th>feedback.question_category.gpt-4</th>\n",
+       "      <th>feedback.off_topic_similarity.gpt-4</th>\n",
+       "      <th>feedback.programming_language_similarity.gpt-4</th>\n",
+       "      <th>feedback.json_edit_distance</th>\n",
+       "      <th>feedback.json_schema</th>\n",
+       "      <th>...</th>\n",
+       "      <th>feedback.off_topic_similarity.claude_xsd</th>\n",
+       "      <th>feedback.programming_language_similarity.claude_xsd</th>\n",
+       "      <th>feedback.json_edit_distance.llama_v2</th>\n",
+       "      <th>feedback.json_schema.llama_v2</th>\n",
+       "      <th>feedback.toxicity_similarity.llama_v2</th>\n",
+       "      <th>feedback.sentiment_similarity.llama_v2</th>\n",
+       "      <th>feedback.confidence_level_similarity.llama_v2</th>\n",
+       "      <th>feedback.question_category.llama_v2</th>\n",
+       "      <th>feedback.off_topic_similarity.llama_v2</th>\n",
+       "      <th>feedback.programming_language_similarity.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.258825</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.962963</td>\n",
+       "      <td>0.592593</td>\n",
+       "      <td>0.888889</td>\n",
+       "      <td>0.592593</td>\n",
+       "      <td>0.365055</td>\n",
+       "      <td>0.777778</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.378524</td>\n",
+       "      <td>0.407407</td>\n",
+       "      <td>0.592593</td>\n",
+       "      <td>0.574074</td>\n",
+       "      <td>0.681481</td>\n",
+       "      <td>0.148148</td>\n",
+       "      <td>0.666667</td>\n",
+       "      <td>0.481481</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.177651</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.079169</td>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.320256</td>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.108204</td>\n",
+       "      <td>0.423659</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.509175</td>\n",
+       "      <td>0.127190</td>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.359051</td>\n",
+       "      <td>0.389316</td>\n",
+       "      <td>0.362014</td>\n",
+       "      <td>0.480384</td>\n",
+       "      <td>0.509175</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 32 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.json_edit_distance.gpt-4  feedback.json_schema.gpt-4  \\\n",
+       "mean                           0.258825                         1.0   \n",
+       "std                            0.177651                         0.0   \n",
+       "\n",
+       "      feedback.toxicity_similarity.gpt-4  feedback.sentiment_similarity.gpt-4  \\\n",
+       "mean                                 0.0                                  1.0   \n",
+       "std                                  0.0                                  0.0   \n",
+       "\n",
+       "      feedback.confidence_level_similarity.gpt-4  \\\n",
+       "mean                                    0.962963   \n",
+       "std                                     0.079169   \n",
+       "\n",
+       "      feedback.question_category.gpt-4  feedback.off_topic_similarity.gpt-4  \\\n",
+       "mean                          0.592593                             0.888889   \n",
+       "std                           0.500712                             0.320256   \n",
+       "\n",
+       "      feedback.programming_language_similarity.gpt-4  \\\n",
+       "mean                                        0.592593   \n",
+       "std                                         0.500712   \n",
+       "\n",
+       "      feedback.json_edit_distance  feedback.json_schema  ...  \\\n",
+       "mean                     0.365055              0.777778  ...   \n",
+       "std                      0.108204              0.423659  ...   \n",
+       "\n",
+       "      feedback.off_topic_similarity.claude_xsd  \\\n",
+       "mean                                       0.0   \n",
+       "std                                        0.0   \n",
+       "\n",
+       "      feedback.programming_language_similarity.claude_xsd  \\\n",
+       "mean                                           0.518519     \n",
+       "std                                            0.509175     \n",
+       "\n",
+       "      feedback.json_edit_distance.llama_v2  feedback.json_schema.llama_v2  \\\n",
+       "mean                              0.378524                       0.407407   \n",
+       "std                               0.127190                       0.500712   \n",
+       "\n",
+       "      feedback.toxicity_similarity.llama_v2  \\\n",
+       "mean                               0.592593   \n",
+       "std                                0.500712   \n",
+       "\n",
+       "      feedback.sentiment_similarity.llama_v2  \\\n",
+       "mean                                0.574074   \n",
+       "std                                 0.359051   \n",
+       "\n",
+       "      feedback.confidence_level_similarity.llama_v2  \\\n",
+       "mean                                       0.681481   \n",
+       "std                                        0.389316   \n",
+       "\n",
+       "      feedback.question_category.llama_v2  \\\n",
+       "mean                             0.148148   \n",
+       "std                              0.362014   \n",
+       "\n",
+       "      feedback.off_topic_similarity.llama_v2  \\\n",
+       "mean                                0.666667   \n",
+       "std                                 0.480384   \n",
+       "\n",
+       "      feedback.programming_language_similarity.llama_v2  \n",
+       "mean                                           0.481481  \n",
+       "std                                            0.509175  \n",
+       "\n",
+       "[2 rows x 32 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>confidence_level_similarity</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.confidence_level_similarity.gpt-4</th>\n",
+       "      <th>feedback.confidence_level_similarity</th>\n",
+       "      <th>feedback.confidence_level_similarity.claude_xsd</th>\n",
+       "      <th>feedback.confidence_level_similarity.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.962963</td>\n",
+       "      <td>0.970370</td>\n",
+       "      <td>0.970370</td>\n",
+       "      <td>0.681481</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.079169</td>\n",
+       "      <td>0.072403</td>\n",
+       "      <td>0.072403</td>\n",
+       "      <td>0.389316</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.confidence_level_similarity.gpt-4  \\\n",
+       "mean                                    0.962963   \n",
+       "std                                     0.079169   \n",
+       "\n",
+       "      feedback.confidence_level_similarity  \\\n",
+       "mean                              0.970370   \n",
+       "std                               0.072403   \n",
+       "\n",
+       "      feedback.confidence_level_similarity.claude_xsd  \\\n",
+       "mean                                         0.970370   \n",
+       "std                                          0.072403   \n",
+       "\n",
+       "      feedback.confidence_level_similarity.llama_v2  \n",
+       "mean                                       0.681481  \n",
+       "std                                        0.389316  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>json_edit_distance</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.json_edit_distance.gpt-4</th>\n",
+       "      <th>feedback.json_edit_distance</th>\n",
+       "      <th>feedback.json_edit_distance.claude_xsd</th>\n",
+       "      <th>feedback.json_edit_distance.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.258825</td>\n",
+       "      <td>0.365055</td>\n",
+       "      <td>0.391835</td>\n",
+       "      <td>0.378524</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.177651</td>\n",
+       "      <td>0.108204</td>\n",
+       "      <td>0.097901</td>\n",
+       "      <td>0.127190</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.json_edit_distance.gpt-4  feedback.json_edit_distance  \\\n",
+       "mean                           0.258825                     0.365055   \n",
+       "std                            0.177651                     0.108204   \n",
+       "\n",
+       "      feedback.json_edit_distance.claude_xsd  \\\n",
+       "mean                                0.391835   \n",
+       "std                                 0.097901   \n",
+       "\n",
+       "      feedback.json_edit_distance.llama_v2  \n",
+       "mean                              0.378524  \n",
+       "std                               0.127190  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>json_schema</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.json_schema.gpt-4</th>\n",
+       "      <th>feedback.json_schema</th>\n",
+       "      <th>feedback.json_schema.claude_xsd</th>\n",
+       "      <th>feedback.json_schema.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.777778</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.407407</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.423659</td>\n",
+       "      <td>0.509175</td>\n",
+       "      <td>0.500712</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.json_schema.gpt-4  feedback.json_schema  \\\n",
+       "mean                         1.0              0.777778   \n",
+       "std                          0.0              0.423659   \n",
+       "\n",
+       "      feedback.json_schema.claude_xsd  feedback.json_schema.llama_v2  \n",
+       "mean                         0.518519                       0.407407  \n",
+       "std                          0.509175                       0.500712  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>off_topic_similarity</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.off_topic_similarity.gpt-4</th>\n",
+       "      <th>feedback.off_topic_similarity</th>\n",
+       "      <th>feedback.off_topic_similarity.claude_xsd</th>\n",
+       "      <th>feedback.off_topic_similarity.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.888889</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.666667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.320256</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.480384</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.off_topic_similarity.gpt-4  feedback.off_topic_similarity  \\\n",
+       "mean                             0.888889                            0.0   \n",
+       "std                              0.320256                            0.0   \n",
+       "\n",
+       "      feedback.off_topic_similarity.claude_xsd  \\\n",
+       "mean                                       0.0   \n",
+       "std                                        0.0   \n",
+       "\n",
+       "      feedback.off_topic_similarity.llama_v2  \n",
+       "mean                                0.666667  \n",
+       "std                                 0.480384  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>programming_language_similarity</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.programming_language_similarity.gpt-4</th>\n",
+       "      <th>feedback.programming_language_similarity</th>\n",
+       "      <th>feedback.programming_language_similarity.claude_xsd</th>\n",
+       "      <th>feedback.programming_language_similarity.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.592593</td>\n",
+       "      <td>0.444444</td>\n",
+       "      <td>0.518519</td>\n",
+       "      <td>0.481481</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.506370</td>\n",
+       "      <td>0.509175</td>\n",
+       "      <td>0.509175</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.programming_language_similarity.gpt-4  \\\n",
+       "mean                                        0.592593   \n",
+       "std                                         0.500712   \n",
+       "\n",
+       "      feedback.programming_language_similarity  \\\n",
+       "mean                                  0.444444   \n",
+       "std                                   0.506370   \n",
+       "\n",
+       "      feedback.programming_language_similarity.claude_xsd  \\\n",
+       "mean                                           0.518519     \n",
+       "std                                            0.509175     \n",
+       "\n",
+       "      feedback.programming_language_similarity.llama_v2  \n",
+       "mean                                           0.481481  \n",
+       "std                                            0.509175  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>question_category</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.question_category.gpt-4</th>\n",
+       "      <th>feedback.question_category</th>\n",
+       "      <th>feedback.question_category.claude_xsd</th>\n",
+       "      <th>feedback.question_category.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.592593</td>\n",
+       "      <td>0.481481</td>\n",
+       "      <td>0.370370</td>\n",
+       "      <td>0.148148</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.500712</td>\n",
+       "      <td>0.509175</td>\n",
+       "      <td>0.492103</td>\n",
+       "      <td>0.362014</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.question_category.gpt-4  feedback.question_category  \\\n",
+       "mean                          0.592593                    0.481481   \n",
+       "std                           0.500712                    0.509175   \n",
+       "\n",
+       "      feedback.question_category.claude_xsd  \\\n",
+       "mean                               0.370370   \n",
+       "std                                0.492103   \n",
+       "\n",
+       "      feedback.question_category.llama_v2  \n",
+       "mean                             0.148148  \n",
+       "std                              0.362014  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>sentiment_similarity</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.sentiment_similarity.gpt-4</th>\n",
+       "      <th>feedback.sentiment_similarity</th>\n",
+       "      <th>feedback.sentiment_similarity.claude_xsd</th>\n",
+       "      <th>feedback.sentiment_similarity.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.944444</td>\n",
+       "      <td>0.907407</td>\n",
+       "      <td>0.574074</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.160128</td>\n",
+       "      <td>0.197924</td>\n",
+       "      <td>0.359051</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.sentiment_similarity.gpt-4  feedback.sentiment_similarity  \\\n",
+       "mean                                  1.0                       0.944444   \n",
+       "std                                   0.0                       0.160128   \n",
+       "\n",
+       "      feedback.sentiment_similarity.claude_xsd  \\\n",
+       "mean                                  0.907407   \n",
+       "std                                   0.197924   \n",
+       "\n",
+       "      feedback.sentiment_similarity.llama_v2  \n",
+       "mean                                0.574074  \n",
+       "std                                 0.359051  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>toxicity_similarity</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.toxicity_similarity.gpt-4</th>\n",
+       "      <th>feedback.toxicity_similarity</th>\n",
+       "      <th>feedback.toxicity_similarity.claude_xsd</th>\n",
+       "      <th>feedback.toxicity_similarity.llama_v2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.592593</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.500712</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      feedback.toxicity_similarity.gpt-4  feedback.toxicity_similarity  \\\n",
+       "mean                                 0.0                           1.0   \n",
+       "std                                  0.0                           0.0   \n",
+       "\n",
+       "      feedback.toxicity_similarity.claude_xsd  \\\n",
+       "mean                                      1.0   \n",
+       "std                                       0.0   \n",
+       "\n",
+       "      feedback.toxicity_similarity.llama_v2  \n",
+       "mean                               0.592593  \n",
+       "std                                0.500712  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for metric in feedback_columns:\n",
+    "    render_metric(df, metric)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/source/notebooks/extraction/intro.ipynb b/docs/source/notebooks/extraction/intro.ipynb
index f03ae2a9..5fbb2010 100644
--- a/docs/source/notebooks/extraction/intro.ipynb
+++ b/docs/source/notebooks/extraction/intro.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "86912590-a90a-4351-8ab4-89192cdee1e7",
    "metadata": {},
    "outputs": [
@@ -26,19 +26,24 @@
        "<tr><th>Name            </th><th>Type          </th><th>Dataset ID                                                                                                                                                 </th><th>Description  </th></tr>\n",
        "</thead>\n",
        "<tbody>\n",
-       "<tr><td>Email Extraction</td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d\" target=\"_blank\" rel=\"noopener\">36bdfe7d-3cd1-4b36-b957-d12d95810a2b</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
+       "<tr><td>Email Extraction</td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d\" target=\"_blank\" rel=\"noopener\">a1742786-bde5-4f51-a1d8-e148e5251ddb</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
        "\n",
        "Some additional cleanup of the data was done by hand after the initial pass.\n",
        "\n",
        "See https://github.com/jacoblee93/oss-model-extraction-evals.              </td></tr>\n",
+       "<tr><td>Chat Extraction </td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d\" target=\"_blank\" rel=\"noopener\">00f4444c-9460-4a82-b87a-f50096f1cfef</a></td><td>A dataset meant to test the ability of an LLM to extract and infer\n",
+       "structured information from a dialogue. The dialogue is between a user and a support\n",
+       "engineer. Outputs should be structured as a JSON object and test both the ability\n",
+       "of the LLM to correctly structure the information and its ability to perform simple \n",
+       "classification tasks.              </td></tr>\n",
        "</tbody>\n",
        "</table>"
       ],
       "text/plain": [
-       "Registry(tasks=[ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n    ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['email'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['email'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{email}\\n```'))]))])"
+       "Registry(tasks=[ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n    ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{input}\\n```'))])), ExtractionTask(name='Chat Extraction', dataset_id='https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d', description='A dataset meant to test the ability of an LLM to extract and infer\\nstructured information from a dialogue. The dialogue is between a user and a support\\nengineer. Outputs should be structured as a JSON object and test both the ability\\nof the LLM to correctly structure the information and its ability to perform simple \\nclassification tasks.', schema=<class 'langchain_benchmarks.extraction.tasks.chat_extraction.schema.GenerateTicket'>, instructions=ChatPromptTemplate(input_variables=['dialogue'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['dialogue'], template='Generate a ticket for the following question-response pair:\\n<Dialogue>\\n{dialogue}\\n</Dialogue>'))]))])"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -85,9 +90,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "9c7865bd-8251-4579-85a3-f9085d96f497",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from langchain.chat_models import ChatOpenAI\n",
@@ -115,7 +122,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.11.2"
   }
  },
  "nbformat": 4,
diff --git a/langchain_benchmarks/extraction/tasks/chat_extraction/__init__.py b/langchain_benchmarks/extraction/tasks/chat_extraction/__init__.py
new file mode 100644
index 00000000..5ed5f765
--- /dev/null
+++ b/langchain_benchmarks/extraction/tasks/chat_extraction/__init__.py
@@ -0,0 +1,41 @@
+from langchain.prompts import ChatPromptTemplate
+
+from langchain_benchmarks.extraction.tasks.chat_extraction.evaluators import (
+    get_eval_config,
+)
+from langchain_benchmarks.extraction.tasks.chat_extraction.schema import GenerateTicket
+from langchain_benchmarks.schema import ExtractionTask
+
+# This is a default prompt that works reasonably for OpenAI models.
+
+DEFAULT_CHAT_MODEL_PROMPT = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            "You are a helpdesk assistant responsible with extracting information"
+            " and generating tickets. Dialogues are between a user and"
+            " a support engineer.",
+        ),
+        (
+            "user",
+            "Generate a ticket for the following question-response pair:\n"
+            "<Dialogue>\n{dialogue}\n</Dialogue>",
+        ),
+    ]
+)
+
+
+CHAT_EXTRACTION_TASK = ExtractionTask(
+    name="Chat Extraction",
+    dataset_id="https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d",
+    schema=GenerateTicket,
+    description="""A dataset meant to test the ability of an LLM to extract and infer
+structured information from a dialogue. The dialogue is between a user and a support
+engineer. Outputs should be structured as a JSON object and test both the ability
+of the LLM to correctly structure the information and its ability to perform simple 
+classification tasks.""",
+    instructions=DEFAULT_CHAT_MODEL_PROMPT,
+)
+
+
+__all__ = ["CHAT_EXTRACTION_TASK", "get_eval_config"]
diff --git a/langchain_benchmarks/extraction/tasks/chat_extraction/evaluators.py b/langchain_benchmarks/extraction/tasks/chat_extraction/evaluators.py
new file mode 100644
index 00000000..5c509c56
--- /dev/null
+++ b/langchain_benchmarks/extraction/tasks/chat_extraction/evaluators.py
@@ -0,0 +1,180 @@
+from langchain.smith.evaluation.config import RunEvalConfig, SingleKeyEvalConfig
+from langsmith.evaluation.evaluator import (
+    EvaluationResult,
+    run_evaluator,
+)
+from langsmith.schemas import Example, Run
+
+from langchain_benchmarks.extraction.tasks.chat_extraction.schema import GenerateTicket
+
+
+@run_evaluator
+def json_schema(run: Run, example: Example) -> EvaluationResult:
+    """Evaluate the json schema of the generated ticket."""
+    score, comment = None, None
+    try:
+        GenerateTicket.parse_obj(run.outputs["output"])
+        score = 1
+    except Exception as e:
+        comment = repr(e)
+        score = 0
+
+    return EvaluationResult(
+        key="json_schema",
+        score=score,
+        comment=comment,
+    )
+
+
+@run_evaluator
+def evaluate_toxicity_similarity(run: Run, example: Example) -> EvaluationResult:
+    """Evaluate the toxicity of the generated ticket."""
+    gt = example.outputs["output"]["question"]["toxicity"]
+    score, comment = None, None
+    # Toxicity should be a on scale from 0 to 5
+    try:
+        pred = run.outputs["output"]["question"]["toxicity"]
+        score = 1 - abs(gt - float(pred)) / 5
+    except Exception as e:
+        comment = repr(e)
+        # Forgot to predict / mis-structured
+        score = 0
+    return EvaluationResult(
+        key="toxicity_similarity",
+        score=score,
+        comment=comment,
+    )
+
+
+@run_evaluator
+def evaluate_sentiment_similarity(run: Run, example: Example) -> EvaluationResult:
+    """Evaluate the sentiment of the generated ticket."""
+    gt = example.outputs["output"]["question"]["sentiment"]
+    ordinal_map = {
+        "negative": 0,
+        "neutral": 1,
+        "positive": 2,
+    }
+    gt_score = ordinal_map.get(str(gt).lower())
+    score, comment = None, None
+    # Sentiment is an enum, "Negative", "Neutral", "Positive"
+    try:
+        pred = run.outputs["output"]["question"]["sentiment"]
+        pred_score = ordinal_map.get(str(pred).lower())
+        score = 1 - (abs(gt_score - float(pred_score)) / 2)
+    except Exception as e:
+        comment = repr(e)
+        # Forgot to predict / mis-structured
+        score = 0
+    return EvaluationResult(
+        key="sentiment_similarity",
+        score=score,
+        comment=comment,
+    )
+
+
+@run_evaluator
+def evaluate_confidence_level_similarity(
+    run: Run, example: Example
+) -> EvaluationResult:
+    """Evaluate the confidence level of the generated ticket.
+    This is a binary T/F question."""
+    gt = example.outputs["output"]["response"]["confidence_level"]
+    score, comment = None, None
+    try:
+        pred = run.outputs["output"]["response"]["confidence_level"]
+        score = 1 - (abs(gt - float(pred)) / 5)
+    except Exception as e:
+        comment = repr(e)
+        score = 0
+    return EvaluationResult(
+        key="confidence_level_similarity",
+        score=score,
+        comment=comment,
+    )
+
+
+@run_evaluator
+def evaluate_question_category_similarity(
+    run: Run, example: Example
+) -> EvaluationResult:
+    """Evaluate the question category of the generated ticket.
+    This is a binary T/F question."""
+    gt = example.outputs["output"]["question"]["question_category"]
+
+    score, comment = None, None
+    try:
+        pred = run.outputs["output"]["question"]["question_category"]
+        score = int(gt == pred)
+    except Exception as e:
+        comment = repr(e)
+        # Forgot to predict / mis-structured
+        score = 0
+    return EvaluationResult(
+        key="question_category",
+        score=score,
+        comment=comment,
+    )
+
+
+@run_evaluator
+def evaluate_off_topic(run: Run, example: Example) -> EvaluationResult:
+    """Evaluate the off topic of the generated ticket.
+    This is a binary T/F question."""
+    gt = example.outputs["output"]["question"]["is_off_topic"]
+    score, comment = None, None
+    try:
+        pred = run.outputs["output"]["question"].get("is_off_topic")
+        score = int(gt == pred)
+    except Exception as e:
+        comment = repr(e)
+        # Forgot to predict / mis-structured
+        score = 0
+    return EvaluationResult(
+        key="off_topic_similarity",
+        score=score,
+        comment=comment,
+    )
+
+
+@run_evaluator
+def evaluate_programming_language(run: Run, example: Example) -> EvaluationResult:
+    """Evaluate the programming language of the generated ticket.
+    This is a binary T/F question."""
+    gt = example.outputs["output"]["question"]["programming_language"]
+    score, comment = None, None
+    try:
+        pred = run.outputs["output"]["question"]["programming_language"]
+        score = int(gt == pred)
+    except Exception as e:
+        comment = repr(e)
+        # Forgot to predict / mis-structured
+        score = 0
+    return EvaluationResult(
+        key="programming_language_similarity",
+        score=score,
+        comment=comment,
+    )
+
+
+def get_eval_config() -> RunEvalConfig:
+    """Get the evaluation configuration for the chat extraction task."""
+    return RunEvalConfig(
+        evaluators=[
+            # General aggregate score
+            SingleKeyEvalConfig(
+                # input key is ignored.
+                evaluator_type="json_edit_distance",
+                input_key="question",
+            )
+        ],
+        custom_evaluators=[
+            json_schema,
+            evaluate_toxicity_similarity,
+            evaluate_sentiment_similarity,
+            evaluate_confidence_level_similarity,
+            evaluate_question_category_similarity,
+            evaluate_off_topic,
+            evaluate_programming_language,
+        ],
+    )
diff --git a/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py b/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py
new file mode 100644
index 00000000..5d614035
--- /dev/null
+++ b/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py
@@ -0,0 +1,99 @@
+from enum import Enum
+from typing import List, Optional
+
+from langchain.pydantic_v1 import BaseModel, Field
+
+
+class QuestionCategory(str, Enum):
+    IMPLEMENTATION_ISSUES = "Implementation Issues"  # about existing implementation
+    FEATURE_REQUESTS = "Feature Requests"
+    CONCEPT_EXPLANATIONS = "Concept Explanations"
+    CODE_OPTIMIZATION = "Code Optimization"
+    SECURITY_AND_PRIVACY_CONCERNS = "Security and Privacy Concerns"
+    MODEL_TRAINING_AND_FINE_TUNING = "Model Training and Fine-tuning"
+    DATA_HANDLING_AND_MANIPULATION = "Data Handling and Manipulation"
+    USER_INTERACTION_FLOW = "User Interaction Flow"
+    TECHNICAL_INTEGRATION = "Technical Integration"
+    ERROR_HANDLING_AND_LOGGING = "Error Handling and Logging"
+    CUSTOMIZATION_AND_CONFIGURATION = "Customization and Configuration"
+    EXTERNAL_API_AND_DATA_SOURCE_INTEGRATION = (
+        "External API and Data Source Integration"
+    )
+    LANGUAGE_AND_LOCALIZATION = "Language and Localization"
+    STREAMING_AND_REAL_TIME_PROCESSING = "Streaming and Real-time Processing"
+    TOOL_DEVELOPMENT = "Tool Development"
+    FUNCTION_CALLING = "Function Calling"
+    LLM_INTEGRATIONS = "LLM Integrations"
+    GENERAL_AGENT_QUESTIONS = "General Agent Question"
+    GENERAL_CHIT_CHAT = "General Chit Chat"
+    MEMORY = "Memory"
+    DEBUGGING_HELP = "Debugging Help"
+    APPLICATION_DESIGN = "Application Design"
+    PROMPT_TEMPLATES = "Prompt Templates"
+    COST_TRACKING = "Cost Tracking"
+    OTHER = "Other"
+
+
+class Sentiment(str, Enum):
+    NEGATIVE = "Negative"
+    NEUTRAL = "Neutral"
+    POSITIVE = "Positive"
+
+
+class ProgrammingLanguage(str, Enum):
+    PYTHON = "python"
+    JAVASCRIPT = "javascript"
+    TYPESCRIPT = "typescript"
+    UNKNOWN = "unknown"
+    OTHER = "other"
+
+
+class QuestionCategorization(BaseModel):
+    question_category: QuestionCategory
+    category_if_other: Optional[str] = Field(
+        default=None, description="question category if the category above is 'other'"
+    )
+    is_off_topic: bool = Field(
+        description="If the input is general chit chat or does not pertain to technical inqueries about LangChain or building/debugging applications with LLMs/AI, it is off topic. For context, LangChain is a library and framework designed"
+        " to assist in building applications with LLMs. Questions may also be about similar packages like LangServe, LangSmith, OpenAI, Anthropic, vectorstores, agents, etc."
+    )
+    toxicity: int = Field(
+        ge=0, lt=6, default=0, description="Whether or not the input question is toxic"
+    )
+    sentiment: Sentiment
+    programming_language: ProgrammingLanguage
+
+
+#  resolve the issue, provide guidance, or ask for more information
+class ResponseType(str, Enum):
+    RESOLVE_ISSUE = "resolve issue"
+    PROVIDE_GUIDANCE = "provide guidance"
+    REQUEST_INFORMATION = "request information"
+    GIVE_UP = "give up"
+    NONE = "none"
+    OTHER = "other"
+
+
+class ResponseCategorization(BaseModel):
+    response_type: ResponseType
+    response_type_if_other: Optional[str] = None
+    confidence_level: int = Field(
+        ge=0, lt=6, description="The confidence of the assistant in its answer."
+    )
+    followup_actions: Optional[List[str]] = Field(
+        description="Actions the assistant recommended the user take."
+    )
+
+
+class GenerateTicket(BaseModel):
+    """Generate a ticket containing all the extracted information."""
+
+    issue_summary: str = Field(
+        description="short (<10 word) summary of the issue or question"
+    )
+    question: QuestionCategorization = Field(
+        description="Information inferred from the the question."
+    )
+    response: ResponseCategorization = Field(
+        description="Information inferred from the the response."
+    )
diff --git a/langchain_benchmarks/registration.py b/langchain_benchmarks/registration.py
index af91a27c..26182fcc 100644
--- a/langchain_benchmarks/registration.py
+++ b/langchain_benchmarks/registration.py
@@ -1,6 +1,6 @@
 """Registry of environments for ease of access."""
 
-from langchain_benchmarks.extraction.tasks import email_task
+from langchain_benchmarks.extraction.tasks import chat_extraction, email_task
 from langchain_benchmarks.rag.tasks import (
     LANGCHAIN_DOCS_TASK,
     SEMI_STRUCTURED_REPORTS_TASK,
@@ -21,6 +21,7 @@
         relational_data.RELATIONAL_DATA_TASK,
         multiverse_math.MULTIVERSE_MATH,
         email_task.EMAIL_EXTRACTION_TASK,
+        chat_extraction.CHAT_EXTRACTION_TASK,
         LANGCHAIN_DOCS_TASK,
         SEMI_STRUCTURED_REPORTS_TASK,
     ]

Name	Chat Extraction
Type	ExtractionTask
Dataset ID	00f4444c-9460-4a82-b87a-f50096f1cfef
Description	A dataset meant to test the ability of an LLM to extract and infer\n", + "structured information from a dialogue. The dialogue is between a user and a support\n", + "engineer. Outputs should be structured as a JSON object and test both the ability\n", + "of the LLM to correctly structure the information and its ability to perform simple \n", + "classification tasks.
	feedback.json_edit_distance	feedback.json_schema	feedback.toxicity_similarity	feedback.sentiment_similarity	feedback.confidence_level_similarity	feedback.question_category	feedback.off_topic_similarity	feedback.programming_language_similarity	error	execution_time
count	27.000000	27.0	27.0	27.0	27.000000	27.000000	27.000000	27.000000	0	27.000000
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
mean	0.258825	1.0	0.0	1.0	0.962963	0.592593	0.888889	0.592593	NaN	11.573060
std	0.177651	0.0	0.0	0.0	0.079169	0.500712	0.320256	0.500712	NaN	4.616704
min	0.049430	1.0	0.0	1.0	0.800000	0.000000	0.000000	0.000000	NaN	6.609211
25%	0.100351	1.0	0.0	1.0	1.000000	0.000000	1.000000	0.000000	NaN	8.454940
50%	0.222621	1.0	0.0	1.0	1.000000	1.000000	1.000000	1.000000	NaN	10.141127
75%	0.365307	1.0	0.0	1.0	1.000000	1.000000	1.000000	1.000000	NaN	13.332418
max	0.595300	1.0	0.0	1.0	1.000000	1.000000	1.000000	1.000000	NaN	27.191173
	feedback.json_edit_distance	feedback.json_schema	feedback.toxicity_similarity	feedback.sentiment_similarity	feedback.confidence_level_similarity	feedback.question_category	feedback.off_topic_similarity	feedback.programming_language_similarity	error	execution_time
count	23.000000	27.000000	27.000000	27.000000	27.000000	27.000000	27.000000	27.000000	0	27.000000
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
mean	0.378524	0.407407	0.592593	0.574074	0.681481	0.148148	0.666667	0.481481	NaN	4.310952
std	0.127190	0.500712	0.500712	0.359051	0.389316	0.362014	0.480384	0.509175	NaN	0.591779
min	0.089130	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	NaN	3.046112
25%	0.309743	0.000000	0.000000	0.500000	0.500000	0.000000	0.000000	0.000000	NaN	3.983488
50%	0.352751	0.000000	1.000000	0.500000	0.800000	0.000000	1.000000	0.000000	NaN	4.326160
75%	0.468417	1.000000	1.000000	1.000000	1.000000	0.000000	1.000000	1.000000	NaN	4.687441
max	0.659091	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	NaN	5.713148
	inputs.answer	inputs.question	outputs.output	reference.output	feedback.json_edit_distance	feedback.json_schema	feedback.sentiment_similarity	feedback.confidence_level_similarity	feedback.question_category	...	feedback.json_edit_distance_llama_v2	feedback.json_schema_llama_v2	feedback.toxicity_similarity_llama_v2	feedback.sentiment_similarity_llama_v2	feedback.confidence_level_similarity_llama_v2	feedback.question_category_llama_v2	feedback.off_topic_similarity_llama_v2	feedback.programming_language_similarity_llama_v2	error_llama_v2	execution_time_llama_v2
23a81130-2ad9-46cf-ad27-46589bcea94a	Pour joindre les deux outputs, vous pouvez uti...	je travail sur python. je souhaite joindre ces...	{'issue_summary': 'Joining two outputs in Pyth...	{'question': {'toxicity': 0, 'sentiment': 'Neu...	0.089219	1	1.0	1.0	1	...	0.477690	1	0.0	0.5	0.8	0	0	1	None	4.100867
598316ec-f5e2-4b4d-83a8-36adb18e12fe	Hmm, I'm not sure.	example for dalle agent	{'issue_summary': 'Example for DALL-E Agent', ...	{'question': {'toxicity': 0, 'sentiment': 'Neu...	0.174905	1	1.0	0.8	0	...	0.346749	1	1.0	1.0	0.4	1	1	1	None	3.653370
d1a1a2e8-6f4c-4325-8aaa-ea20e2449268	To run Llama2 using pandas, you can follow the...	how do I run llama2 using pandas	{'issue_summary': 'Running Llama2 with Pandas'...	{'question': {'toxicity': 0, 'sentiment': 'Neu...	0.222621	1	1.0	1.0	0	...	0.262118	1	1.0	0.5	1.0	0	1	1	None	4.507702
140a4819-0046-469d-b4df-8e747ddae112	To clear the conversation in ConversationalRet...	if Im useing ConversationalRetrievalChain how ...	{'issue_summary': 'Clearing Conversation in Co...	{'question': {'toxicity': 0, 'sentiment': 'Neu...	0.353261	1	1.0	1.0	0	...	0.279330	1	1.0	1.0	1.0	0	1	0	None	3.654116
7b0a9dd9-68ce-41a1-9f9d-067d93175477	To perform the task of creating an app that in...	I want to create an app which:\\n- chats with u...	{'issue_summary': 'Building an app with Langch...	{'question': {'toxicity': 0, 'sentiment': 'Neu...	0.562950	1	1.0	0.8	1	...	NaN	0	0.0	0.0	0.0	0	0	0	None	4.666831
	execution_time.gpt-4	execution_time	execution_time.claude_xsd	execution_time.llama_v2
mean	11.573060	11.570401	10.930946	4.310952
std	4.616704	1.648157	1.594109	0.591779
	feedback.confidence_level_similarity.gpt-4	feedback.confidence_level_similarity	feedback.confidence_level_similarity.claude_xsd	feedback.confidence_level_similarity.llama_v2
mean	0.962963	0.970370	0.970370	0.681481
std	0.079169	0.072403	0.072403	0.389316
	feedback.json_edit_distance.gpt-4	feedback.json_edit_distance	feedback.json_edit_distance.claude_xsd	feedback.json_edit_distance.llama_v2
mean	0.258825	0.365055	0.391835	0.378524
std	0.177651	0.108204	0.097901	0.127190
	feedback.json_schema.gpt-4	feedback.json_schema	feedback.json_schema.claude_xsd	feedback.json_schema.llama_v2
mean	1.0	0.777778	0.518519	0.407407
std	0.0	0.423659	0.509175	0.500712
	feedback.programming_language_similarity.gpt-4	feedback.programming_language_similarity	feedback.programming_language_similarity.claude_xsd	feedback.programming_language_similarity.llama_v2
mean	0.592593	0.444444	0.518519	0.481481
std	0.500712	0.506370	0.509175	0.509175
	feedback.sentiment_similarity.gpt-4	feedback.sentiment_similarity	feedback.sentiment_similarity.claude_xsd	feedback.sentiment_similarity.llama_v2
mean	1.0	0.944444	0.907407	0.574074
std	0.0	0.160128	0.197924	0.359051