From 01ffffd04c9f0a10926730b9a820ba889ffddeff Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Sun, 3 Dec 2023 18:28:51 -0800 Subject: [PATCH] Update Chat Extraction Notebook (#102) --- .../extraction/chat_extraction.ipynb | 674 +++++++++--------- docs/source/toc.segment | 1 + .../tasks/chat_extraction/schema.py | 2 +- pyproject.toml | 2 +- 4 files changed, 349 insertions(+), 330 deletions(-) diff --git a/docs/source/notebooks/extraction/chat_extraction.ipynb b/docs/source/notebooks/extraction/chat_extraction.ipynb index 5dd795fd..0fe317cc 100644 --- a/docs/source/notebooks/extraction/chat_extraction.ipynb +++ b/docs/source/notebooks/extraction/chat_extraction.ipynb @@ -348,8 +348,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "View the evaluation results for project 'gpt-4-1106-preview-af10' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=9ee3b369-1988-4db0-a2a9-ea6259c8e19c\n", + "View the evaluation results for project 'gpt-4-1106-preview-5689' at:\n", + "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=0c022691-a7ac-4545-b2bc-58aab2d476e8\n", "\n", "View all tests for Dataset Chat Extraction at:\n", "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n", @@ -456,29 +456,29 @@ " \n", " \n", " mean\n", - " 0.258825\n", + " 0.283000\n", " 1.0\n", " 0.0\n", " 1.0\n", - " 0.962963\n", - " 0.592593\n", + " 0.940741\n", + " 0.555556\n", " 0.888889\n", " 0.592593\n", " NaN\n", - " 11.573060\n", + " 6.949585\n", " \n", " \n", " std\n", - " 0.177651\n", + " 0.181282\n", " 0.0\n", " 0.0\n", " 0.0\n", - " 0.079169\n", - " 0.500712\n", + " 0.093064\n", + " 0.506370\n", " 0.320256\n", " 0.500712\n", " NaN\n", - " 4.616704\n", + " 1.639494\n", " \n", " \n", " min\n", @@ -491,24 +491,24 @@ " 0.000000\n", " 0.000000\n", " NaN\n", - " 6.609211\n", + " 4.248728\n", " \n", " \n", " 25%\n", - " 0.100351\n", + " 0.104149\n", " 1.0\n", " 0.0\n", " 1.0\n", - " 1.000000\n", + " 0.800000\n", " 0.000000\n", " 1.000000\n", " 0.000000\n", " NaN\n", - " 8.454940\n", + " 5.679244\n", " \n", " \n", " 50%\n", - " 0.222621\n", + " 0.336343\n", " 1.0\n", " 0.0\n", " 1.0\n", @@ -517,11 +517,11 @@ " 1.000000\n", " 1.000000\n", " NaN\n", - " 10.141127\n", + " 6.558088\n", " \n", " \n", " 75%\n", - " 0.365307\n", + " 0.378270\n", " 1.0\n", " 0.0\n", " 1.0\n", @@ -530,11 +530,11 @@ " 1.000000\n", " 1.000000\n", " NaN\n", - " 13.332418\n", + " 8.300396\n", " \n", " \n", " max\n", - " 0.595300\n", + " 0.594255\n", " 1.0\n", " 0.0\n", " 1.0\n", @@ -543,7 +543,7 @@ " 1.000000\n", " 1.000000\n", " NaN\n", - " 27.191173\n", + " 10.123084\n", " \n", " \n", "\n", @@ -555,13 +555,13 @@ "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 0.258825 1.0 \n", - "std 0.177651 0.0 \n", + "mean 0.283000 1.0 \n", + "std 0.181282 0.0 \n", "min 0.049430 1.0 \n", - "25% 0.100351 1.0 \n", - "50% 0.222621 1.0 \n", - "75% 0.365307 1.0 \n", - "max 0.595300 1.0 \n", + "25% 0.104149 1.0 \n", + "50% 0.336343 1.0 \n", + "75% 0.378270 1.0 \n", + "max 0.594255 1.0 \n", "\n", " feedback.toxicity_similarity feedback.sentiment_similarity \\\n", "count 27.0 27.0 \n", @@ -581,10 +581,10 @@ "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 0.962963 0.592593 \n", - "std 0.079169 0.500712 \n", + "mean 0.940741 0.555556 \n", + "std 0.093064 0.506370 \n", "min 0.800000 0.000000 \n", - "25% 1.000000 0.000000 \n", + "25% 0.800000 0.000000 \n", "50% 1.000000 1.000000 \n", "75% 1.000000 1.000000 \n", "max 1.000000 1.000000 \n", @@ -607,13 +607,13 @@ "unique NaN 0 NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", - "mean 0.592593 NaN 11.573060 \n", - "std 0.500712 NaN 4.616704 \n", - "min 0.000000 NaN 6.609211 \n", - "25% 0.000000 NaN 8.454940 \n", - "50% 1.000000 NaN 10.141127 \n", - "75% 1.000000 NaN 13.332418 \n", - "max 1.000000 NaN 27.191173 " + "mean 0.592593 NaN 6.949585 \n", + "std 0.500712 NaN 1.639494 \n", + "min 0.000000 NaN 4.248728 \n", + "25% 0.000000 NaN 5.679244 \n", + "50% 1.000000 NaN 6.558088 \n", + "75% 1.000000 NaN 8.300396 \n", + "max 1.000000 NaN 10.123084 " ] }, "metadata": {}, @@ -809,8 +809,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "View the evaluation results for project 'claude-2-json-schema-to-xml-af10' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=fb67ac1a-4e37-44ca-94bf-970eee89ee04\n", + "View the evaluation results for project 'claude-2-json-schema-to-xml-5689' at:\n", + "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=3f590999-a9d1-48be-83dd-e84acb99a195\n", "\n", "View all tests for Dataset Chat Extraction at:\n", "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n", @@ -917,29 +917,29 @@ " \n", " \n", " mean\n", - " 0.365055\n", + " 0.371950\n", " 0.777778\n", " 1.0\n", - " 0.944444\n", + " 0.925926\n", " 0.970370\n", " 0.481481\n", " 0.0\n", " 0.444444\n", " NaN\n", - " 11.570401\n", + " 10.556105\n", " \n", " \n", " std\n", - " 0.108204\n", + " 0.108628\n", " 0.423659\n", " 0.0\n", - " 0.160128\n", + " 0.181007\n", " 0.072403\n", " 0.509175\n", " 0.0\n", " 0.506370\n", " NaN\n", - " 1.648157\n", + " 1.790352\n", " \n", " \n", " min\n", @@ -952,11 +952,11 @@ " 0.0\n", " 0.000000\n", " NaN\n", - " 8.821772\n", + " 8.435542\n", " \n", " \n", " 25%\n", - " 0.298704\n", + " 0.312445\n", " 1.000000\n", " 1.0\n", " 1.000000\n", @@ -965,11 +965,11 @@ " 0.0\n", " 0.000000\n", " NaN\n", - " 10.545821\n", + " 9.077631\n", " \n", " \n", " 50%\n", - " 0.393478\n", + " 0.390000\n", " 1.000000\n", " 1.0\n", " 1.000000\n", @@ -978,11 +978,11 @@ " 0.0\n", " 0.000000\n", " NaN\n", - " 11.427731\n", + " 10.059124\n", " \n", " \n", " 75%\n", - " 0.444609\n", + " 0.462694\n", " 1.000000\n", " 1.0\n", " 1.000000\n", @@ -991,7 +991,7 @@ " 0.0\n", " 1.000000\n", " NaN\n", - " 12.390761\n", + " 11.795210\n", " \n", " \n", " max\n", @@ -1004,7 +1004,7 @@ " 0.0\n", " 1.000000\n", " NaN\n", - " 17.776214\n", + " 15.072743\n", " \n", " \n", "\n", @@ -1016,12 +1016,12 @@ "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 0.365055 0.777778 \n", - "std 0.108204 0.423659 \n", + "mean 0.371950 0.777778 \n", + "std 0.108628 0.423659 \n", "min 0.105033 0.000000 \n", - "25% 0.298704 1.000000 \n", - "50% 0.393478 1.000000 \n", - "75% 0.444609 1.000000 \n", + "25% 0.312445 1.000000 \n", + "50% 0.390000 1.000000 \n", + "75% 0.462694 1.000000 \n", "max 0.537678 1.000000 \n", "\n", " feedback.toxicity_similarity feedback.sentiment_similarity \\\n", @@ -1029,8 +1029,8 @@ "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 1.0 0.944444 \n", - "std 0.0 0.160128 \n", + "mean 1.0 0.925926 \n", + "std 0.0 0.181007 \n", "min 1.0 0.500000 \n", "25% 1.0 1.000000 \n", "50% 1.0 1.000000 \n", @@ -1068,13 +1068,13 @@ "unique NaN 0 NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", - "mean 0.444444 NaN 11.570401 \n", - "std 0.506370 NaN 1.648157 \n", - "min 0.000000 NaN 8.821772 \n", - "25% 0.000000 NaN 10.545821 \n", - "50% 0.000000 NaN 11.427731 \n", - "75% 1.000000 NaN 12.390761 \n", - "max 1.000000 NaN 17.776214 " + "mean 0.444444 NaN 10.556105 \n", + "std 0.506370 NaN 1.790352 \n", + "min 0.000000 NaN 8.435542 \n", + "25% 0.000000 NaN 9.077631 \n", + "50% 0.000000 NaN 10.059124 \n", + "75% 1.000000 NaN 11.795210 \n", + "max 1.000000 NaN 15.072743 " ] }, "metadata": {}, @@ -1282,8 +1282,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "View the evaluation results for project 'claude-2-xsd-to-xml-af10' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=07edf03d-97b9-42a8-acde-9a6e9facb388\n", + "View the evaluation results for project 'claude-2-xsd-to-xml-5689' at:\n", + "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=dc7656d8-00ef-4048-9ce5-38ef72af593c\n", "\n", "View all tests for Dataset Chat Extraction at:\n", "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n", @@ -1390,7 +1390,7 @@ " \n", " \n", " mean\n", - " 0.391835\n", + " 0.394232\n", " 0.518519\n", " 1.0\n", " 0.907407\n", @@ -1399,11 +1399,11 @@ " 0.0\n", " 0.518519\n", " NaN\n", - " 10.930946\n", + " 11.128319\n", " \n", " \n", " std\n", - " 0.097901\n", + " 0.117880\n", " 0.509175\n", " 0.0\n", " 0.197924\n", @@ -1412,7 +1412,7 @@ " 0.0\n", " 0.509175\n", " NaN\n", - " 1.594109\n", + " 4.845637\n", " \n", " \n", " min\n", @@ -1425,11 +1425,11 @@ " 0.0\n", " 0.000000\n", " NaN\n", - " 8.416739\n", + " 7.833285\n", " \n", " \n", " 25%\n", - " 0.348812\n", + " 0.332400\n", " 0.000000\n", " 1.0\n", " 1.000000\n", @@ -1438,11 +1438,11 @@ " 0.0\n", " 0.000000\n", " NaN\n", - " 9.813120\n", + " 8.888438\n", " \n", " \n", " 50%\n", - " 0.379653\n", + " 0.380435\n", " 1.000000\n", " 1.0\n", " 1.000000\n", @@ -1451,11 +1451,11 @@ " 0.0\n", " 1.000000\n", " NaN\n", - " 10.371725\n", + " 9.629613\n", " \n", " \n", " 75%\n", - " 0.425574\n", + " 0.456592\n", " 1.000000\n", " 1.0\n", " 1.000000\n", @@ -1464,7 +1464,7 @@ " 0.0\n", " 1.000000\n", " NaN\n", - " 11.964592\n", + " 11.143679\n", " \n", " \n", " max\n", @@ -1477,7 +1477,7 @@ " 0.0\n", " 1.000000\n", " NaN\n", - " 14.291423\n", + " 32.068304\n", " \n", " \n", "\n", @@ -1489,12 +1489,12 @@ "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 0.391835 0.518519 \n", - "std 0.097901 0.509175 \n", + "mean 0.394232 0.518519 \n", + "std 0.117880 0.509175 \n", "min 0.116608 0.000000 \n", - "25% 0.348812 0.000000 \n", - "50% 0.379653 1.000000 \n", - "75% 0.425574 1.000000 \n", + "25% 0.332400 0.000000 \n", + "50% 0.380435 1.000000 \n", + "75% 0.456592 1.000000 \n", "max 0.644007 1.000000 \n", "\n", " feedback.toxicity_similarity feedback.sentiment_similarity \\\n", @@ -1541,13 +1541,13 @@ "unique NaN 0 NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", - "mean 0.518519 NaN 10.930946 \n", - "std 0.509175 NaN 1.594109 \n", - "min 0.000000 NaN 8.416739 \n", - "25% 0.000000 NaN 9.813120 \n", - "50% 1.000000 NaN 10.371725 \n", - "75% 1.000000 NaN 11.964592 \n", - "max 1.000000 NaN 14.291423 " + "mean 0.518519 NaN 11.128319 \n", + "std 0.509175 NaN 4.845637 \n", + "min 0.000000 NaN 7.833285 \n", + "25% 0.000000 NaN 8.888438 \n", + "50% 1.000000 NaN 9.629613 \n", + "75% 1.000000 NaN 11.143679 \n", + "max 1.000000 NaN 32.068304 " ] }, "metadata": {}, @@ -1659,7 +1659,15 @@ { "data": { "text/plain": [ - "{'output': 'Here is the generated ticket:\\n{\"GenerateTicket\": {\"issue_summary\": \"Running Llama 2 locally\", \"question\": {\"QuestionCategorization\": {\"question_category\": \"Technical Integration\", \"is_off_topic\": false, \"sentiment\": \"Neutral\", \"programming_language\": \"unknown\"}}}}'}" + "{'output': {'issue_summary': 'How to run Llama 2 locally',\n", + " 'question': {'question_category': 'Implementation Issues',\n", + " 'is_off_topic': False,\n", + " 'toxicity': 0,\n", + " 'sentiment': 'Neutral',\n", + " 'programming_language': 'cpp'},\n", + " 'response': {'response_type': 'Resolve Issue',\n", + " 'confidence_level': 5,\n", + " 'followup_actions': ['Please provide more information about the environment (OS, versions, etc.) and the specific issue you are experiencing.']}}}" ] }, "execution_count": 15, @@ -1686,8 +1694,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "View the evaluation results for project 'llama-v2-34b-code-instruct-af10' at:\n", - "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=53631e16-bdb2-4d53-ae8f-10cde961614e\n", + "View the evaluation results for project 'llama-v2-34b-code-instruct-5689' at:\n", + "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=dc2e0648-7e65-4d60-a149-15c24bca943b\n", "\n", "View all tests for Dataset Chat Extraction at:\n", "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n", @@ -1742,7 +1750,7 @@ " \n", " \n", " count\n", - " 23.000000\n", + " 17.000000\n", " 27.000000\n", " 27.000000\n", " 27.000000\n", @@ -1794,33 +1802,33 @@ " \n", " \n", " mean\n", - " 0.378524\n", - " 0.407407\n", - " 0.592593\n", - " 0.574074\n", - " 0.681481\n", - " 0.148148\n", - " 0.666667\n", - " 0.481481\n", + " 0.399687\n", + " 0.333333\n", + " 0.444444\n", + " 0.444444\n", + " 0.540741\n", + " 0.074074\n", + " 0.518519\n", + " 0.222222\n", " NaN\n", - " 4.310952\n", + " 4.738518\n", " \n", " \n", " std\n", - " 0.127190\n", - " 0.500712\n", - " 0.500712\n", - " 0.359051\n", - " 0.389316\n", - " 0.362014\n", + " 0.097771\n", " 0.480384\n", + " 0.506370\n", + " 0.423659\n", + " 0.439632\n", + " 0.266880\n", " 0.509175\n", + " 0.423659\n", " NaN\n", - " 0.591779\n", + " 3.162978\n", " \n", " \n", " min\n", - " 0.089130\n", + " 0.197279\n", " 0.000000\n", " 0.000000\n", " 0.000000\n", @@ -1829,50 +1837,50 @@ " 0.000000\n", " 0.000000\n", " NaN\n", - " 3.046112\n", + " 3.224190\n", " \n", " \n", " 25%\n", - " 0.309743\n", + " 0.325069\n", + " 0.000000\n", + " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 0.500000\n", - " 0.500000\n", " 0.000000\n", " 0.000000\n", " 0.000000\n", " NaN\n", - " 3.983488\n", + " 3.595067\n", " \n", " \n", " 50%\n", - " 0.352751\n", + " 0.413203\n", + " 0.000000\n", " 0.000000\n", - " 1.000000\n", " 0.500000\n", " 0.800000\n", " 0.000000\n", " 1.000000\n", " 0.000000\n", " NaN\n", - " 4.326160\n", + " 3.744033\n", " \n", " \n", " 75%\n", - " 0.468417\n", + " 0.471366\n", " 1.000000\n", " 1.000000\n", " 1.000000\n", " 1.000000\n", " 0.000000\n", " 1.000000\n", - " 1.000000\n", + " 0.000000\n", " NaN\n", - " 4.687441\n", + " 4.211040\n", " \n", " \n", " max\n", - " 0.659091\n", + " 0.552430\n", " 1.000000\n", " 1.000000\n", " 1.000000\n", @@ -1881,7 +1889,7 @@ " 1.000000\n", " 1.000000\n", " NaN\n", - " 5.713148\n", + " 18.660901\n", " \n", " \n", "\n", @@ -1889,28 +1897,28 @@ ], "text/plain": [ " feedback.json_edit_distance feedback.json_schema \\\n", - "count 23.000000 27.000000 \n", + "count 17.000000 27.000000 \n", "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 0.378524 0.407407 \n", - "std 0.127190 0.500712 \n", - "min 0.089130 0.000000 \n", - "25% 0.309743 0.000000 \n", - "50% 0.352751 0.000000 \n", - "75% 0.468417 1.000000 \n", - "max 0.659091 1.000000 \n", + "mean 0.399687 0.333333 \n", + "std 0.097771 0.480384 \n", + "min 0.197279 0.000000 \n", + "25% 0.325069 0.000000 \n", + "50% 0.413203 0.000000 \n", + "75% 0.471366 1.000000 \n", + "max 0.552430 1.000000 \n", "\n", " feedback.toxicity_similarity feedback.sentiment_similarity \\\n", "count 27.000000 27.000000 \n", "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 0.592593 0.574074 \n", - "std 0.500712 0.359051 \n", + "mean 0.444444 0.444444 \n", + "std 0.506370 0.423659 \n", "min 0.000000 0.000000 \n", - "25% 0.000000 0.500000 \n", - "50% 1.000000 0.500000 \n", + "25% 0.000000 0.000000 \n", + "50% 0.000000 0.500000 \n", "75% 1.000000 1.000000 \n", "max 1.000000 1.000000 \n", "\n", @@ -1919,10 +1927,10 @@ "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 0.681481 0.148148 \n", - "std 0.389316 0.362014 \n", + "mean 0.540741 0.074074 \n", + "std 0.439632 0.266880 \n", "min 0.000000 0.000000 \n", - "25% 0.500000 0.000000 \n", + "25% 0.000000 0.000000 \n", "50% 0.800000 0.000000 \n", "75% 1.000000 0.000000 \n", "max 1.000000 1.000000 \n", @@ -1932,8 +1940,8 @@ "unique NaN \n", "top NaN \n", "freq NaN \n", - "mean 0.666667 \n", - "std 0.480384 \n", + "mean 0.518519 \n", + "std 0.509175 \n", "min 0.000000 \n", "25% 0.000000 \n", "50% 1.000000 \n", @@ -1945,13 +1953,13 @@ "unique NaN 0 NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", - "mean 0.481481 NaN 4.310952 \n", - "std 0.509175 NaN 0.591779 \n", - "min 0.000000 NaN 3.046112 \n", - "25% 0.000000 NaN 3.983488 \n", - "50% 0.000000 NaN 4.326160 \n", - "75% 1.000000 NaN 4.687441 \n", - "max 1.000000 NaN 5.713148 " + "mean 0.222222 NaN 4.738518 \n", + "std 0.423659 NaN 3.162978 \n", + "min 0.000000 NaN 3.224190 \n", + "25% 0.000000 NaN 3.595067 \n", + "50% 0.000000 NaN 3.744033 \n", + "75% 0.000000 NaN 4.211040 \n", + "max 1.000000 NaN 18.660901 " ] }, "metadata": {}, @@ -2064,7 +2072,7 @@ " 1.0\n", " 1\n", " ...\n", - " 0.477690\n", + " 0.552239\n", " 1\n", " 0.0\n", " 0.5\n", @@ -2073,7 +2081,7 @@ " 0\n", " 1\n", " None\n", - " 4.100867\n", + " 3.981128\n", " \n", " \n", " 598316ec-f5e2-4b4d-83a8-36adb18e12fe\n", @@ -2081,23 +2089,23 @@ " example for dalle agent\n", " {'issue_summary': 'Example for DALL-E Agent', ...\n", " {'question': {'toxicity': 0, 'sentiment': 'Neu...\n", - " 0.174905\n", + " 0.171103\n", " 1\n", " 0\n", " 1.0\n", " 0.8\n", " 0\n", " ...\n", - " 0.346749\n", - " 1\n", - " 1.0\n", - " 1.0\n", - " 0.4\n", - " 1\n", - " 1\n", - " 1\n", + " NaN\n", + " 0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0\n", + " 0\n", + " 0\n", " None\n", - " 3.653370\n", + " 10.942758\n", " \n", " \n", " d1a1a2e8-6f4c-4325-8aaa-ea20e2449268\n", @@ -2105,23 +2113,23 @@ " how do I run llama2 using pandas\n", " {'issue_summary': 'Running Llama2 with Pandas'...\n", " {'question': {'toxicity': 0, 'sentiment': 'Neu...\n", - " 0.222621\n", + " 0.594255\n", " 1\n", " 0\n", " 1.0\n", " 1.0\n", " 0\n", " ...\n", - " 0.262118\n", - " 1\n", - " 1.0\n", - " 0.5\n", - " 1.0\n", + " NaN\n", + " 0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0\n", + " 0\n", " 0\n", - " 1\n", - " 1\n", " None\n", - " 4.507702\n", + " 3.628600\n", " \n", " \n", " 140a4819-0046-469d-b4df-8e747ddae112\n", @@ -2136,16 +2144,16 @@ " 1.0\n", " 0\n", " ...\n", - " 0.279330\n", - " 1\n", - " 1.0\n", - " 1.0\n", + " 0.393643\n", + " 0\n", " 1.0\n", + " 0.5\n", + " 0.8\n", " 0\n", " 1\n", " 0\n", " None\n", - " 3.654116\n", + " 3.711707\n", " \n", " \n", " 7b0a9dd9-68ce-41a1-9f9d-067d93175477\n", @@ -2160,16 +2168,16 @@ " 0.8\n", " 1\n", " ...\n", - " NaN\n", - " 0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0\n", - " 0\n", + " 0.436747\n", + " 1\n", + " 1.0\n", + " 0.5\n", + " 1.0\n", " 0\n", + " 1\n", + " 1\n", " None\n", - " 4.666831\n", + " 4.410890\n", " \n", " \n", "\n", @@ -2207,8 +2215,8 @@ "\n", " feedback.json_edit_distance \\\n", "23a81130-2ad9-46cf-ad27-46589bcea94a 0.089219 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.174905 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.222621 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.171103 \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.594255 \n", "140a4819-0046-469d-b4df-8e747ddae112 0.353261 \n", "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.562950 \n", "\n", @@ -2248,67 +2256,67 @@ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1 ... \n", "\n", " feedback.json_edit_distance_llama_v2 \\\n", - "23a81130-2ad9-46cf-ad27-46589bcea94a 0.477690 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.346749 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.262118 \n", - "140a4819-0046-469d-b4df-8e747ddae112 0.279330 \n", - "7b0a9dd9-68ce-41a1-9f9d-067d93175477 NaN \n", + "23a81130-2ad9-46cf-ad27-46589bcea94a 0.552239 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe NaN \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 NaN \n", + "140a4819-0046-469d-b4df-8e747ddae112 0.393643 \n", + "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.436747 \n", "\n", " feedback.json_schema_llama_v2 \\\n", "23a81130-2ad9-46cf-ad27-46589bcea94a 1 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1 \n", - "140a4819-0046-469d-b4df-8e747ddae112 1 \n", - "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0 \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0 \n", + "140a4819-0046-469d-b4df-8e747ddae112 0 \n", + "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1 \n", "\n", " feedback.toxicity_similarity_llama_v2 \\\n", "23a81130-2ad9-46cf-ad27-46589bcea94a 0.0 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1.0 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1.0 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.0 \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.0 \n", "140a4819-0046-469d-b4df-8e747ddae112 1.0 \n", - "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.0 \n", + "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1.0 \n", "\n", " feedback.sentiment_similarity_llama_v2 \\\n", "23a81130-2ad9-46cf-ad27-46589bcea94a 0.5 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1.0 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.5 \n", - "140a4819-0046-469d-b4df-8e747ddae112 1.0 \n", - "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.0 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.0 \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.0 \n", + "140a4819-0046-469d-b4df-8e747ddae112 0.5 \n", + "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.5 \n", "\n", " feedback.confidence_level_similarity_llama_v2 \\\n", "23a81130-2ad9-46cf-ad27-46589bcea94a 0.8 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.4 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1.0 \n", - "140a4819-0046-469d-b4df-8e747ddae112 1.0 \n", - "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.0 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.0 \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.0 \n", + "140a4819-0046-469d-b4df-8e747ddae112 0.8 \n", + "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1.0 \n", "\n", " feedback.question_category_llama_v2 \\\n", "23a81130-2ad9-46cf-ad27-46589bcea94a 0 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0 \n", "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0 \n", "140a4819-0046-469d-b4df-8e747ddae112 0 \n", "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0 \n", "\n", " feedback.off_topic_similarity_llama_v2 \\\n", "23a81130-2ad9-46cf-ad27-46589bcea94a 0 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0 \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0 \n", "140a4819-0046-469d-b4df-8e747ddae112 1 \n", - "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0 \n", + "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1 \n", "\n", " feedback.programming_language_similarity_llama_v2 \\\n", "23a81130-2ad9-46cf-ad27-46589bcea94a 1 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0 \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0 \n", "140a4819-0046-469d-b4df-8e747ddae112 0 \n", - "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0 \n", + "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1 \n", "\n", " error_llama_v2 execution_time_llama_v2 \n", - "23a81130-2ad9-46cf-ad27-46589bcea94a None 4.100867 \n", - "598316ec-f5e2-4b4d-83a8-36adb18e12fe None 3.653370 \n", - "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 None 4.507702 \n", - "140a4819-0046-469d-b4df-8e747ddae112 None 3.654116 \n", - "7b0a9dd9-68ce-41a1-9f9d-067d93175477 None 4.666831 \n", + "23a81130-2ad9-46cf-ad27-46589bcea94a None 3.981128 \n", + "598316ec-f5e2-4b4d-83a8-36adb18e12fe None 10.942758 \n", + "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 None 3.628600 \n", + "140a4819-0046-469d-b4df-8e747ddae112 None 3.711707 \n", + "7b0a9dd9-68ce-41a1-9f9d-067d93175477 None 4.410890 \n", "\n", "[5 rows x 56 columns]" ] @@ -2449,17 +2457,17 @@ " \n", " \n", " mean\n", - " 11.573060\n", - " 11.570401\n", - " 10.930946\n", - " 4.310952\n", + " 6.949585\n", + " 10.556105\n", + " 11.128319\n", + " 4.738518\n", " \n", " \n", " std\n", - " 4.616704\n", - " 1.648157\n", - " 1.594109\n", - " 0.591779\n", + " 1.639494\n", + " 1.790352\n", + " 4.845637\n", + " 3.162978\n", " \n", " \n", "\n", @@ -2467,12 +2475,12 @@ ], "text/plain": [ " execution_time.gpt-4 execution_time execution_time.claude_xsd \\\n", - "mean 11.573060 11.570401 10.930946 \n", - "std 4.616704 1.648157 1.594109 \n", + "mean 6.949585 10.556105 11.128319 \n", + "std 1.639494 1.790352 4.845637 \n", "\n", " execution_time.llama_v2 \n", - "mean 4.310952 \n", - "std 0.591779 " + "mean 4.738518 \n", + "std 3.162978 " ] }, "metadata": {}, @@ -2550,51 +2558,51 @@ " \n", " \n", " mean\n", - " 0.258825\n", + " 0.283000\n", " 1.0\n", " 0.0\n", " 1.0\n", - " 0.962963\n", - " 0.592593\n", + " 0.940741\n", + " 0.555556\n", " 0.888889\n", " 0.592593\n", - " 0.365055\n", + " 0.371950\n", " 0.777778\n", " ...\n", " 0.0\n", " 0.518519\n", - " 0.378524\n", - " 0.407407\n", - " 0.592593\n", - " 0.574074\n", - " 0.681481\n", - " 0.148148\n", - " 0.666667\n", - " 0.481481\n", + " 0.399687\n", + " 0.333333\n", + " 0.444444\n", + " 0.444444\n", + " 0.540741\n", + " 0.074074\n", + " 0.518519\n", + " 0.222222\n", " \n", " \n", " std\n", - " 0.177651\n", + " 0.181282\n", " 0.0\n", " 0.0\n", " 0.0\n", - " 0.079169\n", - " 0.500712\n", + " 0.093064\n", + " 0.506370\n", " 0.320256\n", " 0.500712\n", - " 0.108204\n", + " 0.108628\n", " 0.423659\n", " ...\n", " 0.0\n", " 0.509175\n", - " 0.127190\n", - " 0.500712\n", - " 0.500712\n", - " 0.359051\n", - " 0.389316\n", - " 0.362014\n", + " 0.097771\n", " 0.480384\n", + " 0.506370\n", + " 0.423659\n", + " 0.439632\n", + " 0.266880\n", " 0.509175\n", + " 0.423659\n", " \n", " \n", "\n", @@ -2603,28 +2611,28 @@ ], "text/plain": [ " feedback.json_edit_distance.gpt-4 feedback.json_schema.gpt-4 \\\n", - "mean 0.258825 1.0 \n", - "std 0.177651 0.0 \n", + "mean 0.283000 1.0 \n", + "std 0.181282 0.0 \n", "\n", " feedback.toxicity_similarity.gpt-4 feedback.sentiment_similarity.gpt-4 \\\n", "mean 0.0 1.0 \n", "std 0.0 0.0 \n", "\n", " feedback.confidence_level_similarity.gpt-4 \\\n", - "mean 0.962963 \n", - "std 0.079169 \n", + "mean 0.940741 \n", + "std 0.093064 \n", "\n", " feedback.question_category.gpt-4 feedback.off_topic_similarity.gpt-4 \\\n", - "mean 0.592593 0.888889 \n", - "std 0.500712 0.320256 \n", + "mean 0.555556 0.888889 \n", + "std 0.506370 0.320256 \n", "\n", " feedback.programming_language_similarity.gpt-4 \\\n", "mean 0.592593 \n", "std 0.500712 \n", "\n", " feedback.json_edit_distance feedback.json_schema ... \\\n", - "mean 0.365055 0.777778 ... \n", - "std 0.108204 0.423659 ... \n", + "mean 0.371950 0.777778 ... \n", + "std 0.108628 0.423659 ... \n", "\n", " feedback.off_topic_similarity.claude_xsd \\\n", "mean 0.0 \n", @@ -2635,32 +2643,32 @@ "std 0.509175 \n", "\n", " feedback.json_edit_distance.llama_v2 feedback.json_schema.llama_v2 \\\n", - "mean 0.378524 0.407407 \n", - "std 0.127190 0.500712 \n", + "mean 0.399687 0.333333 \n", + "std 0.097771 0.480384 \n", "\n", " feedback.toxicity_similarity.llama_v2 \\\n", - "mean 0.592593 \n", - "std 0.500712 \n", + "mean 0.444444 \n", + "std 0.506370 \n", "\n", " feedback.sentiment_similarity.llama_v2 \\\n", - "mean 0.574074 \n", - "std 0.359051 \n", + "mean 0.444444 \n", + "std 0.423659 \n", "\n", " feedback.confidence_level_similarity.llama_v2 \\\n", - "mean 0.681481 \n", - "std 0.389316 \n", + "mean 0.540741 \n", + "std 0.439632 \n", "\n", " feedback.question_category.llama_v2 \\\n", - "mean 0.148148 \n", - "std 0.362014 \n", + "mean 0.074074 \n", + "std 0.266880 \n", "\n", " feedback.off_topic_similarity.llama_v2 \\\n", - "mean 0.666667 \n", - "std 0.480384 \n", + "mean 0.518519 \n", + "std 0.509175 \n", "\n", " feedback.programming_language_similarity.llama_v2 \n", - "mean 0.481481 \n", - "std 0.509175 \n", + "mean 0.222222 \n", + "std 0.423659 \n", "\n", "[2 rows x 32 columns]" ] @@ -2710,17 +2718,17 @@ " \n", " \n", " mean\n", - " 0.962963\n", + " 0.940741\n", " 0.970370\n", " 0.970370\n", - " 0.681481\n", + " 0.540741\n", " \n", " \n", " std\n", - " 0.079169\n", + " 0.093064\n", " 0.072403\n", " 0.072403\n", - " 0.389316\n", + " 0.439632\n", " \n", " \n", "\n", @@ -2728,8 +2736,8 @@ ], "text/plain": [ " feedback.confidence_level_similarity.gpt-4 \\\n", - "mean 0.962963 \n", - "std 0.079169 \n", + "mean 0.940741 \n", + "std 0.093064 \n", "\n", " feedback.confidence_level_similarity \\\n", "mean 0.970370 \n", @@ -2740,8 +2748,8 @@ "std 0.072403 \n", "\n", " feedback.confidence_level_similarity.llama_v2 \n", - "mean 0.681481 \n", - "std 0.389316 " + "mean 0.540741 \n", + "std 0.439632 " ] }, "metadata": {}, @@ -2789,17 +2797,17 @@ " \n", " \n", " mean\n", - " 0.258825\n", - " 0.365055\n", - " 0.391835\n", - " 0.378524\n", + " 0.283000\n", + " 0.371950\n", + " 0.394232\n", + " 0.399687\n", " \n", " \n", " std\n", - " 0.177651\n", - " 0.108204\n", - " 0.097901\n", - " 0.127190\n", + " 0.181282\n", + " 0.108628\n", + " 0.117880\n", + " 0.097771\n", " \n", " \n", "\n", @@ -2807,16 +2815,16 @@ ], "text/plain": [ " feedback.json_edit_distance.gpt-4 feedback.json_edit_distance \\\n", - "mean 0.258825 0.365055 \n", - "std 0.177651 0.108204 \n", + "mean 0.283000 0.371950 \n", + "std 0.181282 0.108628 \n", "\n", " feedback.json_edit_distance.claude_xsd \\\n", - "mean 0.391835 \n", - "std 0.097901 \n", + "mean 0.394232 \n", + "std 0.117880 \n", "\n", " feedback.json_edit_distance.llama_v2 \n", - "mean 0.378524 \n", - "std 0.127190 " + "mean 0.399687 \n", + "std 0.097771 " ] }, "metadata": {}, @@ -2867,14 +2875,14 @@ " 1.0\n", " 0.777778\n", " 0.518519\n", - " 0.407407\n", + " 0.333333\n", " \n", " \n", " std\n", " 0.0\n", " 0.423659\n", " 0.509175\n", - " 0.500712\n", + " 0.480384\n", " \n", " \n", "\n", @@ -2886,8 +2894,8 @@ "std 0.0 0.423659 \n", "\n", " feedback.json_schema.claude_xsd feedback.json_schema.llama_v2 \n", - "mean 0.518519 0.407407 \n", - "std 0.509175 0.500712 " + "mean 0.518519 0.333333 \n", + "std 0.509175 0.480384 " ] }, "metadata": {}, @@ -2938,14 +2946,14 @@ " 0.888889\n", " 0.0\n", " 0.0\n", - " 0.666667\n", + " 0.518519\n", " \n", " \n", " std\n", " 0.320256\n", " 0.0\n", " 0.0\n", - " 0.480384\n", + " 0.509175\n", " \n", " \n", "\n", @@ -2961,8 +2969,8 @@ "std 0.0 \n", "\n", " feedback.off_topic_similarity.llama_v2 \n", - "mean 0.666667 \n", - "std 0.480384 " + "mean 0.518519 \n", + "std 0.509175 " ] }, "metadata": {}, @@ -3013,14 +3021,14 @@ " 0.592593\n", " 0.444444\n", " 0.518519\n", - " 0.481481\n", + " 0.222222\n", " \n", " \n", " std\n", " 0.500712\n", " 0.506370\n", " 0.509175\n", - " 0.509175\n", + " 0.423659\n", " \n", " \n", "\n", @@ -3040,8 +3048,8 @@ "std 0.509175 \n", "\n", " feedback.programming_language_similarity.llama_v2 \n", - "mean 0.481481 \n", - "std 0.509175 " + "mean 0.222222 \n", + "std 0.423659 " ] }, "metadata": {}, @@ -3089,17 +3097,17 @@ " \n", " \n", " mean\n", - " 0.592593\n", + " 0.555556\n", " 0.481481\n", " 0.370370\n", - " 0.148148\n", + " 0.074074\n", " \n", " \n", " std\n", - " 0.500712\n", + " 0.506370\n", " 0.509175\n", " 0.492103\n", - " 0.362014\n", + " 0.266880\n", " \n", " \n", "\n", @@ -3107,16 +3115,16 @@ ], "text/plain": [ " feedback.question_category.gpt-4 feedback.question_category \\\n", - "mean 0.592593 0.481481 \n", - "std 0.500712 0.509175 \n", + "mean 0.555556 0.481481 \n", + "std 0.506370 0.509175 \n", "\n", " feedback.question_category.claude_xsd \\\n", "mean 0.370370 \n", "std 0.492103 \n", "\n", " feedback.question_category.llama_v2 \n", - "mean 0.148148 \n", - "std 0.362014 " + "mean 0.074074 \n", + "std 0.266880 " ] }, "metadata": {}, @@ -3165,16 +3173,16 @@ " \n", " mean\n", " 1.0\n", - " 0.944444\n", + " 0.925926\n", " 0.907407\n", - " 0.574074\n", + " 0.444444\n", " \n", " \n", " std\n", " 0.0\n", - " 0.160128\n", + " 0.181007\n", " 0.197924\n", - " 0.359051\n", + " 0.423659\n", " \n", " \n", "\n", @@ -3182,16 +3190,16 @@ ], "text/plain": [ " feedback.sentiment_similarity.gpt-4 feedback.sentiment_similarity \\\n", - "mean 1.0 0.944444 \n", - "std 0.0 0.160128 \n", + "mean 1.0 0.925926 \n", + "std 0.0 0.181007 \n", "\n", " feedback.sentiment_similarity.claude_xsd \\\n", "mean 0.907407 \n", "std 0.197924 \n", "\n", " feedback.sentiment_similarity.llama_v2 \n", - "mean 0.574074 \n", - "std 0.359051 " + "mean 0.444444 \n", + "std 0.423659 " ] }, "metadata": {}, @@ -3242,14 +3250,14 @@ " 0.0\n", " 1.0\n", " 1.0\n", - " 0.592593\n", + " 0.444444\n", " \n", " \n", " std\n", " 0.0\n", " 0.0\n", " 0.0\n", - " 0.500712\n", + " 0.506370\n", " \n", " \n", "\n", @@ -3265,8 +3273,8 @@ "std 0.0 \n", "\n", " feedback.toxicity_similarity.llama_v2 \n", - "mean 0.592593 \n", - "std 0.500712 " + "mean 0.444444 \n", + "std 0.506370 " ] }, "metadata": {}, @@ -3277,6 +3285,16 @@ "for metric in feedback_columns:\n", " render_metric(df, metric)" ] + }, + { + "cell_type": "markdown", + "id": "d1641d5b-362d-4aae-9f42-ccb4726b8229", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "Try it out yourself! You can see some additional experiments on Open Source models in [this repo](https://github.com/hinthornw/llama-extraction)." + ] } ], "metadata": { diff --git a/docs/source/toc.segment b/docs/source/toc.segment index 529bc8f2..889494b5 100644 --- a/docs/source/toc.segment +++ b/docs/source/toc.segment @@ -24,6 +24,7 @@ ./notebooks/extraction/intro ./notebooks/extraction/email +./notebooks/extraction/chat_extraction ``` ```{toctree} diff --git a/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py b/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py index 5d614035..ba64c3f7 100644 --- a/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py +++ b/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py @@ -58,7 +58,7 @@ class QuestionCategorization(BaseModel): " to assist in building applications with LLMs. Questions may also be about similar packages like LangServe, LangSmith, OpenAI, Anthropic, vectorstores, agents, etc." ) toxicity: int = Field( - ge=0, lt=6, default=0, description="Whether or not the input question is toxic" + ge=0, lt=6, description="Whether or not the input question is toxic" ) sentiment: Sentiment programming_language: ProgrammingLanguage diff --git a/pyproject.toml b/pyproject.toml index 5616c976..c4f53830 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain-benchmarks" -version = "0.0.5" +version = "0.0.6" description = "🦜💪 Flex those feathers!" authors = ["LangChain AI"] license = "MIT"