diff --git a/docs/source/notebooks/extraction/chat_extraction.ipynb b/docs/source/notebooks/extraction/chat_extraction.ipynb
index 5dd795fd..0fe317cc 100644
--- a/docs/source/notebooks/extraction/chat_extraction.ipynb
+++ b/docs/source/notebooks/extraction/chat_extraction.ipynb
@@ -348,8 +348,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "View the evaluation results for project 'gpt-4-1106-preview-af10' at:\n",
- "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=9ee3b369-1988-4db0-a2a9-ea6259c8e19c\n",
+ "View the evaluation results for project 'gpt-4-1106-preview-5689' at:\n",
+ "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=0c022691-a7ac-4545-b2bc-58aab2d476e8\n",
"\n",
"View all tests for Dataset Chat Extraction at:\n",
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n",
@@ -456,29 +456,29 @@
" \n",
"
\n",
" mean | \n",
- " 0.258825 | \n",
+ " 0.283000 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
- " 0.962963 | \n",
- " 0.592593 | \n",
+ " 0.940741 | \n",
+ " 0.555556 | \n",
" 0.888889 | \n",
" 0.592593 | \n",
" NaN | \n",
- " 11.573060 | \n",
+ " 6.949585 | \n",
"
\n",
" \n",
" std | \n",
- " 0.177651 | \n",
+ " 0.181282 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
- " 0.079169 | \n",
- " 0.500712 | \n",
+ " 0.093064 | \n",
+ " 0.506370 | \n",
" 0.320256 | \n",
" 0.500712 | \n",
" NaN | \n",
- " 4.616704 | \n",
+ " 1.639494 | \n",
"
\n",
" \n",
" min | \n",
@@ -491,24 +491,24 @@
" 0.000000 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 6.609211 | \n",
+ " 4.248728 | \n",
"
\n",
" \n",
" 25% | \n",
- " 0.100351 | \n",
+ " 0.104149 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
- " 1.000000 | \n",
+ " 0.800000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 8.454940 | \n",
+ " 5.679244 | \n",
"
\n",
" \n",
" 50% | \n",
- " 0.222621 | \n",
+ " 0.336343 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
@@ -517,11 +517,11 @@
" 1.000000 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 10.141127 | \n",
+ " 6.558088 | \n",
"
\n",
" \n",
" 75% | \n",
- " 0.365307 | \n",
+ " 0.378270 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
@@ -530,11 +530,11 @@
" 1.000000 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 13.332418 | \n",
+ " 8.300396 | \n",
"
\n",
" \n",
" max | \n",
- " 0.595300 | \n",
+ " 0.594255 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
@@ -543,7 +543,7 @@
" 1.000000 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 27.191173 | \n",
+ " 10.123084 | \n",
"
\n",
" \n",
"\n",
@@ -555,13 +555,13 @@
"unique NaN NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
- "mean 0.258825 1.0 \n",
- "std 0.177651 0.0 \n",
+ "mean 0.283000 1.0 \n",
+ "std 0.181282 0.0 \n",
"min 0.049430 1.0 \n",
- "25% 0.100351 1.0 \n",
- "50% 0.222621 1.0 \n",
- "75% 0.365307 1.0 \n",
- "max 0.595300 1.0 \n",
+ "25% 0.104149 1.0 \n",
+ "50% 0.336343 1.0 \n",
+ "75% 0.378270 1.0 \n",
+ "max 0.594255 1.0 \n",
"\n",
" feedback.toxicity_similarity feedback.sentiment_similarity \\\n",
"count 27.0 27.0 \n",
@@ -581,10 +581,10 @@
"unique NaN NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
- "mean 0.962963 0.592593 \n",
- "std 0.079169 0.500712 \n",
+ "mean 0.940741 0.555556 \n",
+ "std 0.093064 0.506370 \n",
"min 0.800000 0.000000 \n",
- "25% 1.000000 0.000000 \n",
+ "25% 0.800000 0.000000 \n",
"50% 1.000000 1.000000 \n",
"75% 1.000000 1.000000 \n",
"max 1.000000 1.000000 \n",
@@ -607,13 +607,13 @@
"unique NaN 0 NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
- "mean 0.592593 NaN 11.573060 \n",
- "std 0.500712 NaN 4.616704 \n",
- "min 0.000000 NaN 6.609211 \n",
- "25% 0.000000 NaN 8.454940 \n",
- "50% 1.000000 NaN 10.141127 \n",
- "75% 1.000000 NaN 13.332418 \n",
- "max 1.000000 NaN 27.191173 "
+ "mean 0.592593 NaN 6.949585 \n",
+ "std 0.500712 NaN 1.639494 \n",
+ "min 0.000000 NaN 4.248728 \n",
+ "25% 0.000000 NaN 5.679244 \n",
+ "50% 1.000000 NaN 6.558088 \n",
+ "75% 1.000000 NaN 8.300396 \n",
+ "max 1.000000 NaN 10.123084 "
]
},
"metadata": {},
@@ -809,8 +809,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "View the evaluation results for project 'claude-2-json-schema-to-xml-af10' at:\n",
- "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=fb67ac1a-4e37-44ca-94bf-970eee89ee04\n",
+ "View the evaluation results for project 'claude-2-json-schema-to-xml-5689' at:\n",
+ "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=3f590999-a9d1-48be-83dd-e84acb99a195\n",
"\n",
"View all tests for Dataset Chat Extraction at:\n",
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n",
@@ -917,29 +917,29 @@
" \n",
" \n",
" mean | \n",
- " 0.365055 | \n",
+ " 0.371950 | \n",
" 0.777778 | \n",
" 1.0 | \n",
- " 0.944444 | \n",
+ " 0.925926 | \n",
" 0.970370 | \n",
" 0.481481 | \n",
" 0.0 | \n",
" 0.444444 | \n",
" NaN | \n",
- " 11.570401 | \n",
+ " 10.556105 | \n",
"
\n",
" \n",
" std | \n",
- " 0.108204 | \n",
+ " 0.108628 | \n",
" 0.423659 | \n",
" 0.0 | \n",
- " 0.160128 | \n",
+ " 0.181007 | \n",
" 0.072403 | \n",
" 0.509175 | \n",
" 0.0 | \n",
" 0.506370 | \n",
" NaN | \n",
- " 1.648157 | \n",
+ " 1.790352 | \n",
"
\n",
" \n",
" min | \n",
@@ -952,11 +952,11 @@
" 0.0 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 8.821772 | \n",
+ " 8.435542 | \n",
"
\n",
" \n",
" 25% | \n",
- " 0.298704 | \n",
+ " 0.312445 | \n",
" 1.000000 | \n",
" 1.0 | \n",
" 1.000000 | \n",
@@ -965,11 +965,11 @@
" 0.0 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 10.545821 | \n",
+ " 9.077631 | \n",
"
\n",
" \n",
" 50% | \n",
- " 0.393478 | \n",
+ " 0.390000 | \n",
" 1.000000 | \n",
" 1.0 | \n",
" 1.000000 | \n",
@@ -978,11 +978,11 @@
" 0.0 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 11.427731 | \n",
+ " 10.059124 | \n",
"
\n",
" \n",
" 75% | \n",
- " 0.444609 | \n",
+ " 0.462694 | \n",
" 1.000000 | \n",
" 1.0 | \n",
" 1.000000 | \n",
@@ -991,7 +991,7 @@
" 0.0 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 12.390761 | \n",
+ " 11.795210 | \n",
"
\n",
" \n",
" max | \n",
@@ -1004,7 +1004,7 @@
" 0.0 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 17.776214 | \n",
+ " 15.072743 | \n",
"
\n",
" \n",
"\n",
@@ -1016,12 +1016,12 @@
"unique NaN NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
- "mean 0.365055 0.777778 \n",
- "std 0.108204 0.423659 \n",
+ "mean 0.371950 0.777778 \n",
+ "std 0.108628 0.423659 \n",
"min 0.105033 0.000000 \n",
- "25% 0.298704 1.000000 \n",
- "50% 0.393478 1.000000 \n",
- "75% 0.444609 1.000000 \n",
+ "25% 0.312445 1.000000 \n",
+ "50% 0.390000 1.000000 \n",
+ "75% 0.462694 1.000000 \n",
"max 0.537678 1.000000 \n",
"\n",
" feedback.toxicity_similarity feedback.sentiment_similarity \\\n",
@@ -1029,8 +1029,8 @@
"unique NaN NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
- "mean 1.0 0.944444 \n",
- "std 0.0 0.160128 \n",
+ "mean 1.0 0.925926 \n",
+ "std 0.0 0.181007 \n",
"min 1.0 0.500000 \n",
"25% 1.0 1.000000 \n",
"50% 1.0 1.000000 \n",
@@ -1068,13 +1068,13 @@
"unique NaN 0 NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
- "mean 0.444444 NaN 11.570401 \n",
- "std 0.506370 NaN 1.648157 \n",
- "min 0.000000 NaN 8.821772 \n",
- "25% 0.000000 NaN 10.545821 \n",
- "50% 0.000000 NaN 11.427731 \n",
- "75% 1.000000 NaN 12.390761 \n",
- "max 1.000000 NaN 17.776214 "
+ "mean 0.444444 NaN 10.556105 \n",
+ "std 0.506370 NaN 1.790352 \n",
+ "min 0.000000 NaN 8.435542 \n",
+ "25% 0.000000 NaN 9.077631 \n",
+ "50% 0.000000 NaN 10.059124 \n",
+ "75% 1.000000 NaN 11.795210 \n",
+ "max 1.000000 NaN 15.072743 "
]
},
"metadata": {},
@@ -1282,8 +1282,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "View the evaluation results for project 'claude-2-xsd-to-xml-af10' at:\n",
- "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=07edf03d-97b9-42a8-acde-9a6e9facb388\n",
+ "View the evaluation results for project 'claude-2-xsd-to-xml-5689' at:\n",
+ "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=dc7656d8-00ef-4048-9ce5-38ef72af593c\n",
"\n",
"View all tests for Dataset Chat Extraction at:\n",
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n",
@@ -1390,7 +1390,7 @@
" \n",
" \n",
" mean | \n",
- " 0.391835 | \n",
+ " 0.394232 | \n",
" 0.518519 | \n",
" 1.0 | \n",
" 0.907407 | \n",
@@ -1399,11 +1399,11 @@
" 0.0 | \n",
" 0.518519 | \n",
" NaN | \n",
- " 10.930946 | \n",
+ " 11.128319 | \n",
"
\n",
" \n",
" std | \n",
- " 0.097901 | \n",
+ " 0.117880 | \n",
" 0.509175 | \n",
" 0.0 | \n",
" 0.197924 | \n",
@@ -1412,7 +1412,7 @@
" 0.0 | \n",
" 0.509175 | \n",
" NaN | \n",
- " 1.594109 | \n",
+ " 4.845637 | \n",
"
\n",
" \n",
" min | \n",
@@ -1425,11 +1425,11 @@
" 0.0 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 8.416739 | \n",
+ " 7.833285 | \n",
"
\n",
" \n",
" 25% | \n",
- " 0.348812 | \n",
+ " 0.332400 | \n",
" 0.000000 | \n",
" 1.0 | \n",
" 1.000000 | \n",
@@ -1438,11 +1438,11 @@
" 0.0 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 9.813120 | \n",
+ " 8.888438 | \n",
"
\n",
" \n",
" 50% | \n",
- " 0.379653 | \n",
+ " 0.380435 | \n",
" 1.000000 | \n",
" 1.0 | \n",
" 1.000000 | \n",
@@ -1451,11 +1451,11 @@
" 0.0 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 10.371725 | \n",
+ " 9.629613 | \n",
"
\n",
" \n",
" 75% | \n",
- " 0.425574 | \n",
+ " 0.456592 | \n",
" 1.000000 | \n",
" 1.0 | \n",
" 1.000000 | \n",
@@ -1464,7 +1464,7 @@
" 0.0 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 11.964592 | \n",
+ " 11.143679 | \n",
"
\n",
" \n",
" max | \n",
@@ -1477,7 +1477,7 @@
" 0.0 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 14.291423 | \n",
+ " 32.068304 | \n",
"
\n",
" \n",
"\n",
@@ -1489,12 +1489,12 @@
"unique NaN NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
- "mean 0.391835 0.518519 \n",
- "std 0.097901 0.509175 \n",
+ "mean 0.394232 0.518519 \n",
+ "std 0.117880 0.509175 \n",
"min 0.116608 0.000000 \n",
- "25% 0.348812 0.000000 \n",
- "50% 0.379653 1.000000 \n",
- "75% 0.425574 1.000000 \n",
+ "25% 0.332400 0.000000 \n",
+ "50% 0.380435 1.000000 \n",
+ "75% 0.456592 1.000000 \n",
"max 0.644007 1.000000 \n",
"\n",
" feedback.toxicity_similarity feedback.sentiment_similarity \\\n",
@@ -1541,13 +1541,13 @@
"unique NaN 0 NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
- "mean 0.518519 NaN 10.930946 \n",
- "std 0.509175 NaN 1.594109 \n",
- "min 0.000000 NaN 8.416739 \n",
- "25% 0.000000 NaN 9.813120 \n",
- "50% 1.000000 NaN 10.371725 \n",
- "75% 1.000000 NaN 11.964592 \n",
- "max 1.000000 NaN 14.291423 "
+ "mean 0.518519 NaN 11.128319 \n",
+ "std 0.509175 NaN 4.845637 \n",
+ "min 0.000000 NaN 7.833285 \n",
+ "25% 0.000000 NaN 8.888438 \n",
+ "50% 1.000000 NaN 9.629613 \n",
+ "75% 1.000000 NaN 11.143679 \n",
+ "max 1.000000 NaN 32.068304 "
]
},
"metadata": {},
@@ -1659,7 +1659,15 @@
{
"data": {
"text/plain": [
- "{'output': 'Here is the generated ticket:\\n{\"GenerateTicket\": {\"issue_summary\": \"Running Llama 2 locally\", \"question\": {\"QuestionCategorization\": {\"question_category\": \"Technical Integration\", \"is_off_topic\": false, \"sentiment\": \"Neutral\", \"programming_language\": \"unknown\"}}}}'}"
+ "{'output': {'issue_summary': 'How to run Llama 2 locally',\n",
+ " 'question': {'question_category': 'Implementation Issues',\n",
+ " 'is_off_topic': False,\n",
+ " 'toxicity': 0,\n",
+ " 'sentiment': 'Neutral',\n",
+ " 'programming_language': 'cpp'},\n",
+ " 'response': {'response_type': 'Resolve Issue',\n",
+ " 'confidence_level': 5,\n",
+ " 'followup_actions': ['Please provide more information about the environment (OS, versions, etc.) and the specific issue you are experiencing.']}}}"
]
},
"execution_count": 15,
@@ -1686,8 +1694,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "View the evaluation results for project 'llama-v2-34b-code-instruct-af10' at:\n",
- "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=53631e16-bdb2-4d53-ae8f-10cde961614e\n",
+ "View the evaluation results for project 'llama-v2-34b-code-instruct-5689' at:\n",
+ "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6/compare?selectedSessions=dc2e0648-7e65-4d60-a149-15c24bca943b\n",
"\n",
"View all tests for Dataset Chat Extraction at:\n",
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08042749-504d-4509-9549-5f5c579115f6\n",
@@ -1742,7 +1750,7 @@
" \n",
" \n",
" count | \n",
- " 23.000000 | \n",
+ " 17.000000 | \n",
" 27.000000 | \n",
" 27.000000 | \n",
" 27.000000 | \n",
@@ -1794,33 +1802,33 @@
"
\n",
" \n",
" mean | \n",
- " 0.378524 | \n",
- " 0.407407 | \n",
- " 0.592593 | \n",
- " 0.574074 | \n",
- " 0.681481 | \n",
- " 0.148148 | \n",
- " 0.666667 | \n",
- " 0.481481 | \n",
+ " 0.399687 | \n",
+ " 0.333333 | \n",
+ " 0.444444 | \n",
+ " 0.444444 | \n",
+ " 0.540741 | \n",
+ " 0.074074 | \n",
+ " 0.518519 | \n",
+ " 0.222222 | \n",
" NaN | \n",
- " 4.310952 | \n",
+ " 4.738518 | \n",
"
\n",
" \n",
" std | \n",
- " 0.127190 | \n",
- " 0.500712 | \n",
- " 0.500712 | \n",
- " 0.359051 | \n",
- " 0.389316 | \n",
- " 0.362014 | \n",
+ " 0.097771 | \n",
" 0.480384 | \n",
+ " 0.506370 | \n",
+ " 0.423659 | \n",
+ " 0.439632 | \n",
+ " 0.266880 | \n",
" 0.509175 | \n",
+ " 0.423659 | \n",
" NaN | \n",
- " 0.591779 | \n",
+ " 3.162978 | \n",
"
\n",
" \n",
" min | \n",
- " 0.089130 | \n",
+ " 0.197279 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
@@ -1829,50 +1837,50 @@
" 0.000000 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 3.046112 | \n",
+ " 3.224190 | \n",
"
\n",
" \n",
" 25% | \n",
- " 0.309743 | \n",
+ " 0.325069 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
- " 0.500000 | \n",
- " 0.500000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 3.983488 | \n",
+ " 3.595067 | \n",
"
\n",
" \n",
" 50% | \n",
- " 0.352751 | \n",
+ " 0.413203 | \n",
+ " 0.000000 | \n",
" 0.000000 | \n",
- " 1.000000 | \n",
" 0.500000 | \n",
" 0.800000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" NaN | \n",
- " 4.326160 | \n",
+ " 3.744033 | \n",
"
\n",
" \n",
" 75% | \n",
- " 0.468417 | \n",
+ " 0.471366 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
- " 1.000000 | \n",
+ " 0.000000 | \n",
" NaN | \n",
- " 4.687441 | \n",
+ " 4.211040 | \n",
"
\n",
" \n",
" max | \n",
- " 0.659091 | \n",
+ " 0.552430 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
@@ -1881,7 +1889,7 @@
" 1.000000 | \n",
" 1.000000 | \n",
" NaN | \n",
- " 5.713148 | \n",
+ " 18.660901 | \n",
"
\n",
" \n",
"\n",
@@ -1889,28 +1897,28 @@
],
"text/plain": [
" feedback.json_edit_distance feedback.json_schema \\\n",
- "count 23.000000 27.000000 \n",
+ "count 17.000000 27.000000 \n",
"unique NaN NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
- "mean 0.378524 0.407407 \n",
- "std 0.127190 0.500712 \n",
- "min 0.089130 0.000000 \n",
- "25% 0.309743 0.000000 \n",
- "50% 0.352751 0.000000 \n",
- "75% 0.468417 1.000000 \n",
- "max 0.659091 1.000000 \n",
+ "mean 0.399687 0.333333 \n",
+ "std 0.097771 0.480384 \n",
+ "min 0.197279 0.000000 \n",
+ "25% 0.325069 0.000000 \n",
+ "50% 0.413203 0.000000 \n",
+ "75% 0.471366 1.000000 \n",
+ "max 0.552430 1.000000 \n",
"\n",
" feedback.toxicity_similarity feedback.sentiment_similarity \\\n",
"count 27.000000 27.000000 \n",
"unique NaN NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
- "mean 0.592593 0.574074 \n",
- "std 0.500712 0.359051 \n",
+ "mean 0.444444 0.444444 \n",
+ "std 0.506370 0.423659 \n",
"min 0.000000 0.000000 \n",
- "25% 0.000000 0.500000 \n",
- "50% 1.000000 0.500000 \n",
+ "25% 0.000000 0.000000 \n",
+ "50% 0.000000 0.500000 \n",
"75% 1.000000 1.000000 \n",
"max 1.000000 1.000000 \n",
"\n",
@@ -1919,10 +1927,10 @@
"unique NaN NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
- "mean 0.681481 0.148148 \n",
- "std 0.389316 0.362014 \n",
+ "mean 0.540741 0.074074 \n",
+ "std 0.439632 0.266880 \n",
"min 0.000000 0.000000 \n",
- "25% 0.500000 0.000000 \n",
+ "25% 0.000000 0.000000 \n",
"50% 0.800000 0.000000 \n",
"75% 1.000000 0.000000 \n",
"max 1.000000 1.000000 \n",
@@ -1932,8 +1940,8 @@
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
- "mean 0.666667 \n",
- "std 0.480384 \n",
+ "mean 0.518519 \n",
+ "std 0.509175 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 1.000000 \n",
@@ -1945,13 +1953,13 @@
"unique NaN 0 NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
- "mean 0.481481 NaN 4.310952 \n",
- "std 0.509175 NaN 0.591779 \n",
- "min 0.000000 NaN 3.046112 \n",
- "25% 0.000000 NaN 3.983488 \n",
- "50% 0.000000 NaN 4.326160 \n",
- "75% 1.000000 NaN 4.687441 \n",
- "max 1.000000 NaN 5.713148 "
+ "mean 0.222222 NaN 4.738518 \n",
+ "std 0.423659 NaN 3.162978 \n",
+ "min 0.000000 NaN 3.224190 \n",
+ "25% 0.000000 NaN 3.595067 \n",
+ "50% 0.000000 NaN 3.744033 \n",
+ "75% 0.000000 NaN 4.211040 \n",
+ "max 1.000000 NaN 18.660901 "
]
},
"metadata": {},
@@ -2064,7 +2072,7 @@
" 1.0 | \n",
" 1 | \n",
" ... | \n",
- " 0.477690 | \n",
+ " 0.552239 | \n",
" 1 | \n",
" 0.0 | \n",
" 0.5 | \n",
@@ -2073,7 +2081,7 @@
" 0 | \n",
" 1 | \n",
" None | \n",
- " 4.100867 | \n",
+ " 3.981128 | \n",
" \n",
" \n",
" 598316ec-f5e2-4b4d-83a8-36adb18e12fe | \n",
@@ -2081,23 +2089,23 @@
" example for dalle agent | \n",
" {'issue_summary': 'Example for DALL-E Agent', ... | \n",
" {'question': {'toxicity': 0, 'sentiment': 'Neu... | \n",
- " 0.174905 | \n",
+ " 0.171103 | \n",
" 1 | \n",
" 0 | \n",
" 1.0 | \n",
" 0.8 | \n",
" 0 | \n",
" ... | \n",
- " 0.346749 | \n",
- " 1 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.4 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
" None | \n",
- " 3.653370 | \n",
+ " 10.942758 | \n",
"
\n",
" \n",
" d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 | \n",
@@ -2105,23 +2113,23 @@
" how do I run llama2 using pandas | \n",
" {'issue_summary': 'Running Llama2 with Pandas'... | \n",
" {'question': {'toxicity': 0, 'sentiment': 'Neu... | \n",
- " 0.222621 | \n",
+ " 0.594255 | \n",
" 1 | \n",
" 0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 0 | \n",
" ... | \n",
- " 0.262118 | \n",
- " 1 | \n",
- " 1.0 | \n",
- " 0.5 | \n",
- " 1.0 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
" 0 | \n",
- " 1 | \n",
- " 1 | \n",
" None | \n",
- " 4.507702 | \n",
+ " 3.628600 | \n",
"
\n",
" \n",
" 140a4819-0046-469d-b4df-8e747ddae112 | \n",
@@ -2136,16 +2144,16 @@
" 1.0 | \n",
" 0 | \n",
" ... | \n",
- " 0.279330 | \n",
- " 1 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
+ " 0.393643 | \n",
+ " 0 | \n",
" 1.0 | \n",
+ " 0.5 | \n",
+ " 0.8 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" None | \n",
- " 3.654116 | \n",
+ " 3.711707 | \n",
"
\n",
" \n",
" 7b0a9dd9-68ce-41a1-9f9d-067d93175477 | \n",
@@ -2160,16 +2168,16 @@
" 0.8 | \n",
" 1 | \n",
" ... | \n",
- " NaN | \n",
- " 0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " 0 | \n",
- " 0 | \n",
+ " 0.436747 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0.5 | \n",
+ " 1.0 | \n",
" 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
" None | \n",
- " 4.666831 | \n",
+ " 4.410890 | \n",
"
\n",
" \n",
"\n",
@@ -2207,8 +2215,8 @@
"\n",
" feedback.json_edit_distance \\\n",
"23a81130-2ad9-46cf-ad27-46589bcea94a 0.089219 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.174905 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.222621 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.171103 \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.594255 \n",
"140a4819-0046-469d-b4df-8e747ddae112 0.353261 \n",
"7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.562950 \n",
"\n",
@@ -2248,67 +2256,67 @@
"7b0a9dd9-68ce-41a1-9f9d-067d93175477 1 ... \n",
"\n",
" feedback.json_edit_distance_llama_v2 \\\n",
- "23a81130-2ad9-46cf-ad27-46589bcea94a 0.477690 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.346749 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.262118 \n",
- "140a4819-0046-469d-b4df-8e747ddae112 0.279330 \n",
- "7b0a9dd9-68ce-41a1-9f9d-067d93175477 NaN \n",
+ "23a81130-2ad9-46cf-ad27-46589bcea94a 0.552239 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe NaN \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 NaN \n",
+ "140a4819-0046-469d-b4df-8e747ddae112 0.393643 \n",
+ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.436747 \n",
"\n",
" feedback.json_schema_llama_v2 \\\n",
"23a81130-2ad9-46cf-ad27-46589bcea94a 1 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1 \n",
- "140a4819-0046-469d-b4df-8e747ddae112 1 \n",
- "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0 \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0 \n",
+ "140a4819-0046-469d-b4df-8e747ddae112 0 \n",
+ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1 \n",
"\n",
" feedback.toxicity_similarity_llama_v2 \\\n",
"23a81130-2ad9-46cf-ad27-46589bcea94a 0.0 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1.0 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1.0 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.0 \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.0 \n",
"140a4819-0046-469d-b4df-8e747ddae112 1.0 \n",
- "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.0 \n",
+ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1.0 \n",
"\n",
" feedback.sentiment_similarity_llama_v2 \\\n",
"23a81130-2ad9-46cf-ad27-46589bcea94a 0.5 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1.0 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.5 \n",
- "140a4819-0046-469d-b4df-8e747ddae112 1.0 \n",
- "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.0 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.0 \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.0 \n",
+ "140a4819-0046-469d-b4df-8e747ddae112 0.5 \n",
+ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.5 \n",
"\n",
" feedback.confidence_level_similarity_llama_v2 \\\n",
"23a81130-2ad9-46cf-ad27-46589bcea94a 0.8 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.4 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1.0 \n",
- "140a4819-0046-469d-b4df-8e747ddae112 1.0 \n",
- "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0.0 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0.0 \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0.0 \n",
+ "140a4819-0046-469d-b4df-8e747ddae112 0.8 \n",
+ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1.0 \n",
"\n",
" feedback.question_category_llama_v2 \\\n",
"23a81130-2ad9-46cf-ad27-46589bcea94a 0 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0 \n",
"d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0 \n",
"140a4819-0046-469d-b4df-8e747ddae112 0 \n",
"7b0a9dd9-68ce-41a1-9f9d-067d93175477 0 \n",
"\n",
" feedback.off_topic_similarity_llama_v2 \\\n",
"23a81130-2ad9-46cf-ad27-46589bcea94a 0 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0 \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0 \n",
"140a4819-0046-469d-b4df-8e747ddae112 1 \n",
- "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0 \n",
+ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1 \n",
"\n",
" feedback.programming_language_similarity_llama_v2 \\\n",
"23a81130-2ad9-46cf-ad27-46589bcea94a 1 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe 1 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 1 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe 0 \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 0 \n",
"140a4819-0046-469d-b4df-8e747ddae112 0 \n",
- "7b0a9dd9-68ce-41a1-9f9d-067d93175477 0 \n",
+ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 1 \n",
"\n",
" error_llama_v2 execution_time_llama_v2 \n",
- "23a81130-2ad9-46cf-ad27-46589bcea94a None 4.100867 \n",
- "598316ec-f5e2-4b4d-83a8-36adb18e12fe None 3.653370 \n",
- "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 None 4.507702 \n",
- "140a4819-0046-469d-b4df-8e747ddae112 None 3.654116 \n",
- "7b0a9dd9-68ce-41a1-9f9d-067d93175477 None 4.666831 \n",
+ "23a81130-2ad9-46cf-ad27-46589bcea94a None 3.981128 \n",
+ "598316ec-f5e2-4b4d-83a8-36adb18e12fe None 10.942758 \n",
+ "d1a1a2e8-6f4c-4325-8aaa-ea20e2449268 None 3.628600 \n",
+ "140a4819-0046-469d-b4df-8e747ddae112 None 3.711707 \n",
+ "7b0a9dd9-68ce-41a1-9f9d-067d93175477 None 4.410890 \n",
"\n",
"[5 rows x 56 columns]"
]
@@ -2449,17 +2457,17 @@
" \n",
" \n",
" mean | \n",
- " 11.573060 | \n",
- " 11.570401 | \n",
- " 10.930946 | \n",
- " 4.310952 | \n",
+ " 6.949585 | \n",
+ " 10.556105 | \n",
+ " 11.128319 | \n",
+ " 4.738518 | \n",
"
\n",
" \n",
" std | \n",
- " 4.616704 | \n",
- " 1.648157 | \n",
- " 1.594109 | \n",
- " 0.591779 | \n",
+ " 1.639494 | \n",
+ " 1.790352 | \n",
+ " 4.845637 | \n",
+ " 3.162978 | \n",
"
\n",
" \n",
"\n",
@@ -2467,12 +2475,12 @@
],
"text/plain": [
" execution_time.gpt-4 execution_time execution_time.claude_xsd \\\n",
- "mean 11.573060 11.570401 10.930946 \n",
- "std 4.616704 1.648157 1.594109 \n",
+ "mean 6.949585 10.556105 11.128319 \n",
+ "std 1.639494 1.790352 4.845637 \n",
"\n",
" execution_time.llama_v2 \n",
- "mean 4.310952 \n",
- "std 0.591779 "
+ "mean 4.738518 \n",
+ "std 3.162978 "
]
},
"metadata": {},
@@ -2550,51 +2558,51 @@
" \n",
" \n",
" mean | \n",
- " 0.258825 | \n",
+ " 0.283000 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
- " 0.962963 | \n",
- " 0.592593 | \n",
+ " 0.940741 | \n",
+ " 0.555556 | \n",
" 0.888889 | \n",
" 0.592593 | \n",
- " 0.365055 | \n",
+ " 0.371950 | \n",
" 0.777778 | \n",
" ... | \n",
" 0.0 | \n",
" 0.518519 | \n",
- " 0.378524 | \n",
- " 0.407407 | \n",
- " 0.592593 | \n",
- " 0.574074 | \n",
- " 0.681481 | \n",
- " 0.148148 | \n",
- " 0.666667 | \n",
- " 0.481481 | \n",
+ " 0.399687 | \n",
+ " 0.333333 | \n",
+ " 0.444444 | \n",
+ " 0.444444 | \n",
+ " 0.540741 | \n",
+ " 0.074074 | \n",
+ " 0.518519 | \n",
+ " 0.222222 | \n",
"
\n",
" \n",
" std | \n",
- " 0.177651 | \n",
+ " 0.181282 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
- " 0.079169 | \n",
- " 0.500712 | \n",
+ " 0.093064 | \n",
+ " 0.506370 | \n",
" 0.320256 | \n",
" 0.500712 | \n",
- " 0.108204 | \n",
+ " 0.108628 | \n",
" 0.423659 | \n",
" ... | \n",
" 0.0 | \n",
" 0.509175 | \n",
- " 0.127190 | \n",
- " 0.500712 | \n",
- " 0.500712 | \n",
- " 0.359051 | \n",
- " 0.389316 | \n",
- " 0.362014 | \n",
+ " 0.097771 | \n",
" 0.480384 | \n",
+ " 0.506370 | \n",
+ " 0.423659 | \n",
+ " 0.439632 | \n",
+ " 0.266880 | \n",
" 0.509175 | \n",
+ " 0.423659 | \n",
"
\n",
" \n",
"\n",
@@ -2603,28 +2611,28 @@
],
"text/plain": [
" feedback.json_edit_distance.gpt-4 feedback.json_schema.gpt-4 \\\n",
- "mean 0.258825 1.0 \n",
- "std 0.177651 0.0 \n",
+ "mean 0.283000 1.0 \n",
+ "std 0.181282 0.0 \n",
"\n",
" feedback.toxicity_similarity.gpt-4 feedback.sentiment_similarity.gpt-4 \\\n",
"mean 0.0 1.0 \n",
"std 0.0 0.0 \n",
"\n",
" feedback.confidence_level_similarity.gpt-4 \\\n",
- "mean 0.962963 \n",
- "std 0.079169 \n",
+ "mean 0.940741 \n",
+ "std 0.093064 \n",
"\n",
" feedback.question_category.gpt-4 feedback.off_topic_similarity.gpt-4 \\\n",
- "mean 0.592593 0.888889 \n",
- "std 0.500712 0.320256 \n",
+ "mean 0.555556 0.888889 \n",
+ "std 0.506370 0.320256 \n",
"\n",
" feedback.programming_language_similarity.gpt-4 \\\n",
"mean 0.592593 \n",
"std 0.500712 \n",
"\n",
" feedback.json_edit_distance feedback.json_schema ... \\\n",
- "mean 0.365055 0.777778 ... \n",
- "std 0.108204 0.423659 ... \n",
+ "mean 0.371950 0.777778 ... \n",
+ "std 0.108628 0.423659 ... \n",
"\n",
" feedback.off_topic_similarity.claude_xsd \\\n",
"mean 0.0 \n",
@@ -2635,32 +2643,32 @@
"std 0.509175 \n",
"\n",
" feedback.json_edit_distance.llama_v2 feedback.json_schema.llama_v2 \\\n",
- "mean 0.378524 0.407407 \n",
- "std 0.127190 0.500712 \n",
+ "mean 0.399687 0.333333 \n",
+ "std 0.097771 0.480384 \n",
"\n",
" feedback.toxicity_similarity.llama_v2 \\\n",
- "mean 0.592593 \n",
- "std 0.500712 \n",
+ "mean 0.444444 \n",
+ "std 0.506370 \n",
"\n",
" feedback.sentiment_similarity.llama_v2 \\\n",
- "mean 0.574074 \n",
- "std 0.359051 \n",
+ "mean 0.444444 \n",
+ "std 0.423659 \n",
"\n",
" feedback.confidence_level_similarity.llama_v2 \\\n",
- "mean 0.681481 \n",
- "std 0.389316 \n",
+ "mean 0.540741 \n",
+ "std 0.439632 \n",
"\n",
" feedback.question_category.llama_v2 \\\n",
- "mean 0.148148 \n",
- "std 0.362014 \n",
+ "mean 0.074074 \n",
+ "std 0.266880 \n",
"\n",
" feedback.off_topic_similarity.llama_v2 \\\n",
- "mean 0.666667 \n",
- "std 0.480384 \n",
+ "mean 0.518519 \n",
+ "std 0.509175 \n",
"\n",
" feedback.programming_language_similarity.llama_v2 \n",
- "mean 0.481481 \n",
- "std 0.509175 \n",
+ "mean 0.222222 \n",
+ "std 0.423659 \n",
"\n",
"[2 rows x 32 columns]"
]
@@ -2710,17 +2718,17 @@
" \n",
" \n",
" mean | \n",
- " 0.962963 | \n",
+ " 0.940741 | \n",
" 0.970370 | \n",
" 0.970370 | \n",
- " 0.681481 | \n",
+ " 0.540741 | \n",
"
\n",
" \n",
" std | \n",
- " 0.079169 | \n",
+ " 0.093064 | \n",
" 0.072403 | \n",
" 0.072403 | \n",
- " 0.389316 | \n",
+ " 0.439632 | \n",
"
\n",
" \n",
"\n",
@@ -2728,8 +2736,8 @@
],
"text/plain": [
" feedback.confidence_level_similarity.gpt-4 \\\n",
- "mean 0.962963 \n",
- "std 0.079169 \n",
+ "mean 0.940741 \n",
+ "std 0.093064 \n",
"\n",
" feedback.confidence_level_similarity \\\n",
"mean 0.970370 \n",
@@ -2740,8 +2748,8 @@
"std 0.072403 \n",
"\n",
" feedback.confidence_level_similarity.llama_v2 \n",
- "mean 0.681481 \n",
- "std 0.389316 "
+ "mean 0.540741 \n",
+ "std 0.439632 "
]
},
"metadata": {},
@@ -2789,17 +2797,17 @@
" \n",
" \n",
" mean | \n",
- " 0.258825 | \n",
- " 0.365055 | \n",
- " 0.391835 | \n",
- " 0.378524 | \n",
+ " 0.283000 | \n",
+ " 0.371950 | \n",
+ " 0.394232 | \n",
+ " 0.399687 | \n",
"
\n",
" \n",
" std | \n",
- " 0.177651 | \n",
- " 0.108204 | \n",
- " 0.097901 | \n",
- " 0.127190 | \n",
+ " 0.181282 | \n",
+ " 0.108628 | \n",
+ " 0.117880 | \n",
+ " 0.097771 | \n",
"
\n",
" \n",
"\n",
@@ -2807,16 +2815,16 @@
],
"text/plain": [
" feedback.json_edit_distance.gpt-4 feedback.json_edit_distance \\\n",
- "mean 0.258825 0.365055 \n",
- "std 0.177651 0.108204 \n",
+ "mean 0.283000 0.371950 \n",
+ "std 0.181282 0.108628 \n",
"\n",
" feedback.json_edit_distance.claude_xsd \\\n",
- "mean 0.391835 \n",
- "std 0.097901 \n",
+ "mean 0.394232 \n",
+ "std 0.117880 \n",
"\n",
" feedback.json_edit_distance.llama_v2 \n",
- "mean 0.378524 \n",
- "std 0.127190 "
+ "mean 0.399687 \n",
+ "std 0.097771 "
]
},
"metadata": {},
@@ -2867,14 +2875,14 @@
" 1.0 | \n",
" 0.777778 | \n",
" 0.518519 | \n",
- " 0.407407 | \n",
+ " 0.333333 | \n",
" \n",
" \n",
" std | \n",
" 0.0 | \n",
" 0.423659 | \n",
" 0.509175 | \n",
- " 0.500712 | \n",
+ " 0.480384 | \n",
"
\n",
" \n",
"\n",
@@ -2886,8 +2894,8 @@
"std 0.0 0.423659 \n",
"\n",
" feedback.json_schema.claude_xsd feedback.json_schema.llama_v2 \n",
- "mean 0.518519 0.407407 \n",
- "std 0.509175 0.500712 "
+ "mean 0.518519 0.333333 \n",
+ "std 0.509175 0.480384 "
]
},
"metadata": {},
@@ -2938,14 +2946,14 @@
" 0.888889 | \n",
" 0.0 | \n",
" 0.0 | \n",
- " 0.666667 | \n",
+ " 0.518519 | \n",
" \n",
" \n",
" std | \n",
" 0.320256 | \n",
" 0.0 | \n",
" 0.0 | \n",
- " 0.480384 | \n",
+ " 0.509175 | \n",
"
\n",
" \n",
"\n",
@@ -2961,8 +2969,8 @@
"std 0.0 \n",
"\n",
" feedback.off_topic_similarity.llama_v2 \n",
- "mean 0.666667 \n",
- "std 0.480384 "
+ "mean 0.518519 \n",
+ "std 0.509175 "
]
},
"metadata": {},
@@ -3013,14 +3021,14 @@
" 0.592593 | \n",
" 0.444444 | \n",
" 0.518519 | \n",
- " 0.481481 | \n",
+ " 0.222222 | \n",
" \n",
" \n",
" std | \n",
" 0.500712 | \n",
" 0.506370 | \n",
" 0.509175 | \n",
- " 0.509175 | \n",
+ " 0.423659 | \n",
"
\n",
" \n",
"\n",
@@ -3040,8 +3048,8 @@
"std 0.509175 \n",
"\n",
" feedback.programming_language_similarity.llama_v2 \n",
- "mean 0.481481 \n",
- "std 0.509175 "
+ "mean 0.222222 \n",
+ "std 0.423659 "
]
},
"metadata": {},
@@ -3089,17 +3097,17 @@
" \n",
" \n",
" mean | \n",
- " 0.592593 | \n",
+ " 0.555556 | \n",
" 0.481481 | \n",
" 0.370370 | \n",
- " 0.148148 | \n",
+ " 0.074074 | \n",
"
\n",
" \n",
" std | \n",
- " 0.500712 | \n",
+ " 0.506370 | \n",
" 0.509175 | \n",
" 0.492103 | \n",
- " 0.362014 | \n",
+ " 0.266880 | \n",
"
\n",
" \n",
"\n",
@@ -3107,16 +3115,16 @@
],
"text/plain": [
" feedback.question_category.gpt-4 feedback.question_category \\\n",
- "mean 0.592593 0.481481 \n",
- "std 0.500712 0.509175 \n",
+ "mean 0.555556 0.481481 \n",
+ "std 0.506370 0.509175 \n",
"\n",
" feedback.question_category.claude_xsd \\\n",
"mean 0.370370 \n",
"std 0.492103 \n",
"\n",
" feedback.question_category.llama_v2 \n",
- "mean 0.148148 \n",
- "std 0.362014 "
+ "mean 0.074074 \n",
+ "std 0.266880 "
]
},
"metadata": {},
@@ -3165,16 +3173,16 @@
" \n",
" mean | \n",
" 1.0 | \n",
- " 0.944444 | \n",
+ " 0.925926 | \n",
" 0.907407 | \n",
- " 0.574074 | \n",
+ " 0.444444 | \n",
"
\n",
" \n",
" std | \n",
" 0.0 | \n",
- " 0.160128 | \n",
+ " 0.181007 | \n",
" 0.197924 | \n",
- " 0.359051 | \n",
+ " 0.423659 | \n",
"
\n",
" \n",
"\n",
@@ -3182,16 +3190,16 @@
],
"text/plain": [
" feedback.sentiment_similarity.gpt-4 feedback.sentiment_similarity \\\n",
- "mean 1.0 0.944444 \n",
- "std 0.0 0.160128 \n",
+ "mean 1.0 0.925926 \n",
+ "std 0.0 0.181007 \n",
"\n",
" feedback.sentiment_similarity.claude_xsd \\\n",
"mean 0.907407 \n",
"std 0.197924 \n",
"\n",
" feedback.sentiment_similarity.llama_v2 \n",
- "mean 0.574074 \n",
- "std 0.359051 "
+ "mean 0.444444 \n",
+ "std 0.423659 "
]
},
"metadata": {},
@@ -3242,14 +3250,14 @@
" 0.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
- " 0.592593 | \n",
+ " 0.444444 | \n",
" \n",
" \n",
" std | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
- " 0.500712 | \n",
+ " 0.506370 | \n",
"
\n",
" \n",
"\n",
@@ -3265,8 +3273,8 @@
"std 0.0 \n",
"\n",
" feedback.toxicity_similarity.llama_v2 \n",
- "mean 0.592593 \n",
- "std 0.500712 "
+ "mean 0.444444 \n",
+ "std 0.506370 "
]
},
"metadata": {},
@@ -3277,6 +3285,16 @@
"for metric in feedback_columns:\n",
" render_metric(df, metric)"
]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d1641d5b-362d-4aae-9f42-ccb4726b8229",
+ "metadata": {},
+ "source": [
+ "## Next Steps\n",
+ "\n",
+ "Try it out yourself! You can see some additional experiments on Open Source models in [this repo](https://github.com/hinthornw/llama-extraction)."
+ ]
}
],
"metadata": {
diff --git a/docs/source/toc.segment b/docs/source/toc.segment
index 529bc8f2..889494b5 100644
--- a/docs/source/toc.segment
+++ b/docs/source/toc.segment
@@ -24,6 +24,7 @@
./notebooks/extraction/intro
./notebooks/extraction/email
+./notebooks/extraction/chat_extraction
```
```{toctree}
diff --git a/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py b/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py
index 5d614035..ba64c3f7 100644
--- a/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py
+++ b/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py
@@ -58,7 +58,7 @@ class QuestionCategorization(BaseModel):
" to assist in building applications with LLMs. Questions may also be about similar packages like LangServe, LangSmith, OpenAI, Anthropic, vectorstores, agents, etc."
)
toxicity: int = Field(
- ge=0, lt=6, default=0, description="Whether or not the input question is toxic"
+ ge=0, lt=6, description="Whether or not the input question is toxic"
)
sentiment: Sentiment
programming_language: ProgrammingLanguage
diff --git a/pyproject.toml b/pyproject.toml
index 5616c976..c4f53830 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-benchmarks"
-version = "0.0.5"
+version = "0.0.6"
description = "🦜💪 Flex those feathers!"
authors = ["LangChain AI"]
license = "MIT"