diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb index 2843dc75..8bc81073 100644 --- a/docs/source/notebooks/tool_usage/multiverse_math.ipynb +++ b/docs/source/notebooks/tool_usage/multiverse_math.ipynb @@ -1,481 +1,588 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", - "metadata": { - "tags": [] - }, - "source": [ - "# Multiverse Math\n", - "\n", - "\n", - "Let's see how to evaluate an agent's ability to use tools.\n", - "\n", - " Solve basic math question using the provided tools.\n", - "\n", - " Must use the provided tools to solve the math question.\n", - "\n", - " To make sure that innate knowledge is not used, the math operations have been altered to yield different results than expected.\n", - "\n", - " The modified operations should yield different results, but still retain appropriate properties. For example, the modified multiplication operation should still be commutative.\n", - "\n", - " Please note that the modified operations are not guaranteed to even make sense in the real world since not all properties will be retained (e.g., distributive property)." - ] - }, - { - "cell_type": "markdown", - "id": "03488ab1-31ed-41c2-8da2-46b02599b181", - "metadata": {}, - "source": [ - "For this code to work, please configure LangSmith environment variables with your credentials." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1615b8ff-688a-4447-8c4c-d64ad02818ed", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\" # Your LangSmith API key" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain_benchmarks import clone_public_dataset, registry" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1aef2b32-a5df-421f-8be3-a2ef27372ece", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
Name | Multiverse Math |
Type | ToolUsageTask |
Dataset ID | https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d |
Description | An environment that contains a few basic math operations, but with altered results.\n", - "\n", - "For example, mu... |
\n", - " | Intermediate steps correctness | \n", - "# steps / # expected steps | \n", - "correctness | \n", - "execution_time | \n", - "input.question | \n", - "output.question | \n", - "output.output | \n", - "output.intermediate_steps | \n", - "reference.reference | \n", - "reference.expected_steps | \n", - "num_expected_steps | \n", - "actual_number_of_steps | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "0 | \n", - "15.0 | \n", - "0 | \n", - "38.76436 | \n", - "Add 2 and 3 | \n", - "Add 2 and 3 | \n", - "Agent stopped due to iteration limit or time l... | \n", - "[(tool='add' tool_input={'a': 2, 'b': 3} log=\"... | \n", - "6.20 | \n", - "[add] | \n", - "1 | \n", - "15 | \n", - "
1 | \n", - "0 | \n", - "15.0 | \n", - "0 | \n", - "38.76436 | \n", - "Subtract 3 from 2 | \n", - "Subtract 3 from 2 | \n", - "Agent stopped due to iteration limit or time l... | \n", - "[(tool='subtract' tool_input={'a': 2, 'b': 3} ... | \n", - "-4.00 | \n", - "[subtract] | \n", - "1 | \n", - "15 | \n", - "
2 | \n", - "0 | \n", - "9.0 | \n", - "1 | \n", - "38.76436 | \n", - "What is -5 if evaluated using the negate funct... | \n", - "What is -5 if evaluated using the negate funct... | \n", - "-5.0\\n-5.0 | \n", - "[(tool='negate' tool_input={'a': -5} log=\"\\nIn... | \n", - "-5.00 | \n", - "[negate] | \n", - "1 | \n", - "9 | \n", - "
3 | \n", - "1 | \n", - "1.0 | \n", - "0 | \n", - "38.76436 | \n", - "what is the result of 2 to the power of 3? | \n", - "what is the result of 2 to the power of 3? | \n", - "The result of 2 to the power of 3 is 32. | \n", - "[(tool='power' tool_input={'a': 2, 'b': 3} log... | \n", - "32.00 | \n", - "[power] | \n", - "1 | \n", - "1 | \n", - "
4 | \n", - "0 | \n", - "7.5 | \n", - "0 | \n", - "38.76436 | \n", - "I ate 1 apple and 2 oranges every day for 7 da... | \n", - "I ate 1 apple and 2 oranges every day for 7 da... | \n", - "Agent stopped due to iteration limit or time l... | \n", - "[(tool='add' tool_input={'a': 1, 'b': 2} log=\"... | \n", - "32.34 | \n", - "[multiply, add] | \n", - "2 | \n", - "15 | \n", - "
Name | Multiverse Math |
Type | ToolUsageTask |
Dataset ID | 594f9f60-30a0-49bf-b075-f44beabf546a |
Description | An environment that contains a few basic math operations, but with altered results.\n", + "\n", + "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", + "\n", + "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math. |
\n", + " | Intermediate steps correctness | \n", + "# steps / # expected steps | \n", + "correctness | \n", + "execution_time | \n", + "input.question | \n", + "output.question | \n", + "output.output | \n", + "output.intermediate_steps | \n", + "reference.reference | \n", + "reference.expected_steps | \n", + "num_expected_steps | \n", + "actual_number_of_steps | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "5.771554 | \n", + "Add 2 and 3 | \n", + "Add 2 and 3 | \n", + "The sum of 2 and 3 in this alternate mathemati... | \n", + "[(tool='add' tool_input={'a': 2, 'b': 3} log=\"... | \n", + "6.20 | \n", + "[add] | \n", + "1 | \n", + "1 | \n", + "
1 | \n", + "1 | \n", + "1.0 | \n", + "0 | \n", + "5.771554 | \n", + "Subtract 3 from 2 | \n", + "Subtract 3 from 2 | \n", + "The result of subtracting 3 from 2 in this alt... | \n", + "[(tool='subtract' tool_input={'a': 2, 'b': 3} ... | \n", + "-4.00 | \n", + "[subtract] | \n", + "1 | \n", + "1 | \n", + "
2 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "5.771554 | \n", + "What is -5 if evaluated using the negate funct... | \n", + "What is -5 if evaluated using the negate funct... | \n", + "The result of evaluating -5 using the negate f... | \n", + "[(tool='negate' tool_input={'a': -5} log=\"\\nIn... | \n", + "-5.00 | \n", + "[negate] | \n", + "1 | \n", + "1 | \n", + "
3 | \n", + "1 | \n", + "1.0 | \n", + "0 | \n", + "5.771554 | \n", + "what is the result of 2 to the power of 3? | \n", + "what is the result of 2 to the power of 3? | \n", + "The result of 2 to the power of 3 is 32. | \n", + "[(tool='power' tool_input={'a': 2, 'b': 3} log... | \n", + "32.00 | \n", + "[power] | \n", + "1 | \n", + "1 | \n", + "
4 | \n", + "0 | \n", + "1.0 | \n", + "0 | \n", + "5.771554 | \n", + "I ate 1 apple and 2 oranges every day for 7 da... | \n", + "I ate 1 apple and 2 oranges every day for 7 da... | \n", + "You ate a total of 32.34 fruits. | \n", + "[(tool='add' tool_input={'a': 1, 'b': 2} log=\"... | \n", + "32.34 | \n", + "[multiply, add] | \n", + "2 | \n", + "2 | \n", + "
Name | Tool Usage - Relational Data |
Type | ToolUsageTask |
Dataset ID | https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d |
Description | Environment with fake data about users and their locations and favorite foods.\n", - "\n", - "The environment provides a set of tools that can be used to query the data.\n", - "\n", - "The objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\n", - "\n", - "The dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\n", - "\n", - "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", - "\n", - "Success is measured by the ability to answer the question correctly, and efficiently. |
\n", - " | Intermediate steps correctness | \n", - "# steps / # expected steps | \n", - "correctness | \n", - "execution_time | \n", - "input.question | \n", - "output.question | \n", - "output.output | \n", - "output.intermediate_steps | \n", - "reference.reference | \n", - "reference.order_matters | \n", - "reference.expected_steps | \n", - "num_expected_steps | \n", - "actual_number_of_steps | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "0 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "do bob and alice live in the same city? | \n", - "do bob and alice live in the same city? | \n", - "No, Bob and Alice do not live in the same city... | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "no | \n", - "False | \n", - "[find_users_by_name, get_user_location, get_ci... | \n", - "5 | \n", - "5 | \n", - "
1 | \n", - "0 | \n", - "0.0 | \n", - "0 | \n", - "5.098939 | \n", - "Is it likely that Donna is outside with an umb... | \n", - "Is it likely that Donna is outside with an umb... | \n", - "I'm sorry, but I don't have access to real-tim... | \n", - "[] | \n", - "yes | \n", - "False | \n", - "[find_users_by_name, get_user_location, get_cu... | \n", - "4 | \n", - "0 | \n", - "
2 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "do alice and charlie use the same email provider? | \n", - "do alice and charlie use the same email provider? | \n", - "No, Alice and Charlie do not use the same emai... | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "no | \n", - "True | \n", - "[find_users_by_name, get_user_email, get_user_... | \n", - "3 | \n", - "3 | \n", - "
3 | \n", - "0 | \n", - "0.0 | \n", - "0 | \n", - "5.098939 | \n", - "Is it likely that Donna is awake right now? | \n", - "Is it likely that Donna is awake right now? | \n", - "I'm sorry, but I don't have access to informat... | \n", - "[] | \n", - "yes | \n", - "True | \n", - "[find_users_by_name, get_user_location, get_cu... | \n", - "3 | \n", - "0 | \n", - "
4 | \n", - "0 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "Donna is about to go outside. Does she need an... | \n", - "Donna is about to go outside. Does she need an... | \n", - "Donna is currently in a location where it is r... | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "yes | \n", - "True | \n", - "[find_users_by_name, get_user_location, get_cu... | \n", - "3 | \n", - "3 | \n", - "
\n", - " | Intermediate steps correctness | \n", - "# steps / # expected steps | \n", - "correctness | \n", - "execution_time | \n", - "input.question | \n", - "output.question | \n", - "output.output | \n", - "output.intermediate_steps | \n", - "reference.reference | \n", - "reference.order_matters | \n", - "reference.expected_steps | \n", - "num_expected_steps | \n", - "actual_number_of_steps | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "0 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "do bob and alice live in the same city? | \n", - "do bob and alice live in the same city? | \n", - "No, Bob and Alice do not live in the same city... | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "no | \n", - "False | \n", - "[find_users_by_name, get_user_location, get_ci... | \n", - "5 | \n", - "5 | \n", - "
2 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "do alice and charlie use the same email provider? | \n", - "do alice and charlie use the same email provider? | \n", - "No, Alice and Charlie do not use the same emai... | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "no | \n", - "True | \n", - "[find_users_by_name, get_user_email, get_user_... | \n", - "3 | \n", - "3 | \n", - "
4 | \n", - "0 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "Donna is about to go outside. Does she need an... | \n", - "Donna is about to go outside. Does she need an... | \n", - "Donna is currently in a location where it is r... | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "yes | \n", - "True | \n", - "[find_users_by_name, get_user_location, get_cu... | \n", - "3 | \n", - "3 | \n", - "
5 | \n", - "0 | \n", - "1.0 | \n", - "0 | \n", - "5.098939 | \n", - "whats the name of the city where bob lives? | \n", - "whats the name of the city where bob lives? | \n", - "The name of the city where Bob lives is New York. | \n", - "[(tool='list_user_ids' tool_input={} log='\\nIn... | \n", - "Los Angeles | \n", - "True | \n", - "[find_users_by_name, get_user_location, get_ci... | \n", - "3 | \n", - "3 | \n", - "
6 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "what is the current users favorite color and n... | \n", - "what is the current users favorite color and n... | \n", - "The current user's favorite color is yellow an... | \n", - "[(tool='get_current_user_id' tool_input={} log... | \n", - "yellow and Charlie | \n", - "True | \n", - "[get_current_user_id, get_user_favorite_color,... | \n", - "3 | \n", - "3 | \n", - "
7 | \n", - "0 | \n", - "1.5 | \n", - "1 | \n", - "5.098939 | \n", - "Frank who is Even's friend is allergic to dair... | \n", - "Frank who is Even's friend is allergic to dair... | \n", - "Frank's favorite food is the salad, which cont... | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "yes | \n", - "True | \n", - "[find_users_by_name, get_food_allergic_ingredi... | \n", - "2 | \n", - "3 | \n", - "
11 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "list the allergens in chocolate | \n", - "list the allergens in chocolate | \n", - "The allergens in chocolate are milk and soy. | \n", - "[(tool='find_foods_by_name' tool_input={'food'... | \n", - "milk, soy | \n", - "True | \n", - "[find_foods_by_name, get_food_allergic_ingredi... | \n", - "2 | \n", - "2 | \n", - "
15 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "what is alice's email address? | \n", - "what is alice's email address? | \n", - "Alice's email address is alice@gmail.com. | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "alice@gmail.com | \n", - "True | \n", - "[find_users_by_name, get_user_email] | \n", - "2 | \n", - "2 | \n", - "
14 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "find donna's favorite color | \n", - "find donna's favorite color | \n", - "Donna's favorite color is green. | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "green | \n", - "True | \n", - "[find_users_by_name, get_user_favorite_color] | \n", - "2 | \n", - "2 | \n", - "
13 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "weather in LA right now? | \n", - "weather in LA right now? | \n", - "The current weather in Los Angeles is sunny wi... | \n", - "[(tool='find_locations_by_name' tool_input={'c... | \n", - "Sunny, Temperature: 75°F | \n", - "True | \n", - "[find_locations_by_name, get_current_weather_f... | \n", - "2 | \n", - "2 | \n", - "
12 | \n", - "1 | \n", - "1.0 | \n", - "0 | \n", - "5.098939 | \n", - "time in chicago | \n", - "time in chicago | \n", - "The current time in Chicago is 11:15 AM. | \n", - "[(tool='find_locations_by_name' tool_input={'c... | \n", - "2023-11-14 11:15 AM | \n", - "True | \n", - "[find_locations_by_name, get_current_time_for_... | \n", - "2 | \n", - "2 | \n", - "
10 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "If i eat a serving of pizza, how many calories... | \n", - "If i eat a serving of pizza, how many calories... | \n", - "If you eat a serving of pizza, you will consum... | \n", - "[(tool='find_foods_by_name' tool_input={'food'... | \n", - "285 calories | \n", - "True | \n", - "[find_foods_by_name, get_food_calories] | \n", - "2 | \n", - "2 | \n", - "
9 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "what is the current users favorite color? | \n", - "what is the current users favorite color? | \n", - "The current user's favorite color is yellow. | \n", - "[(tool='get_current_user_id' tool_input={} log... | \n", - "yellow | \n", - "True | \n", - "[get_current_user_id, get_user_favorite_color] | \n", - "2 | \n", - "2 | \n", - "
8 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "eve ate a serving of sushi, what allergens was... | \n", - "eve ate a serving of sushi, what allergens was... | \n", - "Eve was exposed to the allergens fish and soy ... | \n", - "[(tool='find_foods_by_name' tool_input={'food'... | \n", - "fish, soy | \n", - "True | \n", - "[find_foods_by_name, get_food_allergic_ingredi... | \n", - "2 | \n", - "2 | \n", - "
16 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "How many users by the name of bob? | \n", - "How many users by the name of bob? | \n", - "There are 1 user(s) with the name \"Bob\". | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "1 | \n", - "True | \n", - "[find_users_by_name] | \n", - "1 | \n", - "1 | \n", - "
17 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "get the current user id | \n", - "get the current user id | \n", - "The current user ID is 35. | \n", - "[(tool='get_current_user_id' tool_input={} log... | \n", - "35 | \n", - "True | \n", - "[get_current_user_id] | \n", - "1 | \n", - "1 | \n", - "
18 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "what is eve's user id? | \n", - "what is eve's user id? | \n", - "Eve's user ID is 42. | \n", - "[(tool='find_users_by_name' tool_input={'name'... | \n", - "42 | \n", - "True | \n", - "[find_users_by_name] | \n", - "1 | \n", - "1 | \n", - "
19 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "What is the name of food with id 6? | \n", - "What is the name of food with id 6? | \n", - "The name of the food with ID 6 is Pasta. | \n", - "[(tool='get_food_name' tool_input={'food_id': ... | \n", - "Pasta | \n", - "True | \n", - "[get_food_name] | \n", - "1 | \n", - "1 | \n", - "
20 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "5.098939 | \n", - "What is the city for location ID 1? | \n", - "What is the city for location ID 1? | \n", - "The city for location ID 1 is New York. | \n", - "[(tool='get_city_for_location' tool_input={'lo... | \n", - "New York | \n", - "True | \n", - "[get_city_for_location] | \n", - "1 | \n", - "1 | \n", - "
1 | \n", - "0 | \n", - "0.0 | \n", - "0 | \n", - "5.098939 | \n", - "Is it likely that Donna is outside with an umb... | \n", - "Is it likely that Donna is outside with an umb... | \n", - "I'm sorry, but I don't have access to real-tim... | \n", - "[] | \n", - "yes | \n", - "False | \n", - "[find_users_by_name, get_user_location, get_cu... | \n", - "4 | \n", - "0 | \n", - "
3 | \n", - "0 | \n", - "0.0 | \n", - "0 | \n", - "5.098939 | \n", - "Is it likely that Donna is awake right now? | \n", - "Is it likely that Donna is awake right now? | \n", - "I'm sorry, but I don't have access to informat... | \n", - "[] | \n", - "yes | \n", - "True | \n", - "[find_users_by_name, get_user_location, get_cu... | \n", - "3 | \n", - "0 | \n", - "
\n", + " | Intermediate steps correctness | \n", + "# steps / # expected steps | \n", + "correctness | \n", + "execution_time | \n", + "input.question | \n", + "output.question | \n", + "output.output | \n", + "output.intermediate_steps | \n", + "reference.reference | \n", + "reference.order_matters | \n", + "reference.expected_steps | \n", + "num_expected_steps | \n", + "actual_number_of_steps | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "4.253613 | \n", + "do bob and alice live in the same city? | \n", + "do bob and alice live in the same city? | \n", + "No, Bob and Alice do not live in the same city... | \n", + "[(tool='find_users_by_name' tool_input={'name'... | \n", + "no | \n", + "False | \n", + "[find_users_by_name, get_user_location, get_ci... | \n", + "5 | \n", + "5 | \n", + "
1 | \n", + "0 | \n", + "0.0 | \n", + "0 | \n", + "4.253613 | \n", + "Is it likely that Donna is outside with an umb... | \n", + "Is it likely that Donna is outside with an umb... | \n", + "I'm sorry, but I don't have access to real-tim... | \n", + "[] | \n", + "yes | \n", + "False | \n", + "[find_users_by_name, get_user_location, get_cu... | \n", + "4 | \n", + "0 | \n", + "
2 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "4.253613 | \n", + "do alice and charlie use the same email provider? | \n", + "do alice and charlie use the same email provider? | \n", + "No, Alice and Charlie do not use the same emai... | \n", + "[(tool='find_users_by_name' tool_input={'name'... | \n", + "no | \n", + "True | \n", + "[find_users_by_name, get_user_email, get_user_... | \n", + "3 | \n", + "3 | \n", + "
3 | \n", + "0 | \n", + "0.0 | \n", + "0 | \n", + "4.253613 | \n", + "Is it likely that Donna is awake right now? | \n", + "Is it likely that Donna is awake right now? | \n", + "I'm sorry, but I don't have access to informat... | \n", + "[] | \n", + "yes | \n", + "True | \n", + "[find_users_by_name, get_user_location, get_cu... | \n", + "3 | \n", + "0 | \n", + "
4 | \n", + "0 | \n", + "1.0 | \n", + "1 | \n", + "4.253613 | \n", + "Donna is about to go outside. Does she need an... | \n", + "Donna is about to go outside. Does she need an... | \n", + "Donna is at location 4 and the current weather... | \n", + "[(tool='find_users_by_name' tool_input={'name'... | \n", + "yes | \n", + "True | \n", + "[find_users_by_name, get_user_location, get_cu... | \n", + "3 | \n", + "3 | \n", + "
\n", + " | Intermediate steps correctness | \n", + "# steps / # expected steps | \n", + "correctness | \n", + "execution_time | \n", + "input.question | \n", + "output.question | \n", + "output.output | \n", + "output.intermediate_steps | \n", + "reference.reference | \n", + "reference.order_matters | \n", + "reference.expected_steps | \n", + "num_expected_steps | \n", + "actual_number_of_steps | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "4.253613 | \n", + "do bob and alice live in the same city? | \n", + "do bob and alice live in the same city? | \n", + "No, Bob and Alice do not live in the same city... | \n", + "[(tool='find_users_by_name' tool_input={'name'... | \n", + "no | \n", + "False | \n", + "[find_users_by_name, get_user_location, get_ci... | \n", + "5 | \n", + "5 | \n", + "
2 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "4.253613 | \n", + "do alice and charlie use the same email provider? | \n", + "do alice and charlie use the same email provider? | \n", + "No, Alice and Charlie do not use the same emai... | \n", + "[(tool='find_users_by_name' tool_input={'name'... | \n", + "no | \n", + "True | \n", + "[find_users_by_name, get_user_email, get_user_... | \n", + "3 | \n", + "3 | \n", + "
4 | \n", + "0 | \n", + "1.0 | \n", + "1 | \n", + "4.253613 | \n", + "Donna is about to go outside. Does she need an... | \n", + "Donna is about to go outside. Does she need an... | \n", + "Donna is at location 4 and the current weather... | \n", + "[(tool='find_users_by_name' tool_input={'name'... | \n", + "yes | \n", + "True | \n", + "[find_users_by_name, get_user_location, get_cu... | \n", + "3 | \n", + "3 | \n", + "
5 | \n", + "0 | \n", + "1.0 | \n", + "0 | \n", + "4.253613 | \n", + "whats the name of the city where bob lives? | \n", + "whats the name of the city where bob lives? | \n", + "The name of the city where Bob lives is New York. | \n", + "[(tool='list_user_ids' tool_input={} log='\\nIn... | \n", + "Los Angeles | \n", + "True | \n", + "[find_users_by_name, get_user_location, get_ci... | \n", + "3 | \n", + "3 | \n", + "
6 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "4.253613 | \n", + "what is the current users favorite color and n... | \n", + "what is the current users favorite color and n... | \n", + "The current user's favorite color is yellow an... | \n", + "[(tool='get_current_user_id' tool_input={} log... | \n", + "yellow and Charlie | \n", + "True | \n", + "[get_current_user_id, get_user_favorite_color,... | \n", + "3 | \n", + "3 | \n", + "
Name | Tool Usage - Typewriter (1 tool) |
Type | ToolUsageTask |
Dataset ID | 59577193-8938-4ccf-92a7-e8a96bcf4f86 |
Description | Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n", - "\n", - "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n", - "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", - "\n", - "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string. |
\n", - " | Intermediate steps correctness | \n", - "# steps / # expected steps | \n", - "Correct Final State | \n", - "correctness | \n", - "input.question | \n", - "output.question | \n", - "output.output | \n", - "output.intermediate_steps | \n", - "output.state | \n", - "reference.state | \n", - "reference.reference | \n", - "reference.expected_steps | \n", - "num_expected_steps | \n", - "actual_number_of_steps | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "1 | \n", - "communication | \n", - "communication | \n", - "communication | \n", - "[(tool='type_letter' tool_input={'letter': 'c'... | \n", - "communication | \n", - "communication | \n", - "communication | \n", - "[type_letter, type_letter, type_letter, type_l... | \n", - "13 | \n", - "13 | \n", - "
1 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "1 | \n", - "information | \n", - "information | \n", - "information | \n", - "[(tool='type_letter' tool_input={'letter': 'i'... | \n", - "information | \n", - "information | \n", - "information | \n", - "[type_letter, type_letter, type_letter, type_l... | \n", - "11 | \n", - "11 | \n", - "
2 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "1 | \n", - "dictionary | \n", - "dictionary | \n", - "dictionary | \n", - "[(tool='type_letter' tool_input={'letter': 'd'... | \n", - "dictionary | \n", - "dictionary | \n", - "dictionary | \n", - "[type_letter, type_letter, type_letter, type_l... | \n", - "10 | \n", - "10 | \n", - "
3 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "1 | \n", - "university | \n", - "university | \n", - "u\\nn\\ni\\nv\\ne\\nr\\ns\\ni\\nt\\ny | \n", - "[(tool='type_letter' tool_input={'letter': 'u'... | \n", - "university | \n", - "university | \n", - "university | \n", - "[type_letter, type_letter, type_letter, type_l... | \n", - "10 | \n", - "10 | \n", - "
4 | \n", - "1 | \n", - "1.0 | \n", - "1 | \n", - "1 | \n", - "keyboard | \n", - "keyboard | \n", - "keyboard | \n", - "[(tool='type_letter' tool_input={'letter': 'k'... | \n", - "keyboard | \n", - "keyboard | \n", - "keyboard | \n", - "[type_letter, type_letter, type_letter, type_l... | \n", - "8 | \n", - "8 | \n", - "
Name | Tool Usage - Typewriter (1 tool) |
Type | ToolUsageTask |
Dataset ID | 59577193-8938-4ccf-92a7-e8a96bcf4f86 |
Description | Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\n", + "\n", + "The objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string. |
\n", + " | Intermediate steps correctness | \n", + "# steps / # expected steps | \n", + "Correct Final State | \n", + "correctness | \n", + "execution_time | \n", + "input.question | \n", + "output.question | \n", + "output.output | \n", + "output.intermediate_steps | \n", + "output.state | \n", + "reference.state | \n", + "reference.reference | \n", + "reference.expected_steps | \n", + "num_expected_steps | \n", + "actual_number_of_steps | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "15.0 | \n", + "0 | \n", + "0 | \n", + "34.058961 | \n", + "a | \n", + "a | \n", + "Agent stopped due to iteration limit or time l... | \n", + "[(tool='type_letter' tool_input={'letter': 'a'... | \n", + "aaaaaaaaaaaaaaa | \n", + "a | \n", + "a | \n", + "[type_letter] | \n", + "1 | \n", + "15 | \n", + "
1 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "1 | \n", + "34.058961 | \n", + "aa | \n", + "aa | \n", + "aa\\naa | \n", + "[(tool='type_letter' tool_input={'letter': 'a'... | \n", + "aa | \n", + "aa | \n", + "aa | \n", + "[type_letter, type_letter] | \n", + "2 | \n", + "2 | \n", + "
2 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "0 | \n", + "34.058961 | \n", + "aaa | \n", + "aaa | \n", + "a\\na | \n", + "[(tool='type_letter' tool_input={'letter': 'a'... | \n", + "aaa | \n", + "aaa | \n", + "aaa | \n", + "[type_letter, type_letter, type_letter] | \n", + "3 | \n", + "3 | \n", + "
3 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "0 | \n", + "34.058961 | \n", + "aaaa | \n", + "aaaa | \n", + "a\\na | \n", + "[(tool='type_letter' tool_input={'letter': 'a'... | \n", + "aaaa | \n", + "aaaa | \n", + "aaaa | \n", + "[type_letter, type_letter, type_letter, type_l... | \n", + "4 | \n", + "4 | \n", + "
4 | \n", + "1 | \n", + "1.0 | \n", + "1 | \n", + "1 | \n", + "34.058961 | \n", + "dog | \n", + "dog | \n", + "d\\no\\ng | \n", + "[(tool='type_letter' tool_input={'letter': 'd'... | \n", + "dog | \n", + "dog | \n", + "dog | \n", + "[type_letter, type_letter, type_letter] | \n", + "3 | \n", + "3 | \n", + "
Name | Tool Usage - Typewriter (26 tools) |
Type | ToolUsageTask |
Dataset ID | 128af05e-aa00-4e3b-a958-d166dd450581 |
Description | Environment with 26 tools each tool represents a letter of the alphabet.\n", - "\n", - "The objective of this task is to evaluate the model's ability the use tools\n", - "for a simple repetition task.\n", - "\n", - "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", - "\n", - "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", - "\n", - "This is a variation of the typer writer task, where 26 parameterless tools are\n", - "given instead of a single tool that takes a letter as an argument. |
Name | Tool Usage - Typewriter (26 tools) |
Type | ToolUsageTask |
Dataset ID | 128af05e-aa00-4e3b-a958-d166dd450581 |
Description | Environment with 26 tools each tool represents a letter of the alphabet.\n", + "\n", + "The objective of this task is to evaluate the model's ability the use tools\n", + "for a simple repetition task.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", + "\n", + "This is a variation of the typer writer task, where 26 parameterless tools are\n", + "given instead of a single tool that takes a letter as an argument. |
\n", + " | input.question | \n", + "output.state | \n", + "num_expected_steps | \n", + "actual_number_of_steps | \n", + "Correct Final State | \n", + "output.Error | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "a | \n", + "a | \n", + "1 | \n", + "1.0 | \n", + "1.0 | \n", + "\n", + " |
1 | \n", + "aa | \n", + "aa | \n", + "2 | \n", + "2.0 | \n", + "1.0 | \n", + "\n", + " |
2 | \n", + "aaa | \n", + "aaaaaaaaaaaaaaa | \n", + "3 | \n", + "15.0 | \n", + "0.0 | \n", + "\n", + " |
3 | \n", + "aaaa | \n", + "aaaaaaaaaaaaaaa | \n", + "4 | \n", + "15.0 | \n", + "0.0 | \n", + "\n", + " |
4 | \n", + "dog | \n", + "dog | \n", + "3 | \n", + "4.0 | \n", + "1.0 | \n", + "\n", + " |
5 | \n", + "cat | \n", + "cat | \n", + "3 | \n", + "4.0 | \n", + "1.0 | \n", + "\n", + " |
6 | \n", + "hand | \n", + "hand | \n", + "4 | \n", + "4.0 | \n", + "1.0 | \n", + "\n", + " |
7 | \n", + "head | \n", + "hhhhhhhhhhhhhhh | \n", + "4 | \n", + "15.0 | \n", + "0.0 | \n", + "\n", + " |
8 | \n", + "house | \n", + "house | \n", + "5 | \n", + "5.0 | \n", + "1.0 | \n", + "\n", + " |
9 | \n", + "horse | \n", + "NaN | \n", + "5 | \n", + "\n", + " | NaN | \n", + "InternalServerError(\"Error code: 500 - {'error... | \n", + "
10 | \n", + "school | \n", + "NaN | \n", + "6 | \n", + "\n", + " | NaN | \n", + "BadRequestError('Error code: 400 - {\\'error\\':... | \n", + "
11 | \n", + "church | \n", + "churchchchchch | \n", + "6 | \n", + "15.0 | \n", + "0.0 | \n", + "\n", + " |
12 | \n", + "teacher | \n", + "teacher | \n", + "7 | \n", + "7.0 | \n", + "1.0 | \n", + "\n", + " |
13 | \n", + "student | \n", + "studentstudent | \n", + "7 | \n", + "15.0 | \n", + "0.0 | \n", + "\n", + " |
14 | \n", + "computer | \n", + "computer | \n", + "8 | \n", + "9.0 | \n", + "1.0 | \n", + "\n", + " |
15 | \n", + "keyboard | \n", + "NaN | \n", + "8 | \n", + "\n", + " | NaN | \n", + "InternalServerError(\"Error code: 500 - {'error... | \n", + "
16 | \n", + "university | \n", + "\n", + " | 10 | \n", + "5.0 | \n", + "0.0 | \n", + "\n", + " |
17 | \n", + "dictionary | \n", + "dictionarystr | \n", + "10 | \n", + "15.0 | \n", + "0.0 | \n", + "\n", + " |
18 | \n", + "information | \n", + "\n", + " | 11 | \n", + "3.0 | \n", + "0.0 | \n", + "\n", + " |
19 | \n", + "communication | \n", + "communication | \n", + "13 | \n", + "14.0 | \n", + "1.0 | \n", + "\n", + " |