diff --git a/langchain_benchmarks/agents/README.md b/langchain_benchmarks/agents/README.md deleted file mode 100644 index 3d88253c..00000000 --- a/langchain_benchmarks/agents/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Testing Agents - -This directory contains environments that can be used to test agent's ability -to use tools and make decisions. - -## Environments - -Environments are named in the style of e[env_number]_[name].py. - -### e01_alpha - -* Consists of 3 relational tables of users, locations and foods. -* Defines a set of tools that can be used these tables. -* Agent should use the given tools to answer questions. - -## Running Evaluation - -Please refer to the following example to see how to set up and run evaluation -for agents using [LangSmith](https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/agent_steps/evaluating_agents.ipynb). diff --git a/langchain_benchmarks/agents/__init__.py b/langchain_benchmarks/agents/__init__.py deleted file mode 100644 index 6706ac8d..00000000 --- a/langchain_benchmarks/agents/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Package for helping to evaluate agent runs.""" diff --git a/langchain_benchmarks/agents/environments/__init__.py b/langchain_benchmarks/agents/environments/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/langchain_benchmarks/agents/environments/e01_alpha.py b/langchain_benchmarks/agents/environments/e01_alpha.py deleted file mode 100644 index 55d88100..00000000 --- a/langchain_benchmarks/agents/environments/e01_alpha.py +++ /dev/null @@ -1,401 +0,0 @@ -"""A simple environment for evaluating an agent. - -A simple environment to evaluate an agent's ability to use a set of given tools -to reference questions. - -The environment contains fake data about users and their locations and favorite foods. - -The environment defines a set of tools that the agent can use to access the data. - -Agent performance should be evaluated solely based on the agent's ability to use -the tools to reference questions. -""" -from typing import Callable, List, TypedDict - -from langchain.tools import BaseTool, tool - -USER_DATA = [ - # IDs are not consecutive to prevent agents from guessing the ID - { - "id": 1, - "name": "Alice", - "email": "alice@gmail.com", - "location": 1, - "favorite_color": "red", - "favorite_foods": [1, 2, 3], - }, - { - "id": 21, - "name": "Bob", - "email": "bob@hotmail.com", - "location": 2, - "favorite_color": "orange", - "favorite_foods": [4, 5, 6], - }, - { - "id": 35, - "name": "Charlie", - "email": "charlie@yahoo.com", - "location": 3, - "favorite_color": "yellow", - "favorite_foods": [3, 7, 2], - }, - { - "id": 41, - "name": "Donna", - "email": "donna@example.com", - "location": 4, - "favorite_color": "green", - "favorite_foods": [6, 1, 4], - }, - { - "id": 42, - "name": "Eve", - "email": "eve@example.org", - "location": 5, - "favorite_color": "blue", - "favorite_foods": [5, 7, 4], - }, - { - "id": 43, - "name": "Frank The Cat", - "email": "frank.the.cat@langchain.dev", - "location": 5, - "favorite_color": "yellow", - "favorite_foods": [3], - }, -] - -# Create a list of JSON data for locations with "current_weather" as a single string -LOCATION_DATA = [ - { - "id": 1, - "city": "New York", - "current_time": "2023-11-14 10:30 AM", - "current_weather": "Partly Cloudy, Temperature: 68°F", # Example weather string - }, - { - "id": 2, - "city": "Los Angeles", - "current_time": "2023-11-14 7:45 AM", - "current_weather": "Sunny, Temperature: 75°F", # Example weather string - }, - { - "id": 3, - "city": "Chicago", - "current_time": "2023-11-14 11:15 AM", - "current_weather": "Mostly Cloudy, Temperature: 60°F", # Example weather string - }, - { - "id": 4, - "city": "Houston", - "current_time": "2023-11-14 12:00 PM", - "current_weather": "Rainy, Temperature: 55°F", # Example weather string - }, - { - "id": 5, - "city": "Miami", - "current_time": "2023-11-14 1:20 PM", - "current_weather": "Partly Cloudy, Temperature: 80°F", # Example weather string - }, -] - -FOOD_DATA = [ - { - "id": 1, - "name": "Pizza", - "calories": 285, # Calories per serving - "allergic_ingredients": ["Gluten", "Dairy"], - }, - { - "id": 2, - "name": "Chocolate", - "calories": 50, # Calories per serving - "allergic_ingredients": ["Milk", "Soy"], - }, - { - "id": 3, - "name": "Sushi", - "calories": 300, # Calories per serving - "allergic_ingredients": ["Fish", "Soy"], - }, - { - "id": 4, - "name": "Burger", - "calories": 350, # Calories per serving - "allergic_ingredients": ["Gluten", "Dairy"], - }, - { - "id": 5, - "name": "Ice Cream", - "calories": 200, # Calories per serving - "allergic_ingredients": ["Dairy"], - }, - { - "id": 6, - "name": "Pasta", - "calories": 180, # Calories per serving - "allergic_ingredients": ["Gluten"], - }, - { - "id": 7, - "name": "Salad", - "calories": 50, # Calories per serving - "allergic_ingredients": [], - }, -] - - -class SearchHit(TypedDict, total=False): - """A search hit.""" - - id: str - - -def _similarity_search(data: List[dict], query: str, key: str) -> List[SearchHit]: - """Return a list of data that matches the given query. - - Similarity score is jaccard similarity based on the number of shared - characters between the query and the data. - - Args: - data: The data to search. - query: The query to search for. - key: The key to search in. - - Returns: - The list of matching data. - """ - - def _score_function(x: str) -> float: - """Calculate the similarity score between the query and the given string.""" - return len(set(x) & set(query)) / len(set(x) | set(query)) - - re_ranked_data = sorted(data, key=lambda x: _score_function(x[key]), reverse=True) - return [{"id": d["id"], key: d[key]} for d in re_ranked_data] - - -def _get_user(id: int) -> dict: - """Find the user with the given user ID. - - Args: - id: The user's ID. - - Returns: - The user's data. - """ - for user in USER_DATA: - if user["id"] == id: - return user - raise ValueError(f"User ID {id} cannot be resolved") - - -def _get_location(id: int) -> dict: - """Find the location with the given location ID. - - Args: - id: The location's ID. - - Returns: - The location's data. - """ - for location in LOCATION_DATA: - if location["id"] == id: - return location - raise ValueError(f"Location ID {id} cannot be resolved") - - -def _get_food(food_id: int) -> dict: - """Find the food with the given food ID. - - Args: - food_id: The food's ID. - - Returns: - The food's data. - """ - for food in FOOD_DATA: - if food["id"] == food_id: - return food - raise ValueError(f"Food ID {food_id} cannot be resolved") - - -def get_available_functions() -> List[Callable]: - """Get all the available functions.""" - - def get_user_name(user_id: int) -> str: - """Get the name of the user with the given user ID. - - Args: - user_id: The user's ID. - - Returns: - The user's name. - """ - return _get_user(user_id)["name"] - - def list_user_ids() -> List[str]: - """List all the user IDs.""" - return [user["id"] for user in USER_DATA] - - def find_users_by_name(name: str) -> List[SearchHit]: - """Find users with the given name. - - Args: - name: The name to search for. - - Returns: - The list of matching users. - """ - return _similarity_search(USER_DATA, name, "name") - - def find_locations_by_name(city: str) -> List[SearchHit]: - """Find locations with the given city name.""" - return _similarity_search(LOCATION_DATA, city, "city") - - def find_foods_by_name(food: str) -> List[SearchHit]: - """Find foods with the given name.""" - return _similarity_search(FOOD_DATA, food, "name") - - def get_user_email(user_id: int) -> str: - """Get the email of the user with the given user ID. - - Args: - user_id: The user's ID. - - Returns: - The user's email. - """ - return _get_user(user_id)["email"] - - def get_user_location(user_id: int) -> int: - """Get the location ID of the user with the given user ID. - - Args: - user_id: The user's ID. - - Returns: - The user's location ID. - """ - return _get_user(user_id)["location"] - - def get_user_favorite_color(user_id: int) -> str: - """Get the favorite color of the user with the given user ID. - - Args: - user_id: The user's ID. - - Returns: - The user's favorite color. - """ - return _get_user(user_id)["favorite_color"] - - def get_user_favorite_foods(user_id: int) -> List[int]: - """Get the list of favorite foods of the user with the given user ID. - - Args: - user_id: The user's ID. - - Returns: - The list of favorite foods. - """ - return _get_user(user_id)["favorite_foods"] - - def get_weather_at_location(location_id: int) -> str: - """Get the current weather at the location with the given location ID. - - Args: - location_id: The location's ID. - - Returns: - The current weather at the location. - """ - return _get_location(location_id)["current_weather"] - - def get_city_for_location(location_id: int) -> str: - """Get the city for the location with the given location ID. - - Args: - location_id: The location's ID. - - Returns: - The city name for the location. - """ - return _get_location(location_id)["city"] - - def get_current_time_for_location(location_id: int) -> str: - """Get the current time for the location with the given location ID. - - Args: - location_id: The location's ID. - - Returns: - The current time for the location. - """ - return _get_location(location_id)["current_time"] - - def get_current_weather_for_location(location_id: int) -> str: - """Get the current weather for the location with the given location ID. - - Args: - location_id: The location's ID. - - Returns: - The current weather for the location. - """ - return _get_location(location_id)["current_weather"] - - def get_food_name(food_id: int) -> str: - """Get the name of the food with the given food ID. - - Args: - food_id: The food's ID. - - Returns: - The name of the food. - """ - return _get_food(food_id)["name"] - - def get_food_calories(food_id: int) -> int: - """Get the calories per serving for the food with the given food ID. - - Args: - food_id: The food's ID. - - Returns: - The calories per serving of the food. - """ - return _get_food(food_id)["calories"] - - def get_food_allergic_ingredients(food_id: int) -> List[str]: - """Get the list of allergic ingredients for the food with the given food ID. - - Args: - food_id: The food's ID. - - Returns: - The list of allergic ingredients. - """ - return _get_food(food_id)["allergic_ingredients"] - - def get_current_user_id() -> int: - """Get the current user's ID. - - Returns: - The current user's ID. - """ - return 35 - - # Get all the functions defined in the scope of this function - functions = [f for f in locals().values() if callable(f)] - return functions - - -def get_tools() -> List[BaseTool]: - """Get all the available tools.""" - functions = get_available_functions() - return [tool(f) for f in functions] - - -# ID of a dataset that contains the questions and references -DATASET_ID = "9f73165c-d333-4d14-8f59-bd7eede5db08" # ID of Agent Gym: E01 Alpha diff --git a/langchain_benchmarks/agents/evaluators.py b/langchain_benchmarks/agents/evaluators.py deleted file mode 100644 index c4b9a8be..00000000 --- a/langchain_benchmarks/agents/evaluators.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Module contains standard evaluators for agents. - -Requirements: - -* Agents must output "intermediate_steps" in their run outputs. -* The dataset must have "expected_steps" in its outputs. -""" -from typing import Optional - -from langchain.evaluation import EvaluatorType -from langchain.smith import RunEvalConfig -from langsmith.evaluation.evaluator import ( - EvaluationResult, - EvaluationResults, - RunEvaluator, -) -from langsmith.schemas import Example, Run - - -class AgentTrajectoryEvaluator(RunEvaluator): - """An evaluator that can be used in conjunction with a standard agent interface.""" - - def evaluate_run( - self, run: Run, example: Optional[Example] = None - ) -> EvaluationResults: - if run.outputs is None: - raise ValueError("Run outputs cannot be None") - # This is the output of each run - intermediate_steps = run.outputs["intermediate_steps"] - # Since we are comparing to the tool names, we now need to get that - # Intermediate steps is a Tuple[AgentAction, Any] - # The first element is the action taken - # The second element is the observation from taking that action - trajectory = [action.tool for action, _ in intermediate_steps] - # This is what we uploaded to the dataset - if example is None: - raise ValueError("Example cannot be None") - expected_trajectory = example.outputs["expected_steps"] - - # Just score it based on whether it is correct or not - score = int(trajectory == expected_trajectory) - step_fraction = len(trajectory) / len(expected_trajectory) - - return { - "results": [ - EvaluationResult( - key="Intermediate steps correctness", - score=score, - ), - EvaluationResult( - key="# steps / # expected steps", - score=step_fraction, - ), - ] - } - - -STANDARD_AGENT_EVALUATOR = RunEvalConfig( - # Evaluators can either be an evaluator type - # (e.g., "qa", "criteria", "embedding_distance", etc.) or a - # configuration for that evaluator - evaluators=[ - # Measures whether a QA response is "Correct", based on a reference answer - # You can also select via the raw string "qa" - EvaluatorType.QA - ], - # You can add custom StringEvaluator or RunEvaluator objects - # here as well, which will automatically be - # applied to each prediction. Check out the docs for examples. - custom_evaluators=[AgentTrajectoryEvaluator()], - # We now need to specify this because we have multiple outputs in our dataset - reference_key="reference", -) diff --git a/tests/agents/__init__.py b/tests/agents/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py deleted file mode 100644 index 2007707b..00000000 --- a/tests/agents/test_agents.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_import_agents() -> None: - """Test that all agents can be imported""" - from langchain_benchmarks.agents import environments, evaluators # noqa: F401 diff --git a/tests/unit_tests/agents/__init__.py b/tests/unit_tests/agents/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit_tests/agents/test_agents.py b/tests/unit_tests/agents/test_agents.py deleted file mode 100644 index 2007707b..00000000 --- a/tests/unit_tests/agents/test_agents.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_import_agents() -> None: - """Test that all agents can be imported""" - from langchain_benchmarks.agents import environments, evaluators # noqa: F401