diff --git a/langchain_benchmarks/agents/README.md b/langchain_benchmarks/agents/README.md
deleted file mode 100644
index 3d88253c..00000000
--- a/langchain_benchmarks/agents/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Testing Agents
-
-This directory contains environments that can be used to test agent's ability
-to use tools and make decisions.
-
-## Environments
-
-Environments are named in the style of e[env_number]_[name].py.
-
-### e01_alpha
-
-* Consists of 3 relational tables of users, locations and foods.
-* Defines a set of tools that can be used these tables.
-* Agent should use the given tools to answer questions.
-
-## Running Evaluation
-
-Please refer to the following example to see how to set up and run evaluation
-for agents using [LangSmith](https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/agent_steps/evaluating_agents.ipynb).
diff --git a/langchain_benchmarks/agents/__init__.py b/langchain_benchmarks/agents/__init__.py
deleted file mode 100644
index 6706ac8d..00000000
--- a/langchain_benchmarks/agents/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Package for helping to evaluate agent runs."""
diff --git a/langchain_benchmarks/agents/environments/__init__.py b/langchain_benchmarks/agents/environments/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/langchain_benchmarks/agents/environments/e01_alpha.py b/langchain_benchmarks/agents/environments/e01_alpha.py
deleted file mode 100644
index 55d88100..00000000
--- a/langchain_benchmarks/agents/environments/e01_alpha.py
+++ /dev/null
@@ -1,401 +0,0 @@
-"""A simple environment for evaluating an agent.
-
-A simple environment to evaluate an agent's ability to use a set of given tools
-to reference questions.
-
-The environment contains fake data about users and their locations and favorite foods.
-
-The environment defines a set of tools that the agent can use to access the data.
-
-Agent performance should be evaluated solely based on the agent's ability to use
-the tools to reference questions.
-"""
-from typing import Callable, List, TypedDict
-
-from langchain.tools import BaseTool, tool
-
-USER_DATA = [
-    # IDs are not consecutive to prevent agents from guessing the ID
-    {
-        "id": 1,
-        "name": "Alice",
-        "email": "alice@gmail.com",
-        "location": 1,
-        "favorite_color": "red",
-        "favorite_foods": [1, 2, 3],
-    },
-    {
-        "id": 21,
-        "name": "Bob",
-        "email": "bob@hotmail.com",
-        "location": 2,
-        "favorite_color": "orange",
-        "favorite_foods": [4, 5, 6],
-    },
-    {
-        "id": 35,
-        "name": "Charlie",
-        "email": "charlie@yahoo.com",
-        "location": 3,
-        "favorite_color": "yellow",
-        "favorite_foods": [3, 7, 2],
-    },
-    {
-        "id": 41,
-        "name": "Donna",
-        "email": "donna@example.com",
-        "location": 4,
-        "favorite_color": "green",
-        "favorite_foods": [6, 1, 4],
-    },
-    {
-        "id": 42,
-        "name": "Eve",
-        "email": "eve@example.org",
-        "location": 5,
-        "favorite_color": "blue",
-        "favorite_foods": [5, 7, 4],
-    },
-    {
-        "id": 43,
-        "name": "Frank The Cat",
-        "email": "frank.the.cat@langchain.dev",
-        "location": 5,
-        "favorite_color": "yellow",
-        "favorite_foods": [3],
-    },
-]
-
-# Create a list of JSON data for locations with "current_weather" as a single string
-LOCATION_DATA = [
-    {
-        "id": 1,
-        "city": "New York",
-        "current_time": "2023-11-14 10:30 AM",
-        "current_weather": "Partly Cloudy, Temperature: 68°F",  # Example weather string
-    },
-    {
-        "id": 2,
-        "city": "Los Angeles",
-        "current_time": "2023-11-14 7:45 AM",
-        "current_weather": "Sunny, Temperature: 75°F",  # Example weather string
-    },
-    {
-        "id": 3,
-        "city": "Chicago",
-        "current_time": "2023-11-14 11:15 AM",
-        "current_weather": "Mostly Cloudy, Temperature: 60°F",  # Example weather string
-    },
-    {
-        "id": 4,
-        "city": "Houston",
-        "current_time": "2023-11-14 12:00 PM",
-        "current_weather": "Rainy, Temperature: 55°F",  # Example weather string
-    },
-    {
-        "id": 5,
-        "city": "Miami",
-        "current_time": "2023-11-14 1:20 PM",
-        "current_weather": "Partly Cloudy, Temperature: 80°F",  # Example weather string
-    },
-]
-
-FOOD_DATA = [
-    {
-        "id": 1,
-        "name": "Pizza",
-        "calories": 285,  # Calories per serving
-        "allergic_ingredients": ["Gluten", "Dairy"],
-    },
-    {
-        "id": 2,
-        "name": "Chocolate",
-        "calories": 50,  # Calories per serving
-        "allergic_ingredients": ["Milk", "Soy"],
-    },
-    {
-        "id": 3,
-        "name": "Sushi",
-        "calories": 300,  # Calories per serving
-        "allergic_ingredients": ["Fish", "Soy"],
-    },
-    {
-        "id": 4,
-        "name": "Burger",
-        "calories": 350,  # Calories per serving
-        "allergic_ingredients": ["Gluten", "Dairy"],
-    },
-    {
-        "id": 5,
-        "name": "Ice Cream",
-        "calories": 200,  # Calories per serving
-        "allergic_ingredients": ["Dairy"],
-    },
-    {
-        "id": 6,
-        "name": "Pasta",
-        "calories": 180,  # Calories per serving
-        "allergic_ingredients": ["Gluten"],
-    },
-    {
-        "id": 7,
-        "name": "Salad",
-        "calories": 50,  # Calories per serving
-        "allergic_ingredients": [],
-    },
-]
-
-
-class SearchHit(TypedDict, total=False):
-    """A search hit."""
-
-    id: str
-
-
-def _similarity_search(data: List[dict], query: str, key: str) -> List[SearchHit]:
-    """Return a list of data that matches the given query.
-
-    Similarity score is jaccard similarity based on the number of shared
-    characters between the query and the data.
-
-    Args:
-        data: The data to search.
-        query: The query to search for.
-        key: The key to search in.
-
-    Returns:
-        The list of matching data.
-    """
-
-    def _score_function(x: str) -> float:
-        """Calculate the similarity score between the query and the given string."""
-        return len(set(x) & set(query)) / len(set(x) | set(query))
-
-    re_ranked_data = sorted(data, key=lambda x: _score_function(x[key]), reverse=True)
-    return [{"id": d["id"], key: d[key]} for d in re_ranked_data]
-
-
-def _get_user(id: int) -> dict:
-    """Find the user with the given user ID.
-
-    Args:
-        id: The user's ID.
-
-    Returns:
-        The user's data.
-    """
-    for user in USER_DATA:
-        if user["id"] == id:
-            return user
-    raise ValueError(f"User ID {id} cannot be resolved")
-
-
-def _get_location(id: int) -> dict:
-    """Find the location with the given location ID.
-
-    Args:
-        id: The location's ID.
-
-    Returns:
-        The location's data.
-    """
-    for location in LOCATION_DATA:
-        if location["id"] == id:
-            return location
-    raise ValueError(f"Location ID {id} cannot be resolved")
-
-
-def _get_food(food_id: int) -> dict:
-    """Find the food with the given food ID.
-
-    Args:
-        food_id: The food's ID.
-
-    Returns:
-        The food's data.
-    """
-    for food in FOOD_DATA:
-        if food["id"] == food_id:
-            return food
-    raise ValueError(f"Food ID {food_id} cannot be resolved")
-
-
-def get_available_functions() -> List[Callable]:
-    """Get all the available functions."""
-
-    def get_user_name(user_id: int) -> str:
-        """Get the name of the user with the given user ID.
-
-        Args:
-            user_id: The user's ID.
-
-        Returns:
-            The user's name.
-        """
-        return _get_user(user_id)["name"]
-
-    def list_user_ids() -> List[str]:
-        """List all the user IDs."""
-        return [user["id"] for user in USER_DATA]
-
-    def find_users_by_name(name: str) -> List[SearchHit]:
-        """Find users with the given name.
-
-        Args:
-            name: The name to search for.
-
-        Returns:
-            The list of matching users.
-        """
-        return _similarity_search(USER_DATA, name, "name")
-
-    def find_locations_by_name(city: str) -> List[SearchHit]:
-        """Find locations with the given city name."""
-        return _similarity_search(LOCATION_DATA, city, "city")
-
-    def find_foods_by_name(food: str) -> List[SearchHit]:
-        """Find foods with the given name."""
-        return _similarity_search(FOOD_DATA, food, "name")
-
-    def get_user_email(user_id: int) -> str:
-        """Get the email of the user with the given user ID.
-
-        Args:
-            user_id: The user's ID.
-
-        Returns:
-            The user's email.
-        """
-        return _get_user(user_id)["email"]
-
-    def get_user_location(user_id: int) -> int:
-        """Get the location ID of the user with the given user ID.
-
-        Args:
-            user_id: The user's ID.
-
-        Returns:
-            The user's location ID.
-        """
-        return _get_user(user_id)["location"]
-
-    def get_user_favorite_color(user_id: int) -> str:
-        """Get the favorite color of the user with the given user ID.
-
-        Args:
-            user_id: The user's ID.
-
-        Returns:
-            The user's favorite color.
-        """
-        return _get_user(user_id)["favorite_color"]
-
-    def get_user_favorite_foods(user_id: int) -> List[int]:
-        """Get the list of favorite foods of the user with the given user ID.
-
-        Args:
-            user_id: The user's ID.
-
-        Returns:
-            The list of favorite foods.
-        """
-        return _get_user(user_id)["favorite_foods"]
-
-    def get_weather_at_location(location_id: int) -> str:
-        """Get the current weather at the location with the given location ID.
-
-        Args:
-            location_id: The location's ID.
-
-        Returns:
-            The current weather at the location.
-        """
-        return _get_location(location_id)["current_weather"]
-
-    def get_city_for_location(location_id: int) -> str:
-        """Get the city for the location with the given location ID.
-
-        Args:
-            location_id: The location's ID.
-
-        Returns:
-            The city name for the location.
-        """
-        return _get_location(location_id)["city"]
-
-    def get_current_time_for_location(location_id: int) -> str:
-        """Get the current time for the location with the given location ID.
-
-        Args:
-            location_id: The location's ID.
-
-        Returns:
-            The current time for the location.
-        """
-        return _get_location(location_id)["current_time"]
-
-    def get_current_weather_for_location(location_id: int) -> str:
-        """Get the current weather for the location with the given location ID.
-
-        Args:
-            location_id: The location's ID.
-
-        Returns:
-            The current weather for the location.
-        """
-        return _get_location(location_id)["current_weather"]
-
-    def get_food_name(food_id: int) -> str:
-        """Get the name of the food with the given food ID.
-
-        Args:
-            food_id: The food's ID.
-
-        Returns:
-            The name of the food.
-        """
-        return _get_food(food_id)["name"]
-
-    def get_food_calories(food_id: int) -> int:
-        """Get the calories per serving for the food with the given food ID.
-
-        Args:
-            food_id: The food's ID.
-
-        Returns:
-            The calories per serving of the food.
-        """
-        return _get_food(food_id)["calories"]
-
-    def get_food_allergic_ingredients(food_id: int) -> List[str]:
-        """Get the list of allergic ingredients for the food with the given food ID.
-
-        Args:
-            food_id: The food's ID.
-
-        Returns:
-            The list of allergic ingredients.
-        """
-        return _get_food(food_id)["allergic_ingredients"]
-
-    def get_current_user_id() -> int:
-        """Get the current user's ID.
-
-        Returns:
-            The current user's ID.
-        """
-        return 35
-
-    # Get all the functions defined in the scope of this function
-    functions = [f for f in locals().values() if callable(f)]
-    return functions
-
-
-def get_tools() -> List[BaseTool]:
-    """Get all the available tools."""
-    functions = get_available_functions()
-    return [tool(f) for f in functions]
-
-
-# ID of a dataset that contains the questions and references
-DATASET_ID = "9f73165c-d333-4d14-8f59-bd7eede5db08"  # ID of Agent Gym: E01 Alpha
diff --git a/langchain_benchmarks/agents/evaluators.py b/langchain_benchmarks/agents/evaluators.py
deleted file mode 100644
index c4b9a8be..00000000
--- a/langchain_benchmarks/agents/evaluators.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""Module contains standard evaluators for agents.
-
-Requirements:
-
-* Agents must output "intermediate_steps" in their run outputs.
-* The dataset must have "expected_steps" in its outputs.
-"""
-from typing import Optional
-
-from langchain.evaluation import EvaluatorType
-from langchain.smith import RunEvalConfig
-from langsmith.evaluation.evaluator import (
-    EvaluationResult,
-    EvaluationResults,
-    RunEvaluator,
-)
-from langsmith.schemas import Example, Run
-
-
-class AgentTrajectoryEvaluator(RunEvaluator):
-    """An evaluator that can be used in conjunction with a standard agent interface."""
-
-    def evaluate_run(
-        self, run: Run, example: Optional[Example] = None
-    ) -> EvaluationResults:
-        if run.outputs is None:
-            raise ValueError("Run outputs cannot be None")
-        # This is the output of each run
-        intermediate_steps = run.outputs["intermediate_steps"]
-        # Since we are comparing to the tool names, we now need to get that
-        # Intermediate steps is a Tuple[AgentAction, Any]
-        # The first element is the action taken
-        # The second element is the observation from taking that action
-        trajectory = [action.tool for action, _ in intermediate_steps]
-        # This is what we uploaded to the dataset
-        if example is None:
-            raise ValueError("Example cannot be None")
-        expected_trajectory = example.outputs["expected_steps"]
-
-        # Just score it based on whether it is correct or not
-        score = int(trajectory == expected_trajectory)
-        step_fraction = len(trajectory) / len(expected_trajectory)
-
-        return {
-            "results": [
-                EvaluationResult(
-                    key="Intermediate steps correctness",
-                    score=score,
-                ),
-                EvaluationResult(
-                    key="# steps / # expected steps",
-                    score=step_fraction,
-                ),
-            ]
-        }
-
-
-STANDARD_AGENT_EVALUATOR = RunEvalConfig(
-    # Evaluators can either be an evaluator type
-    # (e.g., "qa", "criteria", "embedding_distance", etc.) or a
-    # configuration for that evaluator
-    evaluators=[
-        # Measures whether a QA response is "Correct", based on a reference answer
-        # You can also select via the raw string "qa"
-        EvaluatorType.QA
-    ],
-    # You can add custom StringEvaluator or RunEvaluator objects
-    # here as well, which will automatically be
-    # applied to each prediction. Check out the docs for examples.
-    custom_evaluators=[AgentTrajectoryEvaluator()],
-    # We now need to specify this because we have multiple outputs in our dataset
-    reference_key="reference",
-)
diff --git a/tests/agents/__init__.py b/tests/agents/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py
deleted file mode 100644
index 2007707b..00000000
--- a/tests/agents/test_agents.py
+++ /dev/null
@@ -1,3 +0,0 @@
-def test_import_agents() -> None:
-    """Test that all agents can be imported"""
-    from langchain_benchmarks.agents import environments, evaluators  # noqa: F401
diff --git a/tests/unit_tests/agents/__init__.py b/tests/unit_tests/agents/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/unit_tests/agents/test_agents.py b/tests/unit_tests/agents/test_agents.py
deleted file mode 100644
index 2007707b..00000000
--- a/tests/unit_tests/agents/test_agents.py
+++ /dev/null
@@ -1,3 +0,0 @@
-def test_import_agents() -> None:
-    """Test that all agents can be imported"""
-    from langchain_benchmarks.agents import environments, evaluators  # noqa: F401