diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb index 5ac4a18f..ba16627c 100644 --- a/docs/source/notebooks/tool_usage/multiverse_math.ipynb +++ b/docs/source/notebooks/tool_usage/multiverse_math.ipynb @@ -300,11 +300,11 @@ ], "source": [ "import uuid\n", + "\n", "from langsmith.client import Client\n", "\n", "from langchain_benchmarks.tool_usage import get_eval_config\n", "\n", - "\n", "experiment_uuid = uuid.uuid4().hex[:4]\n", "\n", "client = Client()\n", diff --git a/langchain_benchmarks/extraction/evaluators.py b/langchain_benchmarks/extraction/evaluators.py index 58cec0da..b47aa4d3 100644 --- a/langchain_benchmarks/extraction/evaluators.py +++ b/langchain_benchmarks/extraction/evaluators.py @@ -1,8 +1,18 @@ +from typing import Optional + +from langchain.chat_models import ChatOpenAI from langchain.chat_models.base import BaseChatModel from langchain.smith import RunEvalConfig -def get_eval_config(eval_llm: BaseChatModel) -> RunEvalConfig: +def get_eval_config(eval_llm: Optional[BaseChatModel] = None) -> RunEvalConfig: + eval_llm = eval_llm or ChatOpenAI( + model="gpt-4", + temperature=0, + model_kwargs={"seed": 42}, + max_retries=1, + request_timeout=60, + ) """Get the evaluation configuration for the email task.""" return RunEvalConfig( evaluators=[ diff --git a/langchain_benchmarks/extraction/implementations.py b/langchain_benchmarks/extraction/implementations.py index cafac1bd..b85c1456 100644 --- a/langchain_benchmarks/extraction/implementations.py +++ b/langchain_benchmarks/extraction/implementations.py @@ -61,7 +61,13 @@ def run_on_dataset( kwargs: Additional arguments to pass to the client. """ client = Client() - eval_llm = ChatOpenAI(model="gpt-4", temperature=0.0, model_kwargs={"seed": 42}) + eval_llm = ChatOpenAI( + model="gpt-4", + temperature=0.0, + model_kwargs={"seed": 42}, + max_retries=1, + request_timeout=60, + ) return client.run_on_dataset( dataset_name=task.name, llm_or_chain_factory=create_openai_function_based_extractor( diff --git a/langchain_benchmarks/rag/evaluators.py b/langchain_benchmarks/rag/evaluators.py index 14cc4cb1..5d5e6e14 100644 --- a/langchain_benchmarks/rag/evaluators.py +++ b/langchain_benchmarks/rag/evaluators.py @@ -84,10 +84,20 @@ def evaluate_run( def get_eval_config() -> RunEvalConfig: """Returns the evaluator for the environment.""" - eval_llm = ChatOpenAI(model="gpt-4", temperature=0.0, model_kwargs={"seed": 42}) + eval_llm = ChatOpenAI( + model="gpt-4", + temperature=0.0, + model_kwargs={"seed": 42}, + max_retries=1, + request_timeout=60, + ) # Use a longer-context LLM to check documents faithfulness_eval_llm = ChatOpenAI( - model="gpt-4-1106-preview", temperature=0.0, model_kwargs={"seed": 42} + model="gpt-4-1106-preview", + temperature=0.0, + model_kwargs={"seed": 42}, + max_retries=1, + request_timeout=60, ) return RunEvalConfig( diff --git a/langchain_benchmarks/tool_usage/evaluators.py b/langchain_benchmarks/tool_usage/evaluators.py index df3f7f23..647bd0e6 100644 --- a/langchain_benchmarks/tool_usage/evaluators.py +++ b/langchain_benchmarks/tool_usage/evaluators.py @@ -99,7 +99,13 @@ class AgentTrajectoryEvaluator(RunEvaluator): def __init__(self) -> None: """Initialize the evaluator.""" - eval_llm = ChatOpenAI(model="gpt-4", temperature=0, model_kwargs={"seed": 42}) + eval_llm = ChatOpenAI( + model="gpt-4", + temperature=0, + model_kwargs={"seed": 42}, + max_retries=1, + request_timeout=60, + ) self.qa_evaluator = load_evaluator(EvaluatorType.QA, llm=eval_llm) def evaluate_run(