diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 21b27eb6..ab461f8a 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -42,6 +42,7 @@ async def execute_rollouts( steps: int = 512, openai_format_log_file: Optional[str] = None, max_concurrent_rollouts: int = 8, + evaluation_rows: Optional[List[EvaluationRow]] = None, ) -> List[EvaluationRow]: """ Execute general rollouts using tool calling interface with automatic record/playback. @@ -135,9 +136,11 @@ async def _execute_with_semaphore(idx): # Add note about control plane separation logger.info(f"🎛️ Trajectories include control plane separation") - # Convert trajectories to unified EvaluationRow format - evaluation_rows = [] - for trajectory in trajectories: + # Convert trajectories to unified EvaluationRow format. If no evaluation_rows are provided, create empty ones for backwards compatibility. + if evaluation_rows is None: + evaluation_rows = [EvaluationRow(messages=[], input_metadata=InputMetadata()) for _ in trajectories] + + for idx, trajectory in enumerate(trajectories): # Handle multimodal content by extracting text from complex content structures messages = [] for msg in trajectory.conversation_history: @@ -155,26 +158,15 @@ async def _execute_with_semaphore(idx): messages.append(Message.model_validate(msg_dict)) - input_metadata = InputMetadata( - row_id=trajectory.session.dataset_row.id if trajectory.session.dataset_row else None, - dataset_info=asdict(trajectory.session.dataset_row) if trajectory.session.dataset_row else {}, - completion_params=CompletionParams( - model=policy.model_id, - temperature=getattr(policy, "temperature", None), - max_tokens=getattr(policy, "max_tokens", None), - max_tool_calls=getattr(policy, "max_tools_per_turn", None), - ), - session_data={ - "timestamp": time.time(), - }, - ) - evaluation_row = EvaluationRow( - messages=messages, - tools=shared_tool_schema, - input_metadata=input_metadata, - usage=trajectory.usage, + evaluation_rows[idx].messages = messages + evaluation_rows[idx].tools = shared_tool_schema + evaluation_rows[idx].usage = trajectory.usage + evaluation_rows[idx].input_metadata.completion_params = CompletionParams( + model=policy.model_id, + temperature=getattr(policy, "temperature", None), + max_tokens=getattr(policy, "max_tokens", None), + max_tool_calls=getattr(policy, "max_tools_per_turn", None), ) - evaluation_rows.append(evaluation_row) return evaluation_rows diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index b03c52be..06233c4b 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -64,6 +64,9 @@ def __init__( self.num_retries = num_retries self.retry_strategy = retry_strategy + # Store additional API parameters from kwargs + self.additional_params = kwargs + # Only initialize LiteLLM in live mode (not in playback mode) if not self._is_playback: self._setup_litellm_caching(use_caching, cache_type, redis_url) @@ -166,6 +169,14 @@ async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict: "base_url": self.base_url, } + # Add additional parameters from kwargs (like reasoning_effort) + if self.additional_params: + request_params.update(self.additional_params) + + # Tell LiteLLM to allow reasoning_effort if it's present + if "reasoning_effort" in self.additional_params: + request_params["allowed_openai_params"] = ["reasoning_effort"] + # Add tools if provided if tools: request_params["tools"] = tools diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py index 2a03e931..1d330994 100644 --- a/eval_protocol/mcp_env.py +++ b/eval_protocol/mcp_env.py @@ -40,6 +40,8 @@ - Resources provide static/configuration data, tools provide dynamic actions """ +import asyncio + # For legacy compatibility - import the facade functions import logging import random @@ -47,11 +49,10 @@ # Import all functionality from the new modular components from .mcp.execution.manager import ExecutionManager -from .mcp.execution.policy import AnthropicPolicy, FireworksPolicy, LLMBasePolicy, OpenAIPolicy, LiteLLMPolicy +from .mcp.execution.policy import AnthropicPolicy, FireworksPolicy, LiteLLMPolicy, LLMBasePolicy, OpenAIPolicy from .mcp.session.manager import GeneralMCPVectorEnv from .models import EvaluationRow from .types import DatasetRow, MCPSession, MCPToolCall -import asyncio logger = logging.getLogger(__name__) @@ -288,7 +289,7 @@ async def rollout( execution_manager = ExecutionManager() return await execution_manager.execute_rollouts( - envs, policy, steps, openai_format_log_file, max_concurrent_rollouts + envs, policy, steps, openai_format_log_file, max_concurrent_rollouts, evaluation_rows ) diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index a803dd43..d7cba33d 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -1,8 +1,10 @@ import asyncio +import atexit import os +import signal +import socket import subprocess import time -import socket from pathlib import Path from typing import List, Optional @@ -10,9 +12,6 @@ from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest.types import RolloutProcessorConfig -import atexit -import signal - class MCPServerManager: """Manages MCP server lifecycle for testing.""" @@ -188,13 +187,16 @@ async def default_mcp_gym_rollout_processor( """ Rollout processor for tau bench environments. + This processor starts an MCP server, creates tau bench environments, and runs rollouts using the eval_protocol framework, following the pattern from test_tau2_e2e.py. + Args: rows: List of EvaluationRow objects containing messages and dataset info in input_metadata config: RolloutProcessorConfig with model and other parameters + Returns: List of EvaluationRow objects with completed conversations """ @@ -207,6 +209,7 @@ async def default_mcp_gym_rollout_processor( model_id=config.model, temperature=config.input_params.get("temperature", 0.0), max_tokens=config.input_params.get("max_tokens", 4096), + reasoning_effort=config.input_params.get("reasoning_effort", None), ) # Create MCP environments directly from evaluation_rows @@ -218,7 +221,11 @@ async def default_mcp_gym_rollout_processor( # Run rollout with environments and policy evaluation_rows = await ep.rollout( - envs, policy=policy, steps=config.steps, max_concurrent_rollouts=config.max_concurrent_rollouts + envs, + policy=policy, + evaluation_rows=rows, + steps=config.steps, + max_concurrent_rollouts=config.max_concurrent_rollouts, ) return evaluation_rows diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index b003c12d..a0146f60 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -10,10 +10,9 @@ from pathlib import Path from typing import Any, Dict, List -from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams +from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor - from vendor.tau2.data_model.message import ( AssistantMessage, SystemMessage, @@ -28,20 +27,21 @@ from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator from vendor.tau2.registry import registry + def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: """ Convert entries from airline dataset to EvaluationRow objects. """ rows = [] test_dir = Path(__file__).parent.parent.parent / "examples" / "tau2_mcp" / "tests" - + # Load system prompt from file so we can change it in one place domain = data[0]["environment_context"]["domain"] prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md" - + with open(prompt_file, "r") as f: system_prompt = f.read().strip() - + for row in data: eval_row = EvaluationRow( messages=[Message(role="system", content=system_prompt)], @@ -52,47 +52,46 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval "user_simulation": row["user_simulation"], "evaluation_criteria": row["evaluation_criteria"], "user_prompt_template": row["user_prompt_template"], - } + }, ), ) - + rows.append(eval_row) - + return rows + @evaluation_test( input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "high"}], rollout_processor=default_mcp_gym_rollout_processor, threshold_of_success=0.4, num_runs=1, mode="pointwise", - max_concurrent_rollouts=32, + max_concurrent_rollouts=16, server_script_path="examples/tau2_mcp/server.py", ) def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: """ Test tau bench airline evaluation using the pytest framework. - + This test now uses the tau_bench_airline_reward function which automatically extracts evaluation criteria from dataset entries. No wrapper needed! - + Args: - input_dataset: List of EvaluationRow objects from tau bench airline dataset - input_params: Model parameters (temperature, max_tokens, etc.) - model: Model identifier - + row: EvaluationRow object from tau bench airline dataset after rollout + Returns: - List of evaluated EvaluationRow objects with scores and feedback + EvaluationRow with tau2 evaluation results """ messages = row.messages - + # Get evaluation criteria and user_simulation from input_metadata.dataset_info dataset_info = row.input_metadata.dataset_info if row.input_metadata else {} evaluation_criteria = dataset_info.get("evaluation_criteria", {}) - + nl_assertions = evaluation_criteria.get("nl_assertions", []) communicate_info = evaluation_criteria.get("communicate_info", []) actions = evaluation_criteria.get("actions", []) @@ -131,10 +130,8 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: communicate_info=communicate_info, actions=actions, reward_basis=[ - RewardType.NL_ASSERTION, RewardType.DB, RewardType.COMMUNICATE, - RewardType.ACTION, ], ) @@ -230,4 +227,4 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: reason=reason, metrics={}, ) - return row \ No newline at end of file + return row diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py index 667b74cb..9be29b81 100644 --- a/tests/test_rollout_control_plane_integration.py +++ b/tests/test_rollout_control_plane_integration.py @@ -519,6 +519,7 @@ async def test_rollout_creates_envs_from_url(self): 5, None, 8, + None, ) assert result == ["ok"] diff --git a/uv.lock b/uv.lock index d24abc24..134e2ce4 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -1130,7 +1130,6 @@ dependencies = [ { name = "deepdiff" }, { name = "docstring-parser" }, { name = "fastapi" }, - { name = "fireworks-ai" }, { name = "fsspec" }, { name = "gymnasium" }, { name = "httpx" }, @@ -1243,8 +1242,7 @@ requires-dist = [ { name = "docstring-parser", specifier = ">=0.15" }, { name = "e2b", marker = "extra == 'dev'" }, { name = "fastapi", specifier = ">=0.116.1" }, - { name = "fireworks-ai", specifier = ">=0.19.12" }, - { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.10" }, + { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.12" }, { name = "flake8", marker = "extra == 'dev'", specifier = ">=3.9.2" }, { name = "fsspec" }, { name = "gymnasium", specifier = ">=0.29.0" }, diff --git a/vendor/tau2/evaluator/evaluator_nl_assertions.py b/vendor/tau2/evaluator/evaluator_nl_assertions.py index 0bb56716..1840ac55 100644 --- a/vendor/tau2/evaluator/evaluator_nl_assertions.py +++ b/vendor/tau2/evaluator/evaluator_nl_assertions.py @@ -1,4 +1,7 @@ import json +from typing import List + +from pydantic import BaseModel from vendor.tau2.config import DEFAULT_LLM_NL_ASSERTIONS, DEFAULT_LLM_NL_ASSERTIONS_ARGS from vendor.tau2.data_model.message import Message, SystemMessage, UserMessage @@ -7,6 +10,20 @@ from vendor.tau2.utils.llm_utils import generate +class NLAssertionResult(BaseModel): + """Individual NL assertion evaluation result.""" + + expectedOutcome: str + reasoning: str + metExpectation: bool + + +class NLAssertionsResponse(BaseModel): + """Complete NL assertions evaluation response.""" + + results: List[NLAssertionResult] + + class NLAssertionsEvaluator: """ Judge that evaluates whether a trajectory adheres to all the natural-language assertions. @@ -37,9 +54,7 @@ def calculate_reward( reward_breakdown={RewardType.NL_ASSERTION: 1.0}, ) - nl_assertions_checks = cls.evaluate_nl_assertions( - full_trajectory, nl_assertions - ) + nl_assertions_checks = cls.evaluate_nl_assertions(full_trajectory, nl_assertions) # Calculate reward: 1 if all expectations are met, 0 otherwise all_expectations_met = all(result.met for result in nl_assertions_checks) @@ -70,9 +85,7 @@ def evaluate_nl_assertions( - metExpectation: Boolean indicating if the assertion was met - reasoning: Explanation for the evaluation """ - trajectory_str = "\n".join( - [f"{message.role}: {message.content}" for message in trajectory] - ) + trajectory_str = "\n".join([f"{message.role}: {message.content}" for message in trajectory]) # System prompt similar to the TypeScript implementation system_prompt = """ TASK @@ -86,7 +99,7 @@ def evaluate_nl_assertions( - `reasoning`: a short explanation for your classification - `metExpectation`: `true` if the agent satisfies the expected outcomes, `false` otherwise - `expectedOutcome`: repeat the expectation from the input that you are grading - + Example response structure: { "results": [ @@ -102,7 +115,7 @@ def evaluate_nl_assertions( user_prompt = f""" conversation: {trajectory_str} - + expectedOutcomes: {nl_assertions} """ @@ -115,8 +128,12 @@ def evaluate_nl_assertions( assistant_message = generate( model=DEFAULT_LLM_NL_ASSERTIONS, messages=messages, - **DEFAULT_LLM_NL_ASSERTIONS_ARGS, - ) + temperature=0.0, + response_format={ + "type": "json_schema", + "json_schema": {"name": "NLAssertionsResponse", "schema": NLAssertionsResponse.model_json_schema()}, + }, + ) # Adding constrained generation to ensure the response is a valid JSON object result_data = json.loads(assistant_message.content) return [ NLAssertionCheck(