From a6adeb59629615ebbc5d7f94802c230af5978b43 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 5 Aug 2025 22:47:43 +0000 Subject: [PATCH 1/5] changed tests --- eval_protocol/mcp_env.py | 2 +- .../pytest/default_mcp_gym_rollout_processor.py | 1 + tests/pytest/test_tau_bench_airline.py | 14 +++++--------- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py index f52b1793..ae216d3e 100644 --- a/eval_protocol/mcp_env.py +++ b/eval_protocol/mcp_env.py @@ -272,7 +272,7 @@ async def rollout( execution_manager = ExecutionManager() return await execution_manager.execute_rollouts( - envs, policy, steps, openai_format_log_file, max_concurrent_rollouts + envs, policy, steps, openai_format_log_file, max_concurrent_rollouts, evaluation_rows ) diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index bc68d346..7097c685 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -219,6 +219,7 @@ async def default_mcp_gym_rollout_processor(rows: List[EvaluationRow], config: R evaluation_rows = await ep.rollout( envs, policy=policy, + evaluation_rows=rows, steps=config.steps, max_concurrent_rollouts=config.max_concurrent_rollouts ) diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index b003c12d..091531b5 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -63,13 +63,13 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval @evaluation_test( input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096}], rollout_processor=default_mcp_gym_rollout_processor, threshold_of_success=0.4, num_runs=1, mode="pointwise", - max_concurrent_rollouts=32, + max_concurrent_rollouts=16, server_script_path="examples/tau2_mcp/server.py", ) def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: @@ -80,12 +80,10 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: extracts evaluation criteria from dataset entries. No wrapper needed! Args: - input_dataset: List of EvaluationRow objects from tau bench airline dataset - input_params: Model parameters (temperature, max_tokens, etc.) - model: Model identifier + row: EvaluationRow object from tau bench airline dataset after rollout Returns: - List of evaluated EvaluationRow objects with scores and feedback + EvaluationRow with tau2 evaluation results """ messages = row.messages @@ -131,9 +129,7 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: communicate_info=communicate_info, actions=actions, reward_basis=[ - RewardType.NL_ASSERTION, RewardType.DB, - RewardType.COMMUNICATE, RewardType.ACTION, ], ) From 3415d435e76a0271467bfc047b5c443587a5e007 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 5 Aug 2025 22:48:30 +0000 Subject: [PATCH 2/5] more change --- eval_protocol/mcp/execution/manager.py | 36 ++++++++++---------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 8f52d323..3bae8d28 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -77,6 +77,7 @@ async def execute_rollouts( steps: int = 512, openai_format_log_file: Optional[str] = None, max_concurrent_rollouts: int = 8, + evaluation_rows: Optional[List[EvaluationRow]] = None, ) -> List[EvaluationRow]: """ Execute general rollouts using tool calling interface with automatic record/playback. @@ -170,9 +171,11 @@ async def _execute_with_semaphore(idx): # Add note about control plane separation logger.info(f"🎛️ Trajectories include control plane separation") - # Convert trajectories to unified EvaluationRow format - evaluation_rows = [] - for trajectory in trajectories: + # Convert trajectories to unified EvaluationRow format. If no evaluation_rows are provided, create empty ones for backwards compatibility. + if evaluation_rows is None: + evaluation_rows = [EvaluationRow(messages=[], input_metadata=InputMetadata()) for _ in trajectories] + + for idx, trajectory in enumerate(trajectories): # Handle multimodal content by extracting text from complex content structures messages = [] for msg in trajectory.conversation_history: @@ -190,26 +193,15 @@ async def _execute_with_semaphore(idx): messages.append(Message.model_validate(msg_dict)) - input_metadata = InputMetadata( - row_id=trajectory.session.dataset_row.id if trajectory.session.dataset_row else None, - dataset_info=asdict(trajectory.session.dataset_row) if trajectory.session.dataset_row else {}, - completion_params=CompletionParams( - model=policy.model_id, - temperature=getattr(policy, "temperature", None), - max_tokens=getattr(policy, "max_tokens", None), - max_tool_calls=getattr(policy, "max_tools_per_turn", None), - ), - session_data={ - "timestamp": time.time(), - }, - ) - evaluation_row = EvaluationRow( - messages=messages, - tools=shared_tool_schema, - input_metadata=input_metadata, - usage=trajectory.usage, + evaluation_rows[idx].messages = messages + evaluation_rows[idx].tools = shared_tool_schema + evaluation_rows[idx].usage = trajectory.usage + evaluation_rows[idx].input_metadata.completion_params = CompletionParams( + model=policy.model_id, + temperature=getattr(policy, "temperature", None), + max_tokens=getattr(policy, "max_tokens", None), + max_tool_calls=getattr(policy, "max_tools_per_turn", None), ) - evaluation_rows.append(evaluation_row) return evaluation_rows From bf8dfa470878352eeaedcc30622967f1c022d7f5 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 7 Aug 2025 00:54:59 +0000 Subject: [PATCH 3/5] gpt-oss example e2e --- .../default_mcp_gym_rollout_processor.py | 45 ++++++++++--------- tests/pytest/test_tau_bench_airline.py | 33 +++++++------- .../tau2/evaluator/evaluator_nl_assertions.py | 37 ++++++++++----- 3 files changed, 67 insertions(+), 48 deletions(-) diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index 7097c685..6ea9064c 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -1,8 +1,10 @@ import asyncio +import atexit import os +import signal +import socket import subprocess import time -import socket from pathlib import Path from typing import List, Optional @@ -10,9 +12,6 @@ from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest.types import RolloutProcessorConfig -import atexit -import signal - class MCPServerManager: """Manages MCP server lifecycle for testing.""" @@ -182,50 +181,52 @@ def __exit__(self, exc_type, exc_val, exc_tb): return False # Don't suppress exceptions - -async def default_mcp_gym_rollout_processor(rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[EvaluationRow]: +async def default_mcp_gym_rollout_processor( + rows: List[EvaluationRow], config: RolloutProcessorConfig +) -> List[EvaluationRow]: """ Rollout processor for tau bench environments. - + This processor starts an MCP server, creates tau bench environments, and runs rollouts using the eval_protocol framework, following the pattern from test_tau2_e2e.py. - + Args: rows: List of EvaluationRow objects containing messages and dataset info in input_metadata config: RolloutProcessorConfig with model and other parameters - + Returns: List of EvaluationRow objects with completed conversations """ server = MCPServerManager(config.server_script_path, port=9700) - + try: server.start() - + policy = ep.LiteLLMPolicy( model_id=config.model, - temperature=config.input_params.get('temperature', 0.0), - max_tokens=config.input_params.get('max_tokens', 4096), + temperature=config.input_params.get("temperature", 0.0), + max_tokens=config.input_params.get("max_tokens", 4096), + reasoning_effort=config.input_params.get("reasoning_effort", None), ) - + # Create MCP environments directly from evaluation_rows envs = ep.make( - 'http://localhost:9700/mcp/', + "http://localhost:9700/mcp/", evaluation_rows=rows, model_id=policy.model_id, ) - + # Run rollout with environments and policy evaluation_rows = await ep.rollout( - envs, - policy=policy, + envs, + policy=policy, evaluation_rows=rows, - steps=config.steps, - max_concurrent_rollouts=config.max_concurrent_rollouts + steps=config.steps, + max_concurrent_rollouts=config.max_concurrent_rollouts, ) - + return evaluation_rows - + finally: # Always clean up the server server.stop() diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index 091531b5..a0146f60 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -10,10 +10,9 @@ from pathlib import Path from typing import Any, Dict, List -from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams +from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor - from vendor.tau2.data_model.message import ( AssistantMessage, SystemMessage, @@ -28,20 +27,21 @@ from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator from vendor.tau2.registry import registry + def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: """ Convert entries from airline dataset to EvaluationRow objects. """ rows = [] test_dir = Path(__file__).parent.parent.parent / "examples" / "tau2_mcp" / "tests" - + # Load system prompt from file so we can change it in one place domain = data[0]["environment_context"]["domain"] prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md" - + with open(prompt_file, "r") as f: system_prompt = f.read().strip() - + for row in data: eval_row = EvaluationRow( messages=[Message(role="system", content=system_prompt)], @@ -52,19 +52,20 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval "user_simulation": row["user_simulation"], "evaluation_criteria": row["evaluation_criteria"], "user_prompt_template": row["user_prompt_template"], - } + }, ), ) - + rows.append(eval_row) - + return rows + @evaluation_test( input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_to_evaluation_row, model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096}], + rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "high"}], rollout_processor=default_mcp_gym_rollout_processor, threshold_of_success=0.4, num_runs=1, @@ -75,22 +76,22 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: """ Test tau bench airline evaluation using the pytest framework. - + This test now uses the tau_bench_airline_reward function which automatically extracts evaluation criteria from dataset entries. No wrapper needed! - + Args: row: EvaluationRow object from tau bench airline dataset after rollout - + Returns: EvaluationRow with tau2 evaluation results """ messages = row.messages - + # Get evaluation criteria and user_simulation from input_metadata.dataset_info dataset_info = row.input_metadata.dataset_info if row.input_metadata else {} evaluation_criteria = dataset_info.get("evaluation_criteria", {}) - + nl_assertions = evaluation_criteria.get("nl_assertions", []) communicate_info = evaluation_criteria.get("communicate_info", []) actions = evaluation_criteria.get("actions", []) @@ -130,7 +131,7 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: actions=actions, reward_basis=[ RewardType.DB, - RewardType.ACTION, + RewardType.COMMUNICATE, ], ) @@ -226,4 +227,4 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: reason=reason, metrics={}, ) - return row \ No newline at end of file + return row diff --git a/vendor/tau2/evaluator/evaluator_nl_assertions.py b/vendor/tau2/evaluator/evaluator_nl_assertions.py index 0bb56716..1840ac55 100644 --- a/vendor/tau2/evaluator/evaluator_nl_assertions.py +++ b/vendor/tau2/evaluator/evaluator_nl_assertions.py @@ -1,4 +1,7 @@ import json +from typing import List + +from pydantic import BaseModel from vendor.tau2.config import DEFAULT_LLM_NL_ASSERTIONS, DEFAULT_LLM_NL_ASSERTIONS_ARGS from vendor.tau2.data_model.message import Message, SystemMessage, UserMessage @@ -7,6 +10,20 @@ from vendor.tau2.utils.llm_utils import generate +class NLAssertionResult(BaseModel): + """Individual NL assertion evaluation result.""" + + expectedOutcome: str + reasoning: str + metExpectation: bool + + +class NLAssertionsResponse(BaseModel): + """Complete NL assertions evaluation response.""" + + results: List[NLAssertionResult] + + class NLAssertionsEvaluator: """ Judge that evaluates whether a trajectory adheres to all the natural-language assertions. @@ -37,9 +54,7 @@ def calculate_reward( reward_breakdown={RewardType.NL_ASSERTION: 1.0}, ) - nl_assertions_checks = cls.evaluate_nl_assertions( - full_trajectory, nl_assertions - ) + nl_assertions_checks = cls.evaluate_nl_assertions(full_trajectory, nl_assertions) # Calculate reward: 1 if all expectations are met, 0 otherwise all_expectations_met = all(result.met for result in nl_assertions_checks) @@ -70,9 +85,7 @@ def evaluate_nl_assertions( - metExpectation: Boolean indicating if the assertion was met - reasoning: Explanation for the evaluation """ - trajectory_str = "\n".join( - [f"{message.role}: {message.content}" for message in trajectory] - ) + trajectory_str = "\n".join([f"{message.role}: {message.content}" for message in trajectory]) # System prompt similar to the TypeScript implementation system_prompt = """ TASK @@ -86,7 +99,7 @@ def evaluate_nl_assertions( - `reasoning`: a short explanation for your classification - `metExpectation`: `true` if the agent satisfies the expected outcomes, `false` otherwise - `expectedOutcome`: repeat the expectation from the input that you are grading - + Example response structure: { "results": [ @@ -102,7 +115,7 @@ def evaluate_nl_assertions( user_prompt = f""" conversation: {trajectory_str} - + expectedOutcomes: {nl_assertions} """ @@ -115,8 +128,12 @@ def evaluate_nl_assertions( assistant_message = generate( model=DEFAULT_LLM_NL_ASSERTIONS, messages=messages, - **DEFAULT_LLM_NL_ASSERTIONS_ARGS, - ) + temperature=0.0, + response_format={ + "type": "json_schema", + "json_schema": {"name": "NLAssertionsResponse", "schema": NLAssertionsResponse.model_json_schema()}, + }, + ) # Adding constrained generation to ensure the response is a valid JSON object result_data = json.loads(assistant_message.content) return [ NLAssertionCheck( From 83e2b78f5b9713d95b129dce2c6016dd04333ae8 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 7 Aug 2025 01:03:35 +0000 Subject: [PATCH 4/5] uv lock --- uv.lock | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/uv.lock b/uv.lock index d24abc24..134e2ce4 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -1130,7 +1130,6 @@ dependencies = [ { name = "deepdiff" }, { name = "docstring-parser" }, { name = "fastapi" }, - { name = "fireworks-ai" }, { name = "fsspec" }, { name = "gymnasium" }, { name = "httpx" }, @@ -1243,8 +1242,7 @@ requires-dist = [ { name = "docstring-parser", specifier = ">=0.15" }, { name = "e2b", marker = "extra == 'dev'" }, { name = "fastapi", specifier = ">=0.116.1" }, - { name = "fireworks-ai", specifier = ">=0.19.12" }, - { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.10" }, + { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.12" }, { name = "flake8", marker = "extra == 'dev'", specifier = ">=3.9.2" }, { name = "fsspec" }, { name = "gymnasium", specifier = ">=0.29.0" }, From 3666d1f94cf5692fb7a6b5fb3891ff6491d5e74c Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 6 Aug 2025 23:00:58 -0700 Subject: [PATCH 5/5] fix test --- tests/test_rollout_control_plane_integration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py index 667b74cb..9be29b81 100644 --- a/tests/test_rollout_control_plane_integration.py +++ b/tests/test_rollout_control_plane_integration.py @@ -519,6 +519,7 @@ async def test_rollout_creates_envs_from_url(self): 5, None, 8, + None, ) assert result == ["ok"]