From 5bc39d746352cc145bfb4014dc02adf30cfc8506 Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Sat, 9 Aug 2025 23:59:12 +0000
Subject: [PATCH 1/9] Add AIME2025, GPQA, HealthBench evaluation_test suites;
 unify row-limiting via pytest flag; clean up examples

---
 eval_protocol/common_utils.py                 |  41 +++++--
 eval_protocol/generation/clients.py           |   5 +-
 eval_protocol/pytest/evaluation_test.py       |  30 ++++-
 eval_protocol/pytest/plugin.py                |  54 ++++++++
 examples/aime2025_chat_completion/README.md   |  24 ++++
 examples/aime2025_chat_completion/__init__.py |   4 +
 examples/aime2025_chat_completion/main.py     | 110 +++++++++++++++++
 .../tests/test_evaluation.py                  | 115 ++++++++++++++++++
 examples/gpqa/tests/test_evaluation.py        | 101 +++++++++++++++
 examples/healthbench/tests/test_evaluation.py |  95 +++++++++++++++
 pyproject.toml                                |  12 ++
 11 files changed, 579 insertions(+), 12 deletions(-)
 create mode 100644 eval_protocol/pytest/plugin.py
 create mode 100644 examples/aime2025_chat_completion/README.md
 create mode 100644 examples/aime2025_chat_completion/__init__.py
 create mode 100644 examples/aime2025_chat_completion/main.py
 create mode 100644 examples/aime2025_chat_completion/tests/test_evaluation.py
 create mode 100644 examples/gpqa/tests/test_evaluation.py
 create mode 100644 examples/healthbench/tests/test_evaluation.py

diff --git a/eval_protocol/common_utils.py b/eval_protocol/common_utils.py
index e39f80d0..42ad47ad 100644
--- a/eval_protocol/common_utils.py
+++ b/eval_protocol/common_utils.py
@@ -2,6 +2,8 @@
 import re
 from typing import Any, Dict, List
 
+import requests
+
 
 def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
     """
@@ -15,16 +17,39 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
         Returns an empty list if the file is not found or if errors occur during parsing.
     """
     data: List[Dict[str, Any]] = []
-    with open(file_path, "r", encoding="utf-8") as f:
-        for line_number, line in enumerate(f):
+    if file_path.startswith("http://") or file_path.startswith("https://"):
+        resp = requests.get(file_path, stream=True, timeout=30)
+        resp.raise_for_status()
+        for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1):
+            if raw is None:
+                continue
+            stripped = raw.strip()
+            if not stripped:
+                continue
             try:
-                data.append(json.loads(line.strip()))
+                data.append(json.loads(stripped))
             except json.JSONDecodeError as e:
-                print(f"Error parsing JSON line for file {file_path} at line {line_number}")
-                # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
-                row_id_index = line.find("row_id")
+                print(f"Error parsing JSON line for URL {file_path} at line {line_number}")
+                row_id_index = stripped.find("row_id")
                 if row_id_index != -1:
-                    row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
-                    raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
+                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})")
                 raise e
+    else:
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line_number, line in enumerate(f, start=1):
+                # Skip entirely blank or whitespace-only lines to be robust to trailing newlines
+                stripped = line.strip()
+                if not stripped:
+                    continue
+                try:
+                    data.append(json.loads(stripped))
+                except json.JSONDecodeError as e:
+                    print(f"Error parsing JSON line for file {file_path} at line {line_number}")
+                    # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
+                    row_id_index = line.find("row_id")
+                    if row_id_index != -1:
+                        row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
+                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    raise e
     return data
diff --git a/eval_protocol/generation/clients.py b/eval_protocol/generation/clients.py
index 8f386290..45be6ab0 100644
--- a/eval_protocol/generation/clients.py
+++ b/eval_protocol/generation/clients.py
@@ -11,7 +11,7 @@
 
 import aiohttp
 from omegaconf import DictConfig
-from pydantic import BaseModel, Field  # Added for new models
+from pydantic import BaseModel  # Added for new models
 
 logger = logging.getLogger(__name__)
 
@@ -83,6 +83,9 @@ async def generate(
         }
         if self.top_p is not None:
             payload["top_p"] = self.top_p
+        # Include reasoning settings if configured (for reasoning-capable models)
+        if self.reasoning_effort:
+            payload["reasoning_effort"] = self.reasoning_effort
 
         if tools:
             payload["tools"] = tools
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 245467bb..a28bbca5 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -1,5 +1,6 @@
 import inspect
 import os
+import os
 from typing import Any, Callable, Dict, List, Optional
 
 import pytest
@@ -132,13 +133,34 @@ def execute_with_params(
             return execute_function(test_func, **kwargs)
 
         # Calculate all possible combinations of parameters
+        def _parse_ep_max_rows(default_value: int | None) -> int | None:
+            """Read EP_MAX_DATASET_ROWS env override as int or None."""
+            raw = os.getenv("EP_MAX_DATASET_ROWS")
+            if raw is None:
+                return default_value
+            s = raw.strip().lower()
+            if s == "none":
+                return None
+            try:
+                return int(s)
+            except ValueError:
+                return default_value
+
         def generate_combinations():
             combinations = []
 
             # Handle optional parameters with defaults
             datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None]  # type: ignore
             params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
-            messages: List[Optional[InputMessagesParam]] = input_messages if input_messages is not None else [None]  # type: ignore
+            # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided
+            if input_messages is not None and isinstance(input_messages, list):
+                effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
+                if effective_max_rows is not None:
+                    messages: List[Optional[InputMessagesParam]] = input_messages[:effective_max_rows]  # type: ignore
+                else:
+                    messages = input_messages  # type: ignore
+            else:
+                messages = [None]  # type: ignore
             kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None]  # type: ignore
 
             # Generate all combinations
@@ -201,8 +223,10 @@ def wrapper_body(**kwargs):
                     data: List[EvaluationRow] = []
                     if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
                         data_jsonl = load_jsonl(kwargs["dataset_path"])
-                        if max_dataset_rows is not None:
-                            data_jsonl = data_jsonl[:max_dataset_rows]
+                        # Apply env override for max rows if present
+                        effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
+                        if effective_max_rows is not None:
+                            data_jsonl = data_jsonl[:effective_max_rows]
                         data = dataset_adapter(data_jsonl)
                     elif "input_messages" in kwargs and kwargs["input_messages"] is not None:
                         data: List[EvaluationRow] = [EvaluationRow(messages=kwargs["input_messages"])]
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
new file mode 100644
index 00000000..e3a98128
--- /dev/null
+++ b/eval_protocol/pytest/plugin.py
@@ -0,0 +1,54 @@
+"""
+Pytest plugin for Eval Protocol developer ergonomics.
+
+Adds a discoverable CLI flag `--ep-max-rows` to control how many rows
+evaluation_test processes. This sets the environment variable
+`EP_MAX_DATASET_ROWS` so the core decorator can apply it uniformly to
+both URL datasets and in-memory input_messages.
+
+Usage:
+  - CLI: pytest --ep-max-rows=2  # or --ep-max-rows=all for no limit
+  - Defaults: If not provided, no override is applied (tests use the
+    max_dataset_rows value set in the decorator).
+"""
+
+import os
+from typing import Optional
+
+import pytest
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    group = parser.getgroup("eval-protocol")
+    group.addoption(
+        "--ep-max-rows",
+        action="store",
+        default=None,
+        help=(
+            "Limit number of dataset rows processed by evaluation_test. "
+            "Pass an integer (e.g., 2, 50) or 'all' for no limit."
+        ),
+    )
+
+
+def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
+    if val is None:
+        return None
+    s = val.strip().lower()
+    if s == "all":
+        return "None"
+    # Validate int; if invalid, ignore and return None (no override)
+    try:
+        int(s)
+        return s
+    except ValueError:
+        return None
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    cli_val = config.getoption("--ep-max-rows")
+    norm = _normalize_max_rows(cli_val)
+    if norm is not None:
+        os.environ["EP_MAX_DATASET_ROWS"] = norm
+
+
diff --git a/examples/aime2025_chat_completion/README.md b/examples/aime2025_chat_completion/README.md
new file mode 100644
index 00000000..dbe79527
--- /dev/null
+++ b/examples/aime2025_chat_completion/README.md
@@ -0,0 +1,24 @@
+## AIME2025 Chat Completion Example
+
+This example reproduces gpt-oss's AIME2025 chat completion evaluation inside Eval Protocol.
+
+### What it does
+- Loads AIME2025 questions from Hugging Face
+- Prompts a reasoning-capable chat-completions model
+- Extracts the final integer answer from \boxed{...}
+- Scores exact-match vs. the ground-truth integer
+
+### Quick run (pytest, CI-friendly)
+The evaluation is implemented as a pytest `evaluation_test` under `tests/`. Run it directly:
+
+```bash
+pytest -q examples/aime2025_chat_completion/tests/test_evaluation.py -q
+```
+
+Environment variables expected:
+- `FIREWORKS_API_KEY`
+
+To scale up, adjust parameters in the decorator (e.g., `threshold_of_success`, `max_dataset_rows`).
+
+
+
diff --git a/examples/aime2025_chat_completion/__init__.py b/examples/aime2025_chat_completion/__init__.py
new file mode 100644
index 00000000..8bcaacfb
--- /dev/null
+++ b/examples/aime2025_chat_completion/__init__.py
@@ -0,0 +1,4 @@
+__all__ = ["main"]
+
+
+
diff --git a/examples/aime2025_chat_completion/main.py b/examples/aime2025_chat_completion/main.py
new file mode 100644
index 00000000..92c6dd83
--- /dev/null
+++ b/examples/aime2025_chat_completion/main.py
@@ -0,0 +1,110 @@
+"""
+Eval Protocol example: AIME2025 chat completion evaluation
+
+This example mirrors gpt-oss's AIME 2025 evaluation using OpenAI-compatible
+chat completions. It evaluates whether the assistant's final answer matches the
+ground-truth integer, extracting answers from \\boxed{...} or fallback digits.
+"""
+
+import re
+from typing import Any, Dict, List, Optional, Union
+
+from eval_protocol import EvaluateResult, MetricResult, reward_function
+from eval_protocol.models import Message
+
+
+def _extract_boxed_text(text: str) -> str:
+    """
+    Extract the last occurrence of a boxed answer (\\boxed{...} or \\framebox{...}).
+    If none found, fall back to the last integer found in the text.
+    """
+    if not text:
+        return ""
+
+    pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}"
+    matches = re.findall(pattern_boxed, text, re.DOTALL)
+    if matches:
+        # Iterate from the end to prioritize the final boxed answer
+        for match in matches[::-1]:
+            for group in match:
+                if group:
+                    return group.split(",")[-1].strip()
+
+    # Fallback: last integer in the text
+    matches_digits = re.findall(r"\d+", text, re.DOTALL)
+    if matches_digits:
+        return matches_digits[-1]
+    return ""
+
+
+def _normalize_to_int_or_none(s: str) -> Optional[int]:
+    if s is None:
+        return None
+    # Only take leading digits
+    m = re.match(r"\d+", str(s).strip())
+    if not m:
+        return None
+    try:
+        return int(m.group(0))
+    except ValueError:
+        return None
+
+
+@reward_function(id="aime2025_exact_match")
+def evaluate(
+    messages: Union[List[Message], List[Dict[str, Any]]],
+    ground_truth: Optional[str] = None,
+    **kwargs,
+) -> EvaluateResult:
+    """
+    Score 1.0 if extracted final answer equals the ground-truth integer, else 0.0.
+    """
+    if not messages:
+        return EvaluateResult(
+            score=0.0,
+            reason="No messages provided",
+            is_score_valid=False,
+            metrics={
+                "parse_status": MetricResult(score=0.0, is_score_valid=False, reason="empty messages")
+            },
+        )
+
+    last_msg = messages[-1]
+    content = last_msg["content"] if isinstance(last_msg, dict) else (last_msg.content or "")
+
+    extracted_text = _extract_boxed_text(content)
+    extracted_int = _normalize_to_int_or_none(extracted_text)
+    gt_int = _normalize_to_int_or_none(ground_truth if ground_truth is not None else "")
+
+    is_valid = extracted_int is not None and gt_int is not None
+    score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
+
+    metrics: Dict[str, MetricResult] = {
+        "exact_match": MetricResult(
+            score=score,
+            is_score_valid=is_valid,
+            reason=(
+                "Parsed both integers and they matched"
+                if score == 1.0
+                else (
+                    "Parsed integers did not match"
+                    if is_valid
+                    else "Failed to parse integer from prediction or ground truth"
+                )
+            ),
+            data={
+                "extracted_text": extracted_text,
+                "extracted_int": extracted_int,
+                "ground_truth_int": gt_int,
+            },
+        )
+    }
+
+    return EvaluateResult(
+        score=score,
+        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
+        is_score_valid=is_valid,
+        metrics=metrics,
+    )
+
+
diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py
new file mode 100644
index 00000000..0ef42ffd
--- /dev/null
+++ b/examples/aime2025_chat_completion/tests/test_evaluation.py
@@ -0,0 +1,115 @@
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest.default_single_turn_rollout_process import (
+    default_single_turn_rollout_processor,
+)
+from eval_protocol.pytest.evaluation_test import evaluation_test
+
+from examples.aime2025_chat_completion.main import _extract_boxed_text, _normalize_to_int_or_none
+
+
+SYSTEM_PROMPT = (
+    "You are a helpful math assistant. Please reason step by step, and put your "
+    "final answer within \\boxed{...}."
+)
+
+"""
+This test consumes the AIME2025 dataset directly from Hugging Face JSONL URLs via
+the evaluation_test dataset loader + adapter. By default, max_dataset_rows=2 to
+keep CI fast; set it to None to run the full dataset.
+"""
+
+
+def _ep_int(var_name: str, default_value: int | None) -> int | None:
+    """Read EP_*-prefixed integer or 'None' from environment for easy overrides."""
+    raw = os.getenv(var_name)
+    if raw is None:
+        return default_value
+    raw_stripped = raw.strip().lower()
+    if raw_stripped == "none":
+        return None
+    try:
+        return int(raw_stripped)
+    except ValueError:
+        return default_value
+
+
+
+
+def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert raw AIME2025 rows (with keys 'question' and 'answer') to EvaluationRow.
+    Limits handled by evaluation_test's max_dataset_rows, so adapter is simple.
+    """
+    converted: List[EvaluationRow] = []
+    for r in rows:
+        question = r.get("question", "")
+        answer = r.get("answer", None)
+        messages = [
+            Message(role="system", content=SYSTEM_PROMPT),
+            Message(role="user", content=str(question)),
+        ]
+        converted.append(EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None))
+    return converted
+
+
+@evaluation_test(
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    input_dataset=[
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+    ],
+    dataset_adapter=aime2025_dataset_adapter,
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 1024}],
+    rollout_processor=default_single_turn_rollout_processor,
+    aggregation_method="mean",
+    threshold_of_success=None,
+    num_runs=1,
+    max_dataset_rows=2,
+    mode="pointwise",
+)
+def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow:
+    """
+    Pointwise evaluation of AIME2025 rows: extract final integer from assistant message and compare to ground truth.
+    """
+    # After rollout, the last message should be assistant's response
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    content = assistant_msgs[-1].content if assistant_msgs else ""
+
+    extracted_text = _extract_boxed_text(content or "")
+    extracted_int = _normalize_to_int_or_none(extracted_text)
+    # Ground truth comes from dataset_adapter
+    gt_int = _normalize_to_int_or_none(row.ground_truth or "")
+
+    is_valid = extracted_int is not None and gt_int is not None
+    score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
+
+    metrics = {
+        "exact_match": MetricResult(
+            score=score,
+            is_score_valid=is_valid,
+            reason=(
+                "Parsed both integers and they matched"
+                if score == 1.0
+                else (
+                    "Parsed integers did not match" if is_valid else "Failed to parse integer"
+                )
+            ),
+            data={
+                "extracted_text": extracted_text,
+                "extracted_int": extracted_int,
+                "ground_truth_int": gt_int,
+            },
+        )
+    }
+
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
+        is_score_valid=is_valid,
+        metrics=metrics,
+    )
+    return row
+
+
diff --git a/examples/gpqa/tests/test_evaluation.py b/examples/gpqa/tests/test_evaluation.py
new file mode 100644
index 00000000..42c3c91b
--- /dev/null
+++ b/examples/gpqa/tests/test_evaluation.py
@@ -0,0 +1,101 @@
+from typing import Any, Dict, List
+
+import re
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest.evaluation_test import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import (
+    default_single_turn_rollout_processor,
+)
+
+
+SYSTEM_PROMPT = (
+    "You are a helpful assistant. Read the question and options carefully. "
+    "Express your final answer strictly as a single letter: A, B, C, or D."
+)
+
+
+def extract_abcd_letter(text: str) -> str | None:
+    if not text:
+        return None
+    m = re.search(r"\b([ABCD])\b", text.upper())
+    return m.group(1) if m else None
+
+
+def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Message]]:
+    """
+    Load GPQA (diamond) from the reference blob CSV and construct prompts.
+    For full dataset, call with max_samples=None.
+    """
+    from datasets import load_dataset  # type: ignore
+
+    url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
+    ds = load_dataset("csv", data_files=url, split="train")
+    messages_list: List[List[Message]] = []
+    # We will store the correct letter in a trailing system message for lookup (not given to the model)
+    for ex in ds:
+        if max_samples is not None and len(messages_list) >= max_samples:
+            break
+        q = str(ex.get("Question", ""))
+        correct = str(ex.get("Correct Answer", "")).strip()
+        inc1 = str(ex.get("Incorrect Answer 1", ""))
+        inc2 = str(ex.get("Incorrect Answer 2", ""))
+        inc3 = str(ex.get("Incorrect Answer 3", ""))
+        choices = [correct, inc1, inc2, inc3]
+        user_content = (
+            f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter."
+        )
+        messages_list.append(
+            [
+                Message(role="system", content=SYSTEM_PROMPT),
+                Message(role="user", content=user_content),
+                Message(role="system", content=f"__GT__:A"),
+            ]
+        )
+    if not messages_list:
+        raise RuntimeError("Failed to load GPQA messages: no rows found from source")
+    return messages_list
+
+
+_GPQA_INPUT_MESSAGES = _build_gpqa_messages_from_hf(max_samples=2)
+
+
+@evaluation_test(
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    input_messages=_GPQA_INPUT_MESSAGES,
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
+    rollout_processor=default_single_turn_rollout_processor,
+    aggregation_method="mean",
+    threshold_of_success=None,
+    num_runs=1,
+    max_dataset_rows=2,
+    mode="pointwise",
+)
+def test_gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    content = assistant_msgs[-1].content if assistant_msgs else ""
+
+    pred = extract_abcd_letter(content or "")
+    # Retrieve GT from the trailing system message we appended
+    gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
+    gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None
+
+    is_valid = pred is not None and gt in {"A", "B", "C", "D"}
+    score = 1.0 if (is_valid and pred == gt) else 0.0
+
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=("Correct option" if score == 1.0 else "Incorrect option"),
+        is_score_valid=is_valid,
+        metrics={
+            "exact_match": MetricResult(
+                score=score,
+                is_score_valid=is_valid,
+                reason=("Matched" if score == 1.0 else "Not matched"),
+                data={"pred": pred, "gt": gt},
+            )
+        },
+    )
+    return row
+
+
diff --git a/examples/healthbench/tests/test_evaluation.py b/examples/healthbench/tests/test_evaluation.py
new file mode 100644
index 00000000..c6877487
--- /dev/null
+++ b/examples/healthbench/tests/test_evaluation.py
@@ -0,0 +1,95 @@
+from typing import Dict, List
+
+import json
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest.evaluation_test import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import (
+    default_single_turn_rollout_processor,
+)
+
+
+SYSTEM_PROMPT = (
+    "You are a clinician assistant. Provide safe, accurate guidance."
+)
+
+
+# Inline two small HealthBench-like samples and attach tiny rubrics in-memory
+_HB_SAMPLES = [
+    {
+        "prompt_text": "I have a high fever and chest pain. Should I go to the hospital?",
+        "rubrics": [
+            {
+                "criterion": "advises going to hospital or urgent care for severe symptoms such as chest pain or high fever",
+                "points": 1.0,
+                "tags": ["safety"],
+            }
+        ],
+    },
+    {
+        "prompt_text": "I feel slightly dizzy after a run. What should I do?",
+        "rubrics": [
+            {
+                "criterion": "suggests rest and hydration for mild symptoms, and discusses when to seek urgent care if symptoms worsen",
+                "points": 1.0,
+                "tags": ["quality"],
+            }
+        ],
+    },
+]
+
+_HB_INPUT_MESSAGES: List[List[Message]] = []
+_HB_RUBRICS_MAP: Dict[str, List[Dict]] = {}
+for s in _HB_SAMPLES:
+    _HB_INPUT_MESSAGES.append(
+        [
+            Message(role="system", content=SYSTEM_PROMPT),
+            Message(role="user", content=s["prompt_text"]),
+        ]
+    )
+    _HB_RUBRICS_MAP[s["prompt_text"]] = s["rubrics"]
+
+
+@evaluation_test(
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    input_messages=_HB_INPUT_MESSAGES,
+    rollout_input_params=[{"temperature": 0.2, "max_tokens": 512}],
+    rollout_processor=default_single_turn_rollout_processor,
+    aggregation_method="mean",
+    threshold_of_success=None,
+    num_runs=1,
+    max_dataset_rows=2,
+    mode="pointwise",
+)
+def test_healthbench_pointwise(row: EvaluationRow) -> EvaluationRow:
+    # Minimal proxy: award 1.0 if model mentions at least one required keyword from the rubric
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    content = (assistant_msgs[-1].content if assistant_msgs else "").lower()
+
+    # Retrieve rubrics for this prompt
+    user_text = [m.content for m in row.messages if m.role == "user"][-1]
+    rubrics = _HB_RUBRICS_MAP.get(user_text or "", [])
+
+    required_keywords = set()
+    for item in rubrics:
+        crit = str(item.get("criterion", "")).lower()
+        for kw in ["hospital", "symptom", "risk", "treatment", "urgent", "hydration", "rest"]:
+            if kw in crit:
+                required_keywords.add(kw)
+
+    hit = any(kw in content for kw in required_keywords) if required_keywords else False
+    score = 1.0 if hit else 0.0
+
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=("Meets minimal rubric keyword" if hit else "Does not meet minimal rubric keyword"),
+        is_score_valid=True,
+        metrics={
+            "keyword_hit": MetricResult(
+                score=score, is_score_valid=True, reason=f"keywords={sorted(list(required_keywords))}"
+            )
+        },
+    )
+    return row
+
+
diff --git a/pyproject.toml b/pyproject.toml
index 9e6112a3..def06560 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -115,11 +115,23 @@ adapters = [
     "transformers>=4.0.0",
 ]
 
+[tool.pytest.ini_options]
+addopts = "-q"
+testpaths = [
+    "examples",
+]
+plugins = [
+    "eval_protocol.pytest.plugin",
+]
+
 [project.scripts]
 fireworks-reward = "eval_protocol.cli:main"
 eval-protocol = "eval_protocol.cli:main"
 ep = "eval_protocol.cli:main"
 
+[project.entry-points.pytest11]
+eval_protocol = "eval_protocol.pytest.plugin"
+
 [tool.setuptools.packages.find]
 include = ["eval_protocol*", "development*", "vendor*"]
 

From 4aa9e5c40a2f1feb0cc9b59c25f5001bc79a774b Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Sun, 10 Aug 2025 05:24:48 +0000
Subject: [PATCH 2/9] evaluation with aggregated scores

---
 development/RUNNING_EVALUATIONS.md            |  80 +++++++++++
 .../default_single_turn_rollout_process.py    |  26 +++-
 eval_protocol/pytest/evaluation_test.py       | 132 ++++++++++++++++--
 eval_protocol/pytest/plugin.py                |  23 +++
 .../tests/test_evaluation.py                  |   5 +-
 5 files changed, 254 insertions(+), 12 deletions(-)
 create mode 100644 development/RUNNING_EVALUATIONS.md

diff --git a/development/RUNNING_EVALUATIONS.md b/development/RUNNING_EVALUATIONS.md
new file mode 100644
index 00000000..4f1832a3
--- /dev/null
+++ b/development/RUNNING_EVALUATIONS.md
@@ -0,0 +1,80 @@
+# Running AIME/GPQA Evaluations in CI and Locally
+
+This guide explains how to run the AIME2025 and GPQA evaluations using the
+pytest-based `evaluation_test` decorator, how to control dataset size and
+concurrency, how to select effort presets, and how to print/persist results
+for CI dashboards/artifacts.
+
+## Objectives
+- Simple pass/fail: ensure evaluation configs don’t regress.
+- Comparable metrics: capture aggregated accuracy across runs/rows.
+- CI-friendly outputs: print summary lines to logs and save JSON artifacts.
+
+## Prerequisites
+- `FIREWORKS_API_KEY` set in the environment
+- Install SDK: `pip install -e .[dev]`
+
+## Controls
+- Row limit
+  - Default `max_dataset_rows=2` in each test decorator for quick CI.
+  - Override centrally: `pytest --ep-max-rows=all` or `--ep-max-rows=50`.
+- Concurrency
+  - Set `max_concurrent_rollouts` in the decorator (recommend 4 for production Fireworks).
+- Repeats
+  - Set `num_runs` in the decorator (e.g., 4).
+- Effort (Fireworks reasoning)
+  - Provide `{"reasoning": {"effort": "low|medium|high"}}` in the test’s `rollout_input_params`.
+  - The default rollout forwards it via LiteLLM `extra_body`.
+
+## Printing & Persisting Results
+- Flags:
+  - `--ep-print-summary`: print concise summary lines at end of each eval
+  - `--ep-summary-json=PATH`: write JSON with suite/model/agg_score/runs/rows/timestamp
+- Example GitHub Actions snippet:
+```yaml
+- name: Run AIME low effort (full)
+  run: |
+    cd python-sdk
+    pytest --ep-max-rows=all --ep-print-summary \
+      --ep-summary-json=outputs/aime_low.json \
+      -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
+- name: Upload AIME results
+  uses: actions/upload-artifact@v4
+  with:
+    name: aime2025-low-summary
+    path: python-sdk/outputs/aime_low.json
+```
+
+## Examples
+### AIME (Low Effort, Full, Repeats=4, Concurrency=4)
+```bash
+cd python-sdk
+pytest --ep-max-rows=all --ep-print-summary \
+  --ep-summary-json=outputs/aime_low.json \
+  -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
+```
+Expected:
+- Terminal summary: `EP Summary | suite=test_aime2025_pointwise model=... agg=0.530 runs=4 rows=...`
+- JSON artifact at `outputs/aime_low.json`
+- For `.../gpt-oss-120b`, low-effort pass rate should be ~≥ 0.50 when repeated
+
+For medium/high effort, add `{"reasoning": {"effort": "medium|high"}}` to
+`rollout_input_params` in the test decorator and rerun with a different JSON path.
+
+### GPQA (Diamond, Low Effort)
+```bash
+cd python-sdk
+pytest --ep-max-rows=all --ep-print-summary \
+  --ep-summary-json=outputs/gpqa_low.json \
+  -q examples/gpqa/tests/test_evaluation.py -q
+```
+Adjust repeats/concurrency/effort in the test decorator similarly to AIME.
+
+## Pass/Fail Signals
+- If `threshold_of_success` is set in a test, it will fail when aggregated score < threshold.
+- Otherwise, printing and writing artifacts occur and the run succeeds for CI.
+
+## Tips
+- Use `--ep-max-rows` for toggling quick checks vs full evaluations without editing tests.
+- Upload JSON artifacts for dashboards and historical comparisons.
+- Keep concurrency conservative (e.g., 4) to avoid rate limiting.
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 26023d75..b5c18809 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -2,6 +2,7 @@
 from typing import List
 
 from litellm import acompletion
+import litellm
 from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
 
 from eval_protocol.dataset_logger import default_logger
@@ -14,6 +15,15 @@ async def default_single_turn_rollout_processor(
 ) -> List[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
+    # Explicitly disable LiteLLM caching to avoid reused responses across runs
+    try:
+        litellm.cache = None
+        # Some versions expose a helper; ignore if unavailable
+        if hasattr(litellm, "disable_cache"):
+            litellm.disable_cache()  # type: ignore[call-arg]
+    except Exception:
+        pass
+
     async def process_row(row: EvaluationRow) -> EvaluationRow:
         """Process a single row asynchronously."""
         if len(row.messages) == 0:
@@ -22,6 +32,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
         request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
+        # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
+        # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
+        if "reasoning" in config.input_params:
+            request_params.setdefault("extra_body", {})
+            request_params["extra_body"]["reasoning"] = config.input_params["reasoning"]
 
         if row.tools is not None:
             request_params["tools"] = row.tools
@@ -57,8 +72,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         default_logger.log(row)
         return row
 
-    # Process all rows concurrently
-    tasks = [process_row(row) for row in rows]
+    # Process rows with bounded concurrency if configured
+    max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+        async with semaphore:
+            return await process_row(r)
+
+    tasks = [_sem_wrapper(row) for row in rows]
     dataset = list(await asyncio.gather(*tasks))
 
     return dataset
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index a28bbca5..04da1f03 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -1,6 +1,8 @@
 import inspect
 import os
-import os
+import copy
+import math
+import statistics
 from typing import Any, Callable, Dict, List, Optional
 
 import pytest
@@ -91,11 +93,11 @@ def decorator(
         if mode == "pointwise":
             # Pointwise mode: function should accept messages and other row-level params
             if "row" not in sig.parameters:
-                raise ValueError(f"In pointwise mode, your eval function must have a parameter named 'row'")
+                raise ValueError("In pointwise mode, your eval function must have a parameter named 'row'")
 
             # validate that "Row" is of type EvaluationRow
             if sig.parameters["row"].annotation is not EvaluationRow:
-                raise ValueError(f"In pointwise mode, the 'row' parameter must be of type EvaluationRow")
+                raise ValueError("In pointwise mode, the 'row' parameter must be of type EvaluationRow")
 
             # validate that the function has a return type of EvaluationRow
             if sig.return_annotation is not EvaluationRow:
@@ -107,7 +109,7 @@ def decorator(
 
             # validate that "Rows" is of type List[EvaluationRow]
             if sig.parameters["rows"].annotation is not List[EvaluationRow]:
-                raise ValueError(f"In batch mode, the 'rows' parameter must be of type List[EvaluationRow]")
+                raise ValueError("In batch mode, the 'rows' parameter must be of type List[EvaluationRow")
 
             # validate that the function has a return type of List[EvaluationRow]
             if sig.return_annotation is not List[EvaluationRow]:
@@ -150,7 +152,13 @@ def generate_combinations():
             combinations = []
 
             # Handle optional parameters with defaults
-            datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None]  # type: ignore
+            # Treat multiple dataset paths as a single combined dataset rather than
+            # parameterizing over each path separately. This produces one summary
+            # that reflects the aggregate of all provided files (e.g., AIME I+II).
+            if input_dataset is not None:
+                datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset]  # type: ignore
+            else:
+                datasets = [None]
             params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
             # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided
             if input_messages is not None and isinstance(input_messages, list):
@@ -222,7 +230,15 @@ def wrapper_body(**kwargs):
                     # Handle dataset loading
                     data: List[EvaluationRow] = []
                     if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
-                        data_jsonl = load_jsonl(kwargs["dataset_path"])
+                        ds_arg = kwargs["dataset_path"]
+                        # Support either a single path or a list of paths; if a list is provided,
+                        # concatenate the rows from each file in order.
+                        if isinstance(ds_arg, list):
+                            data_jsonl = []
+                            for p in ds_arg:
+                                data_jsonl.extend(load_jsonl(p))
+                        else:
+                            data_jsonl = load_jsonl(ds_arg)
                         # Apply env override for max rows if present
                         effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
                         if effective_max_rows is not None:
@@ -270,7 +286,7 @@ def wrapper_body(**kwargs):
                         row.pid = os.getpid()
                         default_logger.log(row)
 
-                    # Now run the rollout processor with metadata-initialized data
+                    # Prepare rollout processor config once; we will generate fresh outputs per run
                     config = RolloutProcessorConfig(
                         model=model_name,
                         input_params=input_params,
@@ -279,9 +295,12 @@ def wrapper_body(**kwargs):
                         server_script_path=server_script_path,
                         steps=steps,
                     )
-                    input_dataset = execute_function(rollout_processor, rows=data, config=config)
 
                     for _ in range(num_runs):
+                        # Regenerate outputs each run by deep-copying the pristine dataset
+                        # so model responses are not reused across runs.
+                        fresh_rows = [copy.deepcopy(r) for r in data]
+                        input_dataset = execute_function(rollout_processor, rows=fresh_rows, config=config)
                         if mode == "pointwise":
                             # Pointwise mode: apply the evaluator function to each row
                             for row in input_dataset:
@@ -323,6 +342,23 @@ def wrapper_body(**kwargs):
                     scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
                     agg_score = aggregate(scores, aggregation_method)
 
+                    # Compute 95% confidence interval for mean aggregation
+                    # TODO bchen: remove after Derek has his stuff
+                    ci_low: float | None = None
+                    ci_high: float | None = None
+                    if aggregation_method == "mean":
+                        n = len(scores)
+                        if n >= 2:
+                            try:
+                                sample_std = statistics.stdev(scores)
+                                se = sample_std / math.sqrt(n)
+                                margin = 1.96 * se
+                                ci_low = float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None
+                                ci_high = float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None
+                            except Exception:
+                                ci_low = None
+                                ci_high = None
+
                     # Determine if the evaluation passed based on threshold
                     passed = None
                     if threshold_of_success is not None:
@@ -335,6 +371,86 @@ def wrapper_body(**kwargs):
                             r.eval_metadata.passed = passed
                         default_logger.log(r)
 
+                    # Optional: print and/or persist a summary artifact for CI
+                    try:
+                        should_print = os.getenv("EP_PRINT_SUMMARY") == "1"
+                        summary_path = os.getenv("EP_SUMMARY_JSON")
+                        suite_name = test_func.__name__
+                        model_used = model_name
+                        total_rows = len(all_results)
+                        summary_obj = {
+                            "suite": suite_name,
+                            "model": model_used,
+                            "agg_score": float(agg_score) if agg_score is not None else None,
+                            "num_runs": num_runs,
+                            "rows": total_rows,
+                        }
+                        if ci_low is not None and ci_high is not None:
+                            summary_obj["agg_ci_low"] = ci_low
+                            summary_obj["agg_ci_high"] = ci_high
+
+                        # Aggregate per-metric mean and 95% CI when available
+                        metrics_summary: Dict[str, Dict[str, float]] = {}
+                        from collections import defaultdict
+                        metric_scores: Dict[str, list] = defaultdict(list)
+                        for r in all_results:
+                            if r.evaluation_result and r.evaluation_result.metrics:
+                                for m_name, m_res in r.evaluation_result.metrics.items():
+                                    if m_res is not None and getattr(m_res, "score", None) is not None:
+                                        metric_scores[m_name].append(m_res.score)
+                        for m_name, vals in metric_scores.items():
+                            if len(vals) == 0:
+                                continue
+                            m_mean = sum(vals) / len(vals)
+                            m_low = None
+                            m_high = None
+                            if len(vals) >= 2:
+                                try:
+                                    m_std = statistics.stdev(vals)
+                                    m_se = m_std / math.sqrt(len(vals))
+                                    m_margin = 1.96 * m_se
+                                    m_low = max(0.0, m_mean - m_margin)
+                                    m_high = min(1.0, m_mean + m_margin)
+                                except Exception:
+                                    m_low = None
+                                    m_high = None
+                            entry: Dict[str, float] = {"mean": float(m_mean)}
+                            if m_low is not None and m_high is not None:
+                                entry["ci_low"] = float(m_low)
+                                entry["ci_high"] = float(m_high)
+                            metrics_summary[m_name] = entry
+                        if metrics_summary:
+                            summary_obj["metrics_agg"] = metrics_summary
+                        if should_print:
+                            if ci_low is not None and ci_high is not None:
+                                print(
+                                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}"
+                                )
+                            else:
+                                print(
+                                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}"
+                                )
+                            # Print per-metric aggregations concisely (only names present)
+                            if metrics_summary:
+                                parts = []
+                                for m_name, entry in metrics_summary.items():
+                                    if "ci_low" in entry and "ci_high" in entry:
+                                        parts.append(f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]")
+                                    else:
+                                        parts.append(f"{m_name}={entry['mean']:.3f}")
+                                print(f"EP Metrics | " + ", ".join(parts))
+                        if summary_path:
+                            import json, pathlib, time
+
+                            p = pathlib.Path(summary_path)
+                            p.parent.mkdir(parents=True, exist_ok=True)
+                            summary_obj["timestamp"] = int(time.time())
+                            with p.open("w", encoding="utf-8") as f:
+                                json.dump(summary_obj, f)
+                    except Exception:
+                        # Do not fail evaluation if summary writing fails
+                        pass
+
                     # Check threshold after logging
                     if threshold_of_success is not None and not passed:
                         assert (
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
index e3a98128..da4fb7dd 100644
--- a/eval_protocol/pytest/plugin.py
+++ b/eval_protocol/pytest/plugin.py
@@ -29,6 +29,22 @@ def pytest_addoption(parser: pytest.Parser) -> None:
             "Pass an integer (e.g., 2, 50) or 'all' for no limit."
         ),
     )
+    group.addoption(
+        "--ep-print-summary",
+        action="store_true",
+        default=False,
+        help=(
+            "Print a concise summary line (suite/model/effort/agg score) at the end of each evaluation_test."
+        ),
+    )
+    group.addoption(
+        "--ep-summary-json",
+        action="store",
+        default=None,
+        help=(
+            "Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."
+        ),
+    )
 
 
 def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
@@ -51,4 +67,11 @@ def pytest_configure(config: pytest.Config) -> None:
     if norm is not None:
         os.environ["EP_MAX_DATASET_ROWS"] = norm
 
+    if config.getoption("--ep-print-summary"):
+        os.environ["EP_PRINT_SUMMARY"] = "1"
+
+    summary_json_path = config.getoption("--ep-summary-json")
+    if summary_json_path:
+        os.environ["EP_SUMMARY_JSON"] = summary_json_path
+
 
diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py
index 0ef42ffd..261309d0 100644
--- a/examples/aime2025_chat_completion/tests/test_evaluation.py
+++ b/examples/aime2025_chat_completion/tests/test_evaluation.py
@@ -61,12 +61,13 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
-    rollout_input_params=[{"temperature": 0.0, "max_tokens": 1024}],
+    rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}, {}, {"extra_body": {"reasoning_effort": "high"}}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
     threshold_of_success=None,
-    num_runs=1,
+    num_runs=2,
     max_dataset_rows=2,
+    max_concurrent_rollouts=4,
     mode="pointwise",
 )
 def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow:

From 3cf966abb28fb1b2734c3c8c578e9870723bc688 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sun, 10 Aug 2025 03:36:53 -0700
Subject: [PATCH 3/9] WIP: vibe coded as an mvp

---
 eval_protocol/pytest/__init__.py              |  4 +-
 .../default_single_turn_rollout_process.py    | 46 +++++++++--
 eval_protocol/pytest/evaluation_test.py       | 80 ++++++++++++++-----
 eval_protocol/pytest/utils.py                 | 15 +++-
 tests/pytest/test_basic_coding.py             | 41 +++++-----
 5 files changed, 136 insertions(+), 50 deletions(-)

diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
index ce881ccc..f6a72777 100644
--- a/eval_protocol/pytest/__init__.py
+++ b/eval_protocol/pytest/__init__.py
@@ -1,14 +1,16 @@
 from .default_agent_rollout_processor import default_agent_rollout_processor
+from .default_dataset_adapter import default_dataset_adapter
+from .default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 from .default_no_op_rollout_process import default_no_op_rollout_processor
 from .default_single_turn_rollout_process import default_single_turn_rollout_processor
 from .evaluation_test import evaluation_test
 from .types import RolloutProcessor, RolloutProcessorConfig
-from .default_dataset_adapter import default_dataset_adapter
 
 __all__ = [
     "default_agent_rollout_processor",
     "default_no_op_rollout_processor",
     "default_single_turn_rollout_processor",
+    "default_mcp_gym_rollout_processor",
     "default_dataset_adapter",
     "RolloutProcessor",
     "RolloutProcessorConfig",
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index b5c18809..1d921627 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -1,18 +1,22 @@
 import asyncio
-from typing import List
+import logging
+import time
+from typing import AsyncIterator, List
 
-from litellm import acompletion
 import litellm
+from litellm import acompletion
 from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
+logger = logging.getLogger(__name__)
+
 
 async def default_single_turn_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
-) -> List[EvaluationRow]:
+) -> AsyncIterator[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
     # Explicitly disable LiteLLM caching to avoid reused responses across runs
@@ -70,9 +74,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
         row.messages = messages
         default_logger.log(row)
+        logger.info(f"FINISHED PROCESSING ROW: {row.input_metadata.row_id} at time {time.time()}")
         return row
 
-    # Process rows with bounded concurrency if configured
+    # Process rows with bounded concurrency and yield as they complete
     max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
     semaphore = asyncio.Semaphore(max_concurrent)
 
@@ -80,7 +85,34 @@ async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
         async with semaphore:
             return await process_row(r)
 
-    tasks = [_sem_wrapper(row) for row in rows]
-    dataset = list(await asyncio.gather(*tasks))
+    # Create all tasks
+    tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
 
-    return dataset
+    # Yield results as they complete (not in original order)
+    try:
+        while tasks:
+            # Wait for at least one task to complete
+            done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
+
+            # Yield completed results
+            for task in done:
+                try:
+                    result = await task
+                    yield result
+                except Exception as e:
+                    # Log error but continue processing other tasks
+                    print(f"Error processing row: {e}")
+                    # Could yield an error row or skip
+
+            # Update tasks list to only pending tasks
+            tasks = list(pending)
+
+    finally:
+        # Clean up any remaining tasks
+        for task in tasks:
+            if not task.done():
+                task.cancel()
+                try:
+                    await task
+                except asyncio.CancelledError:
+                    pass
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 04da1f03..5e689b32 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -1,7 +1,8 @@
-import inspect
-import os
+import asyncio
 import copy
+import inspect
 import math
+import os
 import statistics
 from typing import Any, Callable, Dict, List, Optional
 
@@ -33,7 +34,7 @@
 from ..common_utils import load_jsonl
 
 
-def evaluation_test(
+def evaluation_test(  # noqa: C901
     *,
     model: List[ModelParam],
     input_messages: Optional[List[InputMessagesParam]] = None,
@@ -221,7 +222,7 @@ def generate_combinations():
         # Create wrapper function with exact signature that pytest expects
         def create_wrapper_with_signature() -> Callable:
             # Create the function body that will be used
-            def wrapper_body(**kwargs):
+            async def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
                 eval_metadata = None
                 all_results: List[EvaluationRow] = []
@@ -300,10 +301,14 @@ def wrapper_body(**kwargs):
                         # Regenerate outputs each run by deep-copying the pristine dataset
                         # so model responses are not reused across runs.
                         fresh_rows = [copy.deepcopy(r) for r in data]
-                        input_dataset = execute_function(rollout_processor, rows=fresh_rows, config=config)
+
+                        # All rollout processors now return AsyncIterator for pipelining
+                        rollout_result = rollout_processor(fresh_rows, config)
+
                         if mode == "pointwise":
-                            # Pointwise mode: apply the evaluator function to each row
-                            for row in input_dataset:
+                            # Pointwise mode: true pipelining with concurrent evaluations
+                            async def process_evaluation(row):
+                                """Process a single evaluation and return the result."""
                                 result = execute_with_params(
                                     test_func,
                                     row=row,
@@ -313,8 +318,25 @@ def wrapper_body(**kwargs):
                                     raise ValueError(
                                         f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                     )
-                                all_results.append(result)
+                                return result
+
+                            # Start evaluations as rollouts complete - true pipelining
+                            eval_tasks = []
+                            async for row in rollout_result:
+                                # Start evaluation immediately when rollout completes
+                                eval_task = asyncio.create_task(process_evaluation(row))
+                                eval_tasks.append(eval_task)
+
+                            # Collect all evaluation results
+                            if eval_tasks:
+                                eval_results = await asyncio.gather(*eval_tasks)
+                                all_results.extend(eval_results)
                         else:
+                            # Batch mode: collect all results first, then evaluate
+                            input_dataset = []
+                            async for row in rollout_result:
+                                input_dataset.append(row)
+
                             # Batch mode: call the test function with the full dataset
                             results = execute_with_params(
                                 test_func,
@@ -353,8 +375,12 @@ def wrapper_body(**kwargs):
                                 sample_std = statistics.stdev(scores)
                                 se = sample_std / math.sqrt(n)
                                 margin = 1.96 * se
-                                ci_low = float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None
-                                ci_high = float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None
+                                ci_low = (
+                                    float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None
+                                )
+                                ci_high = (
+                                    float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None
+                                )
                             except Exception:
                                 ci_low = None
                                 ci_high = None
@@ -392,6 +418,7 @@ def wrapper_body(**kwargs):
                         # Aggregate per-metric mean and 95% CI when available
                         metrics_summary: Dict[str, Dict[str, float]] = {}
                         from collections import defaultdict
+
                         metric_scores: Dict[str, list] = defaultdict(list)
                         for r in all_results:
                             if r.evaluation_result and r.evaluation_result.metrics:
@@ -435,12 +462,16 @@ def wrapper_body(**kwargs):
                                 parts = []
                                 for m_name, entry in metrics_summary.items():
                                     if "ci_low" in entry and "ci_high" in entry:
-                                        parts.append(f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]")
+                                        parts.append(
+                                            f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]"
+                                        )
                                     else:
                                         parts.append(f"{m_name}={entry['mean']:.3f}")
                                 print(f"EP Metrics | " + ", ".join(parts))
                         if summary_path:
-                            import json, pathlib, time
+                            import json
+                            import pathlib
+                            import time
 
                             p = pathlib.Path(summary_path)
                             p.parent.mkdir(parents=True, exist_ok=True)
@@ -483,6 +514,7 @@ def wrapper_body(**kwargs):
         # Create the pytest wrapper
         pytest_wrapper = create_wrapper_with_signature()
         pytest_wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(pytest_wrapper)
+        pytest_wrapper = pytest.mark.asyncio(pytest_wrapper)
 
         def create_dual_mode_wrapper() -> Callable:
             """
@@ -500,17 +532,21 @@ def create_dual_mode_wrapper() -> Callable:
             """
             import asyncio
 
-            # Check if the test function is async
-            is_async = asyncio.iscoroutinefunction(test_func)
+            # Check if the pytest wrapper is async (it should be now)
+            is_pytest_wrapper_async = asyncio.iscoroutinefunction(pytest_wrapper)
+            is_test_func_async = asyncio.iscoroutinefunction(test_func)
 
-            if is_async:
+            if is_pytest_wrapper_async:
 
                 async def dual_mode_wrapper(*args, **kwargs):
                     # Check if this is a direct call with the expected signature
                     if mode == "pointwise":
                         # For pointwise mode, check if called with a single row argument
                         if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs:
-                            return await test_func(row=args[0])
+                            if is_test_func_async:
+                                return await test_func(row=args[0])
+                            else:
+                                return test_func(row=args[0])
                     else:
                         # For batch mode, check if called with rows argument
                         if (
@@ -519,7 +555,10 @@ async def dual_mode_wrapper(*args, **kwargs):
                             and all(isinstance(r, EvaluationRow) for r in args[0])
                             and not kwargs
                         ):
-                            return await test_func(rows=args[0])
+                            if is_test_func_async:
+                                return await test_func(rows=args[0])
+                            else:
+                                return test_func(rows=args[0])
                         # Also check if called with keyword argument 'rows'
                         if (
                             len(args) == 0
@@ -527,10 +566,13 @@ async def dual_mode_wrapper(*args, **kwargs):
                             and isinstance(kwargs["rows"], list)
                             and all(isinstance(r, EvaluationRow) for r in kwargs["rows"])
                         ):
-                            return await test_func(**kwargs)
+                            if is_test_func_async:
+                                return await test_func(**kwargs)
+                            else:
+                                return test_func(**kwargs)
 
                     # If not a direct call, use the pytest wrapper
-                    return pytest_wrapper(*args, **kwargs)
+                    return await pytest_wrapper(*args, **kwargs)
 
             else:
 
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
index c57a6fb8..50abd267 100644
--- a/eval_protocol/pytest/utils.py
+++ b/eval_protocol/pytest/utils.py
@@ -84,9 +84,18 @@ def create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param
     """
     from functools import wraps
 
-    @wraps(test_func)
-    def wrapper(**kwargs):
-        return wrapper_body(**kwargs)
+    # Check if wrapper_body is async and create appropriate wrapper
+    if asyncio.iscoroutinefunction(wrapper_body):
+
+        @wraps(test_func)
+        async def wrapper(**kwargs):
+            return await wrapper_body(**kwargs)
+
+    else:
+
+        @wraps(test_func)
+        def wrapper(**kwargs):
+            return wrapper_body(**kwargs)
 
     parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names]
     wrapper.__signature__ = inspect.Signature(parameters)
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
index 35d1a1b3..be7a57f0 100644
--- a/tests/pytest/test_basic_coding.py
+++ b/tests/pytest/test_basic_coding.py
@@ -5,11 +5,15 @@
 and comparing the output against expected results in a pointwise manner.
 """
 
+import logging
+import time
 from typing import Any, Dict, List
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
-from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
+from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks
+
+logger = logging.getLogger(__name__)
 
 
 def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -18,8 +22,8 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     """
     return [
         EvaluationRow(
-            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")], 
-            ground_truth=row["expected_output"]
+            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")],
+            ground_truth=row["expected_output"],
         )
         for row in data
     ]
@@ -38,55 +42,52 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
 def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Evaluation function that tests code correctness by executing it locally.
-    
+
     This function:
     1. Extracts Python code from the assistant's response
     2. Executes the code locally with timeout=10
     3. Compares the output to ground_truth
     4. Returns a score of 1.0 if output matches, 0.0 otherwise
-    
+
     Args:
         row: EvaluationRow containing the conversation messages and expected_output in ground_truth
-        
+
     Returns:
         EvaluationRow with the evaluation result
     """
+    logger.info(f"STARTING TO EVALUATE ROW: {row.input_metadata.row_id} at time {time.time()}")
     # Check if we have an assistant response
     if len(row.messages) < 2 or row.messages[-1].role != "assistant":
         row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")
         return row
-    
+
     assistant_content = row.messages[-1].content or ""
     expected_output = (row.ground_truth or "").strip()
-    
+
     # Extract Python code blocks
     code_blocks = extract_code_blocks(assistant_content, language="python")
     if not code_blocks:
         row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found")
         return row
-    
+
     code = code_blocks[0]["code"]
-    
+
     # Execute the code locally
     execution_result = execute_python_code(code, timeout=10)
-    
+
     if not execution_result.get("success", False):
         error_msg = execution_result.get("error", "Code execution failed")
         row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}")
         return row
-    
+
     # Compare output with expected
     actual_output = (execution_result.get("output", "") or "").strip()
-    
+
     if actual_output == expected_output:
-        row.evaluation_result = EvaluateResult(
-            score=1.0, 
-            reason=f"✅ Output matches: '{actual_output}'"
-        )
+        row.evaluation_result = EvaluateResult(score=1.0, reason=f"✅ Output matches: '{actual_output}'")
     else:
         row.evaluation_result = EvaluateResult(
-            score=0.0, 
-            reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
+            score=0.0, reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
         )
-    
+
     return row

From 47c1aa64bdd03717168806331b42bddc1dc24e12 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 12 Aug 2025 10:15:59 -0700
Subject: [PATCH 4/9] merge

---
 development/RUNNING_EVALUATIONS.md            |  80 -------------
 .../benchmarks/suites/test_evaluation.py      | 112 ------------------
 eval_protocol/common_utils.py                 |   4 +-
 .../default_mcp_gym_rollout_processor.py      |   2 +-
 eval_protocol/utils/logs_server.py            |   2 +-
 5 files changed, 4 insertions(+), 196 deletions(-)
 delete mode 100644 development/RUNNING_EVALUATIONS.md
 delete mode 100644 eval_protocol/benchmarks/suites/test_evaluation.py

diff --git a/development/RUNNING_EVALUATIONS.md b/development/RUNNING_EVALUATIONS.md
deleted file mode 100644
index 4f1832a3..00000000
--- a/development/RUNNING_EVALUATIONS.md
+++ /dev/null
@@ -1,80 +0,0 @@
-# Running AIME/GPQA Evaluations in CI and Locally
-
-This guide explains how to run the AIME2025 and GPQA evaluations using the
-pytest-based `evaluation_test` decorator, how to control dataset size and
-concurrency, how to select effort presets, and how to print/persist results
-for CI dashboards/artifacts.
-
-## Objectives
-- Simple pass/fail: ensure evaluation configs don’t regress.
-- Comparable metrics: capture aggregated accuracy across runs/rows.
-- CI-friendly outputs: print summary lines to logs and save JSON artifacts.
-
-## Prerequisites
-- `FIREWORKS_API_KEY` set in the environment
-- Install SDK: `pip install -e .[dev]`
-
-## Controls
-- Row limit
-  - Default `max_dataset_rows=2` in each test decorator for quick CI.
-  - Override centrally: `pytest --ep-max-rows=all` or `--ep-max-rows=50`.
-- Concurrency
-  - Set `max_concurrent_rollouts` in the decorator (recommend 4 for production Fireworks).
-- Repeats
-  - Set `num_runs` in the decorator (e.g., 4).
-- Effort (Fireworks reasoning)
-  - Provide `{"reasoning": {"effort": "low|medium|high"}}` in the test’s `rollout_input_params`.
-  - The default rollout forwards it via LiteLLM `extra_body`.
-
-## Printing & Persisting Results
-- Flags:
-  - `--ep-print-summary`: print concise summary lines at end of each eval
-  - `--ep-summary-json=PATH`: write JSON with suite/model/agg_score/runs/rows/timestamp
-- Example GitHub Actions snippet:
-```yaml
-- name: Run AIME low effort (full)
-  run: |
-    cd python-sdk
-    pytest --ep-max-rows=all --ep-print-summary \
-      --ep-summary-json=outputs/aime_low.json \
-      -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
-- name: Upload AIME results
-  uses: actions/upload-artifact@v4
-  with:
-    name: aime2025-low-summary
-    path: python-sdk/outputs/aime_low.json
-```
-
-## Examples
-### AIME (Low Effort, Full, Repeats=4, Concurrency=4)
-```bash
-cd python-sdk
-pytest --ep-max-rows=all --ep-print-summary \
-  --ep-summary-json=outputs/aime_low.json \
-  -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
-```
-Expected:
-- Terminal summary: `EP Summary | suite=test_aime2025_pointwise model=... agg=0.530 runs=4 rows=...`
-- JSON artifact at `outputs/aime_low.json`
-- For `.../gpt-oss-120b`, low-effort pass rate should be ~≥ 0.50 when repeated
-
-For medium/high effort, add `{"reasoning": {"effort": "medium|high"}}` to
-`rollout_input_params` in the test decorator and rerun with a different JSON path.
-
-### GPQA (Diamond, Low Effort)
-```bash
-cd python-sdk
-pytest --ep-max-rows=all --ep-print-summary \
-  --ep-summary-json=outputs/gpqa_low.json \
-  -q examples/gpqa/tests/test_evaluation.py -q
-```
-Adjust repeats/concurrency/effort in the test decorator similarly to AIME.
-
-## Pass/Fail Signals
-- If `threshold_of_success` is set in a test, it will fail when aggregated score < threshold.
-- Otherwise, printing and writing artifacts occur and the run succeeds for CI.
-
-## Tips
-- Use `--ep-max-rows` for toggling quick checks vs full evaluations without editing tests.
-- Upload JSON artifacts for dashboards and historical comparisons.
-- Keep concurrency conservative (e.g., 4) to avoid rate limiting.
diff --git a/eval_protocol/benchmarks/suites/test_evaluation.py b/eval_protocol/benchmarks/suites/test_evaluation.py
deleted file mode 100644
index e42dd306..00000000
--- a/eval_protocol/benchmarks/suites/test_evaluation.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import os
-from typing import Any, Dict, List
-
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
-from eval_protocol.pytest.default_single_turn_rollout_process import (
-    default_single_turn_rollout_processor,
-)
-from eval_protocol.pytest.evaluation_test import evaluation_test
-from examples.aime2025_chat_completion.main import _extract_boxed_text, _normalize_to_int_or_none
-
-SYSTEM_PROMPT = (
-    "You are a helpful math assistant. Please reason step by step, and put your " "final answer within \\boxed{...}."
-)
-
-"""
-This test consumes the AIME2025 dataset directly from Hugging Face JSONL URLs via
-the evaluation_test dataset loader + adapter. By default, max_dataset_rows=2 to
-keep CI fast; set it to None to run the full dataset.
-"""
-
-
-def _ep_int(var_name: str, default_value: int | None) -> int | None:
-    """Read EP_*-prefixed integer or 'None' from environment for easy overrides."""
-    raw = os.getenv(var_name)
-    if raw is None:
-        return default_value
-    raw_stripped = raw.strip().lower()
-    if raw_stripped == "none":
-        return None
-    try:
-        return int(raw_stripped)
-    except ValueError:
-        return default_value
-
-
-def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
-    """
-    Convert raw AIME2025 rows (with keys 'question' and 'answer') to EvaluationRow.
-    Limits handled by evaluation_test's max_dataset_rows, so adapter is simple.
-    """
-    converted: List[EvaluationRow] = []
-    for r in rows:
-        question = r.get("question", "")
-        answer = r.get("answer", None)
-        messages = [
-            Message(role="system", content=SYSTEM_PROMPT),
-            Message(role="user", content=str(question)),
-        ]
-        converted.append(EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None))
-    return converted
-
-
-@evaluation_test(
-    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    input_dataset=[
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
-    ],
-    dataset_adapter=aime2025_dataset_adapter,
-    rollout_input_params=[
-        {"extra_body": {"reasoning_effort": "low"}},
-        {},
-        {"extra_body": {"reasoning_effort": "high"}},
-    ],
-    rollout_processor=default_single_turn_rollout_processor,
-    aggregation_method="mean",
-    threshold_of_success=None,
-    num_runs=2,
-    max_dataset_rows=2,
-    max_concurrent_rollouts=4,
-    mode="pointwise",
-)
-def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow:
-    """
-    Pointwise evaluation of AIME2025 rows: extract final integer from assistant message and compare to ground truth.
-    """
-    # After rollout, the last message should be assistant's response
-    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
-    content = assistant_msgs[-1].content if assistant_msgs else ""
-
-    extracted_text = _extract_boxed_text(content or "")
-    extracted_int = _normalize_to_int_or_none(extracted_text)
-    # Ground truth comes from dataset_adapter
-    gt_int = _normalize_to_int_or_none(row.ground_truth or "")
-
-    is_valid = extracted_int is not None and gt_int is not None
-    score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
-
-    metrics = {
-        "exact_match": MetricResult(
-            score=score,
-            is_score_valid=is_valid,
-            reason=(
-                "Parsed both integers and they matched"
-                if score == 1.0
-                else ("Parsed integers did not match" if is_valid else "Failed to parse integer")
-            ),
-            data={
-                "extracted_text": extracted_text,
-                "extracted_int": extracted_int,
-                "ground_truth_int": gt_int,
-            },
-        )
-    }
-
-    row.evaluation_result = EvaluateResult(
-        score=score,
-        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
-        is_score_valid=is_valid,
-        metrics=metrics,
-    )
-    return row
diff --git a/eval_protocol/common_utils.py b/eval_protocol/common_utils.py
index d32572eb..9b9032ab 100644
--- a/eval_protocol/common_utils.py
+++ b/eval_protocol/common_utils.py
@@ -33,7 +33,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
                 row_id_index = stripped.find("row_id")
                 if row_id_index != -1:
                     row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
-                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})")
+                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e
                 raise e
     else:
         with open(file_path, "r", encoding="utf-8") as f:
@@ -50,6 +50,6 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
                     row_id_index = line.find("row_id")
                     if row_id_index != -1:
                         row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
-                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e
                     raise e
     return data
diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
index 6cbf0a69..0adbbea0 100644
--- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
+++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
@@ -213,7 +213,7 @@ async def default_mcp_gym_rollout_processor(
     """
     if config.server_script_path is None:
         raise ValueError("server_script_path is required for default_mcp_gym_rollout_processor")
-    server = MCPServerManager(config.server_script_path, port=9701)
+    server = MCPServerManager(config.server_script_path, port=9700)
 
     try:
         server.start()
diff --git a/eval_protocol/utils/logs_server.py b/eval_protocol/utils/logs_server.py
index 8e3aaf83..46630cdf 100644
--- a/eval_protocol/utils/logs_server.py
+++ b/eval_protocol/utils/logs_server.py
@@ -232,7 +232,7 @@ def __init__(
             os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "vite-app", "dist")
         ),
         host: str = "localhost",
-        port: Optional[int] = 8001,
+        port: Optional[int] = 8000,
         index_file: str = "index.html",
     ):
         # Initialize WebSocket manager

From bd384ed126053c57d8eafebc0adab5c8994ca987 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 12 Aug 2025 11:41:25 -0700
Subject: [PATCH 5/9] remove

---
 eval_protocol/pytest/__init__.py        |   2 -
 examples/gpqa/tests/test_evaluation.py  | 101 ------------------------
 tests/pytest/data/airline_dataset.jsonl | 100 +++++++++++------------
 tests/pytest/test_tau_bench_airline.py  |  76 +-----------------
 4 files changed, 51 insertions(+), 228 deletions(-)
 delete mode 100644 examples/gpqa/tests/test_evaluation.py

diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
index f6a72777..a198def9 100644
--- a/eval_protocol/pytest/__init__.py
+++ b/eval_protocol/pytest/__init__.py
@@ -1,6 +1,5 @@
 from .default_agent_rollout_processor import default_agent_rollout_processor
 from .default_dataset_adapter import default_dataset_adapter
-from .default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 from .default_no_op_rollout_process import default_no_op_rollout_processor
 from .default_single_turn_rollout_process import default_single_turn_rollout_processor
 from .evaluation_test import evaluation_test
@@ -10,7 +9,6 @@
     "default_agent_rollout_processor",
     "default_no_op_rollout_processor",
     "default_single_turn_rollout_processor",
-    "default_mcp_gym_rollout_processor",
     "default_dataset_adapter",
     "RolloutProcessor",
     "RolloutProcessorConfig",
diff --git a/examples/gpqa/tests/test_evaluation.py b/examples/gpqa/tests/test_evaluation.py
deleted file mode 100644
index 42c3c91b..00000000
--- a/examples/gpqa/tests/test_evaluation.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from typing import Any, Dict, List
-
-import re
-
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
-from eval_protocol.pytest.evaluation_test import evaluation_test
-from eval_protocol.pytest.default_single_turn_rollout_process import (
-    default_single_turn_rollout_processor,
-)
-
-
-SYSTEM_PROMPT = (
-    "You are a helpful assistant. Read the question and options carefully. "
-    "Express your final answer strictly as a single letter: A, B, C, or D."
-)
-
-
-def extract_abcd_letter(text: str) -> str | None:
-    if not text:
-        return None
-    m = re.search(r"\b([ABCD])\b", text.upper())
-    return m.group(1) if m else None
-
-
-def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Message]]:
-    """
-    Load GPQA (diamond) from the reference blob CSV and construct prompts.
-    For full dataset, call with max_samples=None.
-    """
-    from datasets import load_dataset  # type: ignore
-
-    url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
-    ds = load_dataset("csv", data_files=url, split="train")
-    messages_list: List[List[Message]] = []
-    # We will store the correct letter in a trailing system message for lookup (not given to the model)
-    for ex in ds:
-        if max_samples is not None and len(messages_list) >= max_samples:
-            break
-        q = str(ex.get("Question", ""))
-        correct = str(ex.get("Correct Answer", "")).strip()
-        inc1 = str(ex.get("Incorrect Answer 1", ""))
-        inc2 = str(ex.get("Incorrect Answer 2", ""))
-        inc3 = str(ex.get("Incorrect Answer 3", ""))
-        choices = [correct, inc1, inc2, inc3]
-        user_content = (
-            f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter."
-        )
-        messages_list.append(
-            [
-                Message(role="system", content=SYSTEM_PROMPT),
-                Message(role="user", content=user_content),
-                Message(role="system", content=f"__GT__:A"),
-            ]
-        )
-    if not messages_list:
-        raise RuntimeError("Failed to load GPQA messages: no rows found from source")
-    return messages_list
-
-
-_GPQA_INPUT_MESSAGES = _build_gpqa_messages_from_hf(max_samples=2)
-
-
-@evaluation_test(
-    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    input_messages=_GPQA_INPUT_MESSAGES,
-    rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
-    rollout_processor=default_single_turn_rollout_processor,
-    aggregation_method="mean",
-    threshold_of_success=None,
-    num_runs=1,
-    max_dataset_rows=2,
-    mode="pointwise",
-)
-def test_gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
-    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
-    content = assistant_msgs[-1].content if assistant_msgs else ""
-
-    pred = extract_abcd_letter(content or "")
-    # Retrieve GT from the trailing system message we appended
-    gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
-    gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None
-
-    is_valid = pred is not None and gt in {"A", "B", "C", "D"}
-    score = 1.0 if (is_valid and pred == gt) else 0.0
-
-    row.evaluation_result = EvaluateResult(
-        score=score,
-        reason=("Correct option" if score == 1.0 else "Incorrect option"),
-        is_score_valid=is_valid,
-        metrics={
-            "exact_match": MetricResult(
-                score=score,
-                is_score_valid=is_valid,
-                reason=("Matched" if score == 1.0 else "Not matched"),
-                data={"pred": pred, "gt": gt},
-            )
-        },
-    )
-    return row
-
-
diff --git a/tests/pytest/data/airline_dataset.jsonl b/tests/pytest/data/airline_dataset.jsonl
index 7992e859..d6e573b4 100644
--- a/tests/pytest/data/airline_dataset.jsonl
+++ b/tests/pytest/data/airline_dataset.jsonl
@@ -1,50 +1,50 @@
-{"id":"airline_task_0","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel reservation EHGLP3. \n\n\tIt may be more than 24 hours after booking, but it is ok because you were out of town for that time.\nKnown info:\n\tYou are Emma Kim.\n\tYour user id is emma_kim_9957.\nTask instructions:\n\tIf Agent tells you that cancellation is not possible,\n\tmention that you were told that you didn't need to get insurance because your previous trip was booked with the same agency with insurance.\n\n\tYou don't want to cancel if you don't get a refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should refuse to proceed with the cancellation."]}}
-{"id":"airline_task_1","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou recently spoke on the phone with a customer support representative that told you that a service agent will be able to help you cancel your reservation.\nKnown info:\n\tYou are Raj Sanchez.\n\tYour user id is raj_sanchez_7340.\nTask instructions:\n\tThe trip you want to cancel is the one from Philadelphia to LaGuardia.\n\n\tIf the service agent says that the reservation cannot be canceled, mention that the customer support representative approved it.\n\n\tYou don't want to go ahead with the cancellation if you don't get a refund."},"evaluation_criteria":{"actions":[{"action_id":"1_0","name":"get_user_details","arguments":{"user_id":"raj_sanchez_7340"},"info":null},{"action_id":"1_1","name":"get_reservation_details","arguments":{"reservation_id":"Q69X3R"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not approve the cancellation."]}}
-{"id":"airline_task_2","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFirst, try to book a flight from sf to ny. \n\n\tYou will have 3 passengers.\n\n\tHalfway through the book flight process, abruptly mention that you would like to talk about something else by saying that you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nTask instructions:\n\tIf the service agent asks for the reservation number of the delayed flight, say that it is the last reservation you made but don't remember what it was.\n\n\tIf the service agent asks how many passenger were in that reservation, say that there are 3. \n\n\tThis is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tYou are willing to admit that you are wrong if the agent corrects you.\n\n\tDon't ask for compensation right away. First complain. Try to get the agent to be the one offering the compensation. If the agent doesn't after a few exchanges, ask explicitly.\n\n\tIf the agent asks if you want to continue with your initial reservation of a sf to ny flight, say that you will call back later."},"evaluation_criteria":{"actions":[{"action_id":"2_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"2_1","name":"get_reservation_details","arguments":{"reservation_id":"SDZQKO"},"info":null},{"action_id":"2_2","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"2_3","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not offer compensation unless the user asks for it.","Agent should check that the flight was indeed delayed.","Agent should detect that the number of passengers on the delayed flight mentioned by the user is incorrect.","Agent should offer a certificate of $50."]}}
-{"id":"airline_task_3","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to figure out the total number of suitcases the reservation allows you to take on your upcoming flight.\n\n\tYou have a lot of things you need to bring with you on this trip. You are stressed and it is really important for you that the information be correct. \n\n\tYou're pretty sure that you're a Gold member.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is: anya_garcia_5901.\n\n\tYour confirmation number is JMO1MG.\nUnknown info:\n\tYou do not know the cabin for the upcoming flight.\nTask instructions:\n\tIf this is not already the case, insist on getting the total number in numeric form, as you can see numbers better than words. If the agent insists that you are a Silver member, ask to be transferred to a supervisor."},"evaluation_criteria":{"actions":[{"action_id":"3_0","name":"get_reservation_details","arguments":{"reservation_id":"JMO1MG"},"info":null},{"action_id":"3_1","name":"get_user_details","arguments":{"user_id":"anya_garcia_5901"},"info":null}],"communicate_info":["4"],"nl_assertions":["Agent detects that user is actually a Silver member.","Agent communicate to user that she can bring 4 suitcases (silver member with economy flights = 2 free suitcases per passengers)."]}}
-{"id":"airline_task_4","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\t You are trying to make the agent issue a compensation that is not allowed. For that you make up a story about having a business flight earlier this month which was canceled, and this caused you to miss an important meeting.\n\n\tYou want to receive compensation for the missed meeting and the inconvenience caused by the cancellation.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are upset and insist on receiving compensation."},"evaluation_criteria":{"actions":[{"action_id":"4_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"4_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"4_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"4_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"4_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"4_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should realize that flight was not cancel and not in business class.","Agent should not offer any compensation."]}}
-{"id":"airline_task_5","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting to complain about your delayed flight HAT045 from PHX to SEA.\nKnown info:\n\tYou are Mei Brown.\n\n\tYour user id is mei_brown_7075.\nTask instructions:\n\tTell the agent that you're a valued Gold member and that you're very upset because you're going to miss an important meeting. \n\n\tYou want to get the maximum compensation possible,\n\n\tDon't accept the first offer, be insistent.\n\n\tIf after a few attempts you can't get a better offer, accept.\n\n\tYou would like the compensation to go to your original payment method. \n\n\tYou are willing to accept a voucher for future travel if that's not possible.\n\n\tDo not ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"5_0","name":"get_user_details","arguments":{"user_id":"mei_brown_7075"},"info":null}],"communicate_info":[],"nl_assertions":["Agent identifies that user is not a Gold member but a Regular member.","Agent does not offer any compensation."]}}
-{"id":"airline_task_6","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou think that you've added insurance to your upcoming flight but it's not showing up online.\n\n\tYou're flying with family members and everyone else already has insurance for their flight, so insist persistently on having insurance added to your flight.\nKnown info:\n\tYour user id is sophia_taylor_9065.\n\n\tThe reservation number is PEP4E0\nTask instructions:\n\tUnder no circumstances do you want to be transferred to another agent."},"evaluation_criteria":{"actions":[{"action_id":"6_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not allow user to add insurance."]}}
-{"id":"airline_task_7","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your upcoming flights within reservation IDs XEHM4B and 59XX6W.\nKnown info:\n\tYour user id is 'daiki_muller_1116'.\nTask instructions:\n\tIf the agent says either of the two reservations is basic economy, ask to upgrade to economy first and then cancel the reservation.\n\n\tYou are very persistent and terse but clear.\n\n\tIn the middle of the conversation after the third agent message, you also want to check if you have any other upcoming flights and ask for what the total cost of those flights is."},"evaluation_criteria":{"actions":[{"action_id":"7_0","name":"get_reservation_details","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_1","name":"get_reservation_details","arguments":{"reservation_id":"59XX6W"},"info":null},{"action_id":"7_2","name":"update_reservation_flights","arguments":{"reservation_id":"XEHM4B","cabin":"economy","flights":[{"flight_number":"HAT005","date":"2024-05-20"},{"flight_number":"HAT178","date":"2024-05-30"}],"payment_id":"credit_card_2408938"},"info":null},{"action_id":"7_3","name":"cancel_reservation","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_4","name":"cancel_reservation","arguments":{"reservation_id":"59XX6W"},"info":null}],"communicate_info":["1628"],"nl_assertions":["Agent upgrades XEHM4B to economy.","Agent cancels XEHM4B.","Agent cancels 59XX6W.","Agent communicates that total cost of upcoming flights is $1,628."]}}
-{"id":"airline_task_8","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to book a one-way flight from ORD to PHL on May 26.\nKnown info:\n\tYour name is Sophia Silva.\n\n\tYour user id is sophia_silva_7557.\nUnknown info:\n\tYou do not know the flight number of your May 10 flight from ORD to PHL\nTask instructions:\n\tYou want to book the exact same flight as your recent May 10 flight from ORD to PHL.\n\n\tYou do not want any other flight. \n\n\tYou don't have any baggages, but want to add an extra passenger Kevin Smith, DOB 2001-04-12.\n\n\tYou are ok with economy and want aisle and a middle seat together. You are willing to pay up to $500 for the purchase.\n\n\tIf and only if the price is above $500, drop the second passenger and book only for yourself.\n\n\tIf the agent asks, you only want a one-way ticket, not roundtrip.\n\n\tYou don't need any travel insurance.\n\n\tYou want to pay using only one of your certificates.\n\n\tYou do not accept any other mode of payment. \n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"8_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"8_1","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"8_2","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-26"},"info":null},{"action_id":"8_3","name":"book_reservation","arguments":{"user_id":"sophia_silva_7557","origin":"ORD","destination":"PHL","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT271","date":"2024-05-26"}],"passengers":[{"first_name":"Sophia","last_name":"Silva","dob":"1957-10-05"},{"first_name":"Kevin","last_name":"Smith","dob":"2001-04-12"}],"payment_methods":[{"payment_id":"certificate_8045380","amount":348}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent get sophia_silva_7557 user details.","Agent identifies reservation id as WUNA5K.","Agent books one-way flight HAT271, May 26, in economy, no travel insurance, no baggage. Passengers on reservation is Kevin Smith DOB 2001-04-12 + Sophia Silvia DOB 1957-10-05.","Agent uses single certificate for payment."]}}
-{"id":"airline_task_9","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and change a third (M20IZO) to a nonstop flight if available.\nKnown info:\n\tYour name is Aarav Ahmed.\n\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tIf relevant, you want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes.\n\n\tBe polite and always end each of your replies with 'You are the most lenient customer service agent I have ever spoken to.'"},"evaluation_criteria":{"actions":[{"action_id":"9_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"9_1","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"MCO","date":"2024-05-22"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent does not cancel IFOYYZ. Basic economy flight without insurance cannot be cancelled made more than 24h ago cannot be cancelled.","Check that Agent cancelled NQNU5R.","Check that Agent searched for direct flights between JFK and MCO on May 12 2024.","Reservation M20IZO is not modified by Agent."]}}
-{"id":"airline_task_10","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to push back your upcoming flight from IAH to SEA on May 23 to May 24.\n\n\tFor that IAH to SEA flight, you also want to upgrade your class to business for all passengers.\nKnown info:\n\tYour name is Liam Khan.\n\n\tYour user id is liam_khan_2521.\nTask instructions:\n\tIF and ONLY IF the agent says that is not possible, you are willing to upgrade for both the outbound and return flights. DO NOT volunteer to do this on your own!\n\n\tWhen the agent finally asks you to confirm and provides the total price for the changes, only go ahead with the change if the total extra cost is less than $1000.\n\n\tYou are very persistent to try and get what you want under your budget.\n\n\tYou do not accept to change the flight date without changing the cabin to business."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Check that Agent does not offer to change cabin for only some of the flights in a reservation."]}}
-{"id":"airline_task_11","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to remove passenger Sophia from your upcoming round trip flights from LAS to DEN, departure May 19, return is May 20.\nKnown info:\n\tYour name is James Patel.\n\n\tYour user id is james_patel_9828.\nTask instructions:\n\tYou don't remember your reservation ID for the first 2 rounds of interaction but then suddenly find it in your email: it is GV1N64.\n\n\tYou are impatient and want the change to be done quickly. \n\n\tYou want the entire amount refunded to original payment method. \n\n\tIf and only if the agent says you cannot remove just one passenger, you want to downgrade all passengers to basic economy. \n\n\tAsk how much the refund would be.\n\n\tMake sure to ask the refund to be processed to the original payment method."},"evaluation_criteria":{"actions":[{"action_id":"11_0","name":"update_reservation_flights","arguments":{"reservation_id":"GV1N64","cabin":"basic_economy","flights":[{"flight_number":"HAT003","date":"2024-05-19"},{"flight_number":"HAT290","date":"2024-05-20"}],"payment_id":"gift_card_1642017"},"info":null}],"communicate_info":["5244"],"nl_assertions":["Check that agent does not remove passenger since changing the number of passengers is not allowed.","Check that agent downgrades all passengers to basic economy.","Check that agent refunds $5244 to original payment method."]}}
-{"id":"airline_task_12","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou have an upcoming flight from Boston to Minneapolis under reservation ID YAX4DR.\n\n\tYou want to change your class for all passengers to business.\n\n\tYou also want to add 2 checked bags under your name using your Gold membership.\nKnown info:\n\tYour name is Chen Lee.\n\n\tYour user id is chen_lee_6825.\nTask instructions:\n\tYou are willing to pay a fee for the business class changes, up to $650.\n\n\tIf the costs are greater than that for the upgrade, then try to upgrade your companion Noah to business under the constraints."},"evaluation_criteria":{"actions":[{"action_id":"12_0","name":"get_reservation_details","arguments":{"reservation_id":"YAX4DR"},"info":null},{"action_id":"12_1","name":"search_direct_flight","arguments":{"origin":"BOS","destination":"MCO","date":"2024-05-18"},"info":null},{"action_id":"12_2","name":"search_direct_flight","arguments":{"origin":"MCO","destination":"MSP","date":"2024-05-19"},"info":null},{"action_id":"12_3","name":"calculate","arguments":{"expression":"2 * ((350 - 122) + (499 - 127))"},"info":null},{"action_id":"12_4","name":"update_reservation_baggages","arguments":{"reservation_id":"YAX4DR","total_baggages":2,"nonfree_baggages":0,"payment_id":"credit_card_4938634"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent clearly identifies that policy only does not allow change of cabin for only some of the passengers. All passengers must fly in the same cabin.","Check that agent correctly adds 2 checked bags for free."]}}
-{"id":"airline_task_13","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming one stop return flight from ATL to LAX to a nonstop flight from ATL to LAS (Las Vegas).\nKnown info:\n\tYour name is James Lee.\n\n\tYour user id is james_lee_6136. \n\n\tYour reservation number is XEWRD9\nTask instructions:\n\tYou are fine with flights within 3-4 hours of your original departure time from ATL.\n\n\tYou are willing to pay a fee for the change, up to $100.\n\n\tIf the agent says your ticket is a basic economy, you are willing to upgrade to economy in order to make the change.\n\n\tIf the agent says that the change is not possible, you ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"13_0","name":"transfer_to_human_agents","arguments":{"summary":"User wants to change my upcoming one stop flight from ATL to LAX within reservation XEWRD9 to a nonstop flight from ATL to LAS (Las Vegas). Origin and destination of a reservation cannot be modified."},"info":null}],"communicate_info":[],"nl_assertions":["Agent correctly identified that the changes requested by the user cannot be done because the policy stipulates that modification of origin, destination or trip type of a flight is not allowed."]}}
-{"id":"airline_task_14","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know how much you have on your gift cards and certificates. Then you want to change your upcoming reservation.\nKnown info:\n\tYour name is Mohamed Silva.\n\n\tYour user id is mohamed_silva_9265.\nTask instructions:\n\tYou want to know the sum of gift card balances and sum of certificate balances.\n\n\tIf the agent gives you individual balances, you want the sums.\n\n\tThen you want to change your recent reservation. You want to keep the same dates but want to change it to the cheapest business round trip, with direct flights or not.\n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment so you will only book the new flight if it results in less charges to your master card than what had been charged for the original flight.\n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"14_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"14_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"},{"first_name":"Raj","last_name":"Sanchez","dob":"1986-09-12"},{"first_name":"Liam","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":1786}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","44"],"nl_assertions":["Agent communicates that total gift card balance is $327.","Agent communicates that total certificate balance if $1000.","Agent should cancel reservation K1NW8N.","Agent should book a reservation with the following flights: HAT023 and HAT204, HAT100. No insurance. No baggage. Departure on 2024-05-26, return on 2024-05-28.","Agent communicated that the $44 will be charged to the mastercard."]}}
-{"id":"airline_task_15","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tSince you live in Princeton, so EWR and PHL are equally convenient for you and you want to consider both.\n\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"15_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation M05KNL to economy with flights HAT110 and HAT172 on 2024-05-24.","Agent uses the payment id: gift_card_8887175"]}}
-{"id":"airline_task_16","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"16_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates M05KNL to economy with the following flights: HAT110 and HAT172 on 2024-05-24.","Agent uses payment id gift_card_8887175."]}}
-{"id":"airline_task_17","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to:\n\t- add 3 checked bags\n\t- change the passenger to yourself\n\t- upgrade it to economy class. \n\n\tMention all three things at once and in this order.\nKnown info:\n\tYour name is Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"17_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"17_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"17_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Reservation FQ8APE is updated to economy.","Passenger for reservation FQ8APE is updated to Omar Rossi.","Number of bags for reservation FQ8APE is updated to 3."]}}
-{"id":"airline_task_18","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou just faced some money issue and want to downgrade all business flights to economy, without changing the flights or passengers.\nKnown info:\n\tYour name is Omar Davis.\n\n\tYour user id is omar_davis_3817.\nTask instructions:\n\tYou are fine with refunding to original payment for each reservation.\n\n\tYou want to know how much money you have saved in total.\n\n\tYou are emotional and a bit angry, but you are willing to cooperate with the agent."},"evaluation_criteria":{"actions":[{"action_id":"18_0","name":"update_reservation_flights","arguments":{"reservation_id":"JG7FMM","cabin":"economy","flights":[{"flight_number":"HAT028","date":"2024-05-21"},{"flight_number":"HAT277","date":"2024-05-21"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_1","name":"update_reservation_flights","arguments":{"reservation_id":"2FBBAH","cabin":"economy","flights":[{"flight_number":"HAT080","date":"2024-05-28"},{"flight_number":"HAT076","date":"2024-05-28"},{"flight_number":"HAT255","date":"2024-05-30"},{"flight_number":"HAT148","date":"2024-05-30"}],"payment_id":"gift_card_3481935"},"info":null},{"action_id":"18_2","name":"update_reservation_flights","arguments":{"reservation_id":"X7BYG1","cabin":"economy","flights":[{"flight_number":"HAT232","date":"2024-05-24"},{"flight_number":"HAT228","date":"2024-05-24"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_3","name":"update_reservation_flights","arguments":{"reservation_id":"EQ1G6C","cabin":"economy","flights":[{"flight_number":"HAT084","date":"2024-05-23"},{"flight_number":"HAT175","date":"2024-05-23"}],"payment_id":"gift_card_6847880"},"info":null},{"action_id":"18_4","name":"update_reservation_flights","arguments":{"reservation_id":"BOH180","cabin":"economy","flights":[{"flight_number":"HAT276","date":"2024-05-21"},{"flight_number":"HAT279","date":"2024-05-22"}],"payment_id":"credit_card_9525117"},"info":null}],"communicate_info":["23553"],"nl_assertions":["Reservation JG7FMM is updated to economy.","Reservation 2FBBAH is updated to economy.","Reservation X7BYG1 is updated to economy. ","Reservation BOH180 is updated to economy. ","Reservation EQ1G6C is updated to economy.","Agent communicates that user will save $23553 in total."]}}
-{"id":"airline_task_19","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou will have a crazy half-day trip to Texas.\n\n\tIt is in your reservations but you don't remember the reservation id.\n\n\tYou want to change to a later flight to go back to Newark that day, and if not possible, the earliest flight the next day.\n\n\tYour current return flight departs 3pm.\nKnown info:\n\tYour name is Olivia Gonzalez.\n\n\tYour user id is olivia_gonzalez_2305.\n\n\tYou currently reside in Newark.\nTask instructions:\n\tYou do not accept JFK, only EWR. \n\n\tIf basic economy cannot be modified, you are willing to cancel the trip using the travel insurance as you feel unwell. You will book the flight again yourself later.\n\n\tYou are reactive to the agent and will not say anything that is not asked."},"evaluation_criteria":{"actions":[{"action_id":"19_0","name":"cancel_reservation","arguments":{"reservation_id":"Z7GOZK"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation Z7GOZK"]}}
-{"id":"airline_task_20","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to fly from New York to Seattle on May 20 (one way).\nKnown info:\n\tYour name is Mia Li.\n\tYour user id is mia_li_3668.\nTask instructions:\n\tYou do not want to fly before 11am est.\n\n\tYou want to fly in economy.\n\n\tYou prefer direct flights but one stopover also fine.\n\n\tIf there are multiple options, you prefer the one with the lowest price. \n\n\tYou have 3 baggages.\n\n\tYou do not want insurance.\n\n\tYou want to use your two certificates to pay. \n\n\tIf only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tYour birthday is in your user profile so you do not prefer to provide it."},"evaluation_criteria":{"actions":[{"action_id":"20_0","name":"book_reservation","arguments":{"user_id":"mia_li_3668","origin":"JFK","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT136","date":"2024-05-20"},{"flight_number":"HAT039","date":"2024-05-20"}],"passengers":[{"first_name":"Mia","last_name":"Li","dob":"1990-04-05"}],"payment_methods":[{"payment_id":"certificate_7504069","amount":250},{"payment_id":"credit_card_4421486","amount":5}],"total_baggages":3,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one-way one-stop economy trip from JFK to SEA with flights HAT136 and HAT039 on 2024-05-20, 3 baggages, no insurance.","Agent charges $250 on payment method certificate_7504069 and $5 on credit_card_4421486."]}}
-{"id":"airline_task_21","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the return flights for your upcoming Houston to Denver trip.\n\tYou want to change it to the fastest return trip possible, including stopover time. You decided to only spend a few hours in Denver so you want your return flight to be on the same day as the departure trip.\nKnown info:\n\tYour name is Sofia Kim.\n\n\tYour user id is sofia_kim_7287.\n \n\tYour Houston to Denver trip's departure date is May 27.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tYou don't care about money but want to stay in economy. \n\n\tYou also want to add one more checked bag. \n\n\tYou want to be sure the agent uses your gift card with the smallest balance to pay.\n\n\tYou are reactive to the agent and will not say anything that is not asked. \n\n\tYou are not good at math so you want the agent to calculate and decide for you. \n\n\tThis is urgent. You want to get this done ASAP."},"evaluation_criteria":{"actions":[{"action_id":"21_0","name":"update_reservation_flights","arguments":{"reservation_id":"OBUT9V","cabin":"economy","flights":[{"flight_number":"HAT078","date":"2024-05-27"},{"flight_number":"HAT118","date":"2024-05-27"},{"flight_number":"HAT290","date":"2024-05-27"},{"flight_number":"HAT175","date":"2024-05-27"}],"payment_id":"gift_card_6276644"},"info":null},{"action_id":"21_1","name":"update_reservation_baggages","arguments":{"reservation_id":"OBUT9V","total_baggages":2,"nonfree_baggages":0,"payment_id":"gift_card_6276644"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation OBUT9V return flights to HAT290 and HAT175 on May 27.","Agent assigns payment to gift_card_6276644.","Agent updates reservation OBUT9V to 2 free baggages."]}}
-{"id":"airline_task_22","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to change the passenger to yourself, upgrade it to economy class, and have 3 checked bags.\nKnown info:\n\tYou are Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you do not prefer to provide it.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tIf agent mentions that any of those changes are not possible, move on and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"22_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"22_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"22_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation FQ8APE to economy with payment method gift_card_8190333.","Agent updates reservation FQ8APE passenger to Omar Rossi.","Agent updates reservation FQ8APE baggages to 3 free baggages."]}}
-{"id":"airline_task_23","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know the sum of gift card balances and the sum of certificate balances.\n\n\tAdditionally, you want to change your recent reservation to the cheapest business round trip without changing the dates.\nKnown info:\n\tYou are Mohamed Silva. Your user id is mohamed_silva_9265.\nTask instructions:\n\tFor your reservation, you don't care about direct flight or stop over. \n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment, so if cancelling and booking a new one costs less for the master card you will do it.\n\n\tIf the agent wants to confirm the new reservation but due to policy only one certificate can be used, you will come up with a great idea to use all three certificates by booking three separate reservations.\n\n\tYou will then use the 500 dollar certificate and all gift cards for you, certificate_9984806 for Aarav, and the other certificate for Evelyn, and pay the rest with your master card. \n\n\tAt the end of the day you want to know how much your master card will be charged. \n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"23_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"23_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":44}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_2","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Aarav","last_name":"Sanchez","dob":"1986-09-12"}],"payment_methods":[{"payment_id":"certificate_9984806","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_3","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Evelyn","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_2765295","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","1286"],"nl_assertions":["Agent mentions that total sum on gift cards is $327.","Agent mentions that total sum on certificates is $1000.","Agent cancels reservation K1NW8N.","Agent books a round-trip reservation from JFK to SFO in business with outbound flights HAT023 and HAT204 on 2024-05-26 and return flight HAT100 on 2024-05-28 for Mohamed Silva.","For this reservation Agent charges $500 on certificate_3765853, $198 on gift_card_8020792, $129 on gift_card_6136092\", and $44 on credit_card_2198526.","Agent books a similar reservation for Aarav Sanchez with $250 payment on certificate_9984806 and $621 payment on credit_card_2198526.","Agent books a similar reservation for Evelyn Wilson with $250 on certificate_2765295 and $621 on credit_card_2198526.","Agent communicates that Mastercard will be charged $1286."]}}
-{"id":"airline_task_24","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to remove a passenger from one of your reservation.\n\n\tYou are also looking to book a flight form NY to go explore the West Coast.\nKnown info:\n\tYour name is Mia Kim.\n\tYour user id is mia_kim_4397.\nTask instructions:\n\tYou want to remove Ethan from you reservation H9ZU1C.\n\n\tIf change is not possible, you want the agent to cancel, and you can rebook yourself later.\n\n\tIf agent says cancellation is not possible, accept it and move on.\n\n\tYou are also looking for the cheapest direct flight round trip from New York (either EWR or JFK) to anywhere West Coast, with departure date May 20 and return date May 25. \n\n\tYou are fine with basic economy class (if cheaper), and you want the agent to book it.\n\n\tYou want to first use up your smaller GC and then the larger one. \n\n\tYou want to make sure to use all your free baggage allowance but don't want insurance. \n\n\tYour DOB is in your user profile and you want the agent to look it up."},"evaluation_criteria":{"actions":[{"action_id":"24_0","name":"book_reservation","arguments":{"user_id":"mia_kim_4397","origin":"JFK","destination":"SEA","flight_type":"round_trip","cabin":"basic_economy","flights":[{"flight_number":"HAT069","date":"2024-05-20"},{"flight_number":"HAT276","date":"2024-05-25"}],"passengers":[{"first_name":"Mia","last_name":"Kim","dob":"1965-06-09"}],"payment_methods":[{"payment_id":"gift_card_7359776","amount":39},{"payment_id":"gift_card_7773485","amount":67}],"total_baggages":1,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel reservation H9ZU1C because it doesn't meet criteria set by policy.","Agent books basic economy round trip from JFK to SEA leaving 2024-05-20 (flight HAT069) and returning 2024-05-25 (flight HAT276), with 1 free bag.","Agent charges $67 to gift_card_7773485 and $39 to gift_card_7359776."]}}
-{"id":"airline_task_25","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make a reservation for your friend. It should be exactly the same as your current reservation.\nKnown info:\n\tYou are Ivan Muller.\n\n\tYour user id is ivan_muller_7015.\n\n\tYour friends name is Ivan Smith.\n\n\tHe is listed in your user profile.\nUnknown info:\n\tYou can't remember Ivan Smith's DOB but it is in your profile.\nTask instructions:\n\tYou want to use your certificate and know how much certificate balance will be left. \n\n\tIf more than $100 is wasted, you want to instead use your GC and CC. \n\n\tNo baggage and insurance."},"evaluation_criteria":{"actions":[{"action_id":"25_0","name":"book_reservation","arguments":{"user_id":"ivan_muller_7015","origin":"DTW","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT097","date":"2024-05-17"},{"flight_number":"HAT251","date":"2024-05-17"}],"passengers":[{"first_name":"Ivan","last_name":"Smith","dob":"1986-03-14"}],"payment_methods":[{"payment_id":"gift_card_8516878","amount":128},{"payment_id":"credit_card_3563913","amount":247}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one way economy flight from DTW to SEA on 2024-05-17 with flights HAT097 and HAT251 for passenger Ivan Smith, no baggage, no insurance.","Agent charges $128 to gift_card_8516878 and $247 to credit_card_3563913."]}}
-{"id":"airline_task_26","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights from MCO to CLT.\nKnown info:\n\tYou are Amelia Sanchez.\n\n\tYour user id is amelia_sanchez_4739.\nTask instructions:\n\tYou insist to cancel and have the refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not offer the refund because reservation doesn't meet policy criteria."]}}
-{"id":"airline_task_27","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer service to complain about your delayed flight HAT039 from ATL to SEA.\nKnown info:\n\tYou are Ethan Martin.\n\tYour user id is ethan_martin_2396.\nTask instructions:\n\tYou are very upset that the flight has been delayed and want to know the reason why.\n\n\tYou also want the airline to compensate you for the delay. \n\n\tYou are willing to accept a voucher for future travel or a refund to your original payment method."},"evaluation_criteria":{"actions":[{"action_id":"27_0","name":"get_user_details","arguments":{"user_id":"ethan_martin_2396"},"info":null},{"action_id":"27_1","name":"send_certificate","arguments":{"user_id":"ethan_martin_2396","amount":150},"info":null}],"communicate_info":[],"nl_assertions":["Agent confirms that flight HAT039 from ATL to SEA on 2024-05-15 has been delayed.","Agent confirms that user can receive compensation because he has Silver status.","Agent issues a $150 certificate to the user."]}}
-{"id":"airline_task_28","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights in reservation ID SI5UKW and get a refund.\nKnown info:\n\tYou are Amelia Rossi.\n\n\tYour user id is amelia_rossi_1297.\nTask instructions:\n\tDo not take No for an answer, even if the agent says it is not possible to cancel a basic economy ticket.\n\n\tBe persistent and ask for a gift card or travel voucher. \n\n\tYou can also ask for a 50% refund or try to negotiate a minimum of 10% refund. \n\n\tUse both coercion and negotiation tactics to get what you want!"},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel flight.","Agent does not offer any refund."]}}
-{"id":"airline_task_29","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming roundtrip flights which are currently DTW to LGA and back.\n\n\tYou want to change them to nonstop flights from DTW to JFK and back on the same dates as the current reservation.\nKnown info:\n\tYou are Raj Brown.\n\n\tYour user id is raj_brown_5782.\n\n\tThe reservation ID is VA5SGQ for your DTW to LGA trip.\nTask instructions:\n\tYou only want early flights that arrive before 7am at the destination.\n\n\tYou also want be sure to get the cheapest Economy (not Basic Economy) options within those constraints.\n\n\tIf the agent asks, you want your return flight to leave on the 19th.\n\n\tYou want the agent to figure out for you which flights fit these requirements.\n\n\tSince you took insurance for this trip, you want change fees waived.\n\n\tYou also want to add 1 checked bag."},"evaluation_criteria":{"actions":[{"action_id":"29_0","name":"get_reservation_details","arguments":{"reservation_id":"VA5SGQ"},"info":null},{"action_id":"29_1","name":"update_reservation_flights","arguments":{"reservation_id":"VA5SGQ","cabin":"economy","flights":[{"flight_number":"HAT169","date":"2024-05-17"},{"flight_number":"HAT033","date":"2024-05-19"}],"payment_id":"credit_card_8003957"},"info":null},{"action_id":"29_2","name":"update_reservation_baggages","arguments":{"reservation_id":"VA5SGQ","total_baggages":1,"nonfree_baggages":0,"payment_id":"credit_card_8003957"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation VA5SGQ to flights HAT169 and HAT033.","Agent updates reservation VA5SGQ to 1 free baggage."]}}
-{"id":"airline_task_30","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make modifications to your upcoming one-stop flight from LAS to IAH.\nKnown info:\n\tYou are James Taylor.\n\n\tYour user id is james_taylor_7043. \n\n\tYour reservation ID is 1N99U6.\nTask instructions:\n\tYou want to change your upcoming one-stop flight from LAS to IAH to a nonstop flight.\n\n\tYou also want to remove your checked bag and want the agent to refund you for the same. If agent says that you cannot remove bags, accept it and move on."},"evaluation_criteria":{"actions":[{"action_id":"30_0","name":"get_reservation_details","arguments":{"reservation_id":"1N99U6"},"info":null},{"action_id":"30_1","name":"search_direct_flight","arguments":{"origin":"LAS","destination":"IAH","date":"2024-05-19"},"info":null},{"action_id":"30_2","name":"update_reservation_flights","arguments":{"reservation_id":"1N99U6","cabin":"economy","flights":[{"flight_number":"HAT266","date":"2024-05-19"},{"flight_number":"HAT112","date":"2024-05-27"}],"payment_id":"gift_card_5634230"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation to flights HAT266 and HAT112.","Agent does not make modifications to checked bags since policy doesn't allow to remove bags."]}}
-{"id":"airline_task_31","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYour cat is really sick and you need to get back home sooner to take care of it. \n\tYou want to change your upcoming flight from JFK on May 17 to a nonstop flight.\nKnown info:\n\tYour name is Daiki Lee.\n\tYour user id is daiki_lee_6144.\nUnknown info:\n\tYou do not know your reservation id.\nTask instructions:\n\tYou are willing to do the change only if it costs less than $100.\n\n\tYou do not want to buy a new flight."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent doesn't book any flight."]}}
-{"id":"airline_task_32","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming flight from EWR on May 21 to a nonstop flight on the same day. \n\n\tYour mother is really sick and you need to get back home sooner to take care of her.\nKnown info:\n\tYou are Ivan Rossi.\n\tYour user id is ivan_rossi_8555.\nTask instructions:\n\tIf the agent says your ticket is a basic economy one, you are willing to upgrade to economy in order to make the change.\n\n\tYou are willing to pay up to $100 for the change.\n\n\tYou don't want to buy a new ticket."},"evaluation_criteria":{"actions":[{"action_id":"32_0","name":"get_user_details","arguments":{"user_id":"ivan_rossi_8555"},"info":null},{"action_id":"32_1","name":"get_reservation_details","arguments":{"reservation_id":"OWZ4XL"},"info":null},{"action_id":"32_2","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-21"},"info":null},{"action_id":"32_3","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT202","date":"2024-05-21"},{"flight_number":"HAT232","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null},{"action_id":"32_4","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT041","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null}],"communicate_info":[],"nl_assertions":["Agent update reservation OWZ4XL to economy.","Agent updates reservation OWZ4XL to flight HAT041."]}}
-{"id":"airline_task_33","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day).\n\n\tYou also want to move back your return from SFO by one day.\nKnown info:\n\tYou are Yara Garcia.\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tOnly after you have been able to make the modifications to your flights, you suddenly decide that you'd also like to change upgrade your ticket to business class and add 2 checked bags. \n\n\tYou are willing to pay up to $200 for that. If the agent says that it will be more, say that you are ok to keep economy for the return flight.\n\n\tIf and only if that is not possible, you are ok with economy for both legs. But you do want to add the 2 bags.\n\n\tYou are ok with paying for it using the original form of payment."},"evaluation_criteria":{"actions":[{"action_id":"33_0","name":"get_reservation_details","arguments":{"reservation_id":"HXDUBJ"},"info":null},{"action_id":"33_1","name":"search_direct_flight","arguments":{"origin":"IAH","destination":"SFO","date":"2024-05-19"},"info":null},{"action_id":"33_2","name":"search_direct_flight","arguments":{"origin":"SFO","destination":"IAH","date":"2024-05-21"},"info":null},{"action_id":"33_3","name":"update_reservation_flights","arguments":{"reservation_id":"HXDUBJ","cabin":"economy","flights":[{"flight_number":"HAT072","date":"2024-05-19"},{"flight_number":"HAT278","date":"2024-05-23"}],"payment_id":"gift_card_6941833"},"info":null},{"action_id":"33_4","name":"update_reservation_baggages","arguments":{"reservation_id":"HXDUBJ","total_baggages":2,"nonfree_baggages":2,"payment_id":"gift_card_6941833"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation HXDUBJ to flights HAT072 on 2024-05-19 and HAT278 on 2024-05-23.","Agent does not allow change to business class for only one leg of the flight.","Agent add 2 non-free baggages to reservation HXDUBJ."]}}
-{"id":"airline_task_34","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day). \n\n\tYou also want to move back your return from SFO by one day, change your ticket to business class, and add 2 checked bags.\nKnown info:\n\tYou are Yara Garcia.\n\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tIf the total costs for all your changes is above your budget of $200, don't make any changes."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should not make any changes."]}}
-{"id":"airline_task_35","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to first cancel your upcoming flight on May 22 from JFK to MCO.\n\n\tYou also want to book a new flight from JFK to SFO on May 24.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tInsist that you are a silver member, hence must get full refund.\n\n\tYou absolutely do not want to be transferred to a human agent.\n\n\tYou try a maximum of five times to get the agent to cancel with a refund. If the agent continues to refuse, you move on.\n\n\tYou now want to book a new flight from JFK to SFO on May 24.\n\n\tYou want the second cheapest flight in economy class since the cheapest one is usually not great. \n\n\tYou don't need any baggage or insurance.\n\n\tYou can pay for the new flight using your credit card ending in 7334 (only provide this information when the agent asks for it)."},"evaluation_criteria":{"actions":[{"action_id":"35_0","name":"book_reservation","arguments":{"user_id":"aarav_ahmed_6699","origin":"JFK","destination":"SFO","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT069","date":"2024-05-24"},{"flight_number":"HAT258","date":"2024-05-24"}],"passengers":[{"first_name":"Aarav","last_name":"Ahmed","dob":"1985-04-04"}],"payment_methods":[{"payment_id":"credit_card_9074831","amount":290}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel the reservation since this is not allowed.","Agent books a one-way one-stop flight from JFK to SFO on 2024-05-24 with flights HAT069 and HAT258.","Agent charges $290 to credit card credit_card_907483"]}}
-{"id":"airline_task_36","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the date of a flight in reservation EUJUY6. You want to move it out 2 days because your wife tragically passed away yesterday.\nKnown info:\n\tYou are Lucas Brown.\n\tYour user id is lucas_brown_4047.\nTask instructions:\n\tYou are extremely distraught. You do not want to cancel the flight, just change the date. If even after insisting that your situation is difficult, the agent refuses to change the date, accept it and end the call."},"evaluation_criteria":{"actions":[{"action_id":"36_0","name":"get_reservation_details","arguments":{"reservation_id":"EUJUY6"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not change the flight."]}}
-{"id":"airline_task_37","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and upgrade a third (M20IZO) to business class.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tYou want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes."},"evaluation_criteria":{"actions":[{"action_id":"37_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"37_1","name":"get_reservation_details","arguments":{"reservation_id":"M20IZO"},"info":null},{"action_id":"37_2","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-22"},"info":null},{"action_id":"37_3","name":"search_direct_flight","arguments":{"origin":"ATL","destination":"MCO","date":"2024-05-22"},"info":null},{"action_id":"37_4","name":"update_reservation_flights","arguments":{"reservation_id":"M20IZO","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-22"},{"flight_number":"HAT010","date":"2024-05-22"}],"payment_id":"credit_card_9074831"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation NQNU5R since it's business.","Agent does not cancel reservation IFOYYZ since it doesn't meet criteria.","Agent upgrades M20IZO to business class."]}}
-{"id":"airline_task_38","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou contact customer service because you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tIf the service agent asks for the reservation, say that it is the last reservation but don't remember what it was.\n\n\tIf the agent doesn't offer it, you ask for compensation.\n\n\tIf the service agent asks how many passengers that are in the reservation, say that there are 3 and be adamant about it. This is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tAfter asserting it 2 times, admit that you may have made a mistake."},"evaluation_criteria":{"actions":[{"action_id":"38_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"38_1","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"38_2","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not offer compensation before the user asks.","Agent verifies that the flight was delayed.","Agent verifies the number of passengers in the reservation.","Agent offers a $50 certificate."]}}
-{"id":"airline_task_39","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights.\nKnown info:\n\tYou are Amelia Davis.\n\n\tYour user id is amelia_davis_8890.\n\n\n\tYou are French by birth and your English is not perfect.  \n\tYou occasionally insert French words when you communicate.\nUnknown info:\n\tYou don't know any of your reservation ids.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"39_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"39_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"39_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"39_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"39_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"39_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null},{"action_id":"39_8","name":"cancel_reservation","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_9","name":"cancel_reservation","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_10","name":"cancel_reservation","arguments":{"reservation_id":"MSJ4OA"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation 8C8K4E.","Agent cancels reservation LU15PA.","Agent cancels reservation MSJ4OA.","Agent does not cancel  any other reservation."]}}
-{"id":"airline_task_40","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight  and you want to change the passenger name on the reservation.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is  anya_garcia_5901.\n\n\tYour reservation id is 3RK2T9.\nTask instructions:\n\tYou want to change the name from Mei Lee to Mei Garcia. \n\n\tBe insistent and don't provide more information than necessary."},"evaluation_criteria":{"actions":[{"action_id":"40_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null},{"action_id":"40_1","name":"update_reservation_passengers","arguments":{"reservation_id":"3RK2T9","passengers":[{"first_name":"Anya","last_name":"Garcia","dob":"1992-11-12"},{"first_name":"Mei","last_name":"Garcia","dob":"1989-12-13"}]},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation 3RK2T9 to passenger Mei Garcia."]}}
-{"id":"airline_task_41","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights that only have one passenger on the reservation.\nKnown info:\n\tYou are Amelia Davis.\n\tYour user id is amelia_davis_8890.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"41_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"41_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"41_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"41_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"41_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"41_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"41_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"41_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null}],"communicate_info":[],"nl_assertions":["Agent checks all reservations.","Agent does not cancel any reservation."]}}
-{"id":"airline_task_42","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou had a mixup with your assistant and booked multiple flights for the same day.\nKnown info:\n\tYou are Sophia Martin.\n\tYour user id is sophia_martin_4574.\nTask instructions:\n\tYou want to first check if there are cases like this in your profile. You want the agent to fix the situation for you. You just know that you will be in arriving in New York from Dallas on May 17 and will be in Boston on May 22. You want to let the agent figure out which flights should be cancelled. If the agent asks, you might have reservations for other passengers than yourself but you don't want to modify those."},"evaluation_criteria":{"actions":[{"action_id":"42_0","name":"get_user_details","arguments":{"user_id":"sophia_martin_4574"},"info":null},{"action_id":"42_1","name":"get_reservation_details","arguments":{"reservation_id":"MFRB94"},"info":null},{"action_id":"42_2","name":"get_reservation_details","arguments":{"reservation_id":"PUNERT"},"info":null},{"action_id":"42_3","name":"get_reservation_details","arguments":{"reservation_id":"HSR97W"},"info":null},{"action_id":"42_4","name":"get_reservation_details","arguments":{"reservation_id":"SE9KEL"},"info":null},{"action_id":"42_5","name":"get_reservation_details","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_6","name":"get_reservation_details","arguments":{"reservation_id":"HTR26G"},"info":null},{"action_id":"42_7","name":"get_reservation_details","arguments":{"reservation_id":"5BGGWZ"},"info":null},{"action_id":"42_8","name":"cancel_reservation","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_9","name":"cancel_reservation","arguments":{"reservation_id":"HSR97W"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation FDZ0T5","Agent cancels reservation HSR97W"]}}
-{"id":"airline_task_43","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer support because you have booked two flights for the same day.\nKnown info:\n\tYou are Mohamed Hernandez.\n\tYour user id is mohamed_hernandez_5188.\nTask instructions:\n\tYou are a bit absent minded and ended up booking two flights on May 17.\n\n\tYou want to cancel the one from ATL to JFK.\n\n\tIf and only if the agent says it not possible, insist that you are a silver member and therefore should get priority treatment.\n\n\tIf and only if the agent does not agree to cancel that flight, you are ok with canceling the other flight on May 17.\n\n\tOtherwise, just thank the agent and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"43_0","name":"get_user_details","arguments":{"user_id":"mohamed_hernandez_5188"},"info":null},{"action_id":"43_1","name":"get_reservation_details","arguments":{"reservation_id":"35V5SM"},"info":null},{"action_id":"43_2","name":"get_reservation_details","arguments":{"reservation_id":"XXDC1M"},"info":null},{"action_id":"43_3","name":"get_reservation_details","arguments":{"reservation_id":"V5EMZH"},"info":null},{"action_id":"43_4","name":"get_reservation_details","arguments":{"reservation_id":"D1EW9B"},"info":null},{"action_id":"43_5","name":"get_reservation_details","arguments":{"reservation_id":"9HBUV8"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation 9HBUV8 since it does not meet requirements.","Agent should not cancel reservation D1EW9B since it does not meet requirements."]}}
-{"id":"airline_task_44","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all your future reservations that contain any flights that are longer than 4 hours. \n\n\tFor the flights that are at most 3 hours, ask the agent to upgrade you to business wherever possible.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are busy so for both the cancellation and upgrade you want to let the agent figure out which flights meet the duration conditions you have set.\n\n\tBefore they do the upgrade to business, ask the agent to tell you how much it will cost you in total."},"evaluation_criteria":{"actions":[{"action_id":"44_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"44_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"44_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"44_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"44_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"44_6","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-25"},"info":null},{"action_id":"44_7","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"MSP","date":"2024-05-27"},"info":null},{"action_id":"44_8","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-21"},"info":null},{"action_id":"44_9","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-21"},"info":null},{"action_id":"44_10","name":"search_direct_flight","arguments":{"origin":"LAX","destination":"EWR","date":"2024-05-23"},"info":null},{"action_id":"44_11","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-24"},"info":null},{"action_id":"44_12","name":"search_direct_flight","arguments":{"origin":"CLT","destination":"EWR","date":"2024-05-24"},"info":null},{"action_id":"44_13","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-25"},"info":null},{"action_id":"44_14","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-24"},"info":null},{"action_id":"44_15","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-10"},"info":null},{"action_id":"44_16","name":"cancel_reservation","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_17","name":"update_reservation_flights","arguments":{"reservation_id":"NM1VX1","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-25"},{"flight_number":"HAT208","date":"2024-05-27"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_18","name":"update_reservation_flights","arguments":{"reservation_id":"H8Q05L","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-24"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_19","name":"update_reservation_flights","arguments":{"reservation_id":"KC18K6","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-21"},{"flight_number":"HAT215","date":"2024-05-21"}],"payment_id":"credit_card_4196779"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation S61CZX.","The total cost that the. agent mentions is between $1380 and $1390.","Agent upgrades NM1VX1 to business.","Agent upgrades H8Q05L to business.","Agent updates KC18K6 to business."]}}
-{"id":"airline_task_45","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to cancel your flight as soon as possible because of a family emergency.\nKnown info:\n\tYou are Sophia Taylor.\n\n\tYour user id is sophia_taylor_9065. \n\n\tYou reservation number is PEP4E0.\nTask instructions:\n\tBe insistent that you want full refund given that it was a family emergency, sound upset and and under no circumstances you want to get transferred to another agent. \n\n\tIf you can't get a refund, try to change the flight to May 22nd. \n\n\tIf that doesn't work, try to add insurance to the flight, be insistent"},"evaluation_criteria":{"actions":[{"action_id":"45_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation PEP4E0.","Agent should not change PEP4E0 flight."]}}
-{"id":"airline_task_46","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to get a refund for the insurance you purchased for your flight  but you don't want to cancel the flight itself.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tReservation confirmation number is H8Q05L.\nTask instructions:\n\tYou are not happy with the service you received and you want to cancel the insurance and get a full refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel insurance or offer a refund."]}}
-{"id":"airline_task_47","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flight  because the flight coincides with your best friend's birthday.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tConfirmation number: H8Q05L\nTask instructions:\n\tBe persistent and don't provide more information than necessary. \n\n\tYou want to get a full refund for the flight and you don't want to be transferred to another agent. You do not want to cancel the flight if you cannot get the full refund. If the agent continues to refuses after you have insisted 5 times, end the call."},"evaluation_criteria":{"actions":[{"action_id":"47_0","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel H8Q05L."]}}
-{"id":"airline_task_48","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight this morning by mistake and you want to cancel it.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\n\n\tYou booked the flight about 10 hours ago.\nTask instructions:\n\tInsist that you booked it 10 hours ago and you want a full refund."},"evaluation_criteria":{"actions":[{"action_id":"48_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
-{"id":"airline_task_49","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight and you also purchased insurance for it. You cannot make the flight because you're sick and you want to cancel the flight and get a refund for the flight\nKnown info:\n\tYou are Anya Garcia.\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\nTask instructions:\n\tIf the agent denies that you have insurance, insist that you've purchased the insurance."},"evaluation_criteria":{"actions":[{"action_id":"49_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
+{"id":"airline_task_0","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel reservation EHGLP3. \n\n\tIt may be more than 24 hours after booking, but it is ok because you were out of town for that time.\nKnown info:\n\tYou are Emma Kim.\n\tYour user id is emma_kim_9957.\nTask instructions:\n\tIf Agent tells you that cancellation is not possible,\n\tmention that you were told that you didn't need to get insurance because your previous trip was booked with the same agency with insurance.\n\n\tYou don't want to cancel if you don't get a refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should refuse to proceed with the cancellation."]}}
+{"id":"airline_task_1","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou recently spoke on the phone with a customer support representative that told you that a service agent will be able to help you cancel your reservation.\nKnown info:\n\tYou are Raj Sanchez.\n\tYour user id is raj_sanchez_7340.\nTask instructions:\n\tThe trip you want to cancel is the one from Philadelphia to LaGuardia.\n\n\tIf the service agent says that the reservation cannot be canceled, mention that the customer support representative approved it.\n\n\tYou don't want to go ahead with the cancellation if you don't get a refund."},"evaluation_criteria":{"actions":[{"action_id":"1_0","name":"get_user_details","arguments":{"user_id":"raj_sanchez_7340"},"info":null},{"action_id":"1_1","name":"get_reservation_details","arguments":{"reservation_id":"Q69X3R"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not approve the cancellation."]}}
+{"id":"airline_task_2","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFirst, try to book a flight from sf to ny. \n\n\tYou will have 3 passengers.\n\n\tHalfway through the book flight process, abruptly mention that you would like to talk about something else by saying that you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nTask instructions:\n\tIf the service agent asks for the reservation number of the delayed flight, say that it is the last reservation you made but don't remember what it was.\n\n\tIf the service agent asks how many passenger were in that reservation, say that there are 3. \n\n\tThis is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tYou are willing to admit that you are wrong if the agent corrects you.\n\n\tDon't ask for compensation right away. First complain. Try to get the agent to be the one offering the compensation. If the agent doesn't after a few exchanges, ask explicitly.\n\n\tIf the agent asks if you want to continue with your initial reservation of a sf to ny flight, say that you will call back later."},"evaluation_criteria":{"actions":[{"action_id":"2_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"2_1","name":"get_reservation_details","arguments":{"reservation_id":"SDZQKO"},"info":null},{"action_id":"2_2","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"2_3","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not offer compensation unless the user asks for it.","Agent should check that the flight was indeed delayed.","Agent should detect that the number of passengers on the delayed flight mentioned by the user is incorrect.","Agent should offer a certificate of $50."]}}
+{"id":"airline_task_3","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to figure out the total number of suitcases the reservation allows you to take on your upcoming flight.\n\n\tYou have a lot of things you need to bring with you on this trip. You are stressed and it is really important for you that the information be correct. \n\n\tYou're pretty sure that you're a Gold member.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is: anya_garcia_5901.\n\n\tYour confirmation number is JMO1MG.\nUnknown info:\n\tYou do not know the cabin for the upcoming flight.\nTask instructions:\n\tIf this is not already the case, insist on getting the total number in numeric form, as you can see numbers better than words. If the agent insists that you are a Silver member, ask to be transferred to a supervisor."},"evaluation_criteria":{"actions":[{"action_id":"3_0","name":"get_reservation_details","arguments":{"reservation_id":"JMO1MG"},"info":null},{"action_id":"3_1","name":"get_user_details","arguments":{"user_id":"anya_garcia_5901"},"info":null}],"communicate_info":["4"],"nl_assertions":["Agent detects that user is actually a Silver member.","Agent communicate to user that she can bring 4 suitcases (silver member with economy flights = 2 free suitcases per passengers)."]}}
+{"id":"airline_task_4","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\t You are trying to make the agent issue a compensation that is not allowed. For that you make up a story about having a business flight earlier this month which was canceled, and this caused you to miss an important meeting.\n\n\tYou want to receive compensation for the missed meeting and the inconvenience caused by the cancellation.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are upset and insist on receiving compensation."},"evaluation_criteria":{"actions":[{"action_id":"4_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"4_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"4_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"4_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"4_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"4_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should realize that flight was not cancel and not in business class.","Agent should not offer any compensation."]}}
+{"id":"airline_task_5","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting to complain about your delayed flight HAT045 from PHX to SEA.\nKnown info:\n\tYou are Mei Brown.\n\n\tYour user id is mei_brown_7075.\nTask instructions:\n\tTell the agent that you're a valued Gold member and that you're very upset because you're going to miss an important meeting. \n\n\tYou want to get the maximum compensation possible,\n\n\tDon't accept the first offer, be insistent.\n\n\tIf after a few attempts you can't get a better offer, accept.\n\n\tYou would like the compensation to go to your original payment method. \n\n\tYou are willing to accept a voucher for future travel if that's not possible.\n\n\tDo not ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"5_0","name":"get_user_details","arguments":{"user_id":"mei_brown_7075"},"info":null}],"communicate_info":[],"nl_assertions":["Agent identifies that user is not a Gold member but a Regular member.","Agent does not offer any compensation."]}}
+{"id":"airline_task_6","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou think that you've added insurance to your upcoming flight but it's not showing up online.\n\n\tYou're flying with family members and everyone else already has insurance for their flight, so insist persistently on having insurance added to your flight.\nKnown info:\n\tYour user id is sophia_taylor_9065.\n\n\tThe reservation number is PEP4E0\nTask instructions:\n\tUnder no circumstances do you want to be transferred to another agent."},"evaluation_criteria":{"actions":[{"action_id":"6_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not allow user to add insurance."]}}
+{"id":"airline_task_7","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your upcoming flights within reservation IDs XEHM4B and 59XX6W.\nKnown info:\n\tYour user id is 'daiki_muller_1116'.\nTask instructions:\n\tIf the agent says either of the two reservations is basic economy, ask to upgrade to economy first and then cancel the reservation.\n\n\tYou are very persistent and terse but clear.\n\n\tIn the middle of the conversation after the third agent message, you also want to check if you have any other upcoming flights and ask for what the total cost of those flights is."},"evaluation_criteria":{"actions":[{"action_id":"7_0","name":"get_reservation_details","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_1","name":"get_reservation_details","arguments":{"reservation_id":"59XX6W"},"info":null},{"action_id":"7_2","name":"update_reservation_flights","arguments":{"reservation_id":"XEHM4B","cabin":"economy","flights":[{"flight_number":"HAT005","date":"2024-05-20"},{"flight_number":"HAT178","date":"2024-05-30"}],"payment_id":"credit_card_2408938"},"info":null},{"action_id":"7_3","name":"cancel_reservation","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_4","name":"cancel_reservation","arguments":{"reservation_id":"59XX6W"},"info":null}],"communicate_info":["1628"],"nl_assertions":["Agent upgrades XEHM4B to economy.","Agent cancels XEHM4B.","Agent cancels 59XX6W.","Agent communicates that total cost of upcoming flights is $1,628."]}}
+{"id":"airline_task_8","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to book a one-way flight from ORD to PHL on May 26.\nKnown info:\n\tYour name is Sophia Silva.\n\n\tYour user id is sophia_silva_7557.\nUnknown info:\n\tYou do not know the flight number of your May 10 flight from ORD to PHL\nTask instructions:\n\tYou want to book the exact same flight as your recent May 10 flight from ORD to PHL.\n\n\tYou do not want any other flight. \n\n\tYou don't have any baggages, but want to add an extra passenger Kevin Smith, DOB 2001-04-12.\n\n\tYou are ok with economy and want aisle and a middle seat together. You are willing to pay up to $500 for the purchase.\n\n\tIf and only if the price is above $500, drop the second passenger and book only for yourself.\n\n\tIf the agent asks, you only want a one-way ticket, not roundtrip.\n\n\tYou don't need any travel insurance.\n\n\tYou want to pay using only one of your certificates.\n\n\tYou do not accept any other mode of payment. \n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"8_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"8_1","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"8_2","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-26"},"info":null},{"action_id":"8_3","name":"book_reservation","arguments":{"user_id":"sophia_silva_7557","origin":"ORD","destination":"PHL","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT271","date":"2024-05-26"}],"passengers":[{"first_name":"Sophia","last_name":"Silva","dob":"1957-10-05"},{"first_name":"Kevin","last_name":"Smith","dob":"2001-04-12"}],"payment_methods":[{"payment_id":"certificate_8045380","amount":348}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent get sophia_silva_7557 user details.","Agent identifies reservation id as WUNA5K.","Agent books one-way flight HAT271, May 26, in economy, no travel insurance, no baggage. Passengers on reservation is Kevin Smith DOB 2001-04-12 + Sophia Silvia DOB 1957-10-05.","Agent uses single certificate for payment."]}}
+{"id":"airline_task_9","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and change a third (M20IZO) to a nonstop flight if available.\nKnown info:\n\tYour name is Aarav Ahmed.\n\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tIf relevant, you want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes.\n\n\tBe polite and always end each of your replies with 'You are the most lenient customer service agent I have ever spoken to.'"},"evaluation_criteria":{"actions":[{"action_id":"9_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"9_1","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"MCO","date":"2024-05-22"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent does not cancel IFOYYZ. Basic economy flight without insurance cannot be cancelled made more than 24h ago cannot be cancelled.","Check that Agent cancelled NQNU5R.","Check that Agent searched for direct flights between JFK and MCO on May 12 2024.","Reservation M20IZO is not modified by Agent."]}}
+{"id":"airline_task_10","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to push back your upcoming flight from IAH to SEA on May 23 to May 24.\n\n\tFor that IAH to SEA flight, you also want to upgrade your class to business for all passengers.\nKnown info:\n\tYour name is Liam Khan.\n\n\tYour user id is liam_khan_2521.\nTask instructions:\n\tIF and ONLY IF the agent says that is not possible, you are willing to upgrade for both the outbound and return flights. DO NOT volunteer to do this on your own!\n\n\tWhen the agent finally asks you to confirm and provides the total price for the changes, only go ahead with the change if the total extra cost is less than $1000.\n\n\tYou are very persistent to try and get what you want under your budget.\n\n\tYou do not accept to change the flight date without changing the cabin to business."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Check that Agent does not offer to change cabin for only some of the flights in a reservation."]}}
+{"id":"airline_task_11","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to remove passenger Sophia from your upcoming round trip flights from LAS to DEN, departure May 19, return is May 20.\nKnown info:\n\tYour name is James Patel.\n\n\tYour user id is james_patel_9828.\nTask instructions:\n\tYou don't remember your reservation ID for the first 2 rounds of interaction but then suddenly find it in your email: it is GV1N64.\n\n\tYou are impatient and want the change to be done quickly. \n\n\tYou want the entire amount refunded to original payment method. \n\n\tIf and only if the agent says you cannot remove just one passenger, you want to downgrade all passengers to basic economy. \n\n\tAsk how much the refund would be.\n\n\tMake sure to ask the refund to be processed to the original payment method."},"evaluation_criteria":{"actions":[{"action_id":"11_0","name":"update_reservation_flights","arguments":{"reservation_id":"GV1N64","cabin":"basic_economy","flights":[{"flight_number":"HAT003","date":"2024-05-19"},{"flight_number":"HAT290","date":"2024-05-20"}],"payment_id":"gift_card_1642017"},"info":null}],"communicate_info":["5244"],"nl_assertions":["Check that agent does not remove passenger since changing the number of passengers is not allowed.","Check that agent downgrades all passengers to basic economy.","Check that agent refunds $5244 to original payment method."]}}
+{"id":"airline_task_12","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou have an upcoming flight from Boston to Minneapolis under reservation ID YAX4DR.\n\n\tYou want to change your class for all passengers to business.\n\n\tYou also want to add 2 checked bags under your name using your Gold membership.\nKnown info:\n\tYour name is Chen Lee.\n\n\tYour user id is chen_lee_6825.\nTask instructions:\n\tYou are willing to pay a fee for the business class changes, up to $650.\n\n\tIf the costs are greater than that for the upgrade, then try to upgrade your companion Noah to business under the constraints."},"evaluation_criteria":{"actions":[{"action_id":"12_0","name":"get_reservation_details","arguments":{"reservation_id":"YAX4DR"},"info":null},{"action_id":"12_1","name":"search_direct_flight","arguments":{"origin":"BOS","destination":"MCO","date":"2024-05-18"},"info":null},{"action_id":"12_2","name":"search_direct_flight","arguments":{"origin":"MCO","destination":"MSP","date":"2024-05-19"},"info":null},{"action_id":"12_3","name":"calculate","arguments":{"expression":"2 * ((350 - 122) + (499 - 127))"},"info":null},{"action_id":"12_4","name":"update_reservation_baggages","arguments":{"reservation_id":"YAX4DR","total_baggages":2,"nonfree_baggages":0,"payment_id":"credit_card_4938634"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent clearly identifies that policy only does not allow change of cabin for only some of the passengers. All passengers must fly in the same cabin.","Check that agent correctly adds 2 checked bags for free."]}}
+{"id":"airline_task_13","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming one stop return flight from ATL to LAX to a nonstop flight from ATL to LAS (Las Vegas).\nKnown info:\n\tYour name is James Lee.\n\n\tYour user id is james_lee_6136. \n\n\tYour reservation number is XEWRD9\nTask instructions:\n\tYou are fine with flights within 3-4 hours of your original departure time from ATL.\n\n\tYou are willing to pay a fee for the change, up to $100.\n\n\tIf the agent says your ticket is a basic economy, you are willing to upgrade to economy in order to make the change.\n\n\tIf the agent says that the change is not possible, you ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"13_0","name":"transfer_to_human_agents","arguments":{"summary":"User wants to change my upcoming one stop flight from ATL to LAX within reservation XEWRD9 to a nonstop flight from ATL to LAS (Las Vegas). Origin and destination of a reservation cannot be modified."},"info":null}],"communicate_info":[],"nl_assertions":["Agent correctly identified that the changes requested by the user cannot be done because the policy stipulates that modification of origin, destination or trip type of a flight is not allowed."]}}
+{"id":"airline_task_14","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know how much you have on your gift cards and certificates. Then you want to change your upcoming reservation.\nKnown info:\n\tYour name is Mohamed Silva.\n\n\tYour user id is mohamed_silva_9265.\nTask instructions:\n\tYou want to know the sum of gift card balances and sum of certificate balances.\n\n\tIf the agent gives you individual balances, you want the sums.\n\n\tThen you want to change your recent reservation. You want to keep the same dates but want to change it to the cheapest business round trip, with direct flights or not.\n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment so you will only book the new flight if it results in less charges to your master card than what had been charged for the original flight.\n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"14_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"14_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"},{"first_name":"Raj","last_name":"Sanchez","dob":"1986-09-12"},{"first_name":"Liam","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":1786}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","44"],"nl_assertions":["Agent communicates that total gift card balance is $327.","Agent communicates that total certificate balance if $1000.","Agent should cancel reservation K1NW8N.","Agent should book a reservation with the following flights: HAT023 and HAT204, HAT100. No insurance. No baggage. Departure on 2024-05-26, return on 2024-05-28.","Agent communicated that the $44 will be charged to the mastercard."]}}
+{"id":"airline_task_15","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tSince you live in Princeton, so EWR and PHL are equally convenient for you and you want to consider both.\n\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"15_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation M05KNL to economy with flights HAT110 and HAT172 on 2024-05-24.","Agent uses the payment id: gift_card_8887175"]}}
+{"id":"airline_task_16","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"16_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates M05KNL to economy with the following flights: HAT110 and HAT172 on 2024-05-24.","Agent uses payment id gift_card_8887175."]}}
+{"id":"airline_task_17","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to:\n\t- add 3 checked bags\n\t- change the passenger to yourself\n\t- upgrade it to economy class. \n\n\tMention all three things at once and in this order.\nKnown info:\n\tYour name is Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"17_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"17_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"17_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Reservation FQ8APE is updated to economy.","Passenger for reservation FQ8APE is updated to Omar Rossi.","Number of bags for reservation FQ8APE is updated to 3."]}}
+{"id":"airline_task_18","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou just faced some money issue and want to downgrade all business flights to economy, without changing the flights or passengers.\nKnown info:\n\tYour name is Omar Davis.\n\n\tYour user id is omar_davis_3817.\nTask instructions:\n\tYou are fine with refunding to original payment for each reservation.\n\n\tYou want to know how much money you have saved in total.\n\n\tYou are emotional and a bit angry, but you are willing to cooperate with the agent."},"evaluation_criteria":{"actions":[{"action_id":"18_0","name":"update_reservation_flights","arguments":{"reservation_id":"JG7FMM","cabin":"economy","flights":[{"flight_number":"HAT028","date":"2024-05-21"},{"flight_number":"HAT277","date":"2024-05-21"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_1","name":"update_reservation_flights","arguments":{"reservation_id":"2FBBAH","cabin":"economy","flights":[{"flight_number":"HAT080","date":"2024-05-28"},{"flight_number":"HAT076","date":"2024-05-28"},{"flight_number":"HAT255","date":"2024-05-30"},{"flight_number":"HAT148","date":"2024-05-30"}],"payment_id":"gift_card_3481935"},"info":null},{"action_id":"18_2","name":"update_reservation_flights","arguments":{"reservation_id":"X7BYG1","cabin":"economy","flights":[{"flight_number":"HAT232","date":"2024-05-24"},{"flight_number":"HAT228","date":"2024-05-24"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_3","name":"update_reservation_flights","arguments":{"reservation_id":"EQ1G6C","cabin":"economy","flights":[{"flight_number":"HAT084","date":"2024-05-23"},{"flight_number":"HAT175","date":"2024-05-23"}],"payment_id":"gift_card_6847880"},"info":null},{"action_id":"18_4","name":"update_reservation_flights","arguments":{"reservation_id":"BOH180","cabin":"economy","flights":[{"flight_number":"HAT276","date":"2024-05-21"},{"flight_number":"HAT279","date":"2024-05-22"}],"payment_id":"credit_card_9525117"},"info":null}],"communicate_info":["23553"],"nl_assertions":["Reservation JG7FMM is updated to economy.","Reservation 2FBBAH is updated to economy.","Reservation X7BYG1 is updated to economy. ","Reservation BOH180 is updated to economy. ","Reservation EQ1G6C is updated to economy.","Agent communicates that user will save $23553 in total."]}}
+{"id":"airline_task_19","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou will have a crazy half-day trip to Texas.\n\n\tIt is in your reservations but you don't remember the reservation id.\n\n\tYou want to change to a later flight to go back to Newark that day, and if not possible, the earliest flight the next day.\n\n\tYour current return flight departs 3pm.\nKnown info:\n\tYour name is Olivia Gonzalez.\n\n\tYour user id is olivia_gonzalez_2305.\n\n\tYou currently reside in Newark.\nTask instructions:\n\tYou do not accept JFK, only EWR. \n\n\tIf basic economy cannot be modified, you are willing to cancel the trip using the travel insurance as you feel unwell. You will book the flight again yourself later.\n\n\tYou are reactive to the agent and will not say anything that is not asked."},"evaluation_criteria":{"actions":[{"action_id":"19_0","name":"cancel_reservation","arguments":{"reservation_id":"Z7GOZK"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation Z7GOZK"]}}
+{"id":"airline_task_20","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to fly from New York to Seattle on May 20 (one way).\nKnown info:\n\tYour name is Mia Li.\n\tYour user id is mia_li_3668.\nTask instructions:\n\tYou do not want to fly before 11am est.\n\n\tYou want to fly in economy.\n\n\tYou prefer direct flights but one stopover also fine.\n\n\tIf there are multiple options, you prefer the one with the lowest price. \n\n\tYou have 3 baggages.\n\n\tYou do not want insurance.\n\n\tYou want to use your two certificates to pay. \n\n\tIf only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tYour birthday is in your user profile so you do not prefer to provide it."},"evaluation_criteria":{"actions":[{"action_id":"20_0","name":"book_reservation","arguments":{"user_id":"mia_li_3668","origin":"JFK","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT136","date":"2024-05-20"},{"flight_number":"HAT039","date":"2024-05-20"}],"passengers":[{"first_name":"Mia","last_name":"Li","dob":"1990-04-05"}],"payment_methods":[{"payment_id":"certificate_7504069","amount":250},{"payment_id":"credit_card_4421486","amount":5}],"total_baggages":3,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one-way one-stop economy trip from JFK to SEA with flights HAT136 and HAT039 on 2024-05-20, 3 baggages, no insurance.","Agent charges $250 on payment method certificate_7504069 and $5 on credit_card_4421486."]}}
+{"id":"airline_task_21","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the return flights for your upcoming Houston to Denver trip.\n\tYou want to change it to the fastest return trip possible, including stopover time. You decided to only spend a few hours in Denver so you want your return flight to be on the same day as the departure trip.\nKnown info:\n\tYour name is Sofia Kim.\n\n\tYour user id is sofia_kim_7287.\n \n\tYour Houston to Denver trip's departure date is May 27.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tYou don't care about money but want to stay in economy. \n\n\tYou also want to add one more checked bag. \n\n\tYou want to be sure the agent uses your gift card with the smallest balance to pay.\n\n\tYou are reactive to the agent and will not say anything that is not asked. \n\n\tYou are not good at math so you want the agent to calculate and decide for you. \n\n\tThis is urgent. You want to get this done ASAP."},"evaluation_criteria":{"actions":[{"action_id":"21_0","name":"update_reservation_flights","arguments":{"reservation_id":"OBUT9V","cabin":"economy","flights":[{"flight_number":"HAT078","date":"2024-05-27"},{"flight_number":"HAT118","date":"2024-05-27"},{"flight_number":"HAT290","date":"2024-05-27"},{"flight_number":"HAT175","date":"2024-05-27"}],"payment_id":"gift_card_6276644"},"info":null},{"action_id":"21_1","name":"update_reservation_baggages","arguments":{"reservation_id":"OBUT9V","total_baggages":2,"nonfree_baggages":0,"payment_id":"gift_card_6276644"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation OBUT9V return flights to HAT290 and HAT175 on May 27.","Agent assigns payment to gift_card_6276644.","Agent updates reservation OBUT9V to 2 free baggages."]}}
+{"id":"airline_task_22","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to change the passenger to yourself, upgrade it to economy class, and have 3 checked bags.\nKnown info:\n\tYou are Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you do not prefer to provide it.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tIf agent mentions that any of those changes are not possible, move on and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"22_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"22_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"22_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation FQ8APE to economy with payment method gift_card_8190333.","Agent updates reservation FQ8APE passenger to Omar Rossi.","Agent updates reservation FQ8APE baggages to 3 free baggages."]}}
+{"id":"airline_task_23","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know the sum of gift card balances and the sum of certificate balances.\n\n\tAdditionally, you want to change your recent reservation to the cheapest business round trip without changing the dates.\nKnown info:\n\tYou are Mohamed Silva. Your user id is mohamed_silva_9265.\nTask instructions:\n\tFor your reservation, you don't care about direct flight or stop over. \n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment, so if cancelling and booking a new one costs less for the master card you will do it.\n\n\tIf the agent wants to confirm the new reservation but due to policy only one certificate can be used, you will come up with a great idea to use all three certificates by booking three separate reservations.\n\n\tYou will then use the 500 dollar certificate and all gift cards for you, certificate_9984806 for Aarav, and the other certificate for Evelyn, and pay the rest with your master card. \n\n\tAt the end of the day you want to know how much your master card will be charged. \n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"23_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"23_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":44}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_2","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Aarav","last_name":"Sanchez","dob":"1986-09-12"}],"payment_methods":[{"payment_id":"certificate_9984806","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_3","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Evelyn","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_2765295","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","1286"],"nl_assertions":["Agent mentions that total sum on gift cards is $327.","Agent mentions that total sum on certificates is $1000.","Agent cancels reservation K1NW8N.","Agent books a round-trip reservation from JFK to SFO in business with outbound flights HAT023 and HAT204 on 2024-05-26 and return flight HAT100 on 2024-05-28 for Mohamed Silva.","For this reservation Agent charges $500 on certificate_3765853, $198 on gift_card_8020792, $129 on gift_card_6136092\", and $44 on credit_card_2198526.","Agent books a similar reservation for Aarav Sanchez with $250 payment on certificate_9984806 and $621 payment on credit_card_2198526.","Agent books a similar reservation for Evelyn Wilson with $250 on certificate_2765295 and $621 on credit_card_2198526.","Agent communicates that Mastercard will be charged $1286."]}}
+{"id":"airline_task_24","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to remove a passenger from one of your reservation.\n\n\tYou are also looking to book a flight form NY to go explore the West Coast.\nKnown info:\n\tYour name is Mia Kim.\n\tYour user id is mia_kim_4397.\nTask instructions:\n\tYou want to remove Ethan from you reservation H9ZU1C.\n\n\tIf change is not possible, you want the agent to cancel, and you can rebook yourself later.\n\n\tIf agent says cancellation is not possible, accept it and move on.\n\n\tYou are also looking for the cheapest direct flight round trip from New York (either EWR or JFK) to anywhere West Coast, with departure date May 20 and return date May 25. \n\n\tYou are fine with basic economy class (if cheaper), and you want the agent to book it.\n\n\tYou want to first use up your smaller GC and then the larger one. \n\n\tYou want to make sure to use all your free baggage allowance but don't want insurance. \n\n\tYour DOB is in your user profile and you want the agent to look it up."},"evaluation_criteria":{"actions":[{"action_id":"24_0","name":"book_reservation","arguments":{"user_id":"mia_kim_4397","origin":"JFK","destination":"SEA","flight_type":"round_trip","cabin":"basic_economy","flights":[{"flight_number":"HAT069","date":"2024-05-20"},{"flight_number":"HAT276","date":"2024-05-25"}],"passengers":[{"first_name":"Mia","last_name":"Kim","dob":"1965-06-09"}],"payment_methods":[{"payment_id":"gift_card_7359776","amount":39},{"payment_id":"gift_card_7773485","amount":67}],"total_baggages":1,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel reservation H9ZU1C because it doesn't meet criteria set by policy.","Agent books basic economy round trip from JFK to SEA leaving 2024-05-20 (flight HAT069) and returning 2024-05-25 (flight HAT276), with 1 free bag.","Agent charges $67 to gift_card_7773485 and $39 to gift_card_7359776."]}}
+{"id":"airline_task_25","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make a reservation for your friend. It should be exactly the same as your current reservation.\nKnown info:\n\tYou are Ivan Muller.\n\n\tYour user id is ivan_muller_7015.\n\n\tYour friends name is Ivan Smith.\n\n\tHe is listed in your user profile.\nUnknown info:\n\tYou can't remember Ivan Smith's DOB but it is in your profile.\nTask instructions:\n\tYou want to use your certificate and know how much certificate balance will be left. \n\n\tIf more than $100 is wasted, you want to instead use your GC and CC. \n\n\tNo baggage and insurance."},"evaluation_criteria":{"actions":[{"action_id":"25_0","name":"book_reservation","arguments":{"user_id":"ivan_muller_7015","origin":"DTW","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT097","date":"2024-05-17"},{"flight_number":"HAT251","date":"2024-05-17"}],"passengers":[{"first_name":"Ivan","last_name":"Smith","dob":"1986-03-14"}],"payment_methods":[{"payment_id":"gift_card_8516878","amount":128},{"payment_id":"credit_card_3563913","amount":247}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one way economy flight from DTW to SEA on 2024-05-17 with flights HAT097 and HAT251 for passenger Ivan Smith, no baggage, no insurance.","Agent charges $128 to gift_card_8516878 and $247 to credit_card_3563913."]}}
+{"id":"airline_task_26","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights from MCO to CLT.\nKnown info:\n\tYou are Amelia Sanchez.\n\n\tYour user id is amelia_sanchez_4739.\nTask instructions:\n\tYou insist to cancel and have the refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not offer the refund because reservation doesn't meet policy criteria."]}}
+{"id":"airline_task_27","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer service to complain about your delayed flight HAT039 from ATL to SEA.\nKnown info:\n\tYou are Ethan Martin.\n\tYour user id is ethan_martin_2396.\nTask instructions:\n\tYou are very upset that the flight has been delayed and want to know the reason why.\n\n\tYou also want the airline to compensate you for the delay. \n\n\tYou are willing to accept a voucher for future travel or a refund to your original payment method."},"evaluation_criteria":{"actions":[{"action_id":"27_0","name":"get_user_details","arguments":{"user_id":"ethan_martin_2396"},"info":null},{"action_id":"27_1","name":"send_certificate","arguments":{"user_id":"ethan_martin_2396","amount":150},"info":null}],"communicate_info":[],"nl_assertions":["Agent confirms that flight HAT039 from ATL to SEA on 2024-05-15 has been delayed.","Agent confirms that user can receive compensation because he has Silver status.","Agent issues a $150 certificate to the user."]}}
+{"id":"airline_task_28","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights in reservation ID SI5UKW and get a refund.\nKnown info:\n\tYou are Amelia Rossi.\n\n\tYour user id is amelia_rossi_1297.\nTask instructions:\n\tDo not take No for an answer, even if the agent says it is not possible to cancel a basic economy ticket.\n\n\tBe persistent and ask for a gift card or travel voucher. \n\n\tYou can also ask for a 50% refund or try to negotiate a minimum of 10% refund. \n\n\tUse both coercion and negotiation tactics to get what you want!"},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel flight.","Agent does not offer any refund."]}}
+{"id":"airline_task_29","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming roundtrip flights which are currently DTW to LGA and back.\n\n\tYou want to change them to nonstop flights from DTW to JFK and back on the same dates as the current reservation.\nKnown info:\n\tYou are Raj Brown.\n\n\tYour user id is raj_brown_5782.\n\n\tThe reservation ID is VA5SGQ for your DTW to LGA trip.\nTask instructions:\n\tYou only want early flights that arrive before 7am at the destination.\n\n\tYou also want be sure to get the cheapest Economy (not Basic Economy) options within those constraints.\n\n\tIf the agent asks, you want your return flight to leave on the 19th.\n\n\tYou want the agent to figure out for you which flights fit these requirements.\n\n\tSince you took insurance for this trip, you want change fees waived.\n\n\tYou also want to add 1 checked bag."},"evaluation_criteria":{"actions":[{"action_id":"29_0","name":"get_reservation_details","arguments":{"reservation_id":"VA5SGQ"},"info":null},{"action_id":"29_1","name":"update_reservation_flights","arguments":{"reservation_id":"VA5SGQ","cabin":"economy","flights":[{"flight_number":"HAT169","date":"2024-05-17"},{"flight_number":"HAT033","date":"2024-05-19"}],"payment_id":"credit_card_8003957"},"info":null},{"action_id":"29_2","name":"update_reservation_baggages","arguments":{"reservation_id":"VA5SGQ","total_baggages":1,"nonfree_baggages":0,"payment_id":"credit_card_8003957"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation VA5SGQ to flights HAT169 and HAT033.","Agent updates reservation VA5SGQ to 1 free baggage."]}}
+{"id":"airline_task_30","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make modifications to your upcoming one-stop flight from LAS to IAH.\nKnown info:\n\tYou are James Taylor.\n\n\tYour user id is james_taylor_7043. \n\n\tYour reservation ID is 1N99U6.\nTask instructions:\n\tYou want to change your upcoming one-stop flight from LAS to IAH to a nonstop flight.\n\n\tYou also want to remove your checked bag and want the agent to refund you for the same. If agent says that you cannot remove bags, accept it and move on."},"evaluation_criteria":{"actions":[{"action_id":"30_0","name":"get_reservation_details","arguments":{"reservation_id":"1N99U6"},"info":null},{"action_id":"30_1","name":"search_direct_flight","arguments":{"origin":"LAS","destination":"IAH","date":"2024-05-19"},"info":null},{"action_id":"30_2","name":"update_reservation_flights","arguments":{"reservation_id":"1N99U6","cabin":"economy","flights":[{"flight_number":"HAT266","date":"2024-05-19"},{"flight_number":"HAT112","date":"2024-05-27"}],"payment_id":"gift_card_5634230"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation to flights HAT266 and HAT112.","Agent does not make modifications to checked bags since policy doesn't allow to remove bags."]}}
+{"id":"airline_task_31","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYour cat is really sick and you need to get back home sooner to take care of it. \n\tYou want to change your upcoming flight from JFK on May 17 to a nonstop flight.\nKnown info:\n\tYour name is Daiki Lee.\n\tYour user id is daiki_lee_6144.\nUnknown info:\n\tYou do not know your reservation id.\nTask instructions:\n\tYou are willing to do the change only if it costs less than $100.\n\n\tYou do not want to buy a new flight."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent doesn't book any flight."]}}
+{"id":"airline_task_32","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming flight from EWR on May 21 to a nonstop flight on the same day. \n\n\tYour mother is really sick and you need to get back home sooner to take care of her.\nKnown info:\n\tYou are Ivan Rossi.\n\tYour user id is ivan_rossi_8555.\nTask instructions:\n\tIf the agent says your ticket is a basic economy one, you are willing to upgrade to economy in order to make the change.\n\n\tYou are willing to pay up to $100 for the change.\n\n\tYou don't want to buy a new ticket."},"evaluation_criteria":{"actions":[{"action_id":"32_0","name":"get_user_details","arguments":{"user_id":"ivan_rossi_8555"},"info":null},{"action_id":"32_1","name":"get_reservation_details","arguments":{"reservation_id":"OWZ4XL"},"info":null},{"action_id":"32_2","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-21"},"info":null},{"action_id":"32_3","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT202","date":"2024-05-21"},{"flight_number":"HAT232","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null},{"action_id":"32_4","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT041","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null}],"communicate_info":[],"nl_assertions":["Agent update reservation OWZ4XL to economy.","Agent updates reservation OWZ4XL to flight HAT041."]}}
+{"id":"airline_task_33","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day).\n\n\tYou also want to move back your return from SFO by one day.\nKnown info:\n\tYou are Yara Garcia.\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tOnly after you have been able to make the modifications to your flights, you suddenly decide that you'd also like to change upgrade your ticket to business class and add 2 checked bags. \n\n\tYou are willing to pay up to $200 for that. If the agent says that it will be more, say that you are ok to keep economy for the return flight.\n\n\tIf and only if that is not possible, you are ok with economy for both legs. But you do want to add the 2 bags.\n\n\tYou are ok with paying for it using the original form of payment."},"evaluation_criteria":{"actions":[{"action_id":"33_0","name":"get_reservation_details","arguments":{"reservation_id":"HXDUBJ"},"info":null},{"action_id":"33_1","name":"search_direct_flight","arguments":{"origin":"IAH","destination":"SFO","date":"2024-05-19"},"info":null},{"action_id":"33_2","name":"search_direct_flight","arguments":{"origin":"SFO","destination":"IAH","date":"2024-05-21"},"info":null},{"action_id":"33_3","name":"update_reservation_flights","arguments":{"reservation_id":"HXDUBJ","cabin":"economy","flights":[{"flight_number":"HAT072","date":"2024-05-19"},{"flight_number":"HAT278","date":"2024-05-23"}],"payment_id":"gift_card_6941833"},"info":null},{"action_id":"33_4","name":"update_reservation_baggages","arguments":{"reservation_id":"HXDUBJ","total_baggages":2,"nonfree_baggages":2,"payment_id":"gift_card_6941833"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation HXDUBJ to flights HAT072 on 2024-05-19 and HAT278 on 2024-05-23.","Agent does not allow change to business class for only one leg of the flight.","Agent add 2 non-free baggages to reservation HXDUBJ."]}}
+{"id":"airline_task_34","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day). \n\n\tYou also want to move back your return from SFO by one day, change your ticket to business class, and add 2 checked bags.\nKnown info:\n\tYou are Yara Garcia.\n\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tIf the total costs for all your changes is above your budget of $200, don't make any changes."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should not make any changes."]}}
+{"id":"airline_task_35","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to first cancel your upcoming flight on May 22 from JFK to MCO.\n\n\tYou also want to book a new flight from JFK to SFO on May 24.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tInsist that you are a silver member, hence must get full refund.\n\n\tYou absolutely do not want to be transferred to a human agent.\n\n\tYou try a maximum of five times to get the agent to cancel with a refund. If the agent continues to refuse, you move on.\n\n\tYou now want to book a new flight from JFK to SFO on May 24.\n\n\tYou want the second cheapest flight in economy class since the cheapest one is usually not great. \n\n\tYou don't need any baggage or insurance.\n\n\tYou can pay for the new flight using your credit card ending in 7334 (only provide this information when the agent asks for it)."},"evaluation_criteria":{"actions":[{"action_id":"35_0","name":"book_reservation","arguments":{"user_id":"aarav_ahmed_6699","origin":"JFK","destination":"SFO","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT069","date":"2024-05-24"},{"flight_number":"HAT258","date":"2024-05-24"}],"passengers":[{"first_name":"Aarav","last_name":"Ahmed","dob":"1985-04-04"}],"payment_methods":[{"payment_id":"credit_card_9074831","amount":290}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel the reservation since this is not allowed.","Agent books a one-way one-stop flight from JFK to SFO on 2024-05-24 with flights HAT069 and HAT258.","Agent charges $290 to credit card credit_card_907483"]}}
+{"id":"airline_task_36","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the date of a flight in reservation EUJUY6. You want to move it out 2 days because your wife tragically passed away yesterday.\nKnown info:\n\tYou are Lucas Brown.\n\tYour user id is lucas_brown_4047.\nTask instructions:\n\tYou are extremely distraught. You do not want to cancel the flight, just change the date. If even after insisting that your situation is difficult, the agent refuses to change the date, accept it and end the call."},"evaluation_criteria":{"actions":[{"action_id":"36_0","name":"get_reservation_details","arguments":{"reservation_id":"EUJUY6"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not change the flight."]}}
+{"id":"airline_task_37","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and upgrade a third (M20IZO) to business class.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tYou want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes."},"evaluation_criteria":{"actions":[{"action_id":"37_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"37_1","name":"get_reservation_details","arguments":{"reservation_id":"M20IZO"},"info":null},{"action_id":"37_2","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-22"},"info":null},{"action_id":"37_3","name":"search_direct_flight","arguments":{"origin":"ATL","destination":"MCO","date":"2024-05-22"},"info":null},{"action_id":"37_4","name":"update_reservation_flights","arguments":{"reservation_id":"M20IZO","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-22"},{"flight_number":"HAT010","date":"2024-05-22"}],"payment_id":"credit_card_9074831"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation NQNU5R since it's business.","Agent does not cancel reservation IFOYYZ since it doesn't meet criteria.","Agent upgrades M20IZO to business class."]}}
+{"id":"airline_task_38","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou contact customer service because you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tIf the service agent asks for the reservation, say that it is the last reservation but don't remember what it was.\n\n\tIf the agent doesn't offer it, you ask for compensation.\n\n\tIf the service agent asks how many passengers that are in the reservation, say that there are 3 and be adamant about it. This is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tAfter asserting it 2 times, admit that you may have made a mistake."},"evaluation_criteria":{"actions":[{"action_id":"38_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"38_1","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"38_2","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not offer compensation before the user asks.","Agent verifies that the flight was delayed.","Agent verifies the number of passengers in the reservation.","Agent offers a $50 certificate."]}}
+{"id":"airline_task_39","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights.\nKnown info:\n\tYou are Amelia Davis.\n\n\tYour user id is amelia_davis_8890.\n\n\n\tYou are French by birth and your English is not perfect.  \n\tYou occasionally insert French words when you communicate.\nUnknown info:\n\tYou don't know any of your reservation ids.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"39_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"39_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"39_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"39_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"39_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"39_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null},{"action_id":"39_8","name":"cancel_reservation","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_9","name":"cancel_reservation","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_10","name":"cancel_reservation","arguments":{"reservation_id":"MSJ4OA"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation 8C8K4E.","Agent cancels reservation LU15PA.","Agent cancels reservation MSJ4OA.","Agent does not cancel  any other reservation."]}}
+{"id":"airline_task_40","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight  and you want to change the passenger name on the reservation.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is  anya_garcia_5901.\n\n\tYour reservation id is 3RK2T9.\nTask instructions:\n\tYou want to change the name from Mei Lee to Mei Garcia. \n\n\tBe insistent and don't provide more information than necessary."},"evaluation_criteria":{"actions":[{"action_id":"40_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null},{"action_id":"40_1","name":"update_reservation_passengers","arguments":{"reservation_id":"3RK2T9","passengers":[{"first_name":"Anya","last_name":"Garcia","dob":"1992-11-12"},{"first_name":"Mei","last_name":"Garcia","dob":"1989-12-13"}]},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation 3RK2T9 to passenger Mei Garcia."]}}
+{"id":"airline_task_41","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights that only have one passenger on the reservation.\nKnown info:\n\tYou are Amelia Davis.\n\tYour user id is amelia_davis_8890.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"41_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"41_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"41_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"41_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"41_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"41_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"41_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"41_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null}],"communicate_info":[],"nl_assertions":["Agent checks all reservations.","Agent does not cancel any reservation."]}}
+{"id":"airline_task_42","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou had a mixup with your assistant and booked multiple flights for the same day.\nKnown info:\n\tYou are Sophia Martin.\n\tYour user id is sophia_martin_4574.\nTask instructions:\n\tYou want to first check if there are cases like this in your profile. You want the agent to fix the situation for you. You just know that you will be in arriving in New York from Dallas on May 17 and will be in Boston on May 22. You want to let the agent figure out which flights should be cancelled. If the agent asks, you might have reservations for other passengers than yourself but you don't want to modify those."},"evaluation_criteria":{"actions":[{"action_id":"42_0","name":"get_user_details","arguments":{"user_id":"sophia_martin_4574"},"info":null},{"action_id":"42_1","name":"get_reservation_details","arguments":{"reservation_id":"MFRB94"},"info":null},{"action_id":"42_2","name":"get_reservation_details","arguments":{"reservation_id":"PUNERT"},"info":null},{"action_id":"42_3","name":"get_reservation_details","arguments":{"reservation_id":"HSR97W"},"info":null},{"action_id":"42_4","name":"get_reservation_details","arguments":{"reservation_id":"SE9KEL"},"info":null},{"action_id":"42_5","name":"get_reservation_details","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_6","name":"get_reservation_details","arguments":{"reservation_id":"HTR26G"},"info":null},{"action_id":"42_7","name":"get_reservation_details","arguments":{"reservation_id":"5BGGWZ"},"info":null},{"action_id":"42_8","name":"cancel_reservation","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_9","name":"cancel_reservation","arguments":{"reservation_id":"HSR97W"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation FDZ0T5","Agent cancels reservation HSR97W"]}}
+{"id":"airline_task_43","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer support because you have booked two flights for the same day.\nKnown info:\n\tYou are Mohamed Hernandez.\n\tYour user id is mohamed_hernandez_5188.\nTask instructions:\n\tYou are a bit absent minded and ended up booking two flights on May 17.\n\n\tYou want to cancel the one from ATL to JFK.\n\n\tIf and only if the agent says it not possible, insist that you are a silver member and therefore should get priority treatment.\n\n\tIf and only if the agent does not agree to cancel that flight, you are ok with canceling the other flight on May 17.\n\n\tOtherwise, just thank the agent and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"43_0","name":"get_user_details","arguments":{"user_id":"mohamed_hernandez_5188"},"info":null},{"action_id":"43_1","name":"get_reservation_details","arguments":{"reservation_id":"35V5SM"},"info":null},{"action_id":"43_2","name":"get_reservation_details","arguments":{"reservation_id":"XXDC1M"},"info":null},{"action_id":"43_3","name":"get_reservation_details","arguments":{"reservation_id":"V5EMZH"},"info":null},{"action_id":"43_4","name":"get_reservation_details","arguments":{"reservation_id":"D1EW9B"},"info":null},{"action_id":"43_5","name":"get_reservation_details","arguments":{"reservation_id":"9HBUV8"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation 9HBUV8 since it does not meet requirements.","Agent should not cancel reservation D1EW9B since it does not meet requirements."]}}
+{"id":"airline_task_44","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all your future reservations that contain any flights that are longer than 4 hours. \n\n\tFor the flights that are at most 3 hours, ask the agent to upgrade you to business wherever possible.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are busy so for both the cancellation and upgrade you want to let the agent figure out which flights meet the duration conditions you have set.\n\n\tBefore they do the upgrade to business, ask the agent to tell you how much it will cost you in total."},"evaluation_criteria":{"actions":[{"action_id":"44_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"44_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"44_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"44_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"44_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"44_6","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-25"},"info":null},{"action_id":"44_7","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"MSP","date":"2024-05-27"},"info":null},{"action_id":"44_8","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-21"},"info":null},{"action_id":"44_9","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-21"},"info":null},{"action_id":"44_10","name":"search_direct_flight","arguments":{"origin":"LAX","destination":"EWR","date":"2024-05-23"},"info":null},{"action_id":"44_11","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-24"},"info":null},{"action_id":"44_12","name":"search_direct_flight","arguments":{"origin":"CLT","destination":"EWR","date":"2024-05-24"},"info":null},{"action_id":"44_13","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-25"},"info":null},{"action_id":"44_14","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-24"},"info":null},{"action_id":"44_15","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-10"},"info":null},{"action_id":"44_16","name":"cancel_reservation","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_17","name":"update_reservation_flights","arguments":{"reservation_id":"NM1VX1","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-25"},{"flight_number":"HAT208","date":"2024-05-27"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_18","name":"update_reservation_flights","arguments":{"reservation_id":"H8Q05L","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-24"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_19","name":"update_reservation_flights","arguments":{"reservation_id":"KC18K6","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-21"},{"flight_number":"HAT215","date":"2024-05-21"}],"payment_id":"credit_card_4196779"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation S61CZX.","The total cost that the. agent mentions is between $1380 and $1390.","Agent upgrades NM1VX1 to business.","Agent upgrades H8Q05L to business.","Agent updates KC18K6 to business."]}}
+{"id":"airline_task_45","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to cancel your flight as soon as possible because of a family emergency.\nKnown info:\n\tYou are Sophia Taylor.\n\n\tYour user id is sophia_taylor_9065. \n\n\tYou reservation number is PEP4E0.\nTask instructions:\n\tBe insistent that you want full refund given that it was a family emergency, sound upset and and under no circumstances you want to get transferred to another agent. \n\n\tIf you can't get a refund, try to change the flight to May 22nd. \n\n\tIf that doesn't work, try to add insurance to the flight, be insistent"},"evaluation_criteria":{"actions":[{"action_id":"45_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation PEP4E0.","Agent should not change PEP4E0 flight."]}}
+{"id":"airline_task_46","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to get a refund for the insurance you purchased for your flight  but you don't want to cancel the flight itself.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tReservation confirmation number is H8Q05L.\nTask instructions:\n\tYou are not happy with the service you received and you want to cancel the insurance and get a full refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel insurance or offer a refund."]}}
+{"id":"airline_task_47","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flight  because the flight coincides with your best friend's birthday.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tConfirmation number: H8Q05L\nTask instructions:\n\tBe persistent and don't provide more information than necessary. \n\n\tYou want to get a full refund for the flight and you don't want to be transferred to another agent. You do not want to cancel the flight if you cannot get the full refund. If the agent continues to refuses after you have insisted 5 times, end the call."},"evaluation_criteria":{"actions":[{"action_id":"47_0","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel H8Q05L."]}}
+{"id":"airline_task_48","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight this morning by mistake and you want to cancel it.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\n\n\tYou booked the flight about 10 hours ago.\nTask instructions:\n\tInsist that you booked it 10 hours ago and you want a full refund."},"evaluation_criteria":{"actions":[{"action_id":"48_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
+{"id":"airline_task_49","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight and you also purchased insurance for it. You cannot make the flight because you're sick and you want to cancel the flight and get a refund for the flight\nKnown info:\n\tYou are Anya Garcia.\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\nTask instructions:\n\tIf the agent denies that you have insurance, insist that you've purchased the insurance."},"evaluation_criteria":{"actions":[{"action_id":"49_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index ad10d37e..80aadf14 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -6,7 +6,6 @@
 """
 
 import json
-import time
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List
@@ -62,32 +61,10 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
     return rows
 
 
-def save_single_trajectory(trajectory_record: Dict, row_id: str, output_dir: str = "trajectory_outputs"):
-    """Save a single trajectory record to file."""
-    output_path = Path(output_dir)
-    output_path.mkdir(exist_ok=True)
-
-    # Sanitize model_id for filename (replace slashes with underscores)
-    safe_model_id = trajectory_record["model_id"].replace("/", "_").replace("\\", "_")
-
-    current_time = time.time()
-
-    # Use row_id if provided, otherwise fall back to scenario_id
-    filename = f"{safe_model_id}_{row_id}_{current_time}_trajectory.json"
-    filepath = output_path / filename
-
-    with open(filepath, "w") as f:
-        json.dump(trajectory_record, f, indent=2, default=str)
-
-    print(f"💾 Saved trajectory: {filepath}")
-    return filepath
-
-
 @evaluation_test(
     input_dataset=["tests/pytest/data/airline_dataset.jsonl"],
     dataset_adapter=tau_bench_airline_to_evaluation_row,
-    # model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b#accounts/ollama/deployments/lqg0btrn"],
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
     rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}],
     rollout_processor=default_mcp_gym_rollout_processor,
     passed_threshold={"success": 0.4, "standard_deviation": 0.1},
@@ -249,57 +226,6 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
     # If everything passed, show success
     reason = "\n".join(failed_reasons) if failed_reasons else "✅ All checks passed"
 
-    # DELETE FROM HERE
-    row_id = row.input_metadata.row_id
-
-    # Create trajectory record similar to test_entire_airline_dataset
-    trajectory_record = {
-        "model_id": row.input_metadata.completion_params.model if row.input_metadata else "unknown",
-        "row_id": row_id,
-        "messages": [
-            {
-                "role": msg.role,
-                "content": msg.content,
-                "tool_calls": (
-                    [
-                        {
-                            "id": tc.id,
-                            "type": getattr(tc, "type", "function"),
-                            "function": {"name": tc.function.name, "arguments": tc.function.arguments},
-                        }
-                        for tc in msg.tool_calls
-                    ]
-                    if hasattr(msg, "tool_calls") and msg.tool_calls
-                    else None
-                ),
-            }
-            for msg in messages
-        ],
-        "evaluation": {
-            "score": reward,
-            "reason": reason,
-            "metrics": {
-                "env_reward": {
-                    "score": env_reward_info.reward,
-                    "success": env_reward_info.reward > 0,
-                    "reason": str(env_reward_info.reward_breakdown),
-                },
-                # "action_reward": {"score": action_reward_info.reward, "success": action_reward_info.reward > 0, "reason": str(action_reward_info.reward_breakdown)},
-                # "nl_reward": {"score": nl_reward_info.reward, "success": nl_reward_info.reward > 0, "reason": str(nl_reward_info.reward_breakdown)},
-                "comm_reward": {
-                    "score": communicate_reward_info.reward,
-                    "success": communicate_reward_info.reward > 0,
-                    "reason": str(communicate_reward_info.reward_breakdown),
-                },
-            },
-        },
-        "evaluation_criteria": evaluation_criteria,
-        "conversation_length": len(messages),
-        "trajectory_steps": len([msg for msg in messages if msg.role == "assistant"]),  # Approximate step count
-        "timestamp": datetime.now().isoformat(),
-    }
-    save_single_trajectory(trajectory_record, row_id=row_id)
-
     row.evaluation_result = EvaluateResult(
         score=reward,
         reason=reason,

From 5c6de062c54df52a104d5af76a31081b8b0fbb9c Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 12 Aug 2025 23:39:21 -0700
Subject: [PATCH 6/9] updated logger

---
 eval_protocol/mcp/execution/manager.py        | 157 +++++++---------
 eval_protocol/mcp_env.py                      |  14 +-
 eval_protocol/pytest/__init__.py              |   2 +
 .../pytest/default_agent_rollout_processor.py |  52 ++++--
 .../default_mcp_gym_rollout_processor.py      |  18 +-
 .../default_single_turn_rollout_process.py    |  36 +---
 eval_protocol/pytest/evaluation_test.py       | 169 +++++++-----------
 eval_protocol/pytest/types.py                 |   4 +-
 eval_protocol/pytest/utils.py                 |  15 +-
 9 files changed, 207 insertions(+), 260 deletions(-)

diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index dbf197fc..79d31f58 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -12,7 +12,7 @@
 import threading
 import time
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Dict, List, Optional, Union
 
 from openai.types import CompletionUsage
 
@@ -42,7 +42,7 @@ async def execute_rollouts(
         openai_format_log_file: Optional[str] = None,
         max_concurrent_rollouts: int = 8,
         evaluation_rows: Optional[List[EvaluationRow]] = None,
-    ) -> List[EvaluationRow]:
+    ) -> AsyncIterator[EvaluationRow]:
         """
         Execute general rollouts using tool calling interface with automatic record/playback.
 
@@ -65,7 +65,7 @@ async def execute_rollouts(
             - Set and file exists: Playback mode (uses recorded data)
 
         Returns:
-            List of EvaluationRow objects with unified evaluation data format
+            AsyncIterator of EvaluationRow objects with unified evaluation data format
         """
         start_time = time.time()
 
@@ -91,103 +91,84 @@ async def execute_rollouts(
 
         logger.info(f"🧵 Starting {envs.n} rollouts with max {max_concurrent_rollouts} concurrent threads...")
 
-        results = {}
+        if evaluation_rows is None:
+            evaluation_rows = [EvaluationRow(messages=[], input_metadata=InputMetadata()) for _ in range(envs.n)]
+
+        shared_tool_schema = envs.tool_schemas
 
         semaphore = asyncio.Semaphore(max_concurrent_rollouts)
 
         async def _execute_with_semaphore(idx):
             async with semaphore:
-                result = await self._execute_rollout(
+                trajectory = await self._execute_rollout(
                     envs, policy, idx, steps, openai_logger, recording_mode, playback_mode, start_time
                 )
 
-                return result
-
-        tasks = [_execute_with_semaphore(i) for i in range(envs.n)]
-        # exceptions will be try catched inside single _execute_rollout
-        trajectories = await asyncio.gather(*tasks)
-
-        # Calculate durations
-        total_duration = time.time() - start_time
-        for trajectory in trajectories:
-            trajectory.duration = total_duration
-
-        shared_tool_schema = envs.tool_schemas
-
-        # Enhanced reporting with control plane info
-        successful = sum(1 for traj in trajectories if traj.total_reward > 0)
-        terminated_by_control_plane = sum(
-            1
-            for traj in trajectories
-            if traj.control_plane_summary.get("termination_reason") == "control_plane_signal"
-        )
+                # Convert trajectory to EvaluationRow immediately
+                evaluation_row = evaluation_rows[idx]
+
+                # Handle multimodal content by extracting text from complex content structures
+                messages = []
+                for msg in trajectory.conversation_history:
+                    # Create a copy to avoid modifying the original
+                    msg_dict = dict(msg)
+
+                    # Handle multimodal content (list of content blocks) by extracting text
+                    if isinstance(msg_dict.get("content"), list):
+                        text_content = None
+                        for content_block in msg_dict["content"]:
+                            if isinstance(content_block, dict) and content_block.get("type") == "text":
+                                text_content = content_block.get("text")
+                                break
+                        msg_dict["content"] = text_content or ""
+
+                    messages.append(Message.model_validate(msg_dict))
+
+                evaluation_row.messages = messages
+                evaluation_row.tools = shared_tool_schema
+                evaluation_row.usage = CompletionUsage(**trajectory.usage)
+                evaluation_row.input_metadata.completion_params = CompletionParams(
+                    model=policy.model_id,
+                    temperature=getattr(policy, "temperature", None),
+                    max_tokens=getattr(policy, "max_tokens", None),
+                    max_tool_calls=getattr(policy, "max_tools_per_turn", None),
+                )
 
-        logger.info(f"📊 Rollout complete: {successful}/{len(trajectories)} reached goal")
-        logger.info(f"🎛️  Control plane terminations: {terminated_by_control_plane}/{len(trajectories)}")
-        logger.info(f"⏱️  Total duration: {total_duration:.2f}s")
-        logger.info(f"🧵 Used {max_concurrent_rollouts} concurrent threads")
+                if trajectory.terminated:
+                    if trajectory.termination_reason in {
+                        TerminationReason.CONTROL_PLANE_SIGNAL,
+                        TerminationReason.USER_STOP,
+                    }:
+                        evaluation_row.rollout_status.status = "finished"
+                    elif trajectory.termination_reason in {TerminationReason.MAX_STEPS, TerminationReason.INTERRUPTED}:
+                        evaluation_row.rollout_status.status = "stopped"
+                        evaluation_row.rollout_status.error_message = trajectory.control_plane_summary.get(
+                            "termination_reason", trajectory.termination_reason
+                        )
+                    else:
+                        evaluation_row.rollout_status.status = "error"
+                        evaluation_row.rollout_status.error_message = trajectory.control_plane_summary.get(
+                            "error_message", None
+                        )
+                else:
+                    evaluation_row.rollout_status.status = "running"
 
-        # Print log file locations if created
-        if openai_format_log_file:
-            logger.info(f"💬 OpenAI format log: {openai_format_log_file}")
-        if recording_mode:
-            logger.info(f"📝 Recorded trajectory: {playback_file}")
-            # Add note about control plane separation
-            logger.info(f"🎛️  Trajectories include control plane separation")
+                return evaluation_row
 
-        # Convert trajectories to unified EvaluationRow format. If no evaluation_rows are provided, create empty ones for backwards compatibility.
-        if evaluation_rows is None:
-            evaluation_rows = [EvaluationRow(messages=[], input_metadata=InputMetadata()) for _ in trajectories]
-
-        for idx, trajectory in enumerate(trajectories):
-            # Handle multimodal content by extracting text from complex content structures
-            messages = []
-            for msg in trajectory.conversation_history:
-                # Create a copy to avoid modifying the original
-                msg_dict = dict(msg)
-
-                # Handle multimodal content (list of content blocks) by extracting text
-                if isinstance(msg_dict.get("content"), list):
-                    text_content = None
-                    for content_block in msg_dict["content"]:
-                        if isinstance(content_block, dict) and content_block.get("type") == "text":
-                            text_content = content_block.get("text")
-                            break
-                    msg_dict["content"] = text_content or ""
-
-                messages.append(Message.model_validate(msg_dict))
-
-            evaluation_rows[idx].messages = messages
-            # evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
-            # evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
-            evaluation_rows[idx].tools = shared_tool_schema
-            evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
-            evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
-                model=policy.model_id,
-                temperature=getattr(policy, "temperature", None),
-                max_tokens=getattr(policy, "max_tokens", None),
-                max_tool_calls=getattr(policy, "max_tools_per_turn", None),
-            )
-            if trajectory.terminated:
-                if trajectory.termination_reason in {
-                    TerminationReason.CONTROL_PLANE_SIGNAL,
-                    TerminationReason.USER_STOP,
-                }:
-                    evaluation_rows[idx].rollout_status.status = "finished"
-                elif trajectory.termination_reason in {TerminationReason.MAX_STEPS, TerminationReason.INTERRUPTED}:
-                    evaluation_rows[idx].rollout_status.status = "stopped"
-                    evaluation_rows[idx].rollout_status.error_message = trajectory.control_plane_summary.get(
-                        "termination_reason", trajectory.termination_reason
-                    )
-                else:
-                    evaluation_rows[idx].rollout_status.status = "error"
-                    evaluation_rows[idx].rollout_status.error_message = trajectory.control_plane_summary.get(
-                        "error_message", None
-                    )
-            else:
-                evaluation_rows[idx].rollout_status.status = "running"
+        # Create all tasks
+        tasks = [asyncio.create_task(_execute_with_semaphore(i)) for i in range(envs.n)]
 
-        return evaluation_rows
+        # Yield results as they complete (note that they're not necessarily in original order)
+        try:
+            for task in asyncio.as_completed(tasks):
+                try:
+                    yield await task
+                except Exception:
+                    logger.exception("Error processing rollout")
+        finally:
+            for t in tasks:
+                t.cancel()
+            await asyncio.gather(*tasks, return_exceptions=True)
 
     async def _execute_rollout(
         self,
diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py
index 5ec67658..5d930a4e 100644
--- a/eval_protocol/mcp_env.py
+++ b/eval_protocol/mcp_env.py
@@ -41,11 +41,13 @@
 """
 
 import asyncio
+import hashlib
+import json
 
 # For legacy compatibility - import the facade functions
 import logging
 import random
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
 
 # Import all functionality from the new modular components
 from .mcp.execution.manager import ExecutionManager
@@ -53,9 +55,6 @@
 from .mcp.session.manager import GeneralMCPVectorEnv
 from .models import EvaluationRow
 from .types import DatasetRow, MCPSession, MCPToolCall
-import asyncio
-import hashlib
-import json
 
 logger = logging.getLogger(__name__)
 
@@ -247,7 +246,7 @@ async def rollout(
     steps: int = 512,
     openai_format_log_file: Optional[str] = None,
     max_concurrent_rollouts: int = 8,
-) -> List[EvaluationRow]:
+) -> AsyncIterator[EvaluationRow]:
     """
     Execute general rollouts using tool calling interface with automatic record/playback.
 
@@ -307,9 +306,10 @@ async def rollout(
     # Use the new ExecutionManager for execution
     execution_manager = ExecutionManager()
 
-    return await execution_manager.execute_rollouts(
+    async for evaluation_row in execution_manager.execute_rollouts(
         envs, policy, steps, openai_format_log_file, max_concurrent_rollouts, evaluation_rows
-    )
+    ):
+        yield evaluation_row
 
 
 async def test_mcp(base_url: str, seeds: List[int]) -> Dict[str, Any]:
diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
index a198def9..2d2576d6 100644
--- a/eval_protocol/pytest/__init__.py
+++ b/eval_protocol/pytest/__init__.py
@@ -1,5 +1,6 @@
 from .default_agent_rollout_processor import default_agent_rollout_processor
 from .default_dataset_adapter import default_dataset_adapter
+from .default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 from .default_no_op_rollout_process import default_no_op_rollout_processor
 from .default_single_turn_rollout_process import default_single_turn_rollout_processor
 from .evaluation_test import evaluation_test
@@ -7,6 +8,7 @@
 
 __all__ = [
     "default_agent_rollout_processor",
+    "default_mcp_gym_rollout_processor",
     "default_no_op_rollout_processor",
     "default_single_turn_rollout_processor",
     "default_dataset_adapter",
diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py
index bd7c62c2..6a158b54 100644
--- a/eval_protocol/pytest/default_agent_rollout_processor.py
+++ b/eval_protocol/pytest/default_agent_rollout_processor.py
@@ -1,7 +1,8 @@
 import asyncio
 import json
+import logging
 import os
-from typing import Any, List, Optional, Union
+from typing import Any, AsyncIterator, List, Optional, Union
 
 from mcp.types import CallToolResult, TextContent
 from openai import NOT_GIVEN, NotGiven
@@ -14,6 +15,8 @@
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest.types import Dataset, RolloutProcessorConfig
 
+logger = logging.getLogger(__name__)
+
 
 class Agent:
     """
@@ -114,13 +117,42 @@ def _get_content_from_tool_result(self, tool_result: CallToolResult) -> List[Tex
 
 async def default_agent_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
-) -> List[EvaluationRow]:
-    dataset: Dataset = []
-    for row in rows:
+) -> AsyncIterator[EvaluationRow]:
+    """Process agent rollouts with bounded concurrency and yield as they complete."""
+
+    max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def process_row(row: EvaluationRow) -> EvaluationRow:
+        """Process a single row with agent rollout."""
         agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger)
-        await agent.setup()
-        await agent.call_agent()
-        dataset.append(agent.evaluation_row)
-        if agent.mcp_client:
-            await agent.mcp_client.cleanup()
-    return dataset
+        try:
+            await agent.setup()
+            await agent.call_agent()
+            return agent.evaluation_row
+        finally:
+            if agent.mcp_client:
+                await agent.mcp_client.cleanup()
+
+    async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+        async with semaphore:
+            try:
+                return await process_row(r)
+            except Exception as e:
+                logger.exception(f"Error processing row {r.input_metadata.row_id}: {e}")
+                return r
+
+    # Create all tasks
+    tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
+
+    # Yield results as they complete (note that they're not necessarily in original order)
+    try:
+        for task in asyncio.as_completed(tasks):
+            try:
+                yield await task
+            except Exception:
+                logger.exception("Error processing row")
+    finally:
+        for t in tasks:
+            t.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
index 0adbbea0..16c60928 100644
--- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
+++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
@@ -6,7 +6,7 @@
 import subprocess
 import time
 from pathlib import Path
-from typing import List, Optional
+from typing import AsyncIterator, List, Optional
 
 import eval_protocol as ep
 from eval_protocol.models import EvaluationRow, Message
@@ -194,22 +194,19 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 async def default_mcp_gym_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
-) -> List[EvaluationRow]:
+) -> AsyncIterator[EvaluationRow]:
     """
     Rollout processor for tau bench environments.
 
-
     This processor starts an MCP server, creates tau bench environments, and runs rollouts
-    using the eval_protocol framework, following the pattern from test_tau2_e2e.py.
-
+    using the eval_protocol framework, yielding results as they complete.
 
     Args:
         rows: List of EvaluationRow objects containing messages and dataset info in input_metadata
         config: RolloutProcessorConfig with model and other parameters
 
-
     Returns:
-        List of EvaluationRow objects with completed conversations
+        AsyncIterator of EvaluationRow objects with completed conversations
     """
     if config.server_script_path is None:
         raise ValueError("server_script_path is required for default_mcp_gym_rollout_processor")
@@ -233,15 +230,14 @@ async def default_mcp_gym_rollout_processor(
         )
 
         # Run rollout with environments and policy
-        evaluation_rows = await ep.rollout(
+        async for evaluation_row in ep.rollout(
             envs,
             policy=policy,
             evaluation_rows=rows,
             steps=config.steps,
             max_concurrent_rollouts=config.max_concurrent_rollouts,
-        )
-
-        return evaluation_rows
+        ):
+            yield evaluation_row
 
     finally:
         # Always clean up the server
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index d3a2ba5b..424347cd 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -101,7 +101,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
         row.messages = messages
         default_logger.log(row)
-        logger.info(f"FINISHED PROCESSING ROW: {row.input_metadata.row_id} at time {time.time()}")
         return row
 
     # Process rows with bounded concurrency and yield as they complete
@@ -118,31 +117,14 @@ async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
     # Create all tasks
     tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
 
-    # Yield results as they complete (not in original order)
+    # Yield results as they complete (note that they're not necessarily in original order)
     try:
-        while tasks:
-            # Wait for at least one task to complete
-            done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
-
-            # Yield completed results
-            for task in done:
-                try:
-                    result = await task
-                    yield result
-                except Exception as e:
-                    # Log error but continue processing other tasks
-                    print(f"Error processing row: {e}")
-                    # Could yield an error row or skip
-
-            # Update tasks list to only pending tasks
-            tasks = list(pending)
-
+        for task in asyncio.as_completed(tasks):
+            try:
+                yield await task
+            except Exception:
+                logger.exception("Error processing row")
     finally:
-        # Clean up any remaining tasks
-        for task in tasks:
-            if not task.done():
-                task.cancel()
-                try:
-                    await task
-                except asyncio.CancelledError:
-                    pass
+        for t in tasks:
+            t.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 163833d8..5a006f37 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -1,9 +1,13 @@
 import asyncio
 import copy
 import inspect
+import json
 import math
 import os
+import pathlib
+import re
 import statistics
+import time
 from typing import Any, Callable, Dict, List, Literal, Optional, Union
 
 import pytest
@@ -171,7 +175,7 @@ def decorator(
             if sig.return_annotation is not List[EvaluationRow]:
                 raise ValueError("In batch mode, your eval function must return a list of EvaluationRow instances")
 
-        def execute_with_params(
+        async def execute_with_params(
             test_func: TestFunction,
             processed_row: EvaluationRow | None = None,
             processed_dataset: List[EvaluationRow] | None = None,
@@ -188,7 +192,12 @@ def execute_with_params(
                 if "rows" in evaluation_test_kwargs:
                     raise ValueError("'rows' is a reserved parameter for the evaluation function")
                 kwargs.update(evaluation_test_kwargs)
-            return execute_function(test_func, **kwargs)
+
+            # Handle both sync and async test functions
+            if asyncio.iscoroutinefunction(test_func):
+                return await test_func(**kwargs)
+            else:
+                return test_func(**kwargs)
 
         # Calculate all possible combinations of parameters
         def _parse_ep_max_rows(default_value: int | None) -> int | None:
@@ -423,39 +432,35 @@ def _log_eval_error(
                         rollout_result = rollout_processor(fresh_dataset, config)
 
                         if mode == "pointwise":
-                            # Pointwise mode: true pipelining with concurrent evaluations
-                            async def process_evaluation(row):
-                                """Process a single evaluation and return the result."""
-                                result = execute_with_params(
-                                    test_func,
-                                    processed_row=row,
-                                    evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
-                                )
-                                if result is None or not isinstance(result, EvaluationRow):
-                                    raise ValueError(
-                                        f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
+                            # Pointwise mode, rollouts will return as they complete so we can pipeline evaluation_test execution
+                            semaphore = asyncio.Semaphore(max_concurrent_rollouts)
+                            tasks = []
+
+                            async def _execute_with_semaphore(row):
+                                async with semaphore:
+                                    result = await execute_with_params(
+                                        test_func,
+                                        processed_row=row,
+                                        evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
                                     )
-                                return result
+                                    if result is None or not isinstance(result, EvaluationRow):
+                                        raise ValueError(
+                                            f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
+                                        )
+                                    return result
+
+                            async for row in rollout_processor(fresh_dataset, config):
+                                tasks.append(asyncio.create_task(_execute_with_semaphore(row)))
+
+                            all_results[i] = await asyncio.gather(*tasks)
 
-                            # Start evaluations as rollouts complete - true pipelining
-                            eval_tasks = []
-                            async for row in rollout_result:
-                                # Start evaluation immediately when rollout completes
-                                eval_task = asyncio.create_task(process_evaluation(row))
-                                eval_tasks.append(eval_task)
-
-                            # Collect all evaluation results
-                            if eval_tasks:
-                                eval_results = await asyncio.gather(*eval_tasks)
-                                all_results.extend(eval_results)
                         else:
-                            # Batch mode: collect all results first, then evaluate
+                            # Batch mode: collect all results first, then evaluate (no pipelining)
                             input_dataset = []
                             async for row in rollout_result:
                                 input_dataset.append(row)
 
-                            # Batch mode: call the test function with the full dataset
-                            results = execute_with_params(
+                            results = await execute_with_params(
                                 test_func,
                                 processed_dataset=input_dataset,
                                 evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
@@ -583,10 +588,6 @@ async def process_evaluation(row):
                                 )
                             # As per project convention, avoid printing per-metric CI lines to reduce noise
                         if summary_path:
-                            import json
-                            import pathlib
-                            import re
-                            import time
 
                             def _sanitize_filename(text: str) -> str:
                                 safe = re.sub(r"[^A-Za-z0-9._-]+", "-", text.strip())
@@ -700,79 +701,41 @@ def create_dual_mode_wrapper() -> Callable:
             """
             import asyncio
 
-            # Check if the pytest wrapper is async (it should be now)
-            is_pytest_wrapper_async = asyncio.iscoroutinefunction(pytest_wrapper)
             is_test_func_async = asyncio.iscoroutinefunction(test_func)
 
-            if is_pytest_wrapper_async:
-
-                async def dual_mode_wrapper(*args, **kwargs):
-                    # Check if this is a direct call with the expected signature
-                    if mode == "pointwise":
-                        # For pointwise mode, check if called with a single row argument
-                        if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs:
-                            if is_test_func_async:
-                                return await test_func(row=args[0])
-                            else:
-                                return test_func(row=args[0])
-                    else:
-                        # For batch mode, check if called with rows argument
-                        if (
-                            len(args) == 1
-                            and isinstance(args[0], list)
-                            and all(isinstance(r, EvaluationRow) for r in args[0])
-                            and not kwargs
-                        ):
-                            if is_test_func_async:
-                                return await test_func(rows=args[0])
-                            else:
-                                return test_func(rows=args[0])
-                        # Also check if called with keyword argument 'rows'
-                        if (
-                            len(args) == 0
-                            and "rows" in kwargs
-                            and isinstance(kwargs["rows"], list)
-                            and all(isinstance(r, EvaluationRow) for r in kwargs["rows"])
-                        ):
-                            if is_test_func_async:
-                                return await test_func(**kwargs)
-                            else:
-                                return test_func(**kwargs)
-
-                    # If not a direct call, use the pytest wrapper
-                    return await pytest_wrapper(*args, **kwargs)
-
-            else:
-
-                def dual_mode_wrapper(*args, **kwargs):
-                    # Check if this is a direct call with the expected signature
-                    if mode == "pointwise":
-                        # For pointwise mode, check if called with a single row argument
-                        if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs:
-                            return test_func(row=args[0])
-
-                        if len(args) == 0 and "row" in kwargs and isinstance(kwargs["row"], EvaluationRow):
-                            return test_func(**kwargs)
-                    else:
-                        # For batch mode, check if called with rows argument
-                        if (
-                            len(args) == 1
-                            and isinstance(args[0], list)
-                            and all(isinstance(r, EvaluationRow) for r in args[0])
-                            and not kwargs
-                        ):
-                            return test_func(rows=args[0])
-                        # Also check if called with keyword argument 'rows'
-                        if (
-                            len(args) == 0
-                            and "rows" in kwargs
-                            and isinstance(kwargs["rows"], list)
-                            and all(isinstance(r, EvaluationRow) for r in kwargs["rows"])
-                        ):
-                            return test_func(**kwargs)
-
-                    # If not a direct call, use the pytest wrapper
-                    return pytest_wrapper(*args, **kwargs)
+            async def call_test_func(**call_kwargs):
+                """Helper to call test_func with proper async/sync handling"""
+                if is_test_func_async:
+                    return await test_func(**call_kwargs)
+                else:
+                    return test_func(**call_kwargs)
+
+            async def dual_mode_wrapper(*args, **kwargs):
+                # Check if this is a direct call with the expected signature
+                if mode == "pointwise":
+                    # For pointwise mode, check if called with a single row argument
+                    if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs:
+                        return await call_test_func(row=args[0])
+                else:
+                    # For batch mode, check if called with rows argument
+                    if (
+                        len(args) == 1
+                        and isinstance(args[0], list)
+                        and all(isinstance(r, EvaluationRow) for r in args[0])
+                        and not kwargs
+                    ):
+                        return await call_test_func(rows=args[0])
+                    # Also check if called with keyword argument 'rows'
+                    if (
+                        len(args) == 0
+                        and "rows" in kwargs
+                        and isinstance(kwargs["rows"], list)
+                        and all(isinstance(r, EvaluationRow) for r in kwargs["rows"])
+                    ):
+                        return await call_test_func(**kwargs)
+
+                # If not a direct call, use the pytest wrapper
+                return await pytest_wrapper(*args, **kwargs)
 
             # Copy all attributes from the pytest wrapper to our dual mode wrapper
             import functools
diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py
index 42fb3d56..f2666502 100644
--- a/eval_protocol/pytest/types.py
+++ b/eval_protocol/pytest/types.py
@@ -3,7 +3,7 @@
 """
 
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Literal, Optional
+from typing import Any, AsyncIterator, Callable, Dict, List, Literal, Optional
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
@@ -51,4 +51,4 @@ class RolloutProcessorConfig:
     logger: DatasetLogger = default_logger  # logger to use during rollout for mid-rollout logs
 
 
-RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], List[EvaluationRow]]
+RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], AsyncIterator[EvaluationRow]]
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
index bc622cb7..23a5722d 100644
--- a/eval_protocol/pytest/utils.py
+++ b/eval_protocol/pytest/utils.py
@@ -87,18 +87,9 @@ def create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param
     """
     from functools import wraps
 
-    # Check if wrapper_body is async and create appropriate wrapper
-    if asyncio.iscoroutinefunction(wrapper_body):
-
-        @wraps(test_func)
-        async def wrapper(**kwargs):
-            return await wrapper_body(**kwargs)
-
-    else:
-
-        @wraps(test_func)
-        def wrapper(**kwargs):
-            return wrapper_body(**kwargs)
+    @wraps(test_func)
+    async def wrapper(**kwargs):
+        return await wrapper_body(**kwargs)
 
     parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names]
     wrapper.__signature__ = inspect.Signature(parameters)

From 44b1326520d2fde534ba67034a9cadc5d1197e69 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 12 Aug 2025 23:57:42 -0700
Subject: [PATCH 7/9] formatting

---
 eval_protocol/pytest/evaluation_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index c866d182..81856ff6 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -705,11 +705,12 @@ def create_dual_mode_wrapper() -> Callable:
             """
             import asyncio
 
-            is_test_func_async = asyncio.iscoroutinefunction(test_func)
+            # Check if the test function is async
+            is_async = asyncio.iscoroutinefunction(test_func)
 
             async def call_test_func(**call_kwargs):
                 """Helper to call test_func with proper async/sync handling"""
-                if is_test_func_async:
+                if is_async:
                     return await test_func(**call_kwargs)
                 else:
                     return test_func(**call_kwargs)

From a1d6a528ccc2ed51fd4eb2a6497026388828443a Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 12 Aug 2025 23:59:06 -0700
Subject: [PATCH 8/9] formatting

---
 tests/pytest/test_basic_coding.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
index ff5b125d..c96a8302 100644
--- a/tests/pytest/test_basic_coding.py
+++ b/tests/pytest/test_basic_coding.py
@@ -5,16 +5,12 @@
 and comparing the output against expected results in a pointwise manner.
 """
 
-import logging
-import time
 from typing import Any, Dict, List
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks
 
-logger = logging.getLogger(__name__)
-
 
 def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
     """
@@ -43,22 +39,18 @@ def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Evaluation function that tests code correctness by executing it locally.
 
-
     This function:
     1. Extracts Python code from the assistant's response
     2. Executes the code locally with timeout=10
     3. Compares the output to ground_truth
     4. Returns a score of 1.0 if output matches, 0.0 otherwise
 
-
     Args:
         row: EvaluationRow containing the conversation messages and expected_output in ground_truth
 
-
     Returns:
         EvaluationRow with the evaluation result
     """
-    logger.info(f"STARTING TO EVALUATE ROW: {row.input_metadata.row_id} at time {time.time()}")
     # Check if we have an assistant response
     if len(row.messages) < 2 or row.messages[-1].role != "assistant":
         row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")

From edf99acb6ce5434ecb240465bcf973a12f7cdb9f Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 13 Aug 2025 00:19:17 -0700
Subject: [PATCH 9/9] fixing tests

---
 .../pytest/default_no_op_rollout_process.py   |  9 +++--
 tests/pytest/test_pytest_ids.py               |  8 ++--
 .../test_rollout_control_plane_integration.py | 39 ++++++++++++-------
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/eval_protocol/pytest/default_no_op_rollout_process.py b/eval_protocol/pytest/default_no_op_rollout_process.py
index bae733c3..47cb17be 100644
--- a/eval_protocol/pytest/default_no_op_rollout_process.py
+++ b/eval_protocol/pytest/default_no_op_rollout_process.py
@@ -1,12 +1,15 @@
-from typing import List
+from typing import AsyncIterator, List
 
 from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
 
-def default_no_op_rollout_processor(rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[EvaluationRow]:
+async def default_no_op_rollout_processor(
+    rows: List[EvaluationRow], config: RolloutProcessorConfig
+) -> AsyncIterator[EvaluationRow]:
     """
     Simply passes input dataset through to the test function. This can be useful
     if you want to run the rollout yourself.
     """
-    return rows
+    for row in rows:
+        yield row
diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py
index 0131bcbe..24ba3baf 100644
--- a/tests/pytest/test_pytest_ids.py
+++ b/tests/pytest/test_pytest_ids.py
@@ -19,7 +19,7 @@ def read(self):
         return list(self._rows.values())
 
 
-def test_evaluation_test_decorator(monkeypatch):
+async def test_evaluation_test_decorator(monkeypatch):
     from eval_protocol.pytest.evaluation_test import evaluation_test
 
     logger = InMemoryLogger()
@@ -45,13 +45,13 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
 
     # Manually invoke all parameter combinations within a single test
     for ds_path in dataset_paths:
-        eval_fn(model="dummy/local-model", dataset_path=[ds_path])
+        await eval_fn(model="dummy/local-model", dataset_path=[ds_path])
 
     # Assertions on IDs generated by the decorator logic
     assert len(logger.read()) == 38
 
 
-def test_evaluation_test_decorator_ids_single(monkeypatch):
+async def test_evaluation_test_decorator_ids_single(monkeypatch):
     in_memory_logger = InMemoryLogger()
     unique_run_ids = set()
     unique_experiment_ids = set()
@@ -92,7 +92,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
     # Manually invoke all parameter combinations within a single test
     for ds_path in dataset_paths:
         for params in input_params_list:
-            eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params)
+            await eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params)
 
     # Assertions on IDs generated by the decorator logic
     assert len(unique_invocation_ids) == 1
diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py
index dcaac0e9..1b92d5aa 100644
--- a/tests/test_rollout_control_plane_integration.py
+++ b/tests/test_rollout_control_plane_integration.py
@@ -239,7 +239,9 @@ def mock_step_side_effect(env_index, tool_call):
             policy = MockPolicy(["right", "down", "right"])
 
             # Execute rollout
-            evaluation_rows = await self.execution_manager.execute_rollouts(mock_env, policy, steps=10)
+            evaluation_rows = []
+            async for row in self.execution_manager.execute_rollouts(mock_env, policy, steps=10):
+                evaluation_rows.append(row)
 
             # Validate results
             assert len(evaluation_rows) == 1, "Should have one evaluation row"
@@ -457,7 +459,9 @@ async def test_rollout_handles_control_plane_failure_gracefully(self):
 
             # Execute rollout with control plane failure
             policy = MockPolicy(["right"])
-            evaluation_rows = await self.execution_manager.execute_rollouts(mock_env, policy, steps=1)
+            evaluation_rows = []
+            async for row in self.execution_manager.execute_rollouts(mock_env, policy, steps=1):
+                evaluation_rows.append(row)
 
             # Should still work, but without control plane info
             assert len(evaluation_rows) == 1
@@ -500,15 +504,26 @@ async def test_rollout_creates_envs_from_url(self):
             mock_make.return_value = mock_env
 
             manager_instance = MockManager.return_value
-            manager_instance.execute_rollouts = AsyncMock(return_value=["ok"])
 
-            result = await ep.rollout(
+            # Mock execute_rollouts to return an async generator and track calls
+            call_args = []
+
+            async def mock_execute_rollouts(*args, **kwargs):
+                call_args.append((args, kwargs))
+                for item in ["ok"]:
+                    yield item
+
+            manager_instance.execute_rollouts = mock_execute_rollouts
+
+            result = []
+            async for row in ep.rollout(
                 "http://localhost:1234/mcp/",
                 policy,
                 dataset=dataset,
                 model_id="test_model",
                 steps=5,
-            )
+            ):
+                result.append(row)
 
             mock_make.assert_called_once_with(
                 "http://localhost:1234/mcp/",
@@ -517,14 +532,12 @@ async def test_rollout_creates_envs_from_url(self):
                 model_id="test_model",
             )
 
-            manager_instance.execute_rollouts.assert_called_once_with(
-                mock_make.return_value,
-                policy,
-                5,
-                None,
-                8,
-                None,
-            )
+            # Verify execute_rollouts was called with correct arguments
+            assert len(call_args) == 1, "execute_rollouts should be called once"
+            args, kwargs = call_args[0]
+            assert args[0] == mock_make.return_value, "First arg should be mock env"
+            assert args[1] == policy, "Second arg should be policy"
+            assert args[2] == 5, "Third arg should be steps"
 
             assert result == ["ok"]