diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index f9604e1e..dbf197fc 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -158,8 +158,8 @@ async def _execute_with_semaphore(idx):
                 messages.append(Message.model_validate(msg_dict))
 
             evaluation_rows[idx].messages = messages
-            evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
-            evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
+            # evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
+            # evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
             evaluation_rows[idx].tools = shared_tool_schema
             evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
             evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
@@ -482,11 +482,11 @@ async def _execute_rollout(
                 trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
             try:
                 await envs.connection_manager.reset_session(session)
-            except:
+            except:  # noqa: E722
                 logger.error(f"Error resetting session {session.session_id}")
             try:
                 await envs.connection_manager.close_session(session)
-            except:
+            except:  # noqa: E722
                 logger.error(f"Error closing session {session.session_id}")
         return trajectory
 
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 02573038..a70fddab 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -202,6 +202,21 @@ class InputMetadata(BaseModel):
     )
 
 
+class EvaluationThreshold(BaseModel):
+    """Threshold configuration for evaluation tests.
+
+    The success field is required - tests must specify a minimum success rate.
+    The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
+    """
+
+    success: float = Field(
+        ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    )
+    standard_deviation: Optional[float] = Field(
+        None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    )
+
+
 class EvalMetadata(BaseModel):
     """Metadata about the evaluation that was run."""
 
@@ -216,7 +231,9 @@ class EvalMetadata(BaseModel):
     )
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
-    threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
+    passed_threshold: Optional[EvaluationThreshold] = Field(
+        None, description="Threshold configuration for test success"
+    )
     passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
 
 
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index fe0c8cf5..3869312d 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -3,14 +3,21 @@
 import math
 import os
 import statistics
-from typing import Any, Callable, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 
 import pytest
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
 from eval_protocol.human_id import generate_id
-from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata, Message
+from eval_protocol.models import (
+    CompletionParams,
+    EvalMetadata,
+    EvaluationRow,
+    EvaluationThreshold,
+    InputMetadata,
+    Message,
+)
 from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
@@ -47,7 +54,7 @@ def evaluation_test(  # noqa: C901
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
     evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
     aggregation_method: AggregationMethod = "mean",
-    threshold_of_success: Optional[float] = None,
+    passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
@@ -108,8 +115,8 @@ def evaluation_test(  # noqa: C901
         rollout_processor: Function used to perform the rollout.
         evaluation_test_kwargs: Kwargs for the evaluation function.
         aggregation_method: How to aggregate scores across rows.
-        threshold_of_success: If set, fail the test if the aggregated score is
-            below this threshold.
+        passed_threshold: Threshold configuration for test success.
+            Success rate must be above success, and if set, standard deviation must be below standard_deviation.
         num_runs: Number of times to repeat the rollout and evaluations.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -127,6 +134,14 @@ def evaluation_test(  # noqa: C901
     def decorator(
         test_func: TestFunction,
     ):
+        if passed_threshold is not None:
+            if isinstance(passed_threshold, float):
+                threshold = EvaluationThreshold(success=passed_threshold)
+            else:
+                threshold = EvaluationThreshold(**passed_threshold)
+        else:
+            threshold = None
+
         sig = inspect.signature(test_func)
 
         # For pointwise/rowwise mode, we expect a different signature
@@ -285,7 +300,7 @@ def create_wrapper_with_signature() -> Callable:
             def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
                 eval_metadata = None
-                all_results: List[EvaluationRow] = []
+                all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
 
                 cohort_id = generate_id()
 
@@ -346,7 +361,7 @@ def _log_eval_error(
                         status="running",
                         num_runs=num_runs,
                         aggregation_method=aggregation_method,
-                        threshold_of_success=threshold_of_success,
+                        passed_threshold=threshold,
                         passed=None,
                     )
 
@@ -386,11 +401,11 @@ def _log_eval_error(
                         logger=active_logger,
                     )
 
-                    for _ in range(num_runs):
+                    for i in range(num_runs):
                         # Regenerate outputs each run by deep-copying the pristine dataset
                         # so model responses are not reused across runs.
                         run_id = generate_id()
-                        fresh_dataset = [copy.deepcopy(r) for r in data]
+                        fresh_dataset = [r.model_copy(deep=True) for r in data]
 
                         # apply new run_id to fresh_dataset
                         for row in fresh_dataset:
@@ -418,7 +433,7 @@ def _log_eval_error(
                                     raise ValueError(
                                         f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                     )
-                                all_results.append(result)
+                                all_results[i].append(result)
                         else:
                             # Batch mode: call the test function with the full dataset
                             results = execute_with_params(
@@ -442,17 +457,21 @@ def _log_eval_error(
                                 raise ValueError(
                                     f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
                                 )
-                            all_results.extend(results)
+                            all_results[i] = results
 
-                    scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
+                    scores = [
+                        sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
+                        for result in all_results
+                    ]
                     agg_score = aggregate(scores, aggregation_method)
+                    score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
 
                     # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
                     ci_low: float | None = None
                     ci_high: float | None = None
                     if aggregation_method == "mean":
                         try:
-                            result_ci = compute_fixed_set_mu_ci(all_results)
+                            result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
                             mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
                             if mu_ci_low is not None and mu_ci_high is not None:
                                 ci_low = float(mu_ci_low)
@@ -464,15 +483,24 @@ def _log_eval_error(
 
                     # Determine if the evaluation passed based on threshold
                     passed = None
-                    if threshold_of_success is not None:
-                        passed = agg_score >= threshold_of_success
+
+                    if threshold is not None:
+                        success_passed, std_passed = True, True
+
+                        success_passed = agg_score >= threshold.success
+
+                        if threshold.standard_deviation is not None:
+                            std_passed = score_std <= threshold.standard_deviation
+
+                        passed = success_passed and std_passed
 
                     # Update eval metadata status and passed field for all results
-                    for r in all_results:
-                        if r.eval_metadata is not None:
-                            r.eval_metadata.status = "finished"
-                            r.eval_metadata.passed = passed
-                        active_logger.log(r)
+                    for result in all_results:
+                        for r in result:
+                            if r.eval_metadata is not None:
+                                r.eval_metadata.status = "finished"
+                                r.eval_metadata.passed = passed
+                        default_logger.log(r)
 
                     # Optional: print and/or persist a summary artifact for CI
                     try:
@@ -480,7 +508,7 @@ def _log_eval_error(
                         summary_path = os.getenv("EP_SUMMARY_JSON")
                         suite_name = test_func.__name__
                         model_used = model_name
-                        total_rows = len(all_results)
+                        total_rows = len([item for sublist in all_results for item in sublist])
                         summary_obj = {
                             "suite": suite_name,
                             "model": model_used,
@@ -497,7 +525,7 @@ def _log_eval_error(
                         from collections import defaultdict
 
                         metric_scores: Dict[str, list] = defaultdict(list)
-                        for r in all_results:
+                        for r in [item for sublist in all_results for item in sublist]:
                             if r.evaluation_result and r.evaluation_result.metrics:
                                 for m_name, m_res in r.evaluation_result.metrics.items():
                                     if m_res is not None and getattr(m_res, "score", None) is not None:
@@ -614,10 +642,14 @@ def _extract_effort_tag(params: dict) -> str | None:
                     #     pass
 
                     # Check threshold after logging
-                    if threshold_of_success is not None and not passed:
+                    if threshold is not None and not passed:
                         assert (
-                            agg_score >= threshold_of_success
-                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
+                            agg_score >= threshold.success
+                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
+                        if threshold.standard_deviation is not None:
+                            assert (
+                                score_std <= threshold.standard_deviation
+                            ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
 
                 except AssertionError:
                     _log_eval_error("finished", data if "data" in locals() else None, passed=False)
diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
index 4780388a..f9c84695 100644
--- a/tests/pytest/test_apps_coding.py
+++ b/tests/pytest/test_apps_coding.py
@@ -18,10 +18,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     Convert entries from APPS dataset to EvaluationRow objects.
     """
     return [
-        EvaluationRow(
-            messages=[Message(role="user", content=row["question"])],
-            ground_truth=row["input_output"]
-        )
+        EvaluationRow(messages=[Message(role="user", content=row["question"])], ground_truth=row["input_output"])
         for row in data
     ]
 
@@ -31,7 +28,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     dataset_adapter=apps_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.33,
+    passed_threshold=0.33,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
@@ -42,7 +39,7 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
 
     Args:
         row: EvaluationRow containing the conversation messages and ground_truth as JSON string
-    
+
     Returns:
         EvaluationRow with the evaluation result
     """
@@ -51,8 +48,8 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
         messages=row.messages,
         ground_truth=row.ground_truth,
     )
-    
+
     # Set the evaluation result on the row
     row.evaluation_result = result
-    
-    return row 
\ No newline at end of file
+
+    return row
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
index 35d1a1b3..c96a8302 100644
--- a/tests/pytest/test_basic_coding.py
+++ b/tests/pytest/test_basic_coding.py
@@ -9,7 +9,7 @@
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
-from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
+from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks
 
 
 def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -18,8 +18,8 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     """
     return [
         EvaluationRow(
-            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")], 
-            ground_truth=row["expected_output"]
+            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")],
+            ground_truth=row["expected_output"],
         )
         for row in data
     ]
@@ -30,7 +30,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     dataset_adapter=coding_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.8,
+    passed_threshold=0.8,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
@@ -38,16 +38,16 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
 def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Evaluation function that tests code correctness by executing it locally.
-    
+
     This function:
     1. Extracts Python code from the assistant's response
     2. Executes the code locally with timeout=10
     3. Compares the output to ground_truth
     4. Returns a score of 1.0 if output matches, 0.0 otherwise
-    
+
     Args:
         row: EvaluationRow containing the conversation messages and expected_output in ground_truth
-        
+
     Returns:
         EvaluationRow with the evaluation result
     """
@@ -55,38 +55,34 @@ def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     if len(row.messages) < 2 or row.messages[-1].role != "assistant":
         row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")
         return row
-    
+
     assistant_content = row.messages[-1].content or ""
     expected_output = (row.ground_truth or "").strip()
-    
+
     # Extract Python code blocks
     code_blocks = extract_code_blocks(assistant_content, language="python")
     if not code_blocks:
         row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found")
         return row
-    
+
     code = code_blocks[0]["code"]
-    
+
     # Execute the code locally
     execution_result = execute_python_code(code, timeout=10)
-    
+
     if not execution_result.get("success", False):
         error_msg = execution_result.get("error", "Code execution failed")
         row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}")
         return row
-    
+
     # Compare output with expected
     actual_output = (execution_result.get("output", "") or "").strip()
-    
+
     if actual_output == expected_output:
-        row.evaluation_result = EvaluateResult(
-            score=1.0, 
-            reason=f"✅ Output matches: '{actual_output}'"
-        )
+        row.evaluation_result = EvaluateResult(score=1.0, reason=f"✅ Output matches: '{actual_output}'")
     else:
         row.evaluation_result = EvaluateResult(
-            score=0.0, 
-            reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
+            score=0.0, reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
         )
-    
+
     return row
diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
index 76551920..74d5e317 100644
--- a/tests/pytest/test_frozen_lake.py
+++ b/tests/pytest/test_frozen_lake.py
@@ -7,7 +7,7 @@
 
 from typing import Any, Dict, List
 
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams, MetricResult
+from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 
@@ -41,7 +41,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold_of_success=0.66,
+    passed_threshold=0.66,
     num_runs=1,
     max_concurrent_rollouts=3,
     mode="pointwise",
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
index b396e12c..54779f09 100644
--- a/tests/pytest/test_hallucination.py
+++ b/tests/pytest/test_hallucination.py
@@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
-    threshold_of_success=0.33,
+    passed_threshold=0.33,
     num_runs=1,
     mode="pointwise",
 )
diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py
index 896adc49..ab4dad69 100644
--- a/tests/pytest/test_lunar_lander.py
+++ b/tests/pytest/test_lunar_lander.py
@@ -7,7 +7,7 @@
 
 from typing import Any, Dict, List
 
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams
+from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 
@@ -17,7 +17,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     Convert entries from lunar lander dataset to EvaluationRow objects.
     """
     rows = []
-    
+
     for row in data:
         eval_row = EvaluationRow(
             messages=[Message(role="system", content=row["system_prompt"])],
@@ -26,12 +26,12 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
                 dataset_info={
                     "environment_context": row["environment_context"],
                     "user_prompt_template": row["user_prompt_template"],
-                }
-            )
+                },
+            ),
         )
-        
+
         rows.append(eval_row)
-    
+
     return rows
 
 
@@ -41,7 +41,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     model=["gpt-4.1"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold_of_success=0.0,
+    passed_threshold=0.0,
     num_runs=1,
     mode="pointwise",
     max_concurrent_rollouts=3,
@@ -51,24 +51,28 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
 def test_lunar_lander_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Test lunar lander evaluation using the pytest framework.
-    
+
     This test evaluates how well the model can control the lunar lander to achieve
     a successful landing by checking the final reward and termination status.
-    
+
     Args:
         row: EvaluationRow object from lunar lander dataset
-        
+
     Returns:
         EvaluationRow object with evaluation results
     """
     score = row.get_total_reward()
 
     evaluation_score = 1.0 if score >= 200 else 0.0
-    reason = f"✅ Successful landing with reward {score:.2f}" if score >= 200 else f"❌ Failed landing with reward {score:.2f}"
+    reason = (
+        f"✅ Successful landing with reward {score:.2f}"
+        if score >= 200
+        else f"❌ Failed landing with reward {score:.2f}"
+    )
 
     row.evaluation_result = EvaluateResult(
         score=evaluation_score,
         reason=reason,
     )
-    
-    return row 
\ No newline at end of file
+
+    return row
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
index 4dcaacf2..89302163 100644
--- a/tests/pytest/test_markdown_highlighting.py
+++ b/tests/pytest/test_markdown_highlighting.py
@@ -30,7 +30,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
     dataset_adapter=markdown_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.5,
+    passed_threshold=0.5,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
index 05b0022c..afe74a4e 100644
--- a/tests/pytest/test_pytest_math_example.py
+++ b/tests/pytest/test_pytest_math_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.0,
+    passed_threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
index fbc59efc..e51b062f 100644
--- a/tests/pytest/test_pytest_math_format_length.py
+++ b/tests/pytest/test_pytest_math_format_length.py
@@ -14,7 +14,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.0,
+    passed_threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
index 51062a1f..b0c4850d 100644
--- a/tests/pytest/test_pytest_word_count_example.py
+++ b/tests/pytest/test_pytest_word_count_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.3,  # Reasonable threshold for word count evaluation
+    passed_threshold=0.3,  # Reasonable threshold for word count evaluation
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",  # Use pointwise mode for elegant row-by-row evaluation
 )
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index 5bb025b6..80aadf14 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -65,10 +65,10 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
     input_dataset=["tests/pytest/data/airline_dataset.jsonl"],
     dataset_adapter=tau_bench_airline_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "high"}],
+    rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold_of_success=0.4,
-    num_runs=1,
+    passed_threshold={"success": 0.4, "standard_deviation": 0.1},
+    num_runs=8,
     mode="pointwise",
     max_concurrent_rollouts=50,
     server_script_path="examples/tau2_mcp/server.py",