Azure · placerda · Jun 8, 2026 · Jun 8, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,14 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres
 ## [0.3.11] - 2026-06-08
 
 ### Fixed
+- **Local AI-assisted evaluators now support reasoning-model graders.** When
+  `AZURE_OPENAI_DEPLOYMENT` points at `gpt-5*`, `o1*`, `o3*`, or `o4*`,
+  AgentOps marks the Azure AI Evaluation evaluator model as reasoning-capable so
+  the SDK sends `max_completion_tokens` instead of the unsupported `max_tokens`.
+- **`agentops eval run` no longer hides interactive azd prompts while appearing
+  to hang.** The azd backend now runs `azd ai agent eval run` and the follow-up
+  `show` command with `--no-prompt` and a closed stdin, so any missing
+  authentication/configuration fails visibly instead of waiting indefinitely.
 - **`agentops eval init` now bootstraps the minimal azd prompt-agent context.**
   For Foundry prompt-agent configs, the command creates missing `azure.yaml` and
   `src/<agent>/agent.yaml` files, enriches the active `.azure/<env>/.env` with

diff --git a/docs/doctor-checks.md b/docs/doctor-checks.md
@@ -90,7 +90,7 @@ stopping the whole run.
 | `opex.stale_evaluation` | warning / critical | `results_history` | programmatic | latest run older than `stale_after_days` (critical at 2×) |
 | `opex.flaky_metric.<metric>` | warning | `results_history` | programmatic | coefficient of variation across last N runs > `flaky_cv_threshold` |
 | `opex.no_token_telemetry` | warning | `azure_monitor` | programmatic | `request_count > 0` but `gen_ai.usage.input_tokens + output_tokens == 0` |
-| `opex.max_tokens_undefined` | warning | `workspace_files` | programmatic | no `max_tokens:` declared in any `agentops.yaml` / bundle YAML that configures a model |
+| `opex.max_tokens_undefined` | warning | `workspace_files` | programmatic | no `max_tokens:` or `max_completion_tokens:` declared in any `agentops.yaml` / bundle YAML that configures a model |
 | `opex.llm.bundle_coverage` | info / warning | `workspace_files` | llm-judged | judge compares bundle YAML against agent description and flags missing built-ins |
 | `opex.spec_conformance.spec_missing` | warning | `workspace_files` | programmatic | spec-driven setup detected (`.specify/`, `AGENTS.md`, or Copilot instructions) but no readable spec body, so Doctor cannot verify bundles / datasets / tasks against intended agent behavior |
 | `opex.spec_conformance.tasks_stale` | warning | `workspace_files` | programmatic | unchecked task-list items in the spec have remained open past `stale_after_days`, which suggests the implementation plan may be stale or the task list was not maintained |

diff --git a/src/agentops/agent/checks/catalog.py b/src/agentops/agent/checks/catalog.py
@@ -507,10 +507,10 @@ def is_llm_judged(self) -> bool:
     CheckSpec(
         id="opex.max_tokens_undefined",
         category=Category.OPERATIONAL_EXCELLENCE,
-        title="`max_tokens` is not set on model / evaluator configuration",
+        title="Output token limit is not set on model / evaluator configuration",
         summary=(
-            "Unbounded `max_tokens` invites long, expensive responses "
-            "and unpredictable latency."
+            "Missing `max_tokens` / `max_completion_tokens` limits invite "
+            "long, expensive responses and unpredictable latency."
         ),
         severities=(Severity.WARNING,),
         requires=("workspace",),

diff --git a/src/agentops/agent/checks/opex_workspace.py b/src/agentops/agent/checks/opex_workspace.py
@@ -560,7 +560,7 @@ def _safe_load_yaml(path: Path) -> Optional[dict]:
 
 
 def _check_max_tokens_limit(workspace: Path) -> List[Finding]:
-    """AI.26 — every model deployment / call should set a ``max_tokens`` limit.
+    """AI.26 — every model deployment / call should set an output token limit.
 
     Without an upper bound, a runaway prompt or a malicious user can
     drive the bill arbitrarily high. We look in two places:
@@ -572,7 +572,7 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]:
     The check is permissive: it fires only when at least one file
     explicitly looks like it configures a model (has ``model:``,
     ``deployment:``, or an ``evaluators:`` list) **and** none of the
-    candidate files declares ``max_tokens``. That avoids false
+    candidate files declares ``max_tokens`` or ``max_completion_tokens``. That avoids false
     positives on bare workspaces / agent-only configs.
     """
     candidates: List[Path] = []
@@ -586,18 +586,19 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]:
         return []
 
     looks_model_shaped = False
-    files_with_max_tokens: List[str] = []
-    files_without_max_tokens: List[str] = []
+    files_with_token_limit: List[str] = []
+    files_without_token_limit: List[str] = []
 
     for path in candidates:
         try:
             text = path.read_text(encoding="utf-8")
         except OSError:
             continue
-        # Cheap, format-agnostic detection: matches `max_tokens: <n>`
-        # at any nesting level in any of the candidate YAMLs.
-        if re.search(r"(?m)^\s*max_tokens\s*:", text):
-            files_with_max_tokens.append(str(path.relative_to(workspace)).replace("\\", "/"))
+        # Cheap, format-agnostic detection: matches output token limits at
+        # any nesting level in any of the candidate YAMLs. Reasoning models
+        # require `max_completion_tokens`; older chat models use `max_tokens`.
+        if re.search(r"(?m)^\s*(max_tokens|max_completion_tokens)\s*:", text):
+            files_with_token_limit.append(str(path.relative_to(workspace)).replace("\\", "/"))
             looks_model_shaped = True
             continue
         # Only count files that actually look like they configure a model.
@@ -606,40 +607,39 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]:
             text,
         ):
             looks_model_shaped = True
-            files_without_max_tokens.append(
+            files_without_token_limit.append(
                 str(path.relative_to(workspace)).replace("\\", "/")
             )
 
     if not looks_model_shaped:
         return []
-    if files_with_max_tokens and not files_without_max_tokens:
+    if files_with_token_limit and not files_without_token_limit:
         return []
-    if not files_without_max_tokens:
+    if not files_without_token_limit:
         return []
 
     return [
         Finding(
             id="opex.max_tokens_undefined",
             severity=Severity.WARNING,
             category=Category.OPERATIONAL_EXCELLENCE,
-            title="`max_tokens` is not set on model / evaluator configuration",
+            title="Output token limit is not set on model / evaluator configuration",
             summary=(
                 "Found model / evaluator YAML files that do not declare "
-                "a `max_tokens:` ceiling. Without an upper bound a single "
+                "a `max_tokens:` or `max_completion_tokens:` ceiling. Without an upper bound a single "
                 "runaway completion or a malicious prompt can drive token "
                 "spend arbitrarily high."
             ),
             recommendation=(
-                "Add a `max_tokens:` field next to each `model:` / "
-                "`deployment:` block (and inside `model_config:` for "
-                "AI-assisted evaluators). Pick a value just above your "
-                "longest legitimate response so legitimate traffic isn't "
-                "truncated."
+                "Add a `max_tokens:` field for chat models, or "
+                "`max_completion_tokens:` for reasoning models such as "
+                "`gpt-5` and `o` series deployments. Pick a value just above "
+                "your longest legitimate response so legitimate traffic isn't truncated."
             ),
             source=SOURCE_NAME,
             evidence={
-                "files_without_max_tokens": files_without_max_tokens[:10],
-                "files_with_max_tokens": files_with_max_tokens[:10],
+                "files_without_token_limit": files_without_token_limit[:10],
+                "files_with_token_limit": files_with_token_limit[:10],
             },
         )
     ]

diff --git a/src/agentops/pipeline/azd_runner.py b/src/agentops/pipeline/azd_runner.py
@@ -107,6 +107,7 @@ def run_azd_eval(
 
     command = [
         "azd",
+        "--no-prompt",
         "ai",
         "agent",
         "eval",
@@ -142,6 +143,7 @@ def run_azd_eval(
             show = _run_command(
                 [
                     "azd",
+                    "--no-prompt",
                     "ai",
                     "agent",
                     "eval",
@@ -327,6 +329,7 @@ def _run_command(
             encoding="utf-8",
             errors="replace",
             capture_output=True,
+            stdin=subprocess.DEVNULL,
             timeout=timeout_seconds,
             check=False,
         )

diff --git a/src/agentops/pipeline/runtime.py b/src/agentops/pipeline/runtime.py
@@ -64,7 +64,10 @@ def _credential() -> Any:
     return DefaultAzureCredential(exclude_developer_cli_credential=True, process_timeout=30)
 
 
-def _model_config() -> Dict[str, str]:
+_REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4")
+
+
+def _model_config() -> Dict[str, Any]:
     from agentops.utils.azure_endpoints import (
         derive_openai_endpoint_from_project,
         normalize_azure_openai_endpoint,
@@ -110,14 +113,23 @@ def _model_config() -> Dict[str, str]:
             "Missing environment variables: " + ", ".join(missing) + "." + hint
         )
 
-    config: Dict[str, str] = {
-        "azure_endpoint": endpoint,  # type: ignore[dict-item]
-        "azure_deployment": deployment,  # type: ignore[dict-item]
+    config: Dict[str, Any] = {
+        "azure_endpoint": endpoint,
+        "azure_deployment": deployment,
         "api_version": api_version,
     }
     return config
 
 
+def _is_reasoning_model_deployment(deployment: Optional[str]) -> bool:
+    """Return whether an evaluator deployment needs reasoning-model parameters."""
+
+    if not deployment:
+        return False
+    normalized = deployment.strip().lower()
+    return any(normalized.startswith(prefix) for prefix in _REASONING_MODEL_PREFIXES)
+
+
 def _project_endpoint() -> str:
     endpoint = os.getenv("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT")
     if not endpoint:
@@ -152,7 +164,10 @@ def load_evaluator(preset: EvaluatorPreset) -> EvaluatorRuntime:
 
     init_kwargs: Dict[str, Any] = {}
     if preset.class_name in _AI_ASSISTED:
-        init_kwargs["model_config"] = _model_config()
+        model_config = _model_config()
+        init_kwargs["model_config"] = model_config
+        if _is_reasoning_model_deployment(model_config.get("azure_deployment")):
+            init_kwargs["is_reasoning_model"] = True
     if preset.class_name in _SAFETY:
         init_kwargs["azure_ai_project"] = _project_endpoint()
         init_kwargs["credential"] = _credential()

diff --git a/tests/unit/test_agent_checks_opex_workspace.py b/tests/unit/test_agent_checks_opex_workspace.py
@@ -412,7 +412,7 @@ def test_workflow_sha_pinning_skips_local_actions(workspace: Path) -> None:
 
 
 # ---------------------------------------------------------------------------
-# AI.26 max_tokens limit (opex.max_tokens_undefined)
+# AI.26 output token limit (opex.max_tokens_undefined)
 # ---------------------------------------------------------------------------
 
 
@@ -429,7 +429,7 @@ def test_max_tokens_undefined_fires_when_bundle_lacks_max_tokens(tmp_path: Path)
     findings = run_opex_workspace_check(tmp_path)
     f = next((f for f in findings if f.id == "opex.max_tokens_undefined"), None)
     assert f is not None
-    assert "default.yaml" in f.evidence["files_without_max_tokens"][0]
+    assert "default.yaml" in f.evidence["files_without_token_limit"][0]
 
 
 def test_max_tokens_undefined_silent_when_every_file_declares_it(tmp_path: Path) -> None:
@@ -447,6 +447,19 @@ def test_max_tokens_undefined_silent_when_every_file_declares_it(tmp_path: Path)
     assert not any(f.id == "opex.max_tokens_undefined" for f in findings)
 
 
+def test_max_tokens_undefined_silent_when_reasoning_file_declares_completion_limit(
+    tmp_path: Path,
+) -> None:
+    (tmp_path / "agentops.yaml").write_text(
+        "version: 1\nagent: my-agent:2\nmodel: gpt-5\nmax_completion_tokens: 800\n",
+        encoding="utf-8",
+    )
+
+    findings = run_opex_workspace_check(tmp_path)
+
+    assert not any(f.id == "opex.max_tokens_undefined" for f in findings)
+
+
 def test_max_tokens_undefined_silent_when_no_model_shaped_files(tmp_path: Path) -> None:
     # Empty workspace - no model / evaluator / deployment keys anywhere.
     (tmp_path / "agentops.yaml").write_text(

diff --git a/tests/unit/test_azd_runner.py b/tests/unit/test_azd_runner.py
@@ -52,11 +52,12 @@ def test_run_azd_eval_reads_show_out_file_when_run_outputs_text(
     _write_recipe(recipe_path)
     calls: list[list[str]] = []
 
-    def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, check):
+    def fake_run(command, *, cwd, text, encoding, errors, capture_output, stdin, timeout, check):
         calls.append(command)
         assert encoding == "utf-8"
         assert errors == "replace"
-        if command[:5] == ["azd", "ai", "agent", "eval", "run"]:
+        assert stdin is azd_runner.subprocess.DEVNULL
+        if command[:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "run"]:
             return mock.Mock(
                 returncode=0,
                 stdout=(
@@ -66,7 +67,7 @@ def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, c
                 ),
                 stderr="",
             )
-        if command[:5] == ["azd", "ai", "agent", "eval", "show"]:
+        if command[:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "show"]:
             out_file = Path(command[command.index("--out-file") + 1])
             out_file.write_text(
                 json.dumps(
@@ -98,7 +99,8 @@ def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, c
     assert result.run_id == "evalrun_456"
     assert result.payload["eval_id"] == "eval_123"
     assert azd_runner._extract_numeric_metrics(result.payload) == {"coherence": 1.0}
-    assert calls[1][:6] == ["azd", "ai", "agent", "eval", "show", "eval_123"]
+    assert calls[0][:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "run"]
+    assert calls[1][:7] == ["azd", "--no-prompt", "ai", "agent", "eval", "show", "eval_123"]
 
 
 def test_run_command_with_progress_emits_heartbeat(

diff --git a/tests/unit/test_runtime_model_config.py b/tests/unit/test_runtime_model_config.py
@@ -12,9 +12,13 @@
 
 from __future__ import annotations
 
+from types import SimpleNamespace
+
 import pytest
 
-from agentops.pipeline.runtime import _model_config
+from agentops.core.evaluators import EvaluatorPreset
+from agentops.pipeline import runtime
+from agentops.pipeline.runtime import _is_reasoning_model_deployment, _model_config
 
 
 @pytest.fixture(autouse=True)
@@ -98,3 +102,71 @@ def test_api_version_override(monkeypatch: pytest.MonkeyPatch) -> None:
     cfg = _model_config()
 
     assert cfg["api_version"] == "2024-12-01-preview"
+
+
+@pytest.mark.parametrize("deployment", ["gpt-5", "gpt-5.4-mini", "o1-preview", "o3-mini", "o4-mini"])
+def test_reasoning_model_deployment_detection(deployment: str) -> None:
+    assert _is_reasoning_model_deployment(deployment) is True
+
+
+@pytest.mark.parametrize("deployment", ["gpt-4o-mini", "gpt-4.1", "my-chat-grader", ""])
+def test_non_reasoning_model_deployment_detection(deployment: str) -> None:
+    assert _is_reasoning_model_deployment(deployment) is False
+
+
+def test_ai_assisted_evaluator_marks_reasoning_deployments(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://x.openai.azure.com")
+    monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-5.4-mini")
+    captured: dict[str, object] = {}
+
+    class FakeCoherenceEvaluator:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+    monkeypatch.setattr(
+        runtime.importlib,
+        "import_module",
+        lambda _name: SimpleNamespace(CoherenceEvaluator=FakeCoherenceEvaluator),
+    )
+
+    preset = EvaluatorPreset(
+        name="coherence",
+        class_name="CoherenceEvaluator",
+        score_key="coherence",
+        input_mapping={},
+    )
+    runtime.load_evaluator(preset)
+
+    assert captured["model_config"]["azure_deployment"] == "gpt-5.4-mini"
+    assert captured["is_reasoning_model"] is True
+
+
+def test_ai_assisted_evaluator_leaves_chat_deployments_default(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://x.openai.azure.com")
+    monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini")
+    captured: dict[str, object] = {}
+
+    class FakeCoherenceEvaluator:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+    monkeypatch.setattr(
+        runtime.importlib,
+        "import_module",
+        lambda _name: SimpleNamespace(CoherenceEvaluator=FakeCoherenceEvaluator),
+    )
+
+    preset = EvaluatorPreset(
+        name="coherence",
+        class_name="CoherenceEvaluator",
+        score_key="coherence",
+        input_mapping={},
+    )
+    runtime.load_evaluator(preset)
+
+    assert captured["model_config"]["azure_deployment"] == "gpt-4o-mini"
+    assert "is_reasoning_model" not in captured