diff --git a/CHANGELOG.md b/CHANGELOG.md index ce225fc2..38d0b81d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres ## [0.3.11] - 2026-06-08 ### Fixed +- **Local AI-assisted evaluators now support reasoning-model graders.** When + `AZURE_OPENAI_DEPLOYMENT` points at `gpt-5*`, `o1*`, `o3*`, or `o4*`, + AgentOps marks the Azure AI Evaluation evaluator model as reasoning-capable so + the SDK sends `max_completion_tokens` instead of the unsupported `max_tokens`. +- **`agentops eval run` no longer hides interactive azd prompts while appearing + to hang.** The azd backend now runs `azd ai agent eval run` and the follow-up + `show` command with `--no-prompt` and a closed stdin, so any missing + authentication/configuration fails visibly instead of waiting indefinitely. - **`agentops eval init` now bootstraps the minimal azd prompt-agent context.** For Foundry prompt-agent configs, the command creates missing `azure.yaml` and `src//agent.yaml` files, enriches the active `.azure//.env` with diff --git a/docs/doctor-checks.md b/docs/doctor-checks.md index b1a168a0..16d85734 100644 --- a/docs/doctor-checks.md +++ b/docs/doctor-checks.md @@ -90,7 +90,7 @@ stopping the whole run. | `opex.stale_evaluation` | warning / critical | `results_history` | programmatic | latest run older than `stale_after_days` (critical at 2×) | | `opex.flaky_metric.` | warning | `results_history` | programmatic | coefficient of variation across last N runs > `flaky_cv_threshold` | | `opex.no_token_telemetry` | warning | `azure_monitor` | programmatic | `request_count > 0` but `gen_ai.usage.input_tokens + output_tokens == 0` | -| `opex.max_tokens_undefined` | warning | `workspace_files` | programmatic | no `max_tokens:` declared in any `agentops.yaml` / bundle YAML that configures a model | +| `opex.max_tokens_undefined` | warning | `workspace_files` | programmatic | no `max_tokens:` or `max_completion_tokens:` declared in any `agentops.yaml` / bundle YAML that configures a model | | `opex.llm.bundle_coverage` | info / warning | `workspace_files` | llm-judged | judge compares bundle YAML against agent description and flags missing built-ins | | `opex.spec_conformance.spec_missing` | warning | `workspace_files` | programmatic | spec-driven setup detected (`.specify/`, `AGENTS.md`, or Copilot instructions) but no readable spec body, so Doctor cannot verify bundles / datasets / tasks against intended agent behavior | | `opex.spec_conformance.tasks_stale` | warning | `workspace_files` | programmatic | unchecked task-list items in the spec have remained open past `stale_after_days`, which suggests the implementation plan may be stale or the task list was not maintained | diff --git a/src/agentops/agent/checks/catalog.py b/src/agentops/agent/checks/catalog.py index 731126c9..c85902d5 100644 --- a/src/agentops/agent/checks/catalog.py +++ b/src/agentops/agent/checks/catalog.py @@ -507,10 +507,10 @@ def is_llm_judged(self) -> bool: CheckSpec( id="opex.max_tokens_undefined", category=Category.OPERATIONAL_EXCELLENCE, - title="`max_tokens` is not set on model / evaluator configuration", + title="Output token limit is not set on model / evaluator configuration", summary=( - "Unbounded `max_tokens` invites long, expensive responses " - "and unpredictable latency." + "Missing `max_tokens` / `max_completion_tokens` limits invite " + "long, expensive responses and unpredictable latency." ), severities=(Severity.WARNING,), requires=("workspace",), diff --git a/src/agentops/agent/checks/opex_workspace.py b/src/agentops/agent/checks/opex_workspace.py index 78abac6b..22162111 100644 --- a/src/agentops/agent/checks/opex_workspace.py +++ b/src/agentops/agent/checks/opex_workspace.py @@ -560,7 +560,7 @@ def _safe_load_yaml(path: Path) -> Optional[dict]: def _check_max_tokens_limit(workspace: Path) -> List[Finding]: - """AI.26 — every model deployment / call should set a ``max_tokens`` limit. + """AI.26 — every model deployment / call should set an output token limit. Without an upper bound, a runaway prompt or a malicious user can drive the bill arbitrarily high. We look in two places: @@ -572,7 +572,7 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]: The check is permissive: it fires only when at least one file explicitly looks like it configures a model (has ``model:``, ``deployment:``, or an ``evaluators:`` list) **and** none of the - candidate files declares ``max_tokens``. That avoids false + candidate files declares ``max_tokens`` or ``max_completion_tokens``. That avoids false positives on bare workspaces / agent-only configs. """ candidates: List[Path] = [] @@ -586,18 +586,19 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]: return [] looks_model_shaped = False - files_with_max_tokens: List[str] = [] - files_without_max_tokens: List[str] = [] + files_with_token_limit: List[str] = [] + files_without_token_limit: List[str] = [] for path in candidates: try: text = path.read_text(encoding="utf-8") except OSError: continue - # Cheap, format-agnostic detection: matches `max_tokens: ` - # at any nesting level in any of the candidate YAMLs. - if re.search(r"(?m)^\s*max_tokens\s*:", text): - files_with_max_tokens.append(str(path.relative_to(workspace)).replace("\\", "/")) + # Cheap, format-agnostic detection: matches output token limits at + # any nesting level in any of the candidate YAMLs. Reasoning models + # require `max_completion_tokens`; older chat models use `max_tokens`. + if re.search(r"(?m)^\s*(max_tokens|max_completion_tokens)\s*:", text): + files_with_token_limit.append(str(path.relative_to(workspace)).replace("\\", "/")) looks_model_shaped = True continue # Only count files that actually look like they configure a model. @@ -606,15 +607,15 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]: text, ): looks_model_shaped = True - files_without_max_tokens.append( + files_without_token_limit.append( str(path.relative_to(workspace)).replace("\\", "/") ) if not looks_model_shaped: return [] - if files_with_max_tokens and not files_without_max_tokens: + if files_with_token_limit and not files_without_token_limit: return [] - if not files_without_max_tokens: + if not files_without_token_limit: return [] return [ @@ -622,24 +623,23 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]: id="opex.max_tokens_undefined", severity=Severity.WARNING, category=Category.OPERATIONAL_EXCELLENCE, - title="`max_tokens` is not set on model / evaluator configuration", + title="Output token limit is not set on model / evaluator configuration", summary=( "Found model / evaluator YAML files that do not declare " - "a `max_tokens:` ceiling. Without an upper bound a single " + "a `max_tokens:` or `max_completion_tokens:` ceiling. Without an upper bound a single " "runaway completion or a malicious prompt can drive token " "spend arbitrarily high." ), recommendation=( - "Add a `max_tokens:` field next to each `model:` / " - "`deployment:` block (and inside `model_config:` for " - "AI-assisted evaluators). Pick a value just above your " - "longest legitimate response so legitimate traffic isn't " - "truncated." + "Add a `max_tokens:` field for chat models, or " + "`max_completion_tokens:` for reasoning models such as " + "`gpt-5` and `o` series deployments. Pick a value just above " + "your longest legitimate response so legitimate traffic isn't truncated." ), source=SOURCE_NAME, evidence={ - "files_without_max_tokens": files_without_max_tokens[:10], - "files_with_max_tokens": files_with_max_tokens[:10], + "files_without_token_limit": files_without_token_limit[:10], + "files_with_token_limit": files_with_token_limit[:10], }, ) ] diff --git a/src/agentops/pipeline/azd_runner.py b/src/agentops/pipeline/azd_runner.py index c1e98545..b8f34b82 100644 --- a/src/agentops/pipeline/azd_runner.py +++ b/src/agentops/pipeline/azd_runner.py @@ -107,6 +107,7 @@ def run_azd_eval( command = [ "azd", + "--no-prompt", "ai", "agent", "eval", @@ -142,6 +143,7 @@ def run_azd_eval( show = _run_command( [ "azd", + "--no-prompt", "ai", "agent", "eval", @@ -327,6 +329,7 @@ def _run_command( encoding="utf-8", errors="replace", capture_output=True, + stdin=subprocess.DEVNULL, timeout=timeout_seconds, check=False, ) diff --git a/src/agentops/pipeline/runtime.py b/src/agentops/pipeline/runtime.py index c3cb675d..b22e4dbe 100644 --- a/src/agentops/pipeline/runtime.py +++ b/src/agentops/pipeline/runtime.py @@ -64,7 +64,10 @@ def _credential() -> Any: return DefaultAzureCredential(exclude_developer_cli_credential=True, process_timeout=30) -def _model_config() -> Dict[str, str]: +_REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4") + + +def _model_config() -> Dict[str, Any]: from agentops.utils.azure_endpoints import ( derive_openai_endpoint_from_project, normalize_azure_openai_endpoint, @@ -110,14 +113,23 @@ def _model_config() -> Dict[str, str]: "Missing environment variables: " + ", ".join(missing) + "." + hint ) - config: Dict[str, str] = { - "azure_endpoint": endpoint, # type: ignore[dict-item] - "azure_deployment": deployment, # type: ignore[dict-item] + config: Dict[str, Any] = { + "azure_endpoint": endpoint, + "azure_deployment": deployment, "api_version": api_version, } return config +def _is_reasoning_model_deployment(deployment: Optional[str]) -> bool: + """Return whether an evaluator deployment needs reasoning-model parameters.""" + + if not deployment: + return False + normalized = deployment.strip().lower() + return any(normalized.startswith(prefix) for prefix in _REASONING_MODEL_PREFIXES) + + def _project_endpoint() -> str: endpoint = os.getenv("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT") if not endpoint: @@ -152,7 +164,10 @@ def load_evaluator(preset: EvaluatorPreset) -> EvaluatorRuntime: init_kwargs: Dict[str, Any] = {} if preset.class_name in _AI_ASSISTED: - init_kwargs["model_config"] = _model_config() + model_config = _model_config() + init_kwargs["model_config"] = model_config + if _is_reasoning_model_deployment(model_config.get("azure_deployment")): + init_kwargs["is_reasoning_model"] = True if preset.class_name in _SAFETY: init_kwargs["azure_ai_project"] = _project_endpoint() init_kwargs["credential"] = _credential() diff --git a/tests/unit/test_agent_checks_opex_workspace.py b/tests/unit/test_agent_checks_opex_workspace.py index 0aba4086..adc300c5 100644 --- a/tests/unit/test_agent_checks_opex_workspace.py +++ b/tests/unit/test_agent_checks_opex_workspace.py @@ -412,7 +412,7 @@ def test_workflow_sha_pinning_skips_local_actions(workspace: Path) -> None: # --------------------------------------------------------------------------- -# AI.26 max_tokens limit (opex.max_tokens_undefined) +# AI.26 output token limit (opex.max_tokens_undefined) # --------------------------------------------------------------------------- @@ -429,7 +429,7 @@ def test_max_tokens_undefined_fires_when_bundle_lacks_max_tokens(tmp_path: Path) findings = run_opex_workspace_check(tmp_path) f = next((f for f in findings if f.id == "opex.max_tokens_undefined"), None) assert f is not None - assert "default.yaml" in f.evidence["files_without_max_tokens"][0] + assert "default.yaml" in f.evidence["files_without_token_limit"][0] def test_max_tokens_undefined_silent_when_every_file_declares_it(tmp_path: Path) -> None: @@ -447,6 +447,19 @@ def test_max_tokens_undefined_silent_when_every_file_declares_it(tmp_path: Path) assert not any(f.id == "opex.max_tokens_undefined" for f in findings) +def test_max_tokens_undefined_silent_when_reasoning_file_declares_completion_limit( + tmp_path: Path, +) -> None: + (tmp_path / "agentops.yaml").write_text( + "version: 1\nagent: my-agent:2\nmodel: gpt-5\nmax_completion_tokens: 800\n", + encoding="utf-8", + ) + + findings = run_opex_workspace_check(tmp_path) + + assert not any(f.id == "opex.max_tokens_undefined" for f in findings) + + def test_max_tokens_undefined_silent_when_no_model_shaped_files(tmp_path: Path) -> None: # Empty workspace - no model / evaluator / deployment keys anywhere. (tmp_path / "agentops.yaml").write_text( diff --git a/tests/unit/test_azd_runner.py b/tests/unit/test_azd_runner.py index a8c3edc5..a14a166d 100644 --- a/tests/unit/test_azd_runner.py +++ b/tests/unit/test_azd_runner.py @@ -52,11 +52,12 @@ def test_run_azd_eval_reads_show_out_file_when_run_outputs_text( _write_recipe(recipe_path) calls: list[list[str]] = [] - def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, check): + def fake_run(command, *, cwd, text, encoding, errors, capture_output, stdin, timeout, check): calls.append(command) assert encoding == "utf-8" assert errors == "replace" - if command[:5] == ["azd", "ai", "agent", "eval", "run"]: + assert stdin is azd_runner.subprocess.DEVNULL + if command[:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "run"]: return mock.Mock( returncode=0, stdout=( @@ -66,7 +67,7 @@ def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, c ), stderr="", ) - if command[:5] == ["azd", "ai", "agent", "eval", "show"]: + if command[:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "show"]: out_file = Path(command[command.index("--out-file") + 1]) out_file.write_text( json.dumps( @@ -98,7 +99,8 @@ def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, c assert result.run_id == "evalrun_456" assert result.payload["eval_id"] == "eval_123" assert azd_runner._extract_numeric_metrics(result.payload) == {"coherence": 1.0} - assert calls[1][:6] == ["azd", "ai", "agent", "eval", "show", "eval_123"] + assert calls[0][:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "run"] + assert calls[1][:7] == ["azd", "--no-prompt", "ai", "agent", "eval", "show", "eval_123"] def test_run_command_with_progress_emits_heartbeat( diff --git a/tests/unit/test_runtime_model_config.py b/tests/unit/test_runtime_model_config.py index dfc15022..e21c28da 100644 --- a/tests/unit/test_runtime_model_config.py +++ b/tests/unit/test_runtime_model_config.py @@ -12,9 +12,13 @@ from __future__ import annotations +from types import SimpleNamespace + import pytest -from agentops.pipeline.runtime import _model_config +from agentops.core.evaluators import EvaluatorPreset +from agentops.pipeline import runtime +from agentops.pipeline.runtime import _is_reasoning_model_deployment, _model_config @pytest.fixture(autouse=True) @@ -98,3 +102,71 @@ def test_api_version_override(monkeypatch: pytest.MonkeyPatch) -> None: cfg = _model_config() assert cfg["api_version"] == "2024-12-01-preview" + + +@pytest.mark.parametrize("deployment", ["gpt-5", "gpt-5.4-mini", "o1-preview", "o3-mini", "o4-mini"]) +def test_reasoning_model_deployment_detection(deployment: str) -> None: + assert _is_reasoning_model_deployment(deployment) is True + + +@pytest.mark.parametrize("deployment", ["gpt-4o-mini", "gpt-4.1", "my-chat-grader", ""]) +def test_non_reasoning_model_deployment_detection(deployment: str) -> None: + assert _is_reasoning_model_deployment(deployment) is False + + +def test_ai_assisted_evaluator_marks_reasoning_deployments( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://x.openai.azure.com") + monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-5.4-mini") + captured: dict[str, object] = {} + + class FakeCoherenceEvaluator: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr( + runtime.importlib, + "import_module", + lambda _name: SimpleNamespace(CoherenceEvaluator=FakeCoherenceEvaluator), + ) + + preset = EvaluatorPreset( + name="coherence", + class_name="CoherenceEvaluator", + score_key="coherence", + input_mapping={}, + ) + runtime.load_evaluator(preset) + + assert captured["model_config"]["azure_deployment"] == "gpt-5.4-mini" + assert captured["is_reasoning_model"] is True + + +def test_ai_assisted_evaluator_leaves_chat_deployments_default( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://x.openai.azure.com") + monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini") + captured: dict[str, object] = {} + + class FakeCoherenceEvaluator: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr( + runtime.importlib, + "import_module", + lambda _name: SimpleNamespace(CoherenceEvaluator=FakeCoherenceEvaluator), + ) + + preset = EvaluatorPreset( + name="coherence", + class_name="CoherenceEvaluator", + score_key="coherence", + input_mapping={}, + ) + runtime.load_evaluator(preset) + + assert captured["model_config"]["azure_deployment"] == "gpt-4o-mini" + assert "is_reasoning_model" not in captured