Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres
## [0.3.11] - 2026-06-08

### Fixed
- **Local AI-assisted evaluators now support reasoning-model graders.** When
`AZURE_OPENAI_DEPLOYMENT` points at `gpt-5*`, `o1*`, `o3*`, or `o4*`,
AgentOps marks the Azure AI Evaluation evaluator model as reasoning-capable so
the SDK sends `max_completion_tokens` instead of the unsupported `max_tokens`.
- **`agentops eval run` no longer hides interactive azd prompts while appearing
to hang.** The azd backend now runs `azd ai agent eval run` and the follow-up
`show` command with `--no-prompt` and a closed stdin, so any missing
authentication/configuration fails visibly instead of waiting indefinitely.
- **`agentops eval init` now bootstraps the minimal azd prompt-agent context.**
For Foundry prompt-agent configs, the command creates missing `azure.yaml` and
`src/<agent>/agent.yaml` files, enriches the active `.azure/<env>/.env` with
Expand Down
2 changes: 1 addition & 1 deletion docs/doctor-checks.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ stopping the whole run.
| `opex.stale_evaluation` | warning / critical | `results_history` | programmatic | latest run older than `stale_after_days` (critical at 2×) |
| `opex.flaky_metric.<metric>` | warning | `results_history` | programmatic | coefficient of variation across last N runs > `flaky_cv_threshold` |
| `opex.no_token_telemetry` | warning | `azure_monitor` | programmatic | `request_count > 0` but `gen_ai.usage.input_tokens + output_tokens == 0` |
| `opex.max_tokens_undefined` | warning | `workspace_files` | programmatic | no `max_tokens:` declared in any `agentops.yaml` / bundle YAML that configures a model |
| `opex.max_tokens_undefined` | warning | `workspace_files` | programmatic | no `max_tokens:` or `max_completion_tokens:` declared in any `agentops.yaml` / bundle YAML that configures a model |
| `opex.llm.bundle_coverage` | info / warning | `workspace_files` | llm-judged | judge compares bundle YAML against agent description and flags missing built-ins |
| `opex.spec_conformance.spec_missing` | warning | `workspace_files` | programmatic | spec-driven setup detected (`.specify/`, `AGENTS.md`, or Copilot instructions) but no readable spec body, so Doctor cannot verify bundles / datasets / tasks against intended agent behavior |
| `opex.spec_conformance.tasks_stale` | warning | `workspace_files` | programmatic | unchecked task-list items in the spec have remained open past `stale_after_days`, which suggests the implementation plan may be stale or the task list was not maintained |
Expand Down
6 changes: 3 additions & 3 deletions src/agentops/agent/checks/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,10 +507,10 @@ def is_llm_judged(self) -> bool:
CheckSpec(
id="opex.max_tokens_undefined",
category=Category.OPERATIONAL_EXCELLENCE,
title="`max_tokens` is not set on model / evaluator configuration",
title="Output token limit is not set on model / evaluator configuration",
summary=(
"Unbounded `max_tokens` invites long, expensive responses "
"and unpredictable latency."
"Missing `max_tokens` / `max_completion_tokens` limits invite "
"long, expensive responses and unpredictable latency."
),
severities=(Severity.WARNING,),
requires=("workspace",),
Expand Down
40 changes: 20 additions & 20 deletions src/agentops/agent/checks/opex_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ def _safe_load_yaml(path: Path) -> Optional[dict]:


def _check_max_tokens_limit(workspace: Path) -> List[Finding]:
"""AI.26 — every model deployment / call should set a ``max_tokens`` limit.
"""AI.26 — every model deployment / call should set an output token limit.

Without an upper bound, a runaway prompt or a malicious user can
drive the bill arbitrarily high. We look in two places:
Expand All @@ -572,7 +572,7 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]:
The check is permissive: it fires only when at least one file
explicitly looks like it configures a model (has ``model:``,
``deployment:``, or an ``evaluators:`` list) **and** none of the
candidate files declares ``max_tokens``. That avoids false
candidate files declares ``max_tokens`` or ``max_completion_tokens``. That avoids false
positives on bare workspaces / agent-only configs.
"""
candidates: List[Path] = []
Expand All @@ -586,18 +586,19 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]:
return []

looks_model_shaped = False
files_with_max_tokens: List[str] = []
files_without_max_tokens: List[str] = []
files_with_token_limit: List[str] = []
files_without_token_limit: List[str] = []

for path in candidates:
try:
text = path.read_text(encoding="utf-8")
except OSError:
continue
# Cheap, format-agnostic detection: matches `max_tokens: <n>`
# at any nesting level in any of the candidate YAMLs.
if re.search(r"(?m)^\s*max_tokens\s*:", text):
files_with_max_tokens.append(str(path.relative_to(workspace)).replace("\\", "/"))
# Cheap, format-agnostic detection: matches output token limits at
# any nesting level in any of the candidate YAMLs. Reasoning models
# require `max_completion_tokens`; older chat models use `max_tokens`.
if re.search(r"(?m)^\s*(max_tokens|max_completion_tokens)\s*:", text):
files_with_token_limit.append(str(path.relative_to(workspace)).replace("\\", "/"))
looks_model_shaped = True
continue
# Only count files that actually look like they configure a model.
Expand All @@ -606,40 +607,39 @@ def _check_max_tokens_limit(workspace: Path) -> List[Finding]:
text,
):
looks_model_shaped = True
files_without_max_tokens.append(
files_without_token_limit.append(
str(path.relative_to(workspace)).replace("\\", "/")
)

if not looks_model_shaped:
return []
if files_with_max_tokens and not files_without_max_tokens:
if files_with_token_limit and not files_without_token_limit:
return []
if not files_without_max_tokens:
if not files_without_token_limit:
return []

return [
Finding(
id="opex.max_tokens_undefined",
severity=Severity.WARNING,
category=Category.OPERATIONAL_EXCELLENCE,
title="`max_tokens` is not set on model / evaluator configuration",
title="Output token limit is not set on model / evaluator configuration",
summary=(
"Found model / evaluator YAML files that do not declare "
"a `max_tokens:` ceiling. Without an upper bound a single "
"a `max_tokens:` or `max_completion_tokens:` ceiling. Without an upper bound a single "
"runaway completion or a malicious prompt can drive token "
"spend arbitrarily high."
),
recommendation=(
"Add a `max_tokens:` field next to each `model:` / "
"`deployment:` block (and inside `model_config:` for "
"AI-assisted evaluators). Pick a value just above your "
"longest legitimate response so legitimate traffic isn't "
"truncated."
"Add a `max_tokens:` field for chat models, or "
"`max_completion_tokens:` for reasoning models such as "
"`gpt-5` and `o` series deployments. Pick a value just above "
"your longest legitimate response so legitimate traffic isn't truncated."
),
source=SOURCE_NAME,
evidence={
"files_without_max_tokens": files_without_max_tokens[:10],
"files_with_max_tokens": files_with_max_tokens[:10],
"files_without_token_limit": files_without_token_limit[:10],
"files_with_token_limit": files_with_token_limit[:10],
},
)
]
Expand Down
3 changes: 3 additions & 0 deletions src/agentops/pipeline/azd_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def run_azd_eval(

command = [
"azd",
"--no-prompt",
"ai",
"agent",
"eval",
Expand Down Expand Up @@ -142,6 +143,7 @@ def run_azd_eval(
show = _run_command(
[
"azd",
"--no-prompt",
"ai",
"agent",
"eval",
Expand Down Expand Up @@ -327,6 +329,7 @@ def _run_command(
encoding="utf-8",
errors="replace",
capture_output=True,
stdin=subprocess.DEVNULL,
timeout=timeout_seconds,
check=False,
)
Expand Down
25 changes: 20 additions & 5 deletions src/agentops/pipeline/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ def _credential() -> Any:
return DefaultAzureCredential(exclude_developer_cli_credential=True, process_timeout=30)


def _model_config() -> Dict[str, str]:
_REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4")


def _model_config() -> Dict[str, Any]:
from agentops.utils.azure_endpoints import (
derive_openai_endpoint_from_project,
normalize_azure_openai_endpoint,
Expand Down Expand Up @@ -110,14 +113,23 @@ def _model_config() -> Dict[str, str]:
"Missing environment variables: " + ", ".join(missing) + "." + hint
)

config: Dict[str, str] = {
"azure_endpoint": endpoint, # type: ignore[dict-item]
"azure_deployment": deployment, # type: ignore[dict-item]
config: Dict[str, Any] = {
"azure_endpoint": endpoint,
"azure_deployment": deployment,
"api_version": api_version,
}
return config


def _is_reasoning_model_deployment(deployment: Optional[str]) -> bool:
"""Return whether an evaluator deployment needs reasoning-model parameters."""

if not deployment:
return False
normalized = deployment.strip().lower()
return any(normalized.startswith(prefix) for prefix in _REASONING_MODEL_PREFIXES)


def _project_endpoint() -> str:
endpoint = os.getenv("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT")
if not endpoint:
Expand Down Expand Up @@ -152,7 +164,10 @@ def load_evaluator(preset: EvaluatorPreset) -> EvaluatorRuntime:

init_kwargs: Dict[str, Any] = {}
if preset.class_name in _AI_ASSISTED:
init_kwargs["model_config"] = _model_config()
model_config = _model_config()
init_kwargs["model_config"] = model_config
if _is_reasoning_model_deployment(model_config.get("azure_deployment")):
init_kwargs["is_reasoning_model"] = True
if preset.class_name in _SAFETY:
init_kwargs["azure_ai_project"] = _project_endpoint()
init_kwargs["credential"] = _credential()
Expand Down
17 changes: 15 additions & 2 deletions tests/unit/test_agent_checks_opex_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def test_workflow_sha_pinning_skips_local_actions(workspace: Path) -> None:


# ---------------------------------------------------------------------------
# AI.26 max_tokens limit (opex.max_tokens_undefined)
# AI.26 output token limit (opex.max_tokens_undefined)
# ---------------------------------------------------------------------------


Expand All @@ -429,7 +429,7 @@ def test_max_tokens_undefined_fires_when_bundle_lacks_max_tokens(tmp_path: Path)
findings = run_opex_workspace_check(tmp_path)
f = next((f for f in findings if f.id == "opex.max_tokens_undefined"), None)
assert f is not None
assert "default.yaml" in f.evidence["files_without_max_tokens"][0]
assert "default.yaml" in f.evidence["files_without_token_limit"][0]


def test_max_tokens_undefined_silent_when_every_file_declares_it(tmp_path: Path) -> None:
Expand All @@ -447,6 +447,19 @@ def test_max_tokens_undefined_silent_when_every_file_declares_it(tmp_path: Path)
assert not any(f.id == "opex.max_tokens_undefined" for f in findings)


def test_max_tokens_undefined_silent_when_reasoning_file_declares_completion_limit(
tmp_path: Path,
) -> None:
(tmp_path / "agentops.yaml").write_text(
"version: 1\nagent: my-agent:2\nmodel: gpt-5\nmax_completion_tokens: 800\n",
encoding="utf-8",
)

findings = run_opex_workspace_check(tmp_path)

assert not any(f.id == "opex.max_tokens_undefined" for f in findings)


def test_max_tokens_undefined_silent_when_no_model_shaped_files(tmp_path: Path) -> None:
# Empty workspace - no model / evaluator / deployment keys anywhere.
(tmp_path / "agentops.yaml").write_text(
Expand Down
10 changes: 6 additions & 4 deletions tests/unit/test_azd_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@ def test_run_azd_eval_reads_show_out_file_when_run_outputs_text(
_write_recipe(recipe_path)
calls: list[list[str]] = []

def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, check):
def fake_run(command, *, cwd, text, encoding, errors, capture_output, stdin, timeout, check):
calls.append(command)
assert encoding == "utf-8"
assert errors == "replace"
if command[:5] == ["azd", "ai", "agent", "eval", "run"]:
assert stdin is azd_runner.subprocess.DEVNULL
if command[:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "run"]:
return mock.Mock(
returncode=0,
stdout=(
Expand All @@ -66,7 +67,7 @@ def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, c
),
stderr="",
)
if command[:5] == ["azd", "ai", "agent", "eval", "show"]:
if command[:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "show"]:
out_file = Path(command[command.index("--out-file") + 1])
out_file.write_text(
json.dumps(
Expand Down Expand Up @@ -98,7 +99,8 @@ def fake_run(command, *, cwd, text, encoding, errors, capture_output, timeout, c
assert result.run_id == "evalrun_456"
assert result.payload["eval_id"] == "eval_123"
assert azd_runner._extract_numeric_metrics(result.payload) == {"coherence": 1.0}
assert calls[1][:6] == ["azd", "ai", "agent", "eval", "show", "eval_123"]
assert calls[0][:6] == ["azd", "--no-prompt", "ai", "agent", "eval", "run"]
assert calls[1][:7] == ["azd", "--no-prompt", "ai", "agent", "eval", "show", "eval_123"]


def test_run_command_with_progress_emits_heartbeat(
Expand Down
74 changes: 73 additions & 1 deletion tests/unit/test_runtime_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@

from __future__ import annotations

from types import SimpleNamespace

import pytest

from agentops.pipeline.runtime import _model_config
from agentops.core.evaluators import EvaluatorPreset
from agentops.pipeline import runtime
from agentops.pipeline.runtime import _is_reasoning_model_deployment, _model_config


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -98,3 +102,71 @@ def test_api_version_override(monkeypatch: pytest.MonkeyPatch) -> None:
cfg = _model_config()

assert cfg["api_version"] == "2024-12-01-preview"


@pytest.mark.parametrize("deployment", ["gpt-5", "gpt-5.4-mini", "o1-preview", "o3-mini", "o4-mini"])
def test_reasoning_model_deployment_detection(deployment: str) -> None:
assert _is_reasoning_model_deployment(deployment) is True


@pytest.mark.parametrize("deployment", ["gpt-4o-mini", "gpt-4.1", "my-chat-grader", ""])
def test_non_reasoning_model_deployment_detection(deployment: str) -> None:
assert _is_reasoning_model_deployment(deployment) is False


def test_ai_assisted_evaluator_marks_reasoning_deployments(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://x.openai.azure.com")
monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-5.4-mini")
captured: dict[str, object] = {}

class FakeCoherenceEvaluator:
def __init__(self, **kwargs):
captured.update(kwargs)

monkeypatch.setattr(
runtime.importlib,
"import_module",
lambda _name: SimpleNamespace(CoherenceEvaluator=FakeCoherenceEvaluator),
)

preset = EvaluatorPreset(
name="coherence",
class_name="CoherenceEvaluator",
score_key="coherence",
input_mapping={},
)
runtime.load_evaluator(preset)

assert captured["model_config"]["azure_deployment"] == "gpt-5.4-mini"
assert captured["is_reasoning_model"] is True


def test_ai_assisted_evaluator_leaves_chat_deployments_default(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setenv("AZURE_OPENAI_ENDPOINT", "https://x.openai.azure.com")
monkeypatch.setenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini")
captured: dict[str, object] = {}

class FakeCoherenceEvaluator:
def __init__(self, **kwargs):
captured.update(kwargs)

monkeypatch.setattr(
runtime.importlib,
"import_module",
lambda _name: SimpleNamespace(CoherenceEvaluator=FakeCoherenceEvaluator),
)

preset = EvaluatorPreset(
name="coherence",
class_name="CoherenceEvaluator",
score_key="coherence",
input_mapping={},
)
runtime.load_evaluator(preset)

assert captured["model_config"]["azure_deployment"] == "gpt-4o-mini"
assert "is_reasoning_model" not in captured