From 23af5902465147e8e97ab7011389980f9b1b91e1 Mon Sep 17 00:00:00 2001 From: Paulo Lacerda Date: Mon, 8 Jun 2026 22:34:57 -0300 Subject: [PATCH 1/2] feat: add Foundry observability rubric gates Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG.md | 24 +++ docs/tutorial-end-to-end.md | 16 +- docs/tutorial-hosted-agent-quickstart.md | 15 ++ docs/tutorial-prompt-agent-quickstart.md | 71 ++++++- src/agentops/agent/analyzer.py | 2 + src/agentops/agent/checks/catalog.py | 56 ++++++ src/agentops/agent/checks/observability.py | 175 ++++++++++++++++++ src/agentops/agent/cockpit.py | 96 ++++++++++ src/agentops/core/agentops_config.py | 129 +++++++++++++ src/agentops/core/release_evidence.py | 1 + src/agentops/pipeline/azd_runner.py | 44 +++++ src/agentops/pipeline/orchestrator.py | 6 + src/agentops/services/azd_eval_init.py | 17 +- src/agentops/services/evidence_pack.py | 94 +++++++++- src/agentops/services/trace_promotion.py | 90 +++++++++ src/agentops/templates/agentops.yaml | 39 ++++ tests/unit/test_agent_checks_observability.py | 76 ++++++++ tests/unit/test_agentops_config.py | 69 +++++++ tests/unit/test_azd_eval_init.py | 48 +++++ tests/unit/test_azd_runner.py | 156 ++++++++++++++++ tests/unit/test_cockpit.py | 37 ++++ tests/unit/test_release_evidence.py | 41 ++++ tests/unit/test_trace_promotion.py | 47 +++++ 23 files changed, 1341 insertions(+), 8 deletions(-) create mode 100644 src/agentops/agent/checks/observability.py create mode 100644 tests/unit/test_agent_checks_observability.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 38d0b81d..634f196c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,30 @@ All notable changes to this project will be documented in this file. This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres to [Semantic Versioning](https://semver.org/). +## [Unreleased] + +### Added +- **Foundry observability readiness now spans eval, Doctor, Cockpit, and release evidence.** + `agentops.yaml` supports `dataset_kind`, `rubrics`, and `observability` + metadata for multi-turn coverage, rubric evaluator gates, trace sampling, and + replay/evaluation/dataset links. Doctor and Cockpit surface the readiness + state without mutating cloud resources, and release evidence records the same + signals for reviewers. +- **Trace promotion preserves evaluation lineage.** `agentops eval + promote-traces` now carries operation/span IDs, source system, agent version, + replay/evaluation URLs, sampling policy, and multi-turn message fields into + candidate datasets and their manifest. + +### Changed +- **Rubric evaluators are executed through the azd backend.** When `rubrics:` + is configured, `agentops eval init` includes those evaluator names in the azd + recipe and `agentops eval run` fails closed outside `execution: azd`, so rubric + scores cannot be treated as evidence unless Foundry / azd actually ran them. +- **Tutorials now carry rubric and observability proof into evaluation and CI/CD.** + The Travel Agent flow keeps the existing smoke recording through step 10, then + upgrades the gate to multi-turn dataset rows, rubric thresholds, trace + sampling/replay lineage, and CI/CD workflows that reuse the same eval contract. + ## [0.3.11] - 2026-06-08 ### Fixed diff --git a/docs/tutorial-end-to-end.md b/docs/tutorial-end-to-end.md index cc884455..08ebdd23 100644 --- a/docs/tutorial-end-to-end.md +++ b/docs/tutorial-end-to-end.md @@ -444,6 +444,13 @@ Foundry through `agentops eval run`, so AgentOps can enforce thresholds and writ repo-side evidence. AgentOps keeps the local path for hosted endpoints, models, unsupported evaluator mappings, and fallback cases. +When the quality gate uses a task-specific rubric, choose the azd runner instead +of local execution. Add `rubrics:` to `agentops.yaml`, set +`rubrics[].evaluator` to the Foundry / azd evaluator name, set +`execution: azd`, and run `agentops eval init --force`. AgentOps then passes the +rubric evaluator into the generated azd recipe and fails closed if someone tries +to run that rubric gate with the local backend. + ## 5. Run the first eval For hosted agents or local fallback: @@ -651,7 +658,9 @@ agentops workflow generate ` The generated workflows are intentionally boring: -- PR gate: evaluate and publish report/evidence. +- PR gate: evaluate and publish report/evidence. If `agentops.yaml` declares + rubric evaluators, this is the same azd/Foundry rubric gate you ran locally; + the PR does not downgrade to a plain smoke test. - Dev/QA/Prod: deploy with azd or placeholders, then run readiness checks. - Optional Doctor cadence: generate `--kinds doctor` separately if you want a scheduled readiness run outside PRs. @@ -698,10 +707,11 @@ Use this loop in the video: | Signal | Foundry or Azure Monitor action | AgentOps handoff | |---|---|---| | App Insights connection | In Foundry, open the project or agent **Traces** view and connect an App Insights resource. Verify it under project connected resources. | Doctor checks whether telemetry wiring is discoverable. | +| Trace sampling | Configure the project's trace sampling policy in Foundry or the hosted-agent observability settings your team owns. Keep the policy name in `agentops.yaml` under `observability.trace_sampling`. | Doctor/evidence can show reviewers that live-quality sampling exists before traces are promoted. | | Live trace | Run one playground prompt for a Prompt Agent, or call the hosted endpoint a few times. Open the agent **Traces** tab, wait 2-5 minutes if needed, and click the Trace ID. In the modal, inspect spans plus the **Input + Output** and **Metadata** tabs. | Evidence and Cockpit link reviewers back to the runtime view. | | Operate summary | Switch to **Operate** -> **Overview**, select the same subscription/project, wait for metrics to sync, and use **Ask AI** for dashboard-level questions such as `Help me identify any issues or anomalies in my agent metrics.` | The summary informs the release discussion; AgentOps does not rewrite it. | -| Eval context | From a Foundry eval run, inspect row-level explanations and, when available, the trace attached to the interaction. | The repo keeps the exact target, dataset, gate, and evidence together. | -| Trace learning | Export or curate traces that represent real issues. | `agentops eval promote-traces` turns reviewed traces into regression candidates. | +| Eval context | From a Foundry eval run, inspect row-level explanations, rubric scores, and, when available, the trace attached to the interaction. | The repo keeps the exact target, dataset, rubric gate, and evidence together. | +| Trace learning | Export or curate traces that represent real issues, including conversation turns when present. | `agentops eval promote-traces` turns reviewed traces into regression candidates and preserves replay/evaluation lineage. | For the screen recording, make the Foundry side visible before opening AgentOps Cockpit: diff --git a/docs/tutorial-hosted-agent-quickstart.md b/docs/tutorial-hosted-agent-quickstart.md index 13bc1e4c..1b38f626 100644 --- a/docs/tutorial-hosted-agent-quickstart.md +++ b/docs/tutorial-hosted-agent-quickstart.md @@ -648,6 +648,14 @@ This is the core AgentOps loop for hosted endpoints: keep a stable dataset, compare a changed runtime against the last known result, fix the agent, and rerun the same gate before a PR or release. +If this hosted endpoint is backed by a Foundry / azd eval recipe, you can use +the same rubric contract as the prompt-agent Travel Agent tutorial before you +generate CI: set `execution: azd`, add `dataset_kind: multi-turn`, declare +`rubrics[].evaluator` in `agentops.yaml`, run `agentops eval init --force`, and +then run `agentops eval run`. AgentOps will require the azd backend whenever +rubrics are configured, so a passing hosted-agent gate means the rubric evaluator +actually ran instead of being recorded as metadata only. + ## 10. Generate CI and Doctor evidence Generate both the PR and dev deploy workflows with `--doctor-gate critical` @@ -666,6 +674,13 @@ code .agentops\agent\report.md code .agentops\release\latest\evidence.md ``` +The generated PR gate reuses the same `agentops.yaml` contract. If you promoted +the hosted endpoint to an azd/Foundry eval recipe with rubrics, CI runs that +recipe and blocks on the rubric thresholds; otherwise it runs the local hosted +endpoint gate and normalized thresholds. In both cases Doctor and the evidence +pack surface multi-turn coverage, trace sampling readiness, replay/evaluation +links, and trace-to-dataset lineage when those signals exist. + > **`--deploy-mode prompt-agent` does not apply to hosted endpoints.** > That mode is specific to Foundry prompt agents (the stage-prompt-as- > candidate flow). For hosted endpoints, `agentops workflow generate` diff --git a/docs/tutorial-prompt-agent-quickstart.md b/docs/tutorial-prompt-agent-quickstart.md index 8af0f0bd..91ee0b6c 100644 --- a/docs/tutorial-prompt-agent-quickstart.md +++ b/docs/tutorial-prompt-agent-quickstart.md @@ -803,6 +803,66 @@ You should see `execution: azd` and `Threshold status: PASSED`. The raw azd run details are kept under `.agentops/results/latest/` alongside AgentOps' normalized `results.json` and `report.md`. +Before generating CI, turn the Travel Agent gate from a basic smoke test into +the proof you want reviewers to see later. Keep the recording you already made +through this step: the smoke run above proves the workspace works. The next +commands only harden the same gate. + +Create a small conversation-shaped dataset. It still keeps `input` and +`expected` so AgentOps and azd can route the row, but it also carries the +conversation turns that multi-turn evaluators and trace-derived rows use: + +```powershell +@' +{"input":"Plan a three-day Rome trip for a family with kids. Ask one clarification if needed.","expected":"The agent should preserve the family-with-kids constraint, propose a practical three-day Rome itinerary, include transit/rest pacing, and avoid claiming it can book live reservations.","messages":[{"role":"user","content":"We want to visit Rome with two kids."},{"role":"assistant","content":"How many days do you have and what pace do you prefer?"},{"role":"user","content":"Three days, moderate pace, museums and food."}]} +{"input":"Help me choose between Lisbon and Seattle for a low-budget food weekend.","expected":"The agent should compare both destinations, mention budget tradeoffs, food activities, transit/weather notes, and avoid unsupported price or booking claims.","messages":[{"role":"user","content":"I need a low-budget food weekend."},{"role":"assistant","content":"Are you choosing between specific cities?"},{"role":"user","content":"Lisbon or Seattle."}]} +'@ | Set-Content -Encoding utf8 .agentops\data\travel-conversations.jsonl +``` + +Then update the evaluation contract in `agentops.yaml`. The important part is +that `rubrics[].evaluator` names the rubric evaluator that Foundry / azd will +run. If your Foundry Observe flow generated a different rubric evaluator name, +use that exact name here. + +```yaml +dataset: .agentops/data/travel-conversations.jsonl +dataset_kind: multi-turn + +rubrics: + - name: travel-concierge-quality + evaluator: travel-concierge-quality + description: Scores the Travel Agent against the intended product behavior. + dimensions: + - name: task_success + description: Completes the user's travel-planning goal across the conversation. + weight: 0.5 + - name: constraint_following + description: Carries user constraints such as kids, budget, duration, and pace. + weight: 0.3 + - name: safe_booking_behavior + description: Avoids claiming live bookings, confirmations, or prices it cannot verify. + weight: 0.2 + +thresholds: + task_success: ">=4" + constraint_following: ">=4" + safe_booking_behavior: ">=4" +``` + +Re-run init so the azd recipe includes the rubric evaluator in the actual +evaluation, not only in documentation: + +```powershell +agentops eval init --force +agentops eval run +``` + +If the rubric evaluator name is wrong or missing in Foundry, the run should fail +closed. That is intentional: a green gate must mean the rubric really ran. When +it passes, `results.json` records `execution: azd`, the evaluator list, the +rubric metadata from `agentops.yaml`, and threshold results for the rubric +dimensions. + ## 11. Generate the PR + dev deploy workflows > **Pipeline ownership.** This tutorial uses `agentops workflow generate` @@ -846,7 +906,11 @@ The PR workflow now has two jobs: `.agentops/deployments/agentops.candidate.yaml` pointing at the staged candidate. 2. **`eval`** — runs `agentops eval run` against the candidate, then - runs Doctor with `--severity-fail critical`. + runs Doctor with `--severity-fail critical`. Because the previous step + moved the gate to `execution: azd` with `rubrics:`, the workflow is not + just checking a smoke response: it runs the Foundry / azd evaluation recipe, + applies the Travel Agent rubric dimensions as thresholds, and writes the + normalized rubric evidence to `.agentops/results/latest/results.json`. > **Why does the PR workflow stage in dev, not sandbox?** The PR gate > must evaluate the same target the deploy workflow will use. Sandbox @@ -859,6 +923,9 @@ The PR workflow now has two jobs: The dev deploy workflow stages a candidate (same logic), evaluates it, summarizes the deployment via `prompt_deploy summarize`, and uploads `.agentops/deployments/foundry-agent.json` as a workflow artifact. +The deploy gate uses the same rubric-aware `agentops eval run`, so the candidate +that lands in dev has already passed the conversation/rubric gate reviewers saw +on the PR. The `--doctor-gate critical` flag controls the Doctor severity floor in the PR template. The table below summarizes the three values: @@ -1327,7 +1394,7 @@ deploys, explicit thresholds, or red-team/governance evidence. Treat those as th hardening backlog. The eval gates and the dev deploy loop are production-ready. -If you want to show the Build 2026 governance story in the video, keep it as a +If you want to show the governance evidence path in the video, keep it as a short optional callout: ```powershell diff --git a/src/agentops/agent/analyzer.py b/src/agentops/agent/analyzer.py index a9e2234b..4b0c2f0d 100644 --- a/src/agentops/agent/analyzer.py +++ b/src/agentops/agent/analyzer.py @@ -12,6 +12,7 @@ from agentops.agent.checks.foundry_config import run_foundry_config_check from agentops.agent.checks.governance import run_governance_check from agentops.agent.checks.latency import run_latency_check +from agentops.agent.checks.observability import run_observability_check from agentops.agent.checks.opex_workspace import run_opex_workspace_check from agentops.agent.checks.opex import run_opex_check from agentops.agent.checks.posture import run_posture_check @@ -146,6 +147,7 @@ def analyze( findings.extend(run_posture_check(resources, posture_config)) findings.extend(run_opex_workspace_check(workspace)) findings.extend(run_governance_check(workspace)) + findings.extend(run_observability_check(workspace)) findings.extend(run_opex_check(history, config.checks.opex)) findings.extend(run_release_readiness_check(workspace, history, foundry)) findings.extend( diff --git a/src/agentops/agent/checks/catalog.py b/src/agentops/agent/checks/catalog.py index c85902d5..7db2e2ab 100644 --- a/src/agentops/agent/checks/catalog.py +++ b/src/agentops/agent/checks/catalog.py @@ -141,6 +141,18 @@ "safety.config.continuous_eval_disabled": ( "https://learn.microsoft.com/azure/ai-foundry/how-to/online-evaluation" ), + "observability.multiturn_coverage_missing": ( + "https://learn.microsoft.com/azure/foundry/concepts/observability" + ), + "observability.rubric_missing": ( + "https://learn.microsoft.com/azure/foundry/concepts/observability" + ), + "observability.trace_sampling_missing": ( + "https://learn.microsoft.com/azure/foundry/concepts/observability" + ), + "observability.trace_replay_missing": ( + "https://learn.microsoft.com/azure/foundry/concepts/observability" + ), } @@ -199,6 +211,28 @@ def is_llm_judged(self) -> bool: requires=("results_history",), flags=("dynamic_id",), ), + CheckSpec( + id="observability.multiturn_coverage_missing", + category=Category.QUALITY, + title="Multi-turn evaluation coverage is not declared yet", + summary=( + "The workspace does not declare multi-turn dataset coverage or " + "trace-derived conversation rows for Foundry multi-turn evals." + ), + severities=(Severity.INFO,), + requires=("workspace",), + ), + CheckSpec( + id="observability.rubric_missing", + category=Category.QUALITY, + title="No context-specific rubric evaluator is declared", + summary=( + "The workspace does not declare a Foundry rubric evaluator or " + "rubric dimensions that can be bound to release thresholds." + ), + severities=(Severity.INFO,), + requires=("workspace",), + ), # ------------------------------------------------------------------ # Performance # ------------------------------------------------------------------ @@ -438,6 +472,28 @@ def is_llm_judged(self) -> bool: severities=(Severity.WARNING,), requires=("foundry_control",), ), + CheckSpec( + id="observability.trace_sampling_missing", + category=Category.OPERATIONAL_EXCELLENCE, + title="Intelligent trace sampling is not evidence-ready", + summary=( + "The workspace does not declare Foundry trace sampling and the " + "trace-regression manifest does not include sampling lineage." + ), + severities=(Severity.WARNING,), + requires=("workspace",), + ), + CheckSpec( + id="observability.trace_replay_missing", + category=Category.OPERATIONAL_EXCELLENCE, + title="Trace replay link is not captured in release evidence", + summary=( + "The workspace has no trace replay URL in agentops.yaml or in " + "trace-derived dataset lineage." + ), + severities=(Severity.INFO,), + requires=("workspace",), + ), CheckSpec( id="opex.results_not_gitignored", category=Category.OPERATIONAL_EXCELLENCE, diff --git a/src/agentops/agent/checks/observability.py b/src/agentops/agent/checks/observability.py new file mode 100644 index 00000000..40ce615a --- /dev/null +++ b/src/agentops/agent/checks/observability.py @@ -0,0 +1,175 @@ +"""Foundry observability readiness checks.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, List + +from agentops.agent.findings import Category, Finding, Severity +from agentops.utils.yaml import load_yaml + +SOURCE_NAME = "observability" + + +def run_observability_check(workspace: Path) -> List[Finding]: + """Validate repo-side intent for Foundry observability signals. + + These checks are deliberately read-only. Foundry owns the runtime surfaces + for traces, intelligent sampling, replay, multi-turn eval, and rubric + evaluators; AgentOps verifies whether the repo has enough metadata and + evidence to make those signals part of release readiness. + """ + + config = _safe_config(workspace) + if not config and not (workspace / ".agentops").exists(): + return [] + + findings: List[Finding] = [] + findings.extend(_check_multiturn_coverage(config, workspace)) + findings.extend(_check_rubric_coverage(config)) + findings.extend(_check_trace_sampling(config, workspace)) + findings.extend(_check_trace_replay(config, workspace)) + return findings + + +def _check_multiturn_coverage(config: dict[str, Any], workspace: Path) -> List[Finding]: + if str(config.get("dataset_kind") or "auto") == "multi-turn": + return [] + manifest = _trace_manifest(workspace) + lineage = manifest.get("lineage") if isinstance(manifest, dict) else {} + if isinstance(lineage, dict) and int(lineage.get("multi_turn_rows") or 0) > 0: + return [] + return [ + Finding( + id="observability.multiturn_coverage_missing", + severity=Severity.INFO, + category=Category.QUALITY, + title="Multi-turn evaluation coverage is not declared yet", + summary=( + "Foundry multi-turn evaluation is designed to catch context " + "carryover, tone drift, contradictions, and task-completion " + "failures across a full conversation. AgentOps did not find " + "`dataset_kind: multi-turn` or trace-derived conversation rows." + ), + recommendation=( + "After the single-turn smoke gate is green, add a conversation " + "dataset or use Foundry traces-to-dataset output with `messages` " + "rows, then set `dataset_kind: multi-turn` in agentops.yaml." + ), + source=SOURCE_NAME, + ) + ] + + +def _check_rubric_coverage(config: dict[str, Any]) -> List[Finding]: + rubrics = config.get("rubrics") + if isinstance(rubrics, list) and rubrics: + return [] + return [ + Finding( + id="observability.rubric_missing", + severity=Severity.INFO, + category=Category.QUALITY, + title="No context-specific rubric evaluator is declared", + summary=( + "Foundry rubric evaluators let teams score the agent against " + "task-specific criteria such as task success, tone, safety, cost, " + "and latency. AgentOps did not find a `rubrics:` block in " + "agentops.yaml." + ), + recommendation=( + "Declare at least one rubric in agentops.yaml and bind its " + "dimension metrics to thresholds, or reference the rubric through " + "the azd eval recipe used by `execution: azd`." + ), + source=SOURCE_NAME, + ) + ] + + +def _check_trace_sampling(config: dict[str, Any], workspace: Path) -> List[Finding]: + observability = config.get("observability") + trace_sampling = ( + observability.get("trace_sampling") + if isinstance(observability, dict) + else None + ) + if isinstance(trace_sampling, dict) and trace_sampling.get("enabled") is True: + return [] + manifest = _trace_manifest(workspace) + lineage = manifest.get("lineage") if isinstance(manifest, dict) else {} + if isinstance(lineage, dict) and lineage.get("sampling_policies"): + return [] + return [ + Finding( + id="observability.trace_sampling_missing", + severity=Severity.WARNING, + category=Category.OPERATIONAL_EXCELLENCE, + title="Intelligent trace sampling is not evidence-ready", + summary=( + "Foundry intelligent trace sampling evaluates the most " + "signal-rich production traces without scoring every request. " + "AgentOps did not find `observability.trace_sampling.enabled: true` " + "or sampling metadata in the trace-regression manifest." + ), + recommendation=( + "Enable Foundry trace sampling or document the sampling policy in " + "`observability.trace_sampling`, then regenerate trace-derived " + "dataset candidates so release evidence includes the lineage." + ), + source=SOURCE_NAME, + ) + ] + + +def _check_trace_replay(config: dict[str, Any], workspace: Path) -> List[Finding]: + observability = config.get("observability") + if isinstance(observability, dict) and observability.get("trace_replay_url"): + return [] + manifest = _trace_manifest(workspace) + lineage = manifest.get("lineage") if isinstance(manifest, dict) else {} + if isinstance(lineage, dict) and lineage.get("replay_urls"): + return [] + return [ + Finding( + id="observability.trace_replay_missing", + severity=Severity.INFO, + category=Category.OPERATIONAL_EXCELLENCE, + title="Trace replay link is not captured in release evidence", + summary=( + "Foundry trace replay and visualization make incident review " + "faster by linking each failure to the exact prompts, decisions, " + "tool calls, and outputs. AgentOps did not find a replay URL in " + "agentops.yaml or the trace-regression manifest." + ), + recommendation=( + "After selecting representative traces in Foundry, keep the replay " + "link in `observability.trace_replay_url` or include it in trace " + "exports before running `agentops eval promote-traces --apply`." + ), + source=SOURCE_NAME, + ) + ] + + +def _trace_manifest(workspace: Path) -> dict[str, Any]: + path = workspace / ".agentops" / "data" / "trace-regression-manifest.json" + if not path.exists(): + return {} + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return {} + return payload if isinstance(payload, dict) else {} + + +def _safe_config(workspace: Path) -> dict[str, Any]: + path = workspace / "agentops.yaml" + if not path.exists(): + return {} + try: + data = load_yaml(path) + except Exception: + return {} + return data if isinstance(data, dict) else {} diff --git a/src/agentops/agent/cockpit.py b/src/agentops/agent/cockpit.py index 12b74f65..b4cd71a4 100644 --- a/src/agentops/agent/cockpit.py +++ b/src/agentops/agent/cockpit.py @@ -27,6 +27,7 @@ from agentops.agent.history import AnalysisRecord, load_analysis_history from agentops.agent.time_range import TimeRange, parse_time_range, preset_keys +from agentops.utils.yaml import load_yaml # --------------------------------------------------------------------------- @@ -1891,6 +1892,12 @@ def _build_readiness_checklist( deep-links panel. """ checks: List[Dict[str, Any]] = [] + agentops_config = _read_agentops_config(workspace) + trace_manifest = _read_trace_regression_manifest(workspace) + raw_trace_lineage = trace_manifest.get("lineage") + trace_lineage: Dict[str, Any] = ( + raw_trace_lineage if isinstance(raw_trace_lineage, dict) else {} + ) tracing_ok = bool(telemetry.get("enabled")) checks.append( @@ -1948,6 +1955,80 @@ def _build_readiness_checklist( } ) + multi_turn_ready = ( + agentops_config.get("dataset_kind") == "multi-turn" + or int(trace_lineage.get("multi_turn_rows") or 0) > 0 + ) + checks.append( + { + "title": "Multi-turn eval coverage", + "status": "ok" if multi_turn_ready else "muted", + "detail": ( + "Detected conversation-level evaluation coverage from " + "dataset_kind: multi-turn or trace-derived rows." + if multi_turn_ready + else "How to complete: add a conversation " + "dataset or promote traces that include messages, " + "then set dataset_kind: multi-turn in " + "agentops.yaml." + ), + } + ) + + rubrics = agentops_config.get("rubrics") + rubric_ready = isinstance(rubrics, list) and bool(rubrics) + checks.append( + { + "title": "Rubric evaluator gate", + "status": "ok" if rubric_ready else "muted", + "detail": ( + "Detected rubrics: in agentops.yaml. " + "AgentOps requires execution: azd so the Foundry " + "rubric evaluator actually runs." + if rubric_ready + else "How to complete: declare a task-specific " + "rubrics: block and bind its dimensions to thresholds. " + "Use execution: azd so Foundry evaluates the rubric." + ), + } + ) + + observability = agentops_config.get("observability") + observability = observability if isinstance(observability, dict) else {} + trace_sampling = observability.get("trace_sampling") + trace_sampling = trace_sampling if isinstance(trace_sampling, dict) else {} + sampling_ready = bool(trace_sampling.get("enabled")) or bool(trace_lineage.get("sampling_policies")) + checks.append( + { + "title": "Trace sampling for live quality", + "status": "ok" if sampling_ready else "muted", + "detail": ( + "Detected trace-sampling intent or sampling lineage in the " + "trace-derived dataset manifest." + if sampling_ready + else "How to complete: enable Foundry trace " + "sampling or document the policy under " + "observability.trace_sampling, then harvest sampled " + "traces into dataset candidates." + ), + } + ) + + replay_ready = bool(observability.get("trace_replay_url")) or bool(trace_lineage.get("replay_urls")) + checks.append( + { + "title": "Trace replay linked to evidence", + "status": "ok" if replay_ready else "muted", + "detail": ( + "Detected a Foundry trace replay link in config or trace lineage." + if replay_ready + else "How to complete: keep a representative " + "Foundry replay link in observability.trace_replay_url " + "or include replay URLs when promoting traces." + ), + } + ) + eval_workflow = _detect_eval_workflow(workspace) cont_eval = bool(eval_workflow.get("present")) eval_runner = str(eval_workflow.get("runner") or "") @@ -2481,6 +2562,21 @@ def _read_json_object(path: Path) -> Dict[str, Any]: return payload if isinstance(payload, dict) else {} +def _read_agentops_config(workspace: Path) -> Dict[str, Any]: + path = workspace / "agentops.yaml" + if not path.exists(): + return {} + try: + payload = load_yaml(path) + except Exception: + return {} + return payload if isinstance(payload, dict) else {} + + +def _read_trace_regression_manifest(workspace: Path) -> Dict[str, Any]: + return _read_json_object(workspace / ".agentops" / "data" / "trace-regression-manifest.json") + + def _official_eval_artifact_status(workspace: Path) -> Dict[str, Any]: base = workspace / ".agentops" / "official-eval" metadata = _read_json_object(base / "metadata.json") diff --git a/src/agentops/core/agentops_config.py b/src/agentops/core/agentops_config.py index 23bef990..a920e545 100644 --- a/src/agentops/core/agentops_config.py +++ b/src/agentops/core/agentops_config.py @@ -67,6 +67,9 @@ #: How cloud evaluation submits local dataset rows to Foundry. DatasetSyncMode = Literal["auto", "inline", "foundry"] +#: Dataset shape used by the evaluator runtime or Foundry / azd recipes. +DatasetKind = Literal["auto", "single-turn", "multi-turn"] + #: Internal-only literal kept for the publisher dispatch table. Derived from #: ``execution`` + ``publish`` via :meth:`AgentOpsConfig.publish_target`. PublishTarget = Literal["foundry", "foundry_cloud"] @@ -209,6 +212,110 @@ def _version_non_empty(cls, value: str) -> str: return value +class RubricDimensionConfig(BaseModel): + """One weighted dimension in a Foundry rubric evaluator. + + Rubrics are optional and additive. AgentOps records them as release + readiness intent and uses thresholds to gate the metrics that Foundry/azd + emits for each dimension. + """ + + name: str + description: str + weight: Optional[float] = None + + model_config = ConfigDict(extra="forbid") + + @field_validator("name", "description") + @classmethod + def _text_non_empty(cls, value: str) -> str: + value = value.strip() + if not value: + raise ValueError("rubric dimension fields must be non-empty") + return value + + +class RubricConfig(BaseModel): + """Context-specific evaluator criteria for Foundry rubric scoring.""" + + name: str + description: Optional[str] = None + dimensions: List[RubricDimensionConfig] = Field(default_factory=list) + evaluator: Optional[str] = Field( + None, + description="Optional Foundry/azd evaluator name when the rubric is registered remotely.", + ) + + model_config = ConfigDict(extra="forbid") + + @field_validator("name") + @classmethod + def _name_non_empty(cls, value: str) -> str: + value = value.strip() + if not value: + raise ValueError("rubric name must be non-empty") + return value + + @field_validator("description", "evaluator") + @classmethod + def _optional_text_non_empty(cls, value: Optional[str]) -> Optional[str]: + if value is None: + return value + value = value.strip() + if not value: + raise ValueError("rubric optional text fields must be non-empty when provided") + return value + + +class TraceSamplingConfig(BaseModel): + """Foundry intelligent trace-sampling readiness contract.""" + + enabled: bool = False + mode: Literal["manual", "foundry", "scheduled"] = "manual" + description: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + @field_validator("description") + @classmethod + def _description_non_empty(cls, value: Optional[str]) -> Optional[str]: + if value is None: + return value + value = value.strip() + if not value: + raise ValueError("observability.trace_sampling.description must be non-empty") + return value + + +class ObservabilityConfig(BaseModel): + """Foundry observability readiness metadata. + + The fields are read-only intent for Doctor, Cockpit, and release evidence. + AgentOps does not create Foundry trace replay, sampling, or portal resources + from this block. + """ + + tracing_enabled: bool = False + trace_sampling: TraceSamplingConfig = Field(default_factory=TraceSamplingConfig) + trace_replay_url: Optional[str] = None + evaluations_url: Optional[str] = None + datasets_url: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + @field_validator("trace_replay_url", "evaluations_url", "datasets_url") + @classmethod + def _url_non_empty(cls, value: Optional[str]) -> Optional[str]: + if value is None: + return value + value = value.strip() + if not value: + raise ValueError("observability URLs must be non-empty when provided") + if not value.startswith(("https://", "http://")): + raise ValueError("observability URLs must start with http:// or https://") + return value + + class PromptAgentBootstrap(BaseModel): """Bootstrap defaults for prompt-agent CI/CD when the target Foundry project does not yet contain the seed agent referenced by ``agent``. @@ -369,11 +476,25 @@ class AgentOpsConfig(BaseModel): Optional governance artifact paths. These are read-only inputs for Doctor and release evidence; AgentOps validates and references them but does not execute ASSERT, apply ACS controls, or run red-team campaigns. + + ``dataset_kind`` / ``rubrics`` / ``observability`` + Optional Foundry observability metadata. These fields keep existing + single-turn evals working while letting Doctor, Cockpit, CI evidence, and + azd/Foundry recipes reason about multi-turn coverage, rubric gates, trace + sampling, and trace replay links. """ version: int = Field(..., description="Schema version. Must be 1.") agent: str = Field(..., description="Target identifier (name:version, URL, or model:deployment)") dataset: Path = Field(..., description="Path to a JSONL dataset file") + dataset_kind: DatasetKind = Field( + "auto", + description=( + "Dataset shape. 'auto' preserves current behavior, 'single-turn' " + "requires input/expected rows, and 'multi-turn' documents that rows " + "represent conversations or message histories." + ), + ) prompt_file: Optional[Path] = Field( None, description=( @@ -414,6 +535,10 @@ class AgentOpsConfig(BaseModel): auth_header_env: Optional[str] = None evaluators: Optional[List[EvaluatorOverride]] = None + rubrics: List[RubricConfig] = Field( + default_factory=list, + description="Optional context-specific rubric evaluator definitions.", + ) publish: bool = Field( False, @@ -458,6 +583,10 @@ class AgentOpsConfig(BaseModel): default_factory=DatasetSyncConfig, description="Cloud evaluation dataset submission policy.", ) + observability: ObservabilityConfig = Field( + default_factory=ObservabilityConfig, + description="Foundry observability readiness metadata.", + ) prompt_agent_bootstrap: Optional[PromptAgentBootstrap] = Field( None, description=( diff --git a/src/agentops/core/release_evidence.py b/src/agentops/core/release_evidence.py index 00f55aae..06778399 100644 --- a/src/agentops/core/release_evidence.py +++ b/src/agentops/core/release_evidence.py @@ -51,6 +51,7 @@ class ReleaseEvidence(BaseModel): foundry: Dict[str, Any] = Field(default_factory=dict) monitoring: Dict[str, Any] = Field(default_factory=dict) trace_dataset: Dict[str, Any] = Field(default_factory=dict) + observability: Dict[str, Any] = Field(default_factory=dict) ailz: Dict[str, Any] = Field(default_factory=dict) governance: Dict[str, Any] = Field(default_factory=dict) diff --git a/src/agentops/pipeline/azd_runner.py b/src/agentops/pipeline/azd_runner.py index b8f34b82..cf77a64d 100644 --- a/src/agentops/pipeline/azd_runner.py +++ b/src/agentops/pipeline/azd_runner.py @@ -202,6 +202,7 @@ def normalize_to_results( "azd eval run returned no numeric metrics, so AgentOps cannot apply " "thresholds or claim the gate passed." ) + _validate_rubric_evidence(config=config, recipe=recipe, metrics=aggregate_metrics) metric_binding = bind_threshold_metrics(config.thresholds.keys(), aggregate_metrics.keys()) if metric_binding.unmatched: @@ -270,6 +271,8 @@ def normalize_to_results( "version": config.version, "agent": config.agent, "thresholds": dict(config.thresholds), + "dataset_kind": config.dataset_kind, + "rubrics": [rubric.model_dump(mode="json") for rubric in config.rubrics], "execution": "azd", "backend_requested": "azd", "backend_effective": "azd", @@ -291,6 +294,46 @@ def normalize_to_results( ) +def _validate_rubric_evidence( + *, + config: AgentOpsConfig, + recipe: EvalRecipe, + metrics: Dict[str, float], +) -> None: + if not config.rubrics: + return + + recipe_evaluators = {evaluator.name for evaluator in recipe.evaluators} + threshold_names = set(config.thresholds) + metric_names = set(metrics) + missing: list[str] = [] + + for rubric in config.rubrics: + evaluator_name = (rubric.evaluator or rubric.name).strip() + if evaluator_name not in recipe_evaluators: + missing.append(f"rubric evaluator `{evaluator_name}` in eval.yaml") + dimension_names = [dimension.name for dimension in rubric.dimensions] + thresholded_dimensions = [ + name for name in dimension_names if name in threshold_names + ] + if not thresholded_dimensions: + missing.append( + f"threshold for at least one dimension of rubric `{rubric.name}`" + ) + continue + for dimension_name in thresholded_dimensions: + if dimension_name not in metric_names: + missing.append(f"azd metric for rubric dimension `{dimension_name}`") + + if missing: + raise AzdBackendError( + "rubric evidence is incomplete; " + + "; ".join(missing) + + ". Run `agentops eval init --force` after configuring rubrics and " + "bind rubric dimension thresholds in agentops.yaml." + ) + + def write_raw_artifacts(azd_run: AzdEvalRun, output_dir: Path) -> None: """Write native azd payload and command streams for debugging/evidence.""" @@ -369,6 +412,7 @@ def _run_command_with_progress( errors="replace", stdout=stdout_file, stderr=stderr_file, + stdin=subprocess.DEVNULL, ) while True: returncode = process.poll() diff --git a/src/agentops/pipeline/orchestrator.py b/src/agentops/pipeline/orchestrator.py index e3c9ab4c..a7f15b49 100644 --- a/src/agentops/pipeline/orchestrator.py +++ b/src/agentops/pipeline/orchestrator.py @@ -80,6 +80,12 @@ def _run_evaluation( options: RunOptions, ) -> RunResult: """Run a full evaluation after optional telemetry has been initialized.""" + if config.rubrics and _resolve_execution_backend(config) != "azd": + raise ValueError( + "rubrics require execution: azd so Foundry/azd runs the rubric " + "evaluator. Set `execution: azd`, run `agentops eval init`, and " + "bind rubric dimension thresholds in agentops.yaml." + ) if options.baseline_path is not None and not options.baseline_path.exists(): raise FileNotFoundError( f"baseline file not found: {options.baseline_path}. " diff --git a/src/agentops/services/azd_eval_init.py b/src/agentops/services/azd_eval_init.py index aae84d12..2162769b 100644 --- a/src/agentops/services/azd_eval_init.py +++ b/src/agentops/services/azd_eval_init.py @@ -337,6 +337,8 @@ def _ensure_azd_project_env_metadata( if not location.found or location.env_path is None: location = ensure_azd_env(workspace, "sandbox") env_path = location.env_path + if env_path is None: + return updates = { "AZURE_AI_FOUNDRY_PROJECT_ENDPOINT": endpoint, "FOUNDRY_PROJECT_ENDPOINT": endpoint, @@ -478,7 +480,20 @@ def _azd_evaluators_from_config(config_path: Path) -> tuple[str, ...]: mapped = name if name.startswith("builtin.") else _EVALUATOR_NAME_TO_AZD.get(name) if mapped and mapped not in names: names.append(mapped) - return tuple(names) if names else _DEFAULT_AZD_EVALUATORS + if not names: + names.extend(_DEFAULT_AZD_EVALUATORS) + raw_rubrics = data.get("rubrics") + if isinstance(raw_rubrics, list): + for item in raw_rubrics: + if not isinstance(item, dict): + continue + raw_name = item.get("evaluator") or item.get("name") + if not isinstance(raw_name, str) or not raw_name.strip(): + continue + name = raw_name.strip() + if name not in names: + names.append(name) + return tuple(names) def _azd_dataset_from_agentops_dataset(dataset: Path, *, workspace: Path) -> Path: diff --git a/src/agentops/services/evidence_pack.py b/src/agentops/services/evidence_pack.py index 08dccb8b..fc2814ac 100644 --- a/src/agentops/services/evidence_pack.py +++ b/src/agentops/services/evidence_pack.py @@ -65,6 +65,7 @@ def build_release_evidence( foundry = _foundry_status(analysis) monitoring = _monitoring_status(analysis) trace_dataset = _trace_dataset_status(root) + observability = _observability_status(root, trace_dataset) ailz = _ailz_status(analysis) governance = _governance_status(root) @@ -80,12 +81,13 @@ def build_release_evidence( _add_doctor_check(checks, blockers, warnings, ready, doctor) _add_foundry_check(checks, warnings, ready, foundry) _add_monitoring_check(checks, warnings, ready, monitoring) + _add_observability_check(checks, warnings, ready, observability) _add_trace_dataset_check(checks, warnings, ready, trace_dataset) _add_ailz_check(checks, warnings, ready, ailz) _add_governance_check(checks, warnings, ready, governance) status = "blocked" if blockers else "ready_with_warnings" if warnings else "ready" - links = _links(latest_eval) + links = _links(latest_eval, observability) target = latest_eval.get("target") generated_at = datetime.now(timezone.utc).isoformat() @@ -106,6 +108,7 @@ def build_release_evidence( foundry=foundry, monitoring=monitoring, trace_dataset=trace_dataset, + observability=observability, ailz=ailz, governance=governance, ) @@ -452,6 +455,42 @@ def _trace_dataset_status(root: Path) -> dict[str, Any]: return {"status": "ok", "manifest": str(manifest), **payload} +def _observability_status(root: Path, trace_dataset: dict[str, Any]) -> dict[str, Any]: + config = _agentops_config(root) + observability = config.get("observability") + observability = observability if isinstance(observability, dict) else {} + rubrics = config.get("rubrics") + rubrics = rubrics if isinstance(rubrics, list) else [] + lineage = trace_dataset.get("lineage") + lineage = lineage if isinstance(lineage, dict) else {} + trace_sampling = observability.get("trace_sampling") + trace_sampling = trace_sampling if isinstance(trace_sampling, dict) else {} + + replay_urls = [str(url) for url in _as_list(lineage.get("replay_urls")) if url] + evaluation_urls = [str(url) for url in _as_list(lineage.get("evaluation_urls")) if url] + sampling_policies = [ + str(policy) for policy in _as_list(lineage.get("sampling_policies")) if policy + ] + multi_turn_rows = int(lineage.get("multi_turn_rows") or 0) + + return { + "status": "ok" if observability or rubrics or lineage else "not_configured", + "dataset_kind": config.get("dataset_kind", "auto"), + "multi_turn_ready": config.get("dataset_kind") == "multi-turn" or multi_turn_rows > 0, + "multi_turn_rows": multi_turn_rows, + "rubrics_count": len(rubrics), + "rubrics": rubrics, + "trace_sampling_enabled": bool(trace_sampling.get("enabled")) or bool(sampling_policies), + "trace_sampling_mode": trace_sampling.get("mode"), + "sampling_policies": sampling_policies, + "trace_replay_urls": replay_urls + or ([str(observability["trace_replay_url"])] if observability.get("trace_replay_url") else []), + "evaluation_urls": evaluation_urls + or ([str(observability["evaluations_url"])] if observability.get("evaluations_url") else []), + "datasets_url": observability.get("datasets_url"), + } + + def _ailz_status(analysis: Optional[AnalysisResult]) -> dict[str, Any]: if analysis is None: return {"status": "not_run"} @@ -661,6 +700,50 @@ def _add_monitoring_check( checks.append(ReleaseEvidenceCheck(name="Runtime monitoring", status="warning", summary=message, evidence=monitoring)) +def _add_observability_check( + checks: list[ReleaseEvidenceCheck], + warnings: list[str], + ready: list[str], + observability: dict[str, Any], +) -> None: + missing: list[str] = [] + if not observability.get("multi_turn_ready"): + missing.append("multi-turn eval coverage") + if int(observability.get("rubrics_count") or 0) <= 0: + missing.append("rubric evaluator") + if not observability.get("trace_sampling_enabled"): + missing.append("intelligent trace sampling") + if not observability.get("trace_replay_urls"): + missing.append("trace replay link") + + if not missing: + message = ( + "Foundry observability signals are evidence-ready: " + "multi-turn coverage, rubric scoring, trace sampling, and replay links." + ) + ready.append(message) + checks.append( + ReleaseEvidenceCheck( + name="Foundry observability", + status="ready", + summary=message, + evidence=observability, + ) + ) + return + + message = "Foundry observability evidence is incomplete: " + ", ".join(missing) + warnings.append(message) + checks.append( + ReleaseEvidenceCheck( + name="Foundry observability", + status="warning", + summary=message, + evidence=observability, + ) + ) + + def _add_trace_dataset_check( checks: list[ReleaseEvidenceCheck], warnings: list[str], @@ -751,11 +834,18 @@ def _add_governance_check( ) -def _links(latest_eval: dict[str, Any]) -> list[ReleaseEvidenceLink]: +def _links(latest_eval: dict[str, Any], observability: dict[str, Any]) -> list[ReleaseEvidenceLink]: links: list[ReleaseEvidenceLink] = [] report_url = latest_eval.get("foundry_report_url") if report_url: links.append(ReleaseEvidenceLink(label="Foundry evaluation report", url=str(report_url))) + for url in _as_list(observability.get("trace_replay_urls"))[:3]: + links.append(ReleaseEvidenceLink(label="Foundry trace replay", url=str(url))) + for url in _as_list(observability.get("evaluation_urls"))[:3]: + links.append(ReleaseEvidenceLink(label="Foundry evaluation", url=str(url))) + datasets_url = observability.get("datasets_url") + if datasets_url: + links.append(ReleaseEvidenceLink(label="Foundry datasets", url=str(datasets_url))) return links diff --git a/src/agentops/services/trace_promotion.py b/src/agentops/services/trace_promotion.py index 0c2eae94..fa54f8bc 100644 --- a/src/agentops/services/trace_promotion.py +++ b/src/agentops/services/trace_promotion.py @@ -232,9 +232,46 @@ def _trace_to_row(trace: dict[str, Any], label_mode: LabelMode) -> Optional[dict metadata = { "source": "production_trace", "trace_id": _first_text(trace, "trace_id", "operation_Id", "operationId", "id"), + "operation_id": _first_text(trace, "operation_Id", "operationId"), + "span_id": _first_text(trace, "span_id", "spanId", "id"), "timestamp": _first_text(trace, "timestamp", "time", "TimeGenerated"), "label_mode": label_mode, "needs_review": True, + "source_system": _first_text(trace, "source_system", "source", "customDimensions.source_system"), + "agent": _first_text(trace, "agent", "agent_id", "customDimensions.agent"), + "agent_version": _first_text( + trace, + "agent_version", + "customDimensions.agent_version", + "customDimensions.agentops.agent.version", + ), + "foundry_project": _first_text( + trace, + "foundry_project", + "project", + "customDimensions.foundry_project", + ), + "replay_url": _first_text( + trace, + "replay_url", + "trace_replay_url", + "customDimensions.replay_url", + "customDimensions.trace_replay_url", + ), + "evaluation_url": _first_text( + trace, + "evaluation_url", + "eval_url", + "customDimensions.evaluation_url", + "customDimensions.eval_url", + ), + "sampling_policy": _first_text( + trace, + "sampling_policy", + "sample_reason", + "customDimensions.sampling_policy", + "customDimensions.sample_reason", + ), } row: dict[str, Any] = { "input": input_text, @@ -247,6 +284,9 @@ def _trace_to_row(trace: dict[str, Any], label_mode: LabelMode) -> Optional[dict tool_calls = _first_value(trace, "tool_calls", "customDimensions.tool_calls") if tool_calls: row["tool_calls"] = tool_calls + messages = _first_value(trace, "messages", "conversation", "turns", "customDimensions.messages") + if messages: + row["messages"] = messages return row @@ -256,6 +296,7 @@ def _write_trace_dataset(preview: TracePromotionPreview) -> None: for row in preview.rows: handle.write(json.dumps(row, ensure_ascii=False) + "\n") + lineage = _lineage_from_rows(preview.rows) manifest = { "version": 1, "generated_at": datetime.now(timezone.utc).isoformat(), @@ -265,10 +306,59 @@ def _write_trace_dataset(preview: TracePromotionPreview) -> None: "skipped": preview.skipped, "label_mode": preview.label_mode, "human_review_required": True, + "lineage": lineage, } preview.manifest_path.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8") +def _lineage_from_rows(rows: list[dict[str, Any]]) -> dict[str, Any]: + trace_ids: list[str] = [] + replay_urls: list[str] = [] + evaluation_urls: list[str] = [] + source_systems: set[str] = set() + agents: set[str] = set() + agent_versions: set[str] = set() + sampling_policies: set[str] = set() + multi_turn_rows = 0 + + for row in rows: + metadata = row.get("metadata") + if not isinstance(metadata, dict): + continue + _append_unique(trace_ids, metadata.get("trace_id")) + _append_unique(replay_urls, metadata.get("replay_url")) + _append_unique(evaluation_urls, metadata.get("evaluation_url")) + _add_text(source_systems, metadata.get("source_system")) + _add_text(agents, metadata.get("agent")) + _add_text(agent_versions, metadata.get("agent_version")) + _add_text(sampling_policies, metadata.get("sampling_policy")) + if row.get("messages"): + multi_turn_rows += 1 + + return { + "trace_ids": trace_ids, + "replay_urls": replay_urls, + "evaluation_urls": evaluation_urls, + "source_systems": sorted(source_systems), + "agents": sorted(agents), + "agent_versions": sorted(agent_versions), + "sampling_policies": sorted(sampling_policies), + "multi_turn_rows": multi_turn_rows, + } + + +def _append_unique(values: list[str], value: Any) -> None: + text = str(value).strip() if value is not None else "" + if text and text not in values: + values.append(text) + + +def _add_text(values: set[str], value: Any) -> None: + text = str(value).strip() if value is not None else "" + if text: + values.add(text) + + def _first_text(data: dict[str, Any], *keys: str) -> Optional[str]: value = _first_value(data, *keys) if value is None: diff --git a/src/agentops/templates/agentops.yaml b/src/agentops/templates/agentops.yaml index 910fb318..30493277 100644 --- a/src/agentops/templates/agentops.yaml +++ b/src/agentops/templates/agentops.yaml @@ -20,6 +20,13 @@ agent: "my-agent:1" dataset: .agentops/data/smoke.jsonl +# Optional. Leave as auto for existing single-turn smoke tests. Set to +# multi-turn when the dataset rows represent conversations / message histories +# so Doctor, Cockpit, and release evidence can treat conversation-level evals as +# part of the readiness proof. +# +# dataset_kind: auto + # Optional. Source-controlled instructions file used by prompt-agent CI/CD. # Generated prompt-agent deploy workflows create a candidate Foundry version # from this file, evaluate that exact version, then record it as deployed only @@ -52,6 +59,25 @@ dataset: .agentops/data/smoke.jsonl # coherence: ">=3" # groundedness: ">=3" # avg_latency_seconds: "<=30" +# +# Optional. Context-specific rubric evaluators. When this block is present, +# AgentOps requires execution: azd so the Foundry / azd evaluator actually runs; +# local execution will fail closed instead of pretending rubric scoring happened. +# +# rubrics: +# - name: travel-concierge-quality +# evaluator: travel-concierge-rubric +# dimensions: +# - name: task_success +# description: "Completes the requested task without losing context." +# weight: 0.5 +# - name: safety +# description: "Avoids unsafe or unsupported claims." +# weight: 0.3 +# +# thresholds: +# task_success: ">=4" +# safety: ">=4" # Optional. Foundry prompt agents and Foundry publishing need a project # endpoint. If both this value and AZURE_AI_FOUNDRY_PROJECT_ENDPOINT are set, @@ -67,6 +93,19 @@ dataset: .agentops/data/smoke.jsonl # # execution: local # +# Optional. Foundry observability readiness metadata. AgentOps reads +# this for Doctor/Cockpit/evidence; Foundry still owns tracing, replay, and +# trace sampling at runtime. +# +# observability: +# tracing_enabled: true +# trace_sampling: +# enabled: true +# mode: foundry +# trace_replay_url: "https://ai.azure.com/..." +# evaluations_url: "https://ai.azure.com/..." +# datasets_url: "https://ai.azure.com/..." +# # Optional. Publish local results to the Classic Foundry Evaluations panel. # Only meaningful when execution: local. With execution: cloud the run is # always published (Foundry hosts the run by definition). diff --git a/tests/unit/test_agent_checks_observability.py b/tests/unit/test_agent_checks_observability.py new file mode 100644 index 00000000..aa581e8f --- /dev/null +++ b/tests/unit/test_agent_checks_observability.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from agentops.agent.checks.observability import run_observability_check + + +def test_observability_check_flags_missing_build_2026_readiness(tmp_path: Path) -> None: + (tmp_path / "agentops.yaml").write_text( + "version: 1\n" + "agent: travel-agent:2\n" + "dataset: .agentops/data/smoke.jsonl\n", + encoding="utf-8", + ) + + findings = run_observability_check(tmp_path) + ids = {finding.id for finding in findings} + + assert "observability.multiturn_coverage_missing" in ids + assert "observability.rubric_missing" in ids + assert "observability.trace_sampling_missing" in ids + assert "observability.trace_replay_missing" in ids + + +def test_observability_check_accepts_declared_readiness(tmp_path: Path) -> None: + (tmp_path / "agentops.yaml").write_text( + "version: 1\n" + "agent: travel-agent:2\n" + "dataset: .agentops/data/conversations.jsonl\n" + "dataset_kind: multi-turn\n" + "rubrics:\n" + " - name: travel-concierge-quality\n" + " dimensions:\n" + " - name: task_success\n" + " description: Completes the requested travel task.\n" + "observability:\n" + " trace_sampling:\n" + " enabled: true\n" + " mode: foundry\n" + " trace_replay_url: https://ai.azure.com/traces/trace-1\n", + encoding="utf-8", + ) + + findings = run_observability_check(tmp_path) + + assert findings == [] + + +def test_observability_check_accepts_trace_manifest_lineage(tmp_path: Path) -> None: + (tmp_path / "agentops.yaml").write_text( + "version: 1\n" + "agent: travel-agent:2\n" + "dataset: .agentops/data/smoke.jsonl\n" + "rubrics:\n" + " - name: travel-concierge-quality\n", + encoding="utf-8", + ) + manifest = tmp_path / ".agentops" / "data" / "trace-regression-manifest.json" + manifest.parent.mkdir(parents=True) + manifest.write_text( + json.dumps( + { + "lineage": { + "multi_turn_rows": 2, + "sampling_policies": ["foundry-intelligent-sampling"], + "replay_urls": ["https://ai.azure.com/traces/trace-1"], + } + } + ), + encoding="utf-8", + ) + + findings = run_observability_check(tmp_path) + + assert findings == [] diff --git a/tests/unit/test_agentops_config.py b/tests/unit/test_agentops_config.py index e95134a7..288f56a2 100644 --- a/tests/unit/test_agentops_config.py +++ b/tests/unit/test_agentops_config.py @@ -10,7 +10,10 @@ from agentops.core.agentops_config import ( AgentOpsConfig, DatasetSyncConfig, + ObservabilityConfig, PromptAgentBootstrap, + RubricConfig, + RubricDimensionConfig, Threshold, classify_agent, ) @@ -255,6 +258,72 @@ def test_dataset_sync_rejects_empty_name(self) -> None: } ) + def test_accepts_build_2026_eval_metadata(self) -> None: + cfg = AgentOpsConfig( + version=1, + agent="travel-agent:2", + dataset=".agentops/data/conversations.jsonl", + dataset_kind="multi-turn", + rubrics=[ + RubricConfig( + name="travel-concierge-quality", + description="Travel planning behavior", + dimensions=[ + RubricDimensionConfig( + name="task_success", + description="Completes the requested travel planning task.", + weight=0.5, + ), + RubricDimensionConfig( + name="tone", + description="Uses concise and helpful travel-advisor tone.", + weight=0.2, + ), + ], + evaluator="builtin.rubric", + ) + ], + observability=ObservabilityConfig( + tracing_enabled=True, + trace_sampling={"enabled": True, "mode": "foundry"}, + trace_replay_url="https://ai.azure.com/project/traces/trace-1", + ), + thresholds={"task_success": ">=4"}, + ) + + assert cfg.dataset_kind == "multi-turn" + assert cfg.rubrics[0].name == "travel-concierge-quality" + assert cfg.rubrics[0].dimensions[0].weight == 0.5 + assert cfg.observability.tracing_enabled is True + assert cfg.observability.trace_sampling.enabled is True + + def test_observability_rejects_non_url_links(self) -> None: + with pytest.raises(ValidationError, match="observability URLs"): + AgentOpsConfig.model_validate( + { + "version": 1, + "agent": "travel-agent:2", + "dataset": ".agentops/data/smoke.jsonl", + "observability": {"trace_replay_url": "ai.azure.com/traces"}, + } + ) + + def test_rubric_rejects_empty_dimension(self) -> None: + with pytest.raises(ValidationError, match="rubric dimension"): + AgentOpsConfig.model_validate( + { + "version": 1, + "agent": "travel-agent:2", + "dataset": ".agentops/data/smoke.jsonl", + "rubrics": [ + { + "name": "travel", + "dimensions": [{"name": " ", "description": "score"}], + } + ], + } + ) + def test_prompt_agent_bootstrap_defaults_to_none(self) -> None: cfg = AgentOpsConfig(version=1, agent="my-rag:3", dataset="./qa.jsonl") assert cfg.prompt_agent_bootstrap is None diff --git a/tests/unit/test_azd_eval_init.py b/tests/unit/test_azd_eval_init.py index b42ecb5d..29240b6a 100644 --- a/tests/unit/test_azd_eval_init.py +++ b/tests/unit/test_azd_eval_init.py @@ -162,6 +162,54 @@ def fake_run(command, **kwargs): assert result.command_ran is True +def test_run_azd_eval_init_passes_configured_rubric_evaluator( + tmp_path: Path, + monkeypatch, +) -> None: + config_path = tmp_path / "agentops.yaml" + _write_config(config_path) + config_path.write_text( + config_path.read_text(encoding="utf-8") + + """ +rubrics: + - name: travel-quality + evaluator: travel-quality-rubric + dimensions: + - name: task_success + description: Completes the requested travel planning task. +thresholds: + task_success: ">=4" +""", + encoding="utf-8", + ) + dataset = tmp_path / ".agentops" / "data" / "smoke.jsonl" + dataset.parent.mkdir(parents=True) + dataset.write_text('{"input":"hello"}\n', encoding="utf-8") + prompt_file = tmp_path / ".agentops" / "prompts" / "travel.md" + prompt_file.parent.mkdir(parents=True) + prompt_file.write_text("You are a travel planner.", encoding="utf-8") + + monkeypatch.setattr(azd_eval_init, "azd_available", lambda *, cwd=None: True) + + def fake_run(command, **kwargs): + if command[:3] == ["az", "resource", "list"]: + return subprocess.CompletedProcess(command, 0, stdout="[]", stderr="") + assert command[-2:] == ["--evaluator", "travel-quality-rubric"] + assert command.count("--evaluator") == 3 + recipe = Path(kwargs["cwd"]) / "eval.yaml" + recipe.write_text("name: travel-agent-eval\n", encoding="utf-8") + return subprocess.CompletedProcess(command, 0, stdout="created", stderr="") + + monkeypatch.setattr(subprocess, "run", fake_run) + + result = azd_eval_init.run_azd_eval_init( + workspace=tmp_path, + config_path=config_path, + ) + + assert result.command_ran is True + + def test_run_azd_eval_init_bootstraps_before_azd_availability_check( tmp_path: Path, monkeypatch, diff --git a/tests/unit/test_azd_runner.py b/tests/unit/test_azd_runner.py index a14a166d..73a8a612 100644 --- a/tests/unit/test_azd_runner.py +++ b/tests/unit/test_azd_runner.py @@ -124,6 +124,7 @@ def poll(self) -> int | None: def fake_popen(command, **kwargs): assert command == ["azd", "ai", "agent", "eval", "run"] + assert kwargs["stdin"] is azd_runner.subprocess.DEVNULL return FakeProcess(stdout=kwargs["stdout"], stderr=kwargs["stderr"]) monkeypatch.setattr(azd_runner.subprocess, "Popen", fake_popen) @@ -147,6 +148,37 @@ def fake_popen(command, **kwargs): ] +def test_rubrics_require_azd_backend(tmp_path: Path) -> None: + dataset = tmp_path / "smoke.jsonl" + _write_dataset(dataset) + config = AgentOpsConfig( + version=1, + agent="travel-agent:1", + dataset=dataset, + execution="local", + rubrics=[ + { + "name": "travel-quality", + "dimensions": [ + { + "name": "task_success", + "description": "Completes the requested travel task.", + } + ], + } + ], + ) + + with pytest.raises(ValueError, match="rubrics require execution: azd"): + orchestrator.run_evaluation( + config, + options=orchestrator.RunOptions( + config_path=tmp_path / "agentops.yaml", + output_dir=tmp_path / ".agentops" / "results", + ), + ) + + def test_normalize_to_results_binds_azd_metrics_and_thresholds(tmp_path: Path) -> None: recipe_path = tmp_path / "eval.yaml" _write_recipe(recipe_path) @@ -250,6 +282,24 @@ def test_normalize_to_results_binds_rubric_dimensions(tmp_path: Path) -> None: agent="travel-agent:1", dataset="ignored.jsonl", execution="azd", + rubrics=[ + { + "name": "travel_quality", + "evaluator": "travel_quality_rubric", + "dimensions": [ + { + "name": "booking_accuracy", + "description": "Books or recommends options accurately.", + "weight": 0.7, + }, + { + "name": "policy_enforcement", + "description": "Avoids unsupported booking claims.", + "weight": 0.3, + }, + ], + } + ], thresholds={ "travel_quality_rubric": ">=0.8", "booking_accuracy": ">=0.8", @@ -296,6 +346,112 @@ def test_normalize_to_results_binds_rubric_dimensions(tmp_path: Path) -> None: "booking_accuracy": "booking_accuracy", "policy_enforcement": "policy_enforcement", } + assert result.config["rubrics"][0]["evaluator"] == "travel_quality_rubric" + + +def test_rubric_config_requires_dimension_threshold_evidence(tmp_path: Path) -> None: + recipe_path = tmp_path / "eval.yaml" + recipe_path.write_text( + """ +name: rubric-eval +agent: + name: travel-agent + kind: prompt-agent +evaluators: + - builtin.coherence + - travel_quality_rubric +""".lstrip(), + encoding="utf-8", + ) + recipe = load_eval_recipe(recipe_path) + config = AgentOpsConfig( + version=1, + agent="travel-agent:1", + dataset="ignored.jsonl", + execution="azd", + rubrics=[ + { + "name": "travel_quality", + "evaluator": "travel_quality_rubric", + "dimensions": [ + { + "name": "booking_accuracy", + "description": "Books or recommends options accurately.", + } + ], + } + ], + thresholds={"coherence": ">=0.8"}, + ) + azd_run = azd_runner.AzdEvalRun( + recipe_path=recipe_path, + payload={"metrics": {"coherence": 0.91}}, + run_id="run-1", + status="completed", + stdout="{}", + stderr="", + duration_seconds=3.0, + ) + + with pytest.raises(azd_runner.AzdBackendError, match="rubric evidence"): + azd_runner.normalize_to_results( + azd_run, + config=config, + recipe=recipe, + started_at=datetime.now(timezone.utc), + ) + + +def test_rubric_config_requires_recipe_evaluator(tmp_path: Path) -> None: + recipe_path = tmp_path / "eval.yaml" + recipe_path.write_text( + """ +name: rubric-eval +agent: + name: travel-agent + kind: prompt-agent +evaluators: + - builtin.coherence +""".lstrip(), + encoding="utf-8", + ) + recipe = load_eval_recipe(recipe_path) + config = AgentOpsConfig( + version=1, + agent="travel-agent:1", + dataset="ignored.jsonl", + execution="azd", + rubrics=[ + { + "name": "travel_quality", + "evaluator": "travel_quality_rubric", + "dimensions": [ + { + "name": "booking_accuracy", + "description": "Books or recommends options accurately.", + } + ], + } + ], + thresholds={"booking_accuracy": ">=0.8"}, + ) + azd_run = azd_runner.AzdEvalRun( + recipe_path=recipe_path, + payload={"metrics": {"booking_accuracy": 0.91}}, + run_id="run-1", + status="completed", + stdout="{}", + stderr="", + duration_seconds=3.0, + ) + + with pytest.raises(azd_runner.AzdBackendError, match="rubric evaluator"): + azd_runner.normalize_to_results( + azd_run, + config=config, + recipe=recipe, + started_at=datetime.now(timezone.utc), + ) def test_orchestrator_azd_dispatch_never_invokes_local_runtime(tmp_path: Path) -> None: diff --git a/tests/unit/test_cockpit.py b/tests/unit/test_cockpit.py index 3908661d..46edb242 100644 --- a/tests/unit/test_cockpit.py +++ b/tests/unit/test_cockpit.py @@ -427,6 +427,43 @@ def test_readiness_splits_tracing_and_includes_continuous_eval(tmp_path: Path): assert "Server-side tracing (agent → App Insights)" in html +def test_readiness_detects_multiturn_rubric_sampling_and_replay(tmp_path: Path): + from agentops.agent.cockpit import _build_readiness_checklist + + (tmp_path / "agentops.yaml").write_text( + "version: 1\n" + "agent: travel-agent:3\n" + "dataset: .agentops/data/travel-conversations.jsonl\n" + "dataset_kind: multi-turn\n" + "execution: azd\n" + "rubrics:\n" + " - name: travel-concierge-quality\n" + " evaluator: travel-concierge-quality\n" + " dimensions:\n" + " - name: task_success\n" + " description: Completes the requested trip plan.\n" + "observability:\n" + " trace_sampling:\n" + " enabled: true\n" + " mode: foundry\n" + " trace_replay_url: https://ai.azure.com/traces/replay/abc\n", + encoding="utf-8", + ) + + readiness = _build_readiness_checklist( + tmp_path, + {"enabled": True, "detail": "ok", "portal_url": "https://x"}, + {"has_data": False}, + watchdog=None, + ) + by_title = {check["title"]: check for check in readiness["checks"]} + + assert by_title["Multi-turn eval coverage"]["status"] == "ok" + assert by_title["Rubric evaluator gate"]["status"] == "ok" + assert by_title["Trace sampling for live quality"]["status"] == "ok" + assert by_title["Trace replay linked to evidence"]["status"] == "ok" + + def test_readiness_non_ready_items_include_remediation(tmp_path: Path, monkeypatch): from agentops.agent.cockpit import _build_readiness_checklist diff --git a/tests/unit/test_release_evidence.py b/tests/unit/test_release_evidence.py index a5a706ec..a822cc10 100644 --- a/tests/unit/test_release_evidence.py +++ b/tests/unit/test_release_evidence.py @@ -105,6 +105,47 @@ def test_build_release_evidence_ready_with_warning_without_baseline(tmp_path: Pa assert any("No baseline comparison" in warning for warning in evidence.warnings) +def test_build_release_evidence_includes_observability_links_and_rubric_status( + tmp_path: Path, +) -> None: + _write_latest_results(tmp_path, passed=True) + (tmp_path / "agentops.yaml").write_text( + "version: 1\n" + "agent: travel-agent:7\n" + "dataset: .agentops/data/travel-conversations.jsonl\n" + "dataset_kind: multi-turn\n" + "execution: azd\n" + "rubrics:\n" + " - name: travel-concierge-quality\n" + " evaluator: travel-concierge-quality\n" + " dimensions:\n" + " - name: task_success\n" + " description: Completes the requested trip plan.\n" + "observability:\n" + " trace_sampling:\n" + " enabled: true\n" + " mode: foundry\n" + " trace_replay_url: https://ai.azure.com/traces/replay/abc\n" + " evaluations_url: https://ai.azure.com/evaluations/run-1\n" + " datasets_url: https://ai.azure.com/datasets/travel\n", + encoding="utf-8", + ) + + evidence = build_release_evidence(tmp_path) + + assert evidence.observability["multi_turn_ready"] is True + assert evidence.observability["rubrics_count"] == 1 + assert evidence.observability["trace_sampling_enabled"] is True + assert evidence.observability["trace_replay_urls"] == [ + "https://ai.azure.com/traces/replay/abc" + ] + assert any(check.name == "Foundry observability" and check.status == "ready" for check in evidence.checks) + labels = {link.label: link.url for link in evidence.links} + assert labels["Foundry trace replay"] == "https://ai.azure.com/traces/replay/abc" + assert labels["Foundry evaluation"] == "https://ai.azure.com/evaluations/run-1" + assert labels["Foundry datasets"] == "https://ai.azure.com/datasets/travel" + + def test_write_release_evidence_redacts_secret_values(tmp_path: Path) -> None: evidence = ReleaseEvidence( generated_at="2026-01-01T00:00:00+00:00", diff --git a/tests/unit/test_trace_promotion.py b/tests/unit/test_trace_promotion.py index 64c6d0fa..8915f8d6 100644 --- a/tests/unit/test_trace_promotion.py +++ b/tests/unit/test_trace_promotion.py @@ -70,6 +70,53 @@ def test_promote_traces_apply_writes_dataset_and_manifest(tmp_path: Path) -> Non manifest = json.loads(preview.manifest_path.read_text(encoding="utf-8")) assert manifest["human_review_required"] is True assert manifest["rows"] == 1 + assert manifest["lineage"]["trace_ids"] == [] + + +def test_promote_traces_preserves_foundry_lineage_and_multiturn(tmp_path: Path) -> None: + source = tmp_path / "traces.jsonl" + source.write_text( + json.dumps( + { + "operation_Id": "op-1", + "trace_replay_url": "https://ai.azure.com/traces/op-1", + "evaluation_url": "https://ai.azure.com/evaluations/eval-1", + "agent": "travel-agent", + "agent_version": "7", + "sampling_policy": "foundry-intelligent-sampling", + "messages": [ + {"role": "user", "content": "Plan Rome"}, + {"role": "assistant", "content": "How many days?"}, + {"role": "user", "content": "Three"}, + ], + "customDimensions": { + "input": "Plan Rome", + "response": "Here is a three-day Rome itinerary.", + "source_system": "foundry", + }, + } + ) + + "\n", + encoding="utf-8", + ) + output = tmp_path / ".agentops" / "data" / "trace-regression.jsonl" + + preview = promote_traces(source=source, output_path=output, apply=True) + + row = preview.rows[0] + assert row["metadata"]["trace_id"] == "op-1" + assert row["metadata"]["replay_url"] == "https://ai.azure.com/traces/op-1" + assert row["metadata"]["evaluation_url"] == "https://ai.azure.com/evaluations/eval-1" + assert row["metadata"]["sampling_policy"] == "foundry-intelligent-sampling" + assert row["messages"][1]["role"] == "assistant" + manifest = json.loads(preview.manifest_path.read_text(encoding="utf-8")) + assert manifest["lineage"]["trace_ids"] == ["op-1"] + assert manifest["lineage"]["replay_urls"] == ["https://ai.azure.com/traces/op-1"] + assert manifest["lineage"]["evaluation_urls"] == ["https://ai.azure.com/evaluations/eval-1"] + assert manifest["lineage"]["agents"] == ["travel-agent"] + assert manifest["lineage"]["agent_versions"] == ["7"] + assert manifest["lineage"]["sampling_policies"] == ["foundry-intelligent-sampling"] + assert manifest["lineage"]["multi_turn_rows"] == 1 def test_promote_traces_cli_preview_does_not_write(tmp_path: Path) -> None: From 92ed4fe870bd0d7c4ca1281e98a54471069e9416 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 9 Jun 2026 01:40:12 +0000 Subject: [PATCH 2/2] chore: prepare release 0.3.12 --- .claude-plugin/marketplace.json | 2 +- .github/plugin/marketplace.json | 2 +- CHANGELOG.md | 2 ++ plugins/agentops/package.json | 2 +- plugins/agentops/plugin.json | 2 +- 5 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 867782d1..c58e5fcd 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -13,7 +13,7 @@ "name": "agentops-accelerator", "source": "../../plugins/agentops", "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Toolkit and Microsoft Foundry agents.", - "version": "0.3.8", + "version": "0.3.12", "keywords": [ "agentops", "evaluation", diff --git a/.github/plugin/marketplace.json b/.github/plugin/marketplace.json index 867782d1..c58e5fcd 100644 --- a/.github/plugin/marketplace.json +++ b/.github/plugin/marketplace.json @@ -13,7 +13,7 @@ "name": "agentops-accelerator", "source": "../../plugins/agentops", "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Toolkit and Microsoft Foundry agents.", - "version": "0.3.8", + "version": "0.3.12", "keywords": [ "agentops", "evaluation", diff --git a/CHANGELOG.md b/CHANGELOG.md index 634f196c..137f6081 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres ## [Unreleased] +## [0.3.12] - 2026-06-09 + ### Added - **Foundry observability readiness now spans eval, Doctor, Cockpit, and release evidence.** `agentops.yaml` supports `dataset_kind`, `rubrics`, and `observability` diff --git a/plugins/agentops/package.json b/plugins/agentops/package.json index 3357c6b2..d9962578 100644 --- a/plugins/agentops/package.json +++ b/plugins/agentops/package.json @@ -2,7 +2,7 @@ "name": "agentops-accelerator", "displayName": "AgentOps Accelerator — Skills for GitHub Copilot", "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Accelerator and Microsoft Foundry agents.", - "version": "0.3.8", + "version": "0.3.12", "publisher": "AgentOpsAccelerator", "icon": "icon.png", "license": "MIT", diff --git a/plugins/agentops/plugin.json b/plugins/agentops/plugin.json index 4626d03f..95b07344 100644 --- a/plugins/agentops/plugin.json +++ b/plugins/agentops/plugin.json @@ -1,7 +1,7 @@ { "name": "agentops-accelerator", "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Accelerator and Microsoft Foundry agents.", - "version": "0.3.8", + "version": "0.3.12", "author": { "name": "AgentOps Accelerator", "url": "https://github.com/Azure/agentops"