From 23af5902465147e8e97ab7011389980f9b1b91e1 Mon Sep 17 00:00:00 2001
From: Paulo Lacerda <pclacerda@gmail.com>
Date: Mon, 8 Jun 2026 22:34:57 -0300
Subject: [PATCH 1/2] feat: add Foundry observability rubric gates

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 CHANGELOG.md                                  |  24 +++
 docs/tutorial-end-to-end.md                   |  16 +-
 docs/tutorial-hosted-agent-quickstart.md      |  15 ++
 docs/tutorial-prompt-agent-quickstart.md      |  71 ++++++-
 src/agentops/agent/analyzer.py                |   2 +
 src/agentops/agent/checks/catalog.py          |  56 ++++++
 src/agentops/agent/checks/observability.py    | 175 ++++++++++++++++++
 src/agentops/agent/cockpit.py                 |  96 ++++++++++
 src/agentops/core/agentops_config.py          | 129 +++++++++++++
 src/agentops/core/release_evidence.py         |   1 +
 src/agentops/pipeline/azd_runner.py           |  44 +++++
 src/agentops/pipeline/orchestrator.py         |   6 +
 src/agentops/services/azd_eval_init.py        |  17 +-
 src/agentops/services/evidence_pack.py        |  94 +++++++++-
 src/agentops/services/trace_promotion.py      |  90 +++++++++
 src/agentops/templates/agentops.yaml          |  39 ++++
 tests/unit/test_agent_checks_observability.py |  76 ++++++++
 tests/unit/test_agentops_config.py            |  69 +++++++
 tests/unit/test_azd_eval_init.py              |  48 +++++
 tests/unit/test_azd_runner.py                 | 156 ++++++++++++++++
 tests/unit/test_cockpit.py                    |  37 ++++
 tests/unit/test_release_evidence.py           |  41 ++++
 tests/unit/test_trace_promotion.py            |  47 +++++
 23 files changed, 1341 insertions(+), 8 deletions(-)
 create mode 100644 src/agentops/agent/checks/observability.py
 create mode 100644 tests/unit/test_agent_checks_observability.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 38d0b81d..634f196c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,30 @@
 All notable changes to this project will be documented in this file.
 This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres to [Semantic Versioning](https://semver.org/).
 
+## [Unreleased]
+
+### Added
+- **Foundry observability readiness now spans eval, Doctor, Cockpit, and release evidence.**
+  `agentops.yaml` supports `dataset_kind`, `rubrics`, and `observability`
+  metadata for multi-turn coverage, rubric evaluator gates, trace sampling, and
+  replay/evaluation/dataset links. Doctor and Cockpit surface the readiness
+  state without mutating cloud resources, and release evidence records the same
+  signals for reviewers.
+- **Trace promotion preserves evaluation lineage.** `agentops eval
+  promote-traces` now carries operation/span IDs, source system, agent version,
+  replay/evaluation URLs, sampling policy, and multi-turn message fields into
+  candidate datasets and their manifest.
+
+### Changed
+- **Rubric evaluators are executed through the azd backend.** When `rubrics:`
+  is configured, `agentops eval init` includes those evaluator names in the azd
+  recipe and `agentops eval run` fails closed outside `execution: azd`, so rubric
+  scores cannot be treated as evidence unless Foundry / azd actually ran them.
+- **Tutorials now carry rubric and observability proof into evaluation and CI/CD.**
+  The Travel Agent flow keeps the existing smoke recording through step 10, then
+  upgrades the gate to multi-turn dataset rows, rubric thresholds, trace
+  sampling/replay lineage, and CI/CD workflows that reuse the same eval contract.
+
 ## [0.3.11] - 2026-06-08
 
 ### Fixed
diff --git a/docs/tutorial-end-to-end.md b/docs/tutorial-end-to-end.md
index cc884455..08ebdd23 100644
--- a/docs/tutorial-end-to-end.md
+++ b/docs/tutorial-end-to-end.md
@@ -444,6 +444,13 @@ Foundry through `agentops eval run`, so AgentOps can enforce thresholds and writ
 repo-side evidence. AgentOps keeps the local path for hosted endpoints, models,
 unsupported evaluator mappings, and fallback cases.
 
+When the quality gate uses a task-specific rubric, choose the azd runner instead
+of local execution. Add `rubrics:` to `agentops.yaml`, set
+`rubrics[].evaluator` to the Foundry / azd evaluator name, set
+`execution: azd`, and run `agentops eval init --force`. AgentOps then passes the
+rubric evaluator into the generated azd recipe and fails closed if someone tries
+to run that rubric gate with the local backend.
+
 ## 5. Run the first eval
 
 For hosted agents or local fallback:
@@ -651,7 +658,9 @@ agentops workflow generate `
 
 The generated workflows are intentionally boring:
 
-- PR gate: evaluate and publish report/evidence.
+- PR gate: evaluate and publish report/evidence. If `agentops.yaml` declares
+  rubric evaluators, this is the same azd/Foundry rubric gate you ran locally;
+  the PR does not downgrade to a plain smoke test.
 - Dev/QA/Prod: deploy with azd or placeholders, then run readiness checks.
 - Optional Doctor cadence: generate `--kinds doctor` separately if you want a
   scheduled readiness run outside PRs.
@@ -698,10 +707,11 @@ Use this loop in the video:
 | Signal | Foundry or Azure Monitor action | AgentOps handoff |
 |---|---|---|
 | App Insights connection | In Foundry, open the project or agent **Traces** view and connect an App Insights resource. Verify it under project connected resources. | Doctor checks whether telemetry wiring is discoverable. |
+| Trace sampling | Configure the project's trace sampling policy in Foundry or the hosted-agent observability settings your team owns. Keep the policy name in `agentops.yaml` under `observability.trace_sampling`. | Doctor/evidence can show reviewers that live-quality sampling exists before traces are promoted. |
 | Live trace | Run one playground prompt for a Prompt Agent, or call the hosted endpoint a few times. Open the agent **Traces** tab, wait 2-5 minutes if needed, and click the Trace ID. In the modal, inspect spans plus the **Input + Output** and **Metadata** tabs. | Evidence and Cockpit link reviewers back to the runtime view. |
 | Operate summary | Switch to **Operate** -> **Overview**, select the same subscription/project, wait for metrics to sync, and use **Ask AI** for dashboard-level questions such as `Help me identify any issues or anomalies in my agent metrics.` | The summary informs the release discussion; AgentOps does not rewrite it. |
-| Eval context | From a Foundry eval run, inspect row-level explanations and, when available, the trace attached to the interaction. | The repo keeps the exact target, dataset, gate, and evidence together. |
-| Trace learning | Export or curate traces that represent real issues. | `agentops eval promote-traces` turns reviewed traces into regression candidates. |
+| Eval context | From a Foundry eval run, inspect row-level explanations, rubric scores, and, when available, the trace attached to the interaction. | The repo keeps the exact target, dataset, rubric gate, and evidence together. |
+| Trace learning | Export or curate traces that represent real issues, including conversation turns when present. | `agentops eval promote-traces` turns reviewed traces into regression candidates and preserves replay/evaluation lineage. |
 
 For the screen recording, make the Foundry side visible before opening AgentOps
 Cockpit:
diff --git a/docs/tutorial-hosted-agent-quickstart.md b/docs/tutorial-hosted-agent-quickstart.md
index 13bc1e4c..1b38f626 100644
--- a/docs/tutorial-hosted-agent-quickstart.md
+++ b/docs/tutorial-hosted-agent-quickstart.md
@@ -648,6 +648,14 @@ This is the core AgentOps loop for hosted endpoints: keep a stable dataset,
 compare a changed runtime against the last known result, fix the agent, and
 rerun the same gate before a PR or release.
 
+If this hosted endpoint is backed by a Foundry / azd eval recipe, you can use
+the same rubric contract as the prompt-agent Travel Agent tutorial before you
+generate CI: set `execution: azd`, add `dataset_kind: multi-turn`, declare
+`rubrics[].evaluator` in `agentops.yaml`, run `agentops eval init --force`, and
+then run `agentops eval run`. AgentOps will require the azd backend whenever
+rubrics are configured, so a passing hosted-agent gate means the rubric evaluator
+actually ran instead of being recorded as metadata only.
+
 ## 10. Generate CI and Doctor evidence
 
 Generate both the PR and dev deploy workflows with `--doctor-gate critical`
@@ -666,6 +674,13 @@ code .agentops\agent\report.md
 code .agentops\release\latest\evidence.md
 ```
 
+The generated PR gate reuses the same `agentops.yaml` contract. If you promoted
+the hosted endpoint to an azd/Foundry eval recipe with rubrics, CI runs that
+recipe and blocks on the rubric thresholds; otherwise it runs the local hosted
+endpoint gate and normalized thresholds. In both cases Doctor and the evidence
+pack surface multi-turn coverage, trace sampling readiness, replay/evaluation
+links, and trace-to-dataset lineage when those signals exist.
+
 > **`--deploy-mode prompt-agent` does not apply to hosted endpoints.**
 > That mode is specific to Foundry prompt agents (the stage-prompt-as-
 > candidate flow). For hosted endpoints, `agentops workflow generate`
diff --git a/docs/tutorial-prompt-agent-quickstart.md b/docs/tutorial-prompt-agent-quickstart.md
index 8af0f0bd..91ee0b6c 100644
--- a/docs/tutorial-prompt-agent-quickstart.md
+++ b/docs/tutorial-prompt-agent-quickstart.md
@@ -803,6 +803,66 @@ You should see `execution: azd` and `Threshold status: PASSED`. The raw azd run
 details are kept under `.agentops/results/latest/` alongside AgentOps'
 normalized `results.json` and `report.md`.
 
+Before generating CI, turn the Travel Agent gate from a basic smoke test into
+the proof you want reviewers to see later. Keep the recording you already made
+through this step: the smoke run above proves the workspace works. The next
+commands only harden the same gate.
+
+Create a small conversation-shaped dataset. It still keeps `input` and
+`expected` so AgentOps and azd can route the row, but it also carries the
+conversation turns that multi-turn evaluators and trace-derived rows use:
+
+```powershell
+@'
+{"input":"Plan a three-day Rome trip for a family with kids. Ask one clarification if needed.","expected":"The agent should preserve the family-with-kids constraint, propose a practical three-day Rome itinerary, include transit/rest pacing, and avoid claiming it can book live reservations.","messages":[{"role":"user","content":"We want to visit Rome with two kids."},{"role":"assistant","content":"How many days do you have and what pace do you prefer?"},{"role":"user","content":"Three days, moderate pace, museums and food."}]}
+{"input":"Help me choose between Lisbon and Seattle for a low-budget food weekend.","expected":"The agent should compare both destinations, mention budget tradeoffs, food activities, transit/weather notes, and avoid unsupported price or booking claims.","messages":[{"role":"user","content":"I need a low-budget food weekend."},{"role":"assistant","content":"Are you choosing between specific cities?"},{"role":"user","content":"Lisbon or Seattle."}]}
+'@ | Set-Content -Encoding utf8 .agentops\data\travel-conversations.jsonl
+```
+
+Then update the evaluation contract in `agentops.yaml`. The important part is
+that `rubrics[].evaluator` names the rubric evaluator that Foundry / azd will
+run. If your Foundry Observe flow generated a different rubric evaluator name,
+use that exact name here.
+
+```yaml
+dataset: .agentops/data/travel-conversations.jsonl
+dataset_kind: multi-turn
+
+rubrics:
+  - name: travel-concierge-quality
+    evaluator: travel-concierge-quality
+    description: Scores the Travel Agent against the intended product behavior.
+    dimensions:
+      - name: task_success
+        description: Completes the user's travel-planning goal across the conversation.
+        weight: 0.5
+      - name: constraint_following
+        description: Carries user constraints such as kids, budget, duration, and pace.
+        weight: 0.3
+      - name: safe_booking_behavior
+        description: Avoids claiming live bookings, confirmations, or prices it cannot verify.
+        weight: 0.2
+
+thresholds:
+  task_success: ">=4"
+  constraint_following: ">=4"
+  safe_booking_behavior: ">=4"
+```
+
+Re-run init so the azd recipe includes the rubric evaluator in the actual
+evaluation, not only in documentation:
+
+```powershell
+agentops eval init --force
+agentops eval run
+```
+
+If the rubric evaluator name is wrong or missing in Foundry, the run should fail
+closed. That is intentional: a green gate must mean the rubric really ran. When
+it passes, `results.json` records `execution: azd`, the evaluator list, the
+rubric metadata from `agentops.yaml`, and threshold results for the rubric
+dimensions.
+
 ## 11. Generate the PR + dev deploy workflows
 
 > **Pipeline ownership.** This tutorial uses `agentops workflow generate`
@@ -846,7 +906,11 @@ The PR workflow now has two jobs:
    `.agentops/deployments/agentops.candidate.yaml` pointing at the
    staged candidate.
 2. **`eval`** — runs `agentops eval run` against the candidate, then
-   runs Doctor with `--severity-fail critical`.
+   runs Doctor with `--severity-fail critical`. Because the previous step
+   moved the gate to `execution: azd` with `rubrics:`, the workflow is not
+   just checking a smoke response: it runs the Foundry / azd evaluation recipe,
+   applies the Travel Agent rubric dimensions as thresholds, and writes the
+   normalized rubric evidence to `.agentops/results/latest/results.json`.
 
 > **Why does the PR workflow stage in dev, not sandbox?** The PR gate
 > must evaluate the same target the deploy workflow will use. Sandbox
@@ -859,6 +923,9 @@ The PR workflow now has two jobs:
 The dev deploy workflow stages a candidate (same logic), evaluates it,
 summarizes the deployment via `prompt_deploy summarize`, and uploads
 `.agentops/deployments/foundry-agent.json` as a workflow artifact.
+The deploy gate uses the same rubric-aware `agentops eval run`, so the candidate
+that lands in dev has already passed the conversation/rubric gate reviewers saw
+on the PR.
 
 The `--doctor-gate critical` flag controls the Doctor severity floor in
 the PR template. The table below summarizes the three values:
@@ -1327,7 +1394,7 @@ deploys, explicit thresholds, or red-team/governance evidence. Treat those as th
 hardening backlog. The eval gates and the dev deploy loop are
 production-ready.
 
-If you want to show the Build 2026 governance story in the video, keep it as a
+If you want to show the governance evidence path in the video, keep it as a
 short optional callout:
 
 ```powershell
diff --git a/src/agentops/agent/analyzer.py b/src/agentops/agent/analyzer.py
index a9e2234b..4b0c2f0d 100644
--- a/src/agentops/agent/analyzer.py
+++ b/src/agentops/agent/analyzer.py
@@ -12,6 +12,7 @@
 from agentops.agent.checks.foundry_config import run_foundry_config_check
 from agentops.agent.checks.governance import run_governance_check
 from agentops.agent.checks.latency import run_latency_check
+from agentops.agent.checks.observability import run_observability_check
 from agentops.agent.checks.opex_workspace import run_opex_workspace_check
 from agentops.agent.checks.opex import run_opex_check
 from agentops.agent.checks.posture import run_posture_check
@@ -146,6 +147,7 @@ def analyze(
     findings.extend(run_posture_check(resources, posture_config))
     findings.extend(run_opex_workspace_check(workspace))
     findings.extend(run_governance_check(workspace))
+    findings.extend(run_observability_check(workspace))
     findings.extend(run_opex_check(history, config.checks.opex))
     findings.extend(run_release_readiness_check(workspace, history, foundry))
     findings.extend(
diff --git a/src/agentops/agent/checks/catalog.py b/src/agentops/agent/checks/catalog.py
index c85902d5..7db2e2ab 100644
--- a/src/agentops/agent/checks/catalog.py
+++ b/src/agentops/agent/checks/catalog.py
@@ -141,6 +141,18 @@
     "safety.config.continuous_eval_disabled": (
         "https://learn.microsoft.com/azure/ai-foundry/how-to/online-evaluation"
     ),
+    "observability.multiturn_coverage_missing": (
+        "https://learn.microsoft.com/azure/foundry/concepts/observability"
+    ),
+    "observability.rubric_missing": (
+        "https://learn.microsoft.com/azure/foundry/concepts/observability"
+    ),
+    "observability.trace_sampling_missing": (
+        "https://learn.microsoft.com/azure/foundry/concepts/observability"
+    ),
+    "observability.trace_replay_missing": (
+        "https://learn.microsoft.com/azure/foundry/concepts/observability"
+    ),
 }
 
 
@@ -199,6 +211,28 @@ def is_llm_judged(self) -> bool:
         requires=("results_history",),
         flags=("dynamic_id",),
     ),
+    CheckSpec(
+        id="observability.multiturn_coverage_missing",
+        category=Category.QUALITY,
+        title="Multi-turn evaluation coverage is not declared yet",
+        summary=(
+            "The workspace does not declare multi-turn dataset coverage or "
+            "trace-derived conversation rows for Foundry multi-turn evals."
+        ),
+        severities=(Severity.INFO,),
+        requires=("workspace",),
+    ),
+    CheckSpec(
+        id="observability.rubric_missing",
+        category=Category.QUALITY,
+        title="No context-specific rubric evaluator is declared",
+        summary=(
+            "The workspace does not declare a Foundry rubric evaluator or "
+            "rubric dimensions that can be bound to release thresholds."
+        ),
+        severities=(Severity.INFO,),
+        requires=("workspace",),
+    ),
     # ------------------------------------------------------------------
     # Performance
     # ------------------------------------------------------------------
@@ -438,6 +472,28 @@ def is_llm_judged(self) -> bool:
         severities=(Severity.WARNING,),
         requires=("foundry_control",),
     ),
+    CheckSpec(
+        id="observability.trace_sampling_missing",
+        category=Category.OPERATIONAL_EXCELLENCE,
+        title="Intelligent trace sampling is not evidence-ready",
+        summary=(
+            "The workspace does not declare Foundry trace sampling and the "
+            "trace-regression manifest does not include sampling lineage."
+        ),
+        severities=(Severity.WARNING,),
+        requires=("workspace",),
+    ),
+    CheckSpec(
+        id="observability.trace_replay_missing",
+        category=Category.OPERATIONAL_EXCELLENCE,
+        title="Trace replay link is not captured in release evidence",
+        summary=(
+            "The workspace has no trace replay URL in agentops.yaml or in "
+            "trace-derived dataset lineage."
+        ),
+        severities=(Severity.INFO,),
+        requires=("workspace",),
+    ),
     CheckSpec(
         id="opex.results_not_gitignored",
         category=Category.OPERATIONAL_EXCELLENCE,
diff --git a/src/agentops/agent/checks/observability.py b/src/agentops/agent/checks/observability.py
new file mode 100644
index 00000000..40ce615a
--- /dev/null
+++ b/src/agentops/agent/checks/observability.py
@@ -0,0 +1,175 @@
+"""Foundry observability readiness checks."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, List
+
+from agentops.agent.findings import Category, Finding, Severity
+from agentops.utils.yaml import load_yaml
+
+SOURCE_NAME = "observability"
+
+
+def run_observability_check(workspace: Path) -> List[Finding]:
+    """Validate repo-side intent for Foundry observability signals.
+
+    These checks are deliberately read-only. Foundry owns the runtime surfaces
+    for traces, intelligent sampling, replay, multi-turn eval, and rubric
+    evaluators; AgentOps verifies whether the repo has enough metadata and
+    evidence to make those signals part of release readiness.
+    """
+
+    config = _safe_config(workspace)
+    if not config and not (workspace / ".agentops").exists():
+        return []
+
+    findings: List[Finding] = []
+    findings.extend(_check_multiturn_coverage(config, workspace))
+    findings.extend(_check_rubric_coverage(config))
+    findings.extend(_check_trace_sampling(config, workspace))
+    findings.extend(_check_trace_replay(config, workspace))
+    return findings
+
+
+def _check_multiturn_coverage(config: dict[str, Any], workspace: Path) -> List[Finding]:
+    if str(config.get("dataset_kind") or "auto") == "multi-turn":
+        return []
+    manifest = _trace_manifest(workspace)
+    lineage = manifest.get("lineage") if isinstance(manifest, dict) else {}
+    if isinstance(lineage, dict) and int(lineage.get("multi_turn_rows") or 0) > 0:
+        return []
+    return [
+        Finding(
+            id="observability.multiturn_coverage_missing",
+            severity=Severity.INFO,
+            category=Category.QUALITY,
+            title="Multi-turn evaluation coverage is not declared yet",
+            summary=(
+                "Foundry multi-turn evaluation is designed to catch context "
+                "carryover, tone drift, contradictions, and task-completion "
+                "failures across a full conversation. AgentOps did not find "
+                "`dataset_kind: multi-turn` or trace-derived conversation rows."
+            ),
+            recommendation=(
+                "After the single-turn smoke gate is green, add a conversation "
+                "dataset or use Foundry traces-to-dataset output with `messages` "
+                "rows, then set `dataset_kind: multi-turn` in agentops.yaml."
+            ),
+            source=SOURCE_NAME,
+        )
+    ]
+
+
+def _check_rubric_coverage(config: dict[str, Any]) -> List[Finding]:
+    rubrics = config.get("rubrics")
+    if isinstance(rubrics, list) and rubrics:
+        return []
+    return [
+        Finding(
+            id="observability.rubric_missing",
+            severity=Severity.INFO,
+            category=Category.QUALITY,
+            title="No context-specific rubric evaluator is declared",
+            summary=(
+                "Foundry rubric evaluators let teams score the agent against "
+                "task-specific criteria such as task success, tone, safety, cost, "
+                "and latency. AgentOps did not find a `rubrics:` block in "
+                "agentops.yaml."
+            ),
+            recommendation=(
+                "Declare at least one rubric in agentops.yaml and bind its "
+                "dimension metrics to thresholds, or reference the rubric through "
+                "the azd eval recipe used by `execution: azd`."
+            ),
+            source=SOURCE_NAME,
+        )
+    ]
+
+
+def _check_trace_sampling(config: dict[str, Any], workspace: Path) -> List[Finding]:
+    observability = config.get("observability")
+    trace_sampling = (
+        observability.get("trace_sampling")
+        if isinstance(observability, dict)
+        else None
+    )
+    if isinstance(trace_sampling, dict) and trace_sampling.get("enabled") is True:
+        return []
+    manifest = _trace_manifest(workspace)
+    lineage = manifest.get("lineage") if isinstance(manifest, dict) else {}
+    if isinstance(lineage, dict) and lineage.get("sampling_policies"):
+        return []
+    return [
+        Finding(
+            id="observability.trace_sampling_missing",
+            severity=Severity.WARNING,
+            category=Category.OPERATIONAL_EXCELLENCE,
+            title="Intelligent trace sampling is not evidence-ready",
+            summary=(
+                "Foundry intelligent trace sampling evaluates the most "
+                "signal-rich production traces without scoring every request. "
+                "AgentOps did not find `observability.trace_sampling.enabled: true` "
+                "or sampling metadata in the trace-regression manifest."
+            ),
+            recommendation=(
+                "Enable Foundry trace sampling or document the sampling policy in "
+                "`observability.trace_sampling`, then regenerate trace-derived "
+                "dataset candidates so release evidence includes the lineage."
+            ),
+            source=SOURCE_NAME,
+        )
+    ]
+
+
+def _check_trace_replay(config: dict[str, Any], workspace: Path) -> List[Finding]:
+    observability = config.get("observability")
+    if isinstance(observability, dict) and observability.get("trace_replay_url"):
+        return []
+    manifest = _trace_manifest(workspace)
+    lineage = manifest.get("lineage") if isinstance(manifest, dict) else {}
+    if isinstance(lineage, dict) and lineage.get("replay_urls"):
+        return []
+    return [
+        Finding(
+            id="observability.trace_replay_missing",
+            severity=Severity.INFO,
+            category=Category.OPERATIONAL_EXCELLENCE,
+            title="Trace replay link is not captured in release evidence",
+            summary=(
+                "Foundry trace replay and visualization make incident review "
+                "faster by linking each failure to the exact prompts, decisions, "
+                "tool calls, and outputs. AgentOps did not find a replay URL in "
+                "agentops.yaml or the trace-regression manifest."
+            ),
+            recommendation=(
+                "After selecting representative traces in Foundry, keep the replay "
+                "link in `observability.trace_replay_url` or include it in trace "
+                "exports before running `agentops eval promote-traces --apply`."
+            ),
+            source=SOURCE_NAME,
+        )
+    ]
+
+
+def _trace_manifest(workspace: Path) -> dict[str, Any]:
+    path = workspace / ".agentops" / "data" / "trace-regression-manifest.json"
+    if not path.exists():
+        return {}
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return {}
+    return payload if isinstance(payload, dict) else {}
+
+
+def _safe_config(workspace: Path) -> dict[str, Any]:
+    path = workspace / "agentops.yaml"
+    if not path.exists():
+        return {}
+    try:
+        data = load_yaml(path)
+    except Exception:
+        return {}
+    return data if isinstance(data, dict) else {}
diff --git a/src/agentops/agent/cockpit.py b/src/agentops/agent/cockpit.py
index 12b74f65..b4cd71a4 100644
--- a/src/agentops/agent/cockpit.py
+++ b/src/agentops/agent/cockpit.py
@@ -27,6 +27,7 @@
 
 from agentops.agent.history import AnalysisRecord, load_analysis_history
 from agentops.agent.time_range import TimeRange, parse_time_range, preset_keys
+from agentops.utils.yaml import load_yaml
 
 
 # ---------------------------------------------------------------------------
@@ -1891,6 +1892,12 @@ def _build_readiness_checklist(
     deep-links panel.
     """
     checks: List[Dict[str, Any]] = []
+    agentops_config = _read_agentops_config(workspace)
+    trace_manifest = _read_trace_regression_manifest(workspace)
+    raw_trace_lineage = trace_manifest.get("lineage")
+    trace_lineage: Dict[str, Any] = (
+        raw_trace_lineage if isinstance(raw_trace_lineage, dict) else {}
+    )
 
     tracing_ok = bool(telemetry.get("enabled"))
     checks.append(
@@ -1948,6 +1955,80 @@ def _build_readiness_checklist(
         }
     )
 
+    multi_turn_ready = (
+        agentops_config.get("dataset_kind") == "multi-turn"
+        or int(trace_lineage.get("multi_turn_rows") or 0) > 0
+    )
+    checks.append(
+        {
+            "title": "Multi-turn eval coverage",
+            "status": "ok" if multi_turn_ready else "muted",
+            "detail": (
+                "Detected conversation-level evaluation coverage from "
+                "<code>dataset_kind: multi-turn</code> or trace-derived rows."
+                if multi_turn_ready
+                else "<strong>How to complete:</strong> add a conversation "
+                "dataset or promote traces that include <code>messages</code>, "
+                "then set <code>dataset_kind: multi-turn</code> in "
+                "<code>agentops.yaml</code>."
+            ),
+        }
+    )
+
+    rubrics = agentops_config.get("rubrics")
+    rubric_ready = isinstance(rubrics, list) and bool(rubrics)
+    checks.append(
+        {
+            "title": "Rubric evaluator gate",
+            "status": "ok" if rubric_ready else "muted",
+            "detail": (
+                "Detected <code>rubrics:</code> in <code>agentops.yaml</code>. "
+                "AgentOps requires <code>execution: azd</code> so the Foundry "
+                "rubric evaluator actually runs."
+                if rubric_ready
+                else "<strong>How to complete:</strong> declare a task-specific "
+                "<code>rubrics:</code> block and bind its dimensions to thresholds. "
+                "Use <code>execution: azd</code> so Foundry evaluates the rubric."
+            ),
+        }
+    )
+
+    observability = agentops_config.get("observability")
+    observability = observability if isinstance(observability, dict) else {}
+    trace_sampling = observability.get("trace_sampling")
+    trace_sampling = trace_sampling if isinstance(trace_sampling, dict) else {}
+    sampling_ready = bool(trace_sampling.get("enabled")) or bool(trace_lineage.get("sampling_policies"))
+    checks.append(
+        {
+            "title": "Trace sampling for live quality",
+            "status": "ok" if sampling_ready else "muted",
+            "detail": (
+                "Detected trace-sampling intent or sampling lineage in the "
+                "trace-derived dataset manifest."
+                if sampling_ready
+                else "<strong>How to complete:</strong> enable Foundry trace "
+                "sampling or document the policy under "
+                "<code>observability.trace_sampling</code>, then harvest sampled "
+                "traces into dataset candidates."
+            ),
+        }
+    )
+
+    replay_ready = bool(observability.get("trace_replay_url")) or bool(trace_lineage.get("replay_urls"))
+    checks.append(
+        {
+            "title": "Trace replay linked to evidence",
+            "status": "ok" if replay_ready else "muted",
+            "detail": (
+                "Detected a Foundry trace replay link in config or trace lineage."
+                if replay_ready
+                else "<strong>How to complete:</strong> keep a representative "
+                "Foundry replay link in <code>observability.trace_replay_url</code> "
+                "or include replay URLs when promoting traces."
+            ),
+        }
+    )
+
     eval_workflow = _detect_eval_workflow(workspace)
     cont_eval = bool(eval_workflow.get("present"))
     eval_runner = str(eval_workflow.get("runner") or "")
@@ -2481,6 +2562,21 @@ def _read_json_object(path: Path) -> Dict[str, Any]:
     return payload if isinstance(payload, dict) else {}
 
 
+def _read_agentops_config(workspace: Path) -> Dict[str, Any]:
+    path = workspace / "agentops.yaml"
+    if not path.exists():
+        return {}
+    try:
+        payload = load_yaml(path)
+    except Exception:
+        return {}
+    return payload if isinstance(payload, dict) else {}
+
+
+def _read_trace_regression_manifest(workspace: Path) -> Dict[str, Any]:
+    return _read_json_object(workspace / ".agentops" / "data" / "trace-regression-manifest.json")
+
+
 def _official_eval_artifact_status(workspace: Path) -> Dict[str, Any]:
     base = workspace / ".agentops" / "official-eval"
     metadata = _read_json_object(base / "metadata.json")
diff --git a/src/agentops/core/agentops_config.py b/src/agentops/core/agentops_config.py
index 23bef990..a920e545 100644
--- a/src/agentops/core/agentops_config.py
+++ b/src/agentops/core/agentops_config.py
@@ -67,6 +67,9 @@
 #: How cloud evaluation submits local dataset rows to Foundry.
 DatasetSyncMode = Literal["auto", "inline", "foundry"]
 
+#: Dataset shape used by the evaluator runtime or Foundry / azd recipes.
+DatasetKind = Literal["auto", "single-turn", "multi-turn"]
+
 #: Internal-only literal kept for the publisher dispatch table. Derived from
 #: ``execution`` + ``publish`` via :meth:`AgentOpsConfig.publish_target`.
 PublishTarget = Literal["foundry", "foundry_cloud"]
@@ -209,6 +212,110 @@ def _version_non_empty(cls, value: str) -> str:
         return value
 
 
+class RubricDimensionConfig(BaseModel):
+    """One weighted dimension in a Foundry rubric evaluator.
+
+    Rubrics are optional and additive. AgentOps records them as release
+    readiness intent and uses thresholds to gate the metrics that Foundry/azd
+    emits for each dimension.
+    """
+
+    name: str
+    description: str
+    weight: Optional[float] = None
+
+    model_config = ConfigDict(extra="forbid")
+
+    @field_validator("name", "description")
+    @classmethod
+    def _text_non_empty(cls, value: str) -> str:
+        value = value.strip()
+        if not value:
+            raise ValueError("rubric dimension fields must be non-empty")
+        return value
+
+
+class RubricConfig(BaseModel):
+    """Context-specific evaluator criteria for Foundry rubric scoring."""
+
+    name: str
+    description: Optional[str] = None
+    dimensions: List[RubricDimensionConfig] = Field(default_factory=list)
+    evaluator: Optional[str] = Field(
+        None,
+        description="Optional Foundry/azd evaluator name when the rubric is registered remotely.",
+    )
+
+    model_config = ConfigDict(extra="forbid")
+
+    @field_validator("name")
+    @classmethod
+    def _name_non_empty(cls, value: str) -> str:
+        value = value.strip()
+        if not value:
+            raise ValueError("rubric name must be non-empty")
+        return value
+
+    @field_validator("description", "evaluator")
+    @classmethod
+    def _optional_text_non_empty(cls, value: Optional[str]) -> Optional[str]:
+        if value is None:
+            return value
+        value = value.strip()
+        if not value:
+            raise ValueError("rubric optional text fields must be non-empty when provided")
+        return value
+
+
+class TraceSamplingConfig(BaseModel):
+    """Foundry intelligent trace-sampling readiness contract."""
+
+    enabled: bool = False
+    mode: Literal["manual", "foundry", "scheduled"] = "manual"
+    description: Optional[str] = None
+
+    model_config = ConfigDict(extra="forbid")
+
+    @field_validator("description")
+    @classmethod
+    def _description_non_empty(cls, value: Optional[str]) -> Optional[str]:
+        if value is None:
+            return value
+        value = value.strip()
+        if not value:
+            raise ValueError("observability.trace_sampling.description must be non-empty")
+        return value
+
+
+class ObservabilityConfig(BaseModel):
+    """Foundry observability readiness metadata.
+
+    The fields are read-only intent for Doctor, Cockpit, and release evidence.
+    AgentOps does not create Foundry trace replay, sampling, or portal resources
+    from this block.
+    """
+
+    tracing_enabled: bool = False
+    trace_sampling: TraceSamplingConfig = Field(default_factory=TraceSamplingConfig)
+    trace_replay_url: Optional[str] = None
+    evaluations_url: Optional[str] = None
+    datasets_url: Optional[str] = None
+
+    model_config = ConfigDict(extra="forbid")
+
+    @field_validator("trace_replay_url", "evaluations_url", "datasets_url")
+    @classmethod
+    def _url_non_empty(cls, value: Optional[str]) -> Optional[str]:
+        if value is None:
+            return value
+        value = value.strip()
+        if not value:
+            raise ValueError("observability URLs must be non-empty when provided")
+        if not value.startswith(("https://", "http://")):
+            raise ValueError("observability URLs must start with http:// or https://")
+        return value
+
+
 class PromptAgentBootstrap(BaseModel):
     """Bootstrap defaults for prompt-agent CI/CD when the target Foundry
     project does not yet contain the seed agent referenced by ``agent``.
@@ -369,11 +476,25 @@ class AgentOpsConfig(BaseModel):
         Optional governance artifact paths. These are read-only inputs for
         Doctor and release evidence; AgentOps validates and references them but
         does not execute ASSERT, apply ACS controls, or run red-team campaigns.
+
+    ``dataset_kind`` / ``rubrics`` / ``observability``
+        Optional Foundry observability metadata. These fields keep existing
+        single-turn evals working while letting Doctor, Cockpit, CI evidence, and
+        azd/Foundry recipes reason about multi-turn coverage, rubric gates, trace
+        sampling, and trace replay links.
     """
 
     version: int = Field(..., description="Schema version. Must be 1.")
     agent: str = Field(..., description="Target identifier (name:version, URL, or model:deployment)")
     dataset: Path = Field(..., description="Path to a JSONL dataset file")
+    dataset_kind: DatasetKind = Field(
+        "auto",
+        description=(
+            "Dataset shape. 'auto' preserves current behavior, 'single-turn' "
+            "requires input/expected rows, and 'multi-turn' documents that rows "
+            "represent conversations or message histories."
+        ),
+    )
     prompt_file: Optional[Path] = Field(
         None,
         description=(
@@ -414,6 +535,10 @@ class AgentOpsConfig(BaseModel):
     auth_header_env: Optional[str] = None
 
     evaluators: Optional[List[EvaluatorOverride]] = None
+    rubrics: List[RubricConfig] = Field(
+        default_factory=list,
+        description="Optional context-specific rubric evaluator definitions.",
+    )
 
     publish: bool = Field(
         False,
@@ -458,6 +583,10 @@ class AgentOpsConfig(BaseModel):
         default_factory=DatasetSyncConfig,
         description="Cloud evaluation dataset submission policy.",
     )
+    observability: ObservabilityConfig = Field(
+        default_factory=ObservabilityConfig,
+        description="Foundry observability readiness metadata.",
+    )
     prompt_agent_bootstrap: Optional[PromptAgentBootstrap] = Field(
         None,
         description=(
diff --git a/src/agentops/core/release_evidence.py b/src/agentops/core/release_evidence.py
index 00f55aae..06778399 100644
--- a/src/agentops/core/release_evidence.py
+++ b/src/agentops/core/release_evidence.py
@@ -51,6 +51,7 @@ class ReleaseEvidence(BaseModel):
     foundry: Dict[str, Any] = Field(default_factory=dict)
     monitoring: Dict[str, Any] = Field(default_factory=dict)
     trace_dataset: Dict[str, Any] = Field(default_factory=dict)
+    observability: Dict[str, Any] = Field(default_factory=dict)
     ailz: Dict[str, Any] = Field(default_factory=dict)
     governance: Dict[str, Any] = Field(default_factory=dict)
 
diff --git a/src/agentops/pipeline/azd_runner.py b/src/agentops/pipeline/azd_runner.py
index b8f34b82..cf77a64d 100644
--- a/src/agentops/pipeline/azd_runner.py
+++ b/src/agentops/pipeline/azd_runner.py
@@ -202,6 +202,7 @@ def normalize_to_results(
             "azd eval run returned no numeric metrics, so AgentOps cannot apply "
             "thresholds or claim the gate passed."
         )
+    _validate_rubric_evidence(config=config, recipe=recipe, metrics=aggregate_metrics)
 
     metric_binding = bind_threshold_metrics(config.thresholds.keys(), aggregate_metrics.keys())
     if metric_binding.unmatched:
@@ -270,6 +271,8 @@ def normalize_to_results(
             "version": config.version,
             "agent": config.agent,
             "thresholds": dict(config.thresholds),
+            "dataset_kind": config.dataset_kind,
+            "rubrics": [rubric.model_dump(mode="json") for rubric in config.rubrics],
             "execution": "azd",
             "backend_requested": "azd",
             "backend_effective": "azd",
@@ -291,6 +294,46 @@ def normalize_to_results(
     )
 
 
+def _validate_rubric_evidence(
+    *,
+    config: AgentOpsConfig,
+    recipe: EvalRecipe,
+    metrics: Dict[str, float],
+) -> None:
+    if not config.rubrics:
+        return
+
+    recipe_evaluators = {evaluator.name for evaluator in recipe.evaluators}
+    threshold_names = set(config.thresholds)
+    metric_names = set(metrics)
+    missing: list[str] = []
+
+    for rubric in config.rubrics:
+        evaluator_name = (rubric.evaluator or rubric.name).strip()
+        if evaluator_name not in recipe_evaluators:
+            missing.append(f"rubric evaluator `{evaluator_name}` in eval.yaml")
+        dimension_names = [dimension.name for dimension in rubric.dimensions]
+        thresholded_dimensions = [
+            name for name in dimension_names if name in threshold_names
+        ]
+        if not thresholded_dimensions:
+            missing.append(
+                f"threshold for at least one dimension of rubric `{rubric.name}`"
+            )
+            continue
+        for dimension_name in thresholded_dimensions:
+            if dimension_name not in metric_names:
+                missing.append(f"azd metric for rubric dimension `{dimension_name}`")
+
+    if missing:
+        raise AzdBackendError(
+            "rubric evidence is incomplete; "
+            + "; ".join(missing)
+            + ". Run `agentops eval init --force` after configuring rubrics and "
+            "bind rubric dimension thresholds in agentops.yaml."
+        )
+
+
 def write_raw_artifacts(azd_run: AzdEvalRun, output_dir: Path) -> None:
     """Write native azd payload and command streams for debugging/evidence."""
 
@@ -369,6 +412,7 @@ def _run_command_with_progress(
                     errors="replace",
                     stdout=stdout_file,
                     stderr=stderr_file,
+                    stdin=subprocess.DEVNULL,
                 )
                 while True:
                     returncode = process.poll()
diff --git a/src/agentops/pipeline/orchestrator.py b/src/agentops/pipeline/orchestrator.py
index e3c9ab4c..a7f15b49 100644
--- a/src/agentops/pipeline/orchestrator.py
+++ b/src/agentops/pipeline/orchestrator.py
@@ -80,6 +80,12 @@ def _run_evaluation(
     options: RunOptions,
 ) -> RunResult:
     """Run a full evaluation after optional telemetry has been initialized."""
+    if config.rubrics and _resolve_execution_backend(config) != "azd":
+        raise ValueError(
+            "rubrics require execution: azd so Foundry/azd runs the rubric "
+            "evaluator. Set `execution: azd`, run `agentops eval init`, and "
+            "bind rubric dimension thresholds in agentops.yaml."
+        )
     if options.baseline_path is not None and not options.baseline_path.exists():
         raise FileNotFoundError(
             f"baseline file not found: {options.baseline_path}. "
diff --git a/src/agentops/services/azd_eval_init.py b/src/agentops/services/azd_eval_init.py
index aae84d12..2162769b 100644
--- a/src/agentops/services/azd_eval_init.py
+++ b/src/agentops/services/azd_eval_init.py
@@ -337,6 +337,8 @@ def _ensure_azd_project_env_metadata(
     if not location.found or location.env_path is None:
         location = ensure_azd_env(workspace, "sandbox")
     env_path = location.env_path
+    if env_path is None:
+        return
     updates = {
         "AZURE_AI_FOUNDRY_PROJECT_ENDPOINT": endpoint,
         "FOUNDRY_PROJECT_ENDPOINT": endpoint,
@@ -478,7 +480,20 @@ def _azd_evaluators_from_config(config_path: Path) -> tuple[str, ...]:
             mapped = name if name.startswith("builtin.") else _EVALUATOR_NAME_TO_AZD.get(name)
             if mapped and mapped not in names:
                 names.append(mapped)
-    return tuple(names) if names else _DEFAULT_AZD_EVALUATORS
+    if not names:
+        names.extend(_DEFAULT_AZD_EVALUATORS)
+    raw_rubrics = data.get("rubrics")
+    if isinstance(raw_rubrics, list):
+        for item in raw_rubrics:
+            if not isinstance(item, dict):
+                continue
+            raw_name = item.get("evaluator") or item.get("name")
+            if not isinstance(raw_name, str) or not raw_name.strip():
+                continue
+            name = raw_name.strip()
+            if name not in names:
+                names.append(name)
+    return tuple(names)
 
 
 def _azd_dataset_from_agentops_dataset(dataset: Path, *, workspace: Path) -> Path:
diff --git a/src/agentops/services/evidence_pack.py b/src/agentops/services/evidence_pack.py
index 08dccb8b..fc2814ac 100644
--- a/src/agentops/services/evidence_pack.py
+++ b/src/agentops/services/evidence_pack.py
@@ -65,6 +65,7 @@ def build_release_evidence(
     foundry = _foundry_status(analysis)
     monitoring = _monitoring_status(analysis)
     trace_dataset = _trace_dataset_status(root)
+    observability = _observability_status(root, trace_dataset)
     ailz = _ailz_status(analysis)
     governance = _governance_status(root)
 
@@ -80,12 +81,13 @@ def build_release_evidence(
     _add_doctor_check(checks, blockers, warnings, ready, doctor)
     _add_foundry_check(checks, warnings, ready, foundry)
     _add_monitoring_check(checks, warnings, ready, monitoring)
+    _add_observability_check(checks, warnings, ready, observability)
     _add_trace_dataset_check(checks, warnings, ready, trace_dataset)
     _add_ailz_check(checks, warnings, ready, ailz)
     _add_governance_check(checks, warnings, ready, governance)
 
     status = "blocked" if blockers else "ready_with_warnings" if warnings else "ready"
-    links = _links(latest_eval)
+    links = _links(latest_eval, observability)
     target = latest_eval.get("target")
     generated_at = datetime.now(timezone.utc).isoformat()
 
@@ -106,6 +108,7 @@ def build_release_evidence(
         foundry=foundry,
         monitoring=monitoring,
         trace_dataset=trace_dataset,
+        observability=observability,
         ailz=ailz,
         governance=governance,
     )
@@ -452,6 +455,42 @@ def _trace_dataset_status(root: Path) -> dict[str, Any]:
     return {"status": "ok", "manifest": str(manifest), **payload}
 
 
+def _observability_status(root: Path, trace_dataset: dict[str, Any]) -> dict[str, Any]:
+    config = _agentops_config(root)
+    observability = config.get("observability")
+    observability = observability if isinstance(observability, dict) else {}
+    rubrics = config.get("rubrics")
+    rubrics = rubrics if isinstance(rubrics, list) else []
+    lineage = trace_dataset.get("lineage")
+    lineage = lineage if isinstance(lineage, dict) else {}
+    trace_sampling = observability.get("trace_sampling")
+    trace_sampling = trace_sampling if isinstance(trace_sampling, dict) else {}
+
+    replay_urls = [str(url) for url in _as_list(lineage.get("replay_urls")) if url]
+    evaluation_urls = [str(url) for url in _as_list(lineage.get("evaluation_urls")) if url]
+    sampling_policies = [
+        str(policy) for policy in _as_list(lineage.get("sampling_policies")) if policy
+    ]
+    multi_turn_rows = int(lineage.get("multi_turn_rows") or 0)
+
+    return {
+        "status": "ok" if observability or rubrics or lineage else "not_configured",
+        "dataset_kind": config.get("dataset_kind", "auto"),
+        "multi_turn_ready": config.get("dataset_kind") == "multi-turn" or multi_turn_rows > 0,
+        "multi_turn_rows": multi_turn_rows,
+        "rubrics_count": len(rubrics),
+        "rubrics": rubrics,
+        "trace_sampling_enabled": bool(trace_sampling.get("enabled")) or bool(sampling_policies),
+        "trace_sampling_mode": trace_sampling.get("mode"),
+        "sampling_policies": sampling_policies,
+        "trace_replay_urls": replay_urls
+        or ([str(observability["trace_replay_url"])] if observability.get("trace_replay_url") else []),
+        "evaluation_urls": evaluation_urls
+        or ([str(observability["evaluations_url"])] if observability.get("evaluations_url") else []),
+        "datasets_url": observability.get("datasets_url"),
+    }
+
+
 def _ailz_status(analysis: Optional[AnalysisResult]) -> dict[str, Any]:
     if analysis is None:
         return {"status": "not_run"}
@@ -661,6 +700,50 @@ def _add_monitoring_check(
     checks.append(ReleaseEvidenceCheck(name="Runtime monitoring", status="warning", summary=message, evidence=monitoring))
 
 
+def _add_observability_check(
+    checks: list[ReleaseEvidenceCheck],
+    warnings: list[str],
+    ready: list[str],
+    observability: dict[str, Any],
+) -> None:
+    missing: list[str] = []
+    if not observability.get("multi_turn_ready"):
+        missing.append("multi-turn eval coverage")
+    if int(observability.get("rubrics_count") or 0) <= 0:
+        missing.append("rubric evaluator")
+    if not observability.get("trace_sampling_enabled"):
+        missing.append("intelligent trace sampling")
+    if not observability.get("trace_replay_urls"):
+        missing.append("trace replay link")
+
+    if not missing:
+        message = (
+            "Foundry observability signals are evidence-ready: "
+            "multi-turn coverage, rubric scoring, trace sampling, and replay links."
+        )
+        ready.append(message)
+        checks.append(
+            ReleaseEvidenceCheck(
+                name="Foundry observability",
+                status="ready",
+                summary=message,
+                evidence=observability,
+            )
+        )
+        return
+
+    message = "Foundry observability evidence is incomplete: " + ", ".join(missing)
+    warnings.append(message)
+    checks.append(
+        ReleaseEvidenceCheck(
+            name="Foundry observability",
+            status="warning",
+            summary=message,
+            evidence=observability,
+        )
+    )
+
+
 def _add_trace_dataset_check(
     checks: list[ReleaseEvidenceCheck],
     warnings: list[str],
@@ -751,11 +834,18 @@ def _add_governance_check(
     )
 
 
-def _links(latest_eval: dict[str, Any]) -> list[ReleaseEvidenceLink]:
+def _links(latest_eval: dict[str, Any], observability: dict[str, Any]) -> list[ReleaseEvidenceLink]:
     links: list[ReleaseEvidenceLink] = []
     report_url = latest_eval.get("foundry_report_url")
     if report_url:
         links.append(ReleaseEvidenceLink(label="Foundry evaluation report", url=str(report_url)))
+    for url in _as_list(observability.get("trace_replay_urls"))[:3]:
+        links.append(ReleaseEvidenceLink(label="Foundry trace replay", url=str(url)))
+    for url in _as_list(observability.get("evaluation_urls"))[:3]:
+        links.append(ReleaseEvidenceLink(label="Foundry evaluation", url=str(url)))
+    datasets_url = observability.get("datasets_url")
+    if datasets_url:
+        links.append(ReleaseEvidenceLink(label="Foundry datasets", url=str(datasets_url)))
     return links
 
 
diff --git a/src/agentops/services/trace_promotion.py b/src/agentops/services/trace_promotion.py
index 0c2eae94..fa54f8bc 100644
--- a/src/agentops/services/trace_promotion.py
+++ b/src/agentops/services/trace_promotion.py
@@ -232,9 +232,46 @@ def _trace_to_row(trace: dict[str, Any], label_mode: LabelMode) -> Optional[dict
     metadata = {
         "source": "production_trace",
         "trace_id": _first_text(trace, "trace_id", "operation_Id", "operationId", "id"),
+        "operation_id": _first_text(trace, "operation_Id", "operationId"),
+        "span_id": _first_text(trace, "span_id", "spanId", "id"),
         "timestamp": _first_text(trace, "timestamp", "time", "TimeGenerated"),
         "label_mode": label_mode,
         "needs_review": True,
+        "source_system": _first_text(trace, "source_system", "source", "customDimensions.source_system"),
+        "agent": _first_text(trace, "agent", "agent_id", "customDimensions.agent"),
+        "agent_version": _first_text(
+            trace,
+            "agent_version",
+            "customDimensions.agent_version",
+            "customDimensions.agentops.agent.version",
+        ),
+        "foundry_project": _first_text(
+            trace,
+            "foundry_project",
+            "project",
+            "customDimensions.foundry_project",
+        ),
+        "replay_url": _first_text(
+            trace,
+            "replay_url",
+            "trace_replay_url",
+            "customDimensions.replay_url",
+            "customDimensions.trace_replay_url",
+        ),
+        "evaluation_url": _first_text(
+            trace,
+            "evaluation_url",
+            "eval_url",
+            "customDimensions.evaluation_url",
+            "customDimensions.eval_url",
+        ),
+        "sampling_policy": _first_text(
+            trace,
+            "sampling_policy",
+            "sample_reason",
+            "customDimensions.sampling_policy",
+            "customDimensions.sample_reason",
+        ),
     }
     row: dict[str, Any] = {
         "input": input_text,
@@ -247,6 +284,9 @@ def _trace_to_row(trace: dict[str, Any], label_mode: LabelMode) -> Optional[dict
     tool_calls = _first_value(trace, "tool_calls", "customDimensions.tool_calls")
     if tool_calls:
         row["tool_calls"] = tool_calls
+    messages = _first_value(trace, "messages", "conversation", "turns", "customDimensions.messages")
+    if messages:
+        row["messages"] = messages
     return row
 
 
@@ -256,6 +296,7 @@ def _write_trace_dataset(preview: TracePromotionPreview) -> None:
         for row in preview.rows:
             handle.write(json.dumps(row, ensure_ascii=False) + "\n")
 
+    lineage = _lineage_from_rows(preview.rows)
     manifest = {
         "version": 1,
         "generated_at": datetime.now(timezone.utc).isoformat(),
@@ -265,10 +306,59 @@ def _write_trace_dataset(preview: TracePromotionPreview) -> None:
         "skipped": preview.skipped,
         "label_mode": preview.label_mode,
         "human_review_required": True,
+        "lineage": lineage,
     }
     preview.manifest_path.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8")
 
 
+def _lineage_from_rows(rows: list[dict[str, Any]]) -> dict[str, Any]:
+    trace_ids: list[str] = []
+    replay_urls: list[str] = []
+    evaluation_urls: list[str] = []
+    source_systems: set[str] = set()
+    agents: set[str] = set()
+    agent_versions: set[str] = set()
+    sampling_policies: set[str] = set()
+    multi_turn_rows = 0
+
+    for row in rows:
+        metadata = row.get("metadata")
+        if not isinstance(metadata, dict):
+            continue
+        _append_unique(trace_ids, metadata.get("trace_id"))
+        _append_unique(replay_urls, metadata.get("replay_url"))
+        _append_unique(evaluation_urls, metadata.get("evaluation_url"))
+        _add_text(source_systems, metadata.get("source_system"))
+        _add_text(agents, metadata.get("agent"))
+        _add_text(agent_versions, metadata.get("agent_version"))
+        _add_text(sampling_policies, metadata.get("sampling_policy"))
+        if row.get("messages"):
+            multi_turn_rows += 1
+
+    return {
+        "trace_ids": trace_ids,
+        "replay_urls": replay_urls,
+        "evaluation_urls": evaluation_urls,
+        "source_systems": sorted(source_systems),
+        "agents": sorted(agents),
+        "agent_versions": sorted(agent_versions),
+        "sampling_policies": sorted(sampling_policies),
+        "multi_turn_rows": multi_turn_rows,
+    }
+
+
+def _append_unique(values: list[str], value: Any) -> None:
+    text = str(value).strip() if value is not None else ""
+    if text and text not in values:
+        values.append(text)
+
+
+def _add_text(values: set[str], value: Any) -> None:
+    text = str(value).strip() if value is not None else ""
+    if text:
+        values.add(text)
+
+
 def _first_text(data: dict[str, Any], *keys: str) -> Optional[str]:
     value = _first_value(data, *keys)
     if value is None:
diff --git a/src/agentops/templates/agentops.yaml b/src/agentops/templates/agentops.yaml
index 910fb318..30493277 100644
--- a/src/agentops/templates/agentops.yaml
+++ b/src/agentops/templates/agentops.yaml
@@ -20,6 +20,13 @@ agent: "my-agent:1"
 
 dataset: .agentops/data/smoke.jsonl
 
+# Optional. Leave as auto for existing single-turn smoke tests. Set to
+# multi-turn when the dataset rows represent conversations / message histories
+# so Doctor, Cockpit, and release evidence can treat conversation-level evals as
+# part of the readiness proof.
+#
+# dataset_kind: auto
+
 # Optional. Source-controlled instructions file used by prompt-agent CI/CD.
 # Generated prompt-agent deploy workflows create a candidate Foundry version
 # from this file, evaluate that exact version, then record it as deployed only
@@ -52,6 +59,25 @@ dataset: .agentops/data/smoke.jsonl
 #   coherence: ">=3"
 #   groundedness: ">=3"
 #   avg_latency_seconds: "<=30"
+#
+# Optional. Context-specific rubric evaluators. When this block is present,
+# AgentOps requires execution: azd so the Foundry / azd evaluator actually runs;
+# local execution will fail closed instead of pretending rubric scoring happened.
+#
+# rubrics:
+#   - name: travel-concierge-quality
+#     evaluator: travel-concierge-rubric
+#     dimensions:
+#       - name: task_success
+#         description: "Completes the requested task without losing context."
+#         weight: 0.5
+#       - name: safety
+#         description: "Avoids unsafe or unsupported claims."
+#         weight: 0.3
+#
+# thresholds:
+#   task_success: ">=4"
+#   safety: ">=4"
 
 # Optional. Foundry prompt agents and Foundry publishing need a project
 # endpoint. If both this value and AZURE_AI_FOUNDRY_PROJECT_ENDPOINT are set,
@@ -67,6 +93,19 @@ dataset: .agentops/data/smoke.jsonl
 #
 # execution: local
 #
+# Optional. Foundry observability readiness metadata. AgentOps reads
+# this for Doctor/Cockpit/evidence; Foundry still owns tracing, replay, and
+# trace sampling at runtime.
+#
+# observability:
+#   tracing_enabled: true
+#   trace_sampling:
+#     enabled: true
+#     mode: foundry
+#   trace_replay_url: "https://ai.azure.com/..."
+#   evaluations_url: "https://ai.azure.com/..."
+#   datasets_url: "https://ai.azure.com/..."
+#
 # Optional. Publish local results to the Classic Foundry Evaluations panel.
 # Only meaningful when execution: local. With execution: cloud the run is
 # always published (Foundry hosts the run by definition).
diff --git a/tests/unit/test_agent_checks_observability.py b/tests/unit/test_agent_checks_observability.py
new file mode 100644
index 00000000..aa581e8f
--- /dev/null
+++ b/tests/unit/test_agent_checks_observability.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from agentops.agent.checks.observability import run_observability_check
+
+
+def test_observability_check_flags_missing_build_2026_readiness(tmp_path: Path) -> None:
+    (tmp_path / "agentops.yaml").write_text(
+        "version: 1\n"
+        "agent: travel-agent:2\n"
+        "dataset: .agentops/data/smoke.jsonl\n",
+        encoding="utf-8",
+    )
+
+    findings = run_observability_check(tmp_path)
+    ids = {finding.id for finding in findings}
+
+    assert "observability.multiturn_coverage_missing" in ids
+    assert "observability.rubric_missing" in ids
+    assert "observability.trace_sampling_missing" in ids
+    assert "observability.trace_replay_missing" in ids
+
+
+def test_observability_check_accepts_declared_readiness(tmp_path: Path) -> None:
+    (tmp_path / "agentops.yaml").write_text(
+        "version: 1\n"
+        "agent: travel-agent:2\n"
+        "dataset: .agentops/data/conversations.jsonl\n"
+        "dataset_kind: multi-turn\n"
+        "rubrics:\n"
+        "  - name: travel-concierge-quality\n"
+        "    dimensions:\n"
+        "      - name: task_success\n"
+        "        description: Completes the requested travel task.\n"
+        "observability:\n"
+        "  trace_sampling:\n"
+        "    enabled: true\n"
+        "    mode: foundry\n"
+        "  trace_replay_url: https://ai.azure.com/traces/trace-1\n",
+        encoding="utf-8",
+    )
+
+    findings = run_observability_check(tmp_path)
+
+    assert findings == []
+
+
+def test_observability_check_accepts_trace_manifest_lineage(tmp_path: Path) -> None:
+    (tmp_path / "agentops.yaml").write_text(
+        "version: 1\n"
+        "agent: travel-agent:2\n"
+        "dataset: .agentops/data/smoke.jsonl\n"
+        "rubrics:\n"
+        "  - name: travel-concierge-quality\n",
+        encoding="utf-8",
+    )
+    manifest = tmp_path / ".agentops" / "data" / "trace-regression-manifest.json"
+    manifest.parent.mkdir(parents=True)
+    manifest.write_text(
+        json.dumps(
+            {
+                "lineage": {
+                    "multi_turn_rows": 2,
+                    "sampling_policies": ["foundry-intelligent-sampling"],
+                    "replay_urls": ["https://ai.azure.com/traces/trace-1"],
+                }
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    findings = run_observability_check(tmp_path)
+
+    assert findings == []
diff --git a/tests/unit/test_agentops_config.py b/tests/unit/test_agentops_config.py
index e95134a7..288f56a2 100644
--- a/tests/unit/test_agentops_config.py
+++ b/tests/unit/test_agentops_config.py
@@ -10,7 +10,10 @@
 from agentops.core.agentops_config import (
     AgentOpsConfig,
     DatasetSyncConfig,
+    ObservabilityConfig,
     PromptAgentBootstrap,
+    RubricConfig,
+    RubricDimensionConfig,
     Threshold,
     classify_agent,
 )
@@ -255,6 +258,72 @@ def test_dataset_sync_rejects_empty_name(self) -> None:
                 }
             )
 
+    def test_accepts_build_2026_eval_metadata(self) -> None:
+        cfg = AgentOpsConfig(
+            version=1,
+            agent="travel-agent:2",
+            dataset=".agentops/data/conversations.jsonl",
+            dataset_kind="multi-turn",
+            rubrics=[
+                RubricConfig(
+                    name="travel-concierge-quality",
+                    description="Travel planning behavior",
+                    dimensions=[
+                        RubricDimensionConfig(
+                            name="task_success",
+                            description="Completes the requested travel planning task.",
+                            weight=0.5,
+                        ),
+                        RubricDimensionConfig(
+                            name="tone",
+                            description="Uses concise and helpful travel-advisor tone.",
+                            weight=0.2,
+                        ),
+                    ],
+                    evaluator="builtin.rubric",
+                )
+            ],
+            observability=ObservabilityConfig(
+                tracing_enabled=True,
+                trace_sampling={"enabled": True, "mode": "foundry"},
+                trace_replay_url="https://ai.azure.com/project/traces/trace-1",
+            ),
+            thresholds={"task_success": ">=4"},
+        )
+
+        assert cfg.dataset_kind == "multi-turn"
+        assert cfg.rubrics[0].name == "travel-concierge-quality"
+        assert cfg.rubrics[0].dimensions[0].weight == 0.5
+        assert cfg.observability.tracing_enabled is True
+        assert cfg.observability.trace_sampling.enabled is True
+
+    def test_observability_rejects_non_url_links(self) -> None:
+        with pytest.raises(ValidationError, match="observability URLs"):
+            AgentOpsConfig.model_validate(
+                {
+                    "version": 1,
+                    "agent": "travel-agent:2",
+                    "dataset": ".agentops/data/smoke.jsonl",
+                    "observability": {"trace_replay_url": "ai.azure.com/traces"},
+                }
+            )
+
+    def test_rubric_rejects_empty_dimension(self) -> None:
+        with pytest.raises(ValidationError, match="rubric dimension"):
+            AgentOpsConfig.model_validate(
+                {
+                    "version": 1,
+                    "agent": "travel-agent:2",
+                    "dataset": ".agentops/data/smoke.jsonl",
+                    "rubrics": [
+                        {
+                            "name": "travel",
+                            "dimensions": [{"name": " ", "description": "score"}],
+                        }
+                    ],
+                }
+            )
+
     def test_prompt_agent_bootstrap_defaults_to_none(self) -> None:
         cfg = AgentOpsConfig(version=1, agent="my-rag:3", dataset="./qa.jsonl")
         assert cfg.prompt_agent_bootstrap is None
diff --git a/tests/unit/test_azd_eval_init.py b/tests/unit/test_azd_eval_init.py
index b42ecb5d..29240b6a 100644
--- a/tests/unit/test_azd_eval_init.py
+++ b/tests/unit/test_azd_eval_init.py
@@ -162,6 +162,54 @@ def fake_run(command, **kwargs):
     assert result.command_ran is True
 
 
+def test_run_azd_eval_init_passes_configured_rubric_evaluator(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    config_path = tmp_path / "agentops.yaml"
+    _write_config(config_path)
+    config_path.write_text(
+        config_path.read_text(encoding="utf-8")
+        + """
+rubrics:
+  - name: travel-quality
+    evaluator: travel-quality-rubric
+    dimensions:
+      - name: task_success
+        description: Completes the requested travel planning task.
+thresholds:
+  task_success: ">=4"
+""",
+        encoding="utf-8",
+    )
+    dataset = tmp_path / ".agentops" / "data" / "smoke.jsonl"
+    dataset.parent.mkdir(parents=True)
+    dataset.write_text('{"input":"hello"}\n', encoding="utf-8")
+    prompt_file = tmp_path / ".agentops" / "prompts" / "travel.md"
+    prompt_file.parent.mkdir(parents=True)
+    prompt_file.write_text("You are a travel planner.", encoding="utf-8")
+
+    monkeypatch.setattr(azd_eval_init, "azd_available", lambda *, cwd=None: True)
+
+    def fake_run(command, **kwargs):
+        if command[:3] == ["az", "resource", "list"]:
+            return subprocess.CompletedProcess(command, 0, stdout="[]", stderr="")
+        assert command[-2:] == ["--evaluator", "travel-quality-rubric"]
+        assert command.count("--evaluator") == 3
+        recipe = Path(kwargs["cwd"]) / "eval.yaml"
+        recipe.write_text("name: travel-agent-eval\n", encoding="utf-8")
+        return subprocess.CompletedProcess(command, 0, stdout="created", stderr="")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    result = azd_eval_init.run_azd_eval_init(
+        workspace=tmp_path,
+        config_path=config_path,
+    )
+
+    assert result.command_ran is True
+
+
 def test_run_azd_eval_init_bootstraps_before_azd_availability_check(
     tmp_path: Path,
     monkeypatch,
diff --git a/tests/unit/test_azd_runner.py b/tests/unit/test_azd_runner.py
index a14a166d..73a8a612 100644
--- a/tests/unit/test_azd_runner.py
+++ b/tests/unit/test_azd_runner.py
@@ -124,6 +124,7 @@ def poll(self) -> int | None:
 
     def fake_popen(command, **kwargs):
         assert command == ["azd", "ai", "agent", "eval", "run"]
+        assert kwargs["stdin"] is azd_runner.subprocess.DEVNULL
         return FakeProcess(stdout=kwargs["stdout"], stderr=kwargs["stderr"])
 
     monkeypatch.setattr(azd_runner.subprocess, "Popen", fake_popen)
@@ -147,6 +148,37 @@ def fake_popen(command, **kwargs):
     ]
 
 
+def test_rubrics_require_azd_backend(tmp_path: Path) -> None:
+    dataset = tmp_path / "smoke.jsonl"
+    _write_dataset(dataset)
+    config = AgentOpsConfig(
+        version=1,
+        agent="travel-agent:1",
+        dataset=dataset,
+        execution="local",
+        rubrics=[
+            {
+                "name": "travel-quality",
+                "dimensions": [
+                    {
+                        "name": "task_success",
+                        "description": "Completes the requested travel task.",
+                    }
+                ],
+            }
+        ],
+    )
+
+    with pytest.raises(ValueError, match="rubrics require execution: azd"):
+        orchestrator.run_evaluation(
+            config,
+            options=orchestrator.RunOptions(
+                config_path=tmp_path / "agentops.yaml",
+                output_dir=tmp_path / ".agentops" / "results",
+            ),
+        )
+
+
 def test_normalize_to_results_binds_azd_metrics_and_thresholds(tmp_path: Path) -> None:
     recipe_path = tmp_path / "eval.yaml"
     _write_recipe(recipe_path)
@@ -250,6 +282,24 @@ def test_normalize_to_results_binds_rubric_dimensions(tmp_path: Path) -> None:
         agent="travel-agent:1",
         dataset="ignored.jsonl",
         execution="azd",
+        rubrics=[
+            {
+                "name": "travel_quality",
+                "evaluator": "travel_quality_rubric",
+                "dimensions": [
+                    {
+                        "name": "booking_accuracy",
+                        "description": "Books or recommends options accurately.",
+                        "weight": 0.7,
+                    },
+                    {
+                        "name": "policy_enforcement",
+                        "description": "Avoids unsupported booking claims.",
+                        "weight": 0.3,
+                    },
+                ],
+            }
+        ],
         thresholds={
             "travel_quality_rubric": ">=0.8",
             "booking_accuracy": ">=0.8",
@@ -296,6 +346,112 @@ def test_normalize_to_results_binds_rubric_dimensions(tmp_path: Path) -> None:
         "booking_accuracy": "booking_accuracy",
         "policy_enforcement": "policy_enforcement",
     }
+    assert result.config["rubrics"][0]["evaluator"] == "travel_quality_rubric"
+
+
+def test_rubric_config_requires_dimension_threshold_evidence(tmp_path: Path) -> None:
+    recipe_path = tmp_path / "eval.yaml"
+    recipe_path.write_text(
+        """
+name: rubric-eval
+agent:
+  name: travel-agent
+  kind: prompt-agent
+evaluators:
+  - builtin.coherence
+  - travel_quality_rubric
+""".lstrip(),
+        encoding="utf-8",
+    )
+    recipe = load_eval_recipe(recipe_path)
+    config = AgentOpsConfig(
+        version=1,
+        agent="travel-agent:1",
+        dataset="ignored.jsonl",
+        execution="azd",
+        rubrics=[
+            {
+                "name": "travel_quality",
+                "evaluator": "travel_quality_rubric",
+                "dimensions": [
+                    {
+                        "name": "booking_accuracy",
+                        "description": "Books or recommends options accurately.",
+                    }
+                ],
+            }
+        ],
+        thresholds={"coherence": ">=0.8"},
+    )
+    azd_run = azd_runner.AzdEvalRun(
+        recipe_path=recipe_path,
+        payload={"metrics": {"coherence": 0.91}},
+        run_id="run-1",
+        status="completed",
+        stdout="{}",
+        stderr="",
+        duration_seconds=3.0,
+    )
+
+    with pytest.raises(azd_runner.AzdBackendError, match="rubric evidence"):
+        azd_runner.normalize_to_results(
+            azd_run,
+            config=config,
+            recipe=recipe,
+            started_at=datetime.now(timezone.utc),
+        )
+
+
+def test_rubric_config_requires_recipe_evaluator(tmp_path: Path) -> None:
+    recipe_path = tmp_path / "eval.yaml"
+    recipe_path.write_text(
+        """
+name: rubric-eval
+agent:
+  name: travel-agent
+  kind: prompt-agent
+evaluators:
+  - builtin.coherence
+""".lstrip(),
+        encoding="utf-8",
+    )
+    recipe = load_eval_recipe(recipe_path)
+    config = AgentOpsConfig(
+        version=1,
+        agent="travel-agent:1",
+        dataset="ignored.jsonl",
+        execution="azd",
+        rubrics=[
+            {
+                "name": "travel_quality",
+                "evaluator": "travel_quality_rubric",
+                "dimensions": [
+                    {
+                        "name": "booking_accuracy",
+                        "description": "Books or recommends options accurately.",
+                    }
+                ],
+            }
+        ],
+        thresholds={"booking_accuracy": ">=0.8"},
+    )
+    azd_run = azd_runner.AzdEvalRun(
+        recipe_path=recipe_path,
+        payload={"metrics": {"booking_accuracy": 0.91}},
+        run_id="run-1",
+        status="completed",
+        stdout="{}",
+        stderr="",
+        duration_seconds=3.0,
+    )
+
+    with pytest.raises(azd_runner.AzdBackendError, match="rubric evaluator"):
+        azd_runner.normalize_to_results(
+            azd_run,
+            config=config,
+            recipe=recipe,
+            started_at=datetime.now(timezone.utc),
+        )
 
 
 def test_orchestrator_azd_dispatch_never_invokes_local_runtime(tmp_path: Path) -> None:
diff --git a/tests/unit/test_cockpit.py b/tests/unit/test_cockpit.py
index 3908661d..46edb242 100644
--- a/tests/unit/test_cockpit.py
+++ b/tests/unit/test_cockpit.py
@@ -427,6 +427,43 @@ def test_readiness_splits_tracing_and_includes_continuous_eval(tmp_path: Path):
     assert "Server-side tracing (agent → App Insights)" in html
 
 
+def test_readiness_detects_multiturn_rubric_sampling_and_replay(tmp_path: Path):
+    from agentops.agent.cockpit import _build_readiness_checklist
+
+    (tmp_path / "agentops.yaml").write_text(
+        "version: 1\n"
+        "agent: travel-agent:3\n"
+        "dataset: .agentops/data/travel-conversations.jsonl\n"
+        "dataset_kind: multi-turn\n"
+        "execution: azd\n"
+        "rubrics:\n"
+        "  - name: travel-concierge-quality\n"
+        "    evaluator: travel-concierge-quality\n"
+        "    dimensions:\n"
+        "      - name: task_success\n"
+        "        description: Completes the requested trip plan.\n"
+        "observability:\n"
+        "  trace_sampling:\n"
+        "    enabled: true\n"
+        "    mode: foundry\n"
+        "  trace_replay_url: https://ai.azure.com/traces/replay/abc\n",
+        encoding="utf-8",
+    )
+
+    readiness = _build_readiness_checklist(
+        tmp_path,
+        {"enabled": True, "detail": "ok", "portal_url": "https://x"},
+        {"has_data": False},
+        watchdog=None,
+    )
+    by_title = {check["title"]: check for check in readiness["checks"]}
+
+    assert by_title["Multi-turn eval coverage"]["status"] == "ok"
+    assert by_title["Rubric evaluator gate"]["status"] == "ok"
+    assert by_title["Trace sampling for live quality"]["status"] == "ok"
+    assert by_title["Trace replay linked to evidence"]["status"] == "ok"
+
+
 def test_readiness_non_ready_items_include_remediation(tmp_path: Path, monkeypatch):
     from agentops.agent.cockpit import _build_readiness_checklist
 
diff --git a/tests/unit/test_release_evidence.py b/tests/unit/test_release_evidence.py
index a5a706ec..a822cc10 100644
--- a/tests/unit/test_release_evidence.py
+++ b/tests/unit/test_release_evidence.py
@@ -105,6 +105,47 @@ def test_build_release_evidence_ready_with_warning_without_baseline(tmp_path: Pa
     assert any("No baseline comparison" in warning for warning in evidence.warnings)
 
 
+def test_build_release_evidence_includes_observability_links_and_rubric_status(
+    tmp_path: Path,
+) -> None:
+    _write_latest_results(tmp_path, passed=True)
+    (tmp_path / "agentops.yaml").write_text(
+        "version: 1\n"
+        "agent: travel-agent:7\n"
+        "dataset: .agentops/data/travel-conversations.jsonl\n"
+        "dataset_kind: multi-turn\n"
+        "execution: azd\n"
+        "rubrics:\n"
+        "  - name: travel-concierge-quality\n"
+        "    evaluator: travel-concierge-quality\n"
+        "    dimensions:\n"
+        "      - name: task_success\n"
+        "        description: Completes the requested trip plan.\n"
+        "observability:\n"
+        "  trace_sampling:\n"
+        "    enabled: true\n"
+        "    mode: foundry\n"
+        "  trace_replay_url: https://ai.azure.com/traces/replay/abc\n"
+        "  evaluations_url: https://ai.azure.com/evaluations/run-1\n"
+        "  datasets_url: https://ai.azure.com/datasets/travel\n",
+        encoding="utf-8",
+    )
+
+    evidence = build_release_evidence(tmp_path)
+
+    assert evidence.observability["multi_turn_ready"] is True
+    assert evidence.observability["rubrics_count"] == 1
+    assert evidence.observability["trace_sampling_enabled"] is True
+    assert evidence.observability["trace_replay_urls"] == [
+        "https://ai.azure.com/traces/replay/abc"
+    ]
+    assert any(check.name == "Foundry observability" and check.status == "ready" for check in evidence.checks)
+    labels = {link.label: link.url for link in evidence.links}
+    assert labels["Foundry trace replay"] == "https://ai.azure.com/traces/replay/abc"
+    assert labels["Foundry evaluation"] == "https://ai.azure.com/evaluations/run-1"
+    assert labels["Foundry datasets"] == "https://ai.azure.com/datasets/travel"
+
+
 def test_write_release_evidence_redacts_secret_values(tmp_path: Path) -> None:
     evidence = ReleaseEvidence(
         generated_at="2026-01-01T00:00:00+00:00",
diff --git a/tests/unit/test_trace_promotion.py b/tests/unit/test_trace_promotion.py
index 64c6d0fa..8915f8d6 100644
--- a/tests/unit/test_trace_promotion.py
+++ b/tests/unit/test_trace_promotion.py
@@ -70,6 +70,53 @@ def test_promote_traces_apply_writes_dataset_and_manifest(tmp_path: Path) -> Non
     manifest = json.loads(preview.manifest_path.read_text(encoding="utf-8"))
     assert manifest["human_review_required"] is True
     assert manifest["rows"] == 1
+    assert manifest["lineage"]["trace_ids"] == []
+
+
+def test_promote_traces_preserves_foundry_lineage_and_multiturn(tmp_path: Path) -> None:
+    source = tmp_path / "traces.jsonl"
+    source.write_text(
+        json.dumps(
+            {
+                "operation_Id": "op-1",
+                "trace_replay_url": "https://ai.azure.com/traces/op-1",
+                "evaluation_url": "https://ai.azure.com/evaluations/eval-1",
+                "agent": "travel-agent",
+                "agent_version": "7",
+                "sampling_policy": "foundry-intelligent-sampling",
+                "messages": [
+                    {"role": "user", "content": "Plan Rome"},
+                    {"role": "assistant", "content": "How many days?"},
+                    {"role": "user", "content": "Three"},
+                ],
+                "customDimensions": {
+                    "input": "Plan Rome",
+                    "response": "Here is a three-day Rome itinerary.",
+                    "source_system": "foundry",
+                },
+            }
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+    output = tmp_path / ".agentops" / "data" / "trace-regression.jsonl"
+
+    preview = promote_traces(source=source, output_path=output, apply=True)
+
+    row = preview.rows[0]
+    assert row["metadata"]["trace_id"] == "op-1"
+    assert row["metadata"]["replay_url"] == "https://ai.azure.com/traces/op-1"
+    assert row["metadata"]["evaluation_url"] == "https://ai.azure.com/evaluations/eval-1"
+    assert row["metadata"]["sampling_policy"] == "foundry-intelligent-sampling"
+    assert row["messages"][1]["role"] == "assistant"
+    manifest = json.loads(preview.manifest_path.read_text(encoding="utf-8"))
+    assert manifest["lineage"]["trace_ids"] == ["op-1"]
+    assert manifest["lineage"]["replay_urls"] == ["https://ai.azure.com/traces/op-1"]
+    assert manifest["lineage"]["evaluation_urls"] == ["https://ai.azure.com/evaluations/eval-1"]
+    assert manifest["lineage"]["agents"] == ["travel-agent"]
+    assert manifest["lineage"]["agent_versions"] == ["7"]
+    assert manifest["lineage"]["sampling_policies"] == ["foundry-intelligent-sampling"]
+    assert manifest["lineage"]["multi_turn_rows"] == 1
 
 
 def test_promote_traces_cli_preview_does_not_write(tmp_path: Path) -> None:

From 92ed4fe870bd0d7c4ca1281e98a54471069e9416 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 9 Jun 2026 01:40:12 +0000
Subject: [PATCH 2/2] chore: prepare release 0.3.12

---
 .claude-plugin/marketplace.json | 2 +-
 .github/plugin/marketplace.json | 2 +-
 CHANGELOG.md                    | 2 ++
 plugins/agentops/package.json   | 2 +-
 plugins/agentops/plugin.json    | 2 +-
 5 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 867782d1..c58e5fcd 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -13,7 +13,7 @@
       "name": "agentops-accelerator",
       "source": "../../plugins/agentops",
       "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Toolkit and Microsoft Foundry agents.",
-      "version": "0.3.8",
+      "version": "0.3.12",
       "keywords": [
         "agentops",
         "evaluation",
diff --git a/.github/plugin/marketplace.json b/.github/plugin/marketplace.json
index 867782d1..c58e5fcd 100644
--- a/.github/plugin/marketplace.json
+++ b/.github/plugin/marketplace.json
@@ -13,7 +13,7 @@
       "name": "agentops-accelerator",
       "source": "../../plugins/agentops",
       "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Toolkit and Microsoft Foundry agents.",
-      "version": "0.3.8",
+      "version": "0.3.12",
       "keywords": [
         "agentops",
         "evaluation",
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 634f196c..137f6081 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ This format follows [Keep a Changelog](https://keepachangelog.com/) and adheres
 
 ## [Unreleased]
 
+## [0.3.12] - 2026-06-09
+
 ### Added
 - **Foundry observability readiness now spans eval, Doctor, Cockpit, and release evidence.**
   `agentops.yaml` supports `dataset_kind`, `rubrics`, and `observability`
diff --git a/plugins/agentops/package.json b/plugins/agentops/package.json
index 3357c6b2..d9962578 100644
--- a/plugins/agentops/package.json
+++ b/plugins/agentops/package.json
@@ -2,7 +2,7 @@
   "name": "agentops-accelerator",
   "displayName": "AgentOps Accelerator — Skills for GitHub Copilot",
   "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Accelerator and Microsoft Foundry agents.",
-  "version": "0.3.8",
+  "version": "0.3.12",
   "publisher": "AgentOpsAccelerator",
   "icon": "icon.png",
   "license": "MIT",
diff --git a/plugins/agentops/plugin.json b/plugins/agentops/plugin.json
index 4626d03f..95b07344 100644
--- a/plugins/agentops/plugin.json
+++ b/plugins/agentops/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "agentops-accelerator",
   "description": "Copilot agent skills for running standardized evaluation workflows with AgentOps Accelerator and Microsoft Foundry agents.",
-  "version": "0.3.8",
+  "version": "0.3.12",
   "author": {
     "name": "AgentOps Accelerator",
     "url": "https://github.com/Azure/agentops"