From 8d64281b84ca9b76b2e05f7fe860f0787f1fa5e3 Mon Sep 17 00:00:00 2001
From: Edgecaser <EdgeCaser@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:28:35 -0700
Subject: [PATCH] Add Shipwright v2 benchmark harness

---
 benchmarks/README.md                          |  69 +++
 benchmarks/baselines/README.md                |   1 +
 .../board-update-ambiguity/final-pass.md      | 105 +++++
 .../board-update-ambiguity/first-pass.md      | 101 +++++
 .../churn-conflicting-signals/final-pass.md   |  92 ++++
 .../churn-conflicting-signals/first-pass.md   |  91 ++++
 .../related/strategy.json                     |  66 +++
 .../feature-weak-evidence/final-pass.md       |  79 ++++
 .../feature-weak-evidence/first-pass.md       |  78 ++++
 .../handoff-contradiction/final-pass.md       | 100 ++++
 .../handoff-contradiction/first-pass.md       |  91 ++++
 .../related/challenge-report.json             |  55 +++
 .../related/strategy.json                     |  66 +++
 .../prd-hidden-scope-creep/final-pass.md      | 100 ++++
 .../prd-hidden-scope-creep/first-pass.md      |  89 ++++
 .../related/challenge-report.json             |  58 +++
 .../pricing-partial-data/final-pass.md        |  99 ++++
 .../pricing-partial-data/first-pass.md        |  89 ++++
 benchmarks/results/README.md                  |   3 +
 .../scenarios/board-update-ambiguity.json     |  35 ++
 .../scenarios/churn-conflicting-signals.json  |  37 ++
 .../scenarios/feature-weak-evidence.json      |  35 ++
 .../scenarios/handoff-contradiction.json      |  38 ++
 .../scenarios/prd-hidden-scope-creep.json     |  37 ++
 .../scenarios/pricing-partial-data.json       |  35 ++
 scripts/run-benchmarks.mjs                    | 426 ++++++++++++++++++
 tests/run-benchmarks.test.mjs                 | 355 +++++++++++++++
 27 files changed, 2430 insertions(+)
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/baselines/README.md
 create mode 100644 benchmarks/fixtures/board-update-ambiguity/final-pass.md
 create mode 100644 benchmarks/fixtures/board-update-ambiguity/first-pass.md
 create mode 100644 benchmarks/fixtures/churn-conflicting-signals/final-pass.md
 create mode 100644 benchmarks/fixtures/churn-conflicting-signals/first-pass.md
 create mode 100644 benchmarks/fixtures/churn-conflicting-signals/related/strategy.json
 create mode 100644 benchmarks/fixtures/feature-weak-evidence/final-pass.md
 create mode 100644 benchmarks/fixtures/feature-weak-evidence/first-pass.md
 create mode 100644 benchmarks/fixtures/handoff-contradiction/final-pass.md
 create mode 100644 benchmarks/fixtures/handoff-contradiction/first-pass.md
 create mode 100644 benchmarks/fixtures/handoff-contradiction/related/challenge-report.json
 create mode 100644 benchmarks/fixtures/handoff-contradiction/related/strategy.json
 create mode 100644 benchmarks/fixtures/prd-hidden-scope-creep/final-pass.md
 create mode 100644 benchmarks/fixtures/prd-hidden-scope-creep/first-pass.md
 create mode 100644 benchmarks/fixtures/prd-hidden-scope-creep/related/challenge-report.json
 create mode 100644 benchmarks/fixtures/pricing-partial-data/final-pass.md
 create mode 100644 benchmarks/fixtures/pricing-partial-data/first-pass.md
 create mode 100644 benchmarks/results/README.md
 create mode 100644 benchmarks/scenarios/board-update-ambiguity.json
 create mode 100644 benchmarks/scenarios/churn-conflicting-signals.json
 create mode 100644 benchmarks/scenarios/feature-weak-evidence.json
 create mode 100644 benchmarks/scenarios/handoff-contradiction.json
 create mode 100644 benchmarks/scenarios/prd-hidden-scope-creep.json
 create mode 100644 benchmarks/scenarios/pricing-partial-data.json
 create mode 100644 scripts/run-benchmarks.mjs
 create mode 100644 tests/run-benchmarks.test.mjs

diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..2079b85
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,69 @@
+# Benchmarks
+
+Shipwright v2 benchmarks are fixture-based and deterministic by design.
+
+Each scenario lives in `benchmarks/scenarios/` and points to:
+
+- an initial markdown artifact fixture
+- a final markdown artifact fixture
+- optional related structured artifacts for contradiction and challenge checks
+- optional blind review inputs
+
+The harness in `scripts/run-benchmarks.mjs` validates those fixtures with the existing runtime validator and emits the required result shape from `docs/shipwright-v2-benchmark-scoring-spec.md`.
+
+## Fixture Rules
+
+- Markdown artifacts remain canonical for human review.
+- Structured payloads embedded in artifact comments are extracted and validated.
+- Related artifacts may be raw `.json` sidecars or markdown artifacts with an embedded structured payload.
+- `blind_review` may be `null` until human review has been run.
+
+## Default Run
+
+```bash
+node scripts/run-benchmarks.mjs
+node scripts/run-benchmarks.mjs --format json
+node scripts/run-benchmarks.mjs --out benchmarks/results/latest.json --format json
+```
+
+## Scenario Shape
+
+```json
+{
+  "id": "prd-hidden-scope-creep",
+  "title": "PRD with hidden scope creep",
+  "inputs": {
+    "prompt": "Write a PRD for ...",
+    "context_files": [],
+    "expected_artifact_type": "prd",
+    "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md"
+  },
+  "validator": {
+    "expect_sections": [
+      "Decision Frame",
+      "Unknowns & Evidence Gaps",
+      "Pass/Fail Readiness",
+      "Recommended Next Artifact"
+    ],
+    "expect_structured": true
+  },
+  "fixtures": {
+    "first_pass_artifact": "../fixtures/prd-hidden-scope-creep/first-pass.md",
+    "final_pass_artifact": "../fixtures/prd-hidden-scope-creep/final-pass.md",
+    "related_artifacts": [
+      "../fixtures/prd-hidden-scope-creep/related/challenge-report.json"
+    ],
+    "blind_review": null
+  },
+  "run_metadata": {
+    "time_to_first_usable_artifact_seconds": 420,
+    "revision_count": 1
+  },
+  "measures": [
+    "time_to_first_usable_artifact",
+    "revision_count",
+    "contradiction_count",
+    "blind_human_rating"
+  ]
+}
+```
diff --git a/benchmarks/baselines/README.md b/benchmarks/baselines/README.md
new file mode 100644
index 0000000..7d0b89b
--- /dev/null
+++ b/benchmarks/baselines/README.md
@@ -0,0 +1 @@
+Baseline benchmark outputs should be written here once the frozen v2 contract has been run against the comparison system.
diff --git a/benchmarks/fixtures/board-update-ambiguity/final-pass.md b/benchmarks/fixtures/board-update-ambiguity/final-pass.md
new file mode 100644
index 0000000..02fbe89
--- /dev/null
+++ b/benchmarks/fixtures/board-update-ambiguity/final-pass.md
@@ -0,0 +1,105 @@
+# Strategy: Board Update Under Ambiguity
+
+## Decision Frame
+
+Recommendation: keep the current investment plan while evidence is still incomplete.
+
+## Unknowns & Evidence Gaps
+
+- Demand quality is improving, but not yet uniform across segments.
+
+## Pass/Fail Readiness
+
+PASS because the board-facing recommendation is explicit and bounded.
+
+## Recommended Next Artifact
+
+- Executive briefing with the same recommendation and evidence trail.
+
+The board draft says expansion pipeline coverage is 42% above plan and should reach 120% next quarter.[1]
+
+## Sources
+
+- [1] Board pipeline pack, Q2 planning draft.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "strategy",
+  "mode": "rigorous",
+  "depth": "light",
+  "metadata": {
+    "title": "Strategy: Board Update Under Ambiguity",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Keep the current investment plan while evidence remains mixed.",
+    "tradeoff": "Preserves strategic consistency, but delays a sharper board narrative.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Segment demand quality is not yet stable enough for a sharper commitment."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The recommendation is explicit, bounded, and evidence-linked."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-board-1",
+      "kind": "document",
+      "source_ref": "quarterly-board-pack",
+      "confidence": "medium",
+      "supports": [
+        "decision_frame.recommendation",
+        "bet-hold-line"
+      ]
+    }
+  ],
+  "payload": {
+    "vision": "Preserve credibility with the board while ambiguity remains.",
+    "context": {
+      "current_state": {
+        "product_stage": "growth"
+      }
+    },
+    "primary_segment": "existing enterprise accounts",
+    "bets": [
+      {
+        "bet_id": "bet-hold-line",
+        "name": "Hold current investment line",
+        "thesis": "A steady investment plan is more defensible than a reactive pivot under mixed evidence.",
+        "assumptions": [
+          "Current demand is directionally positive but not yet conclusive."
+        ],
+        "investment_level": "moderate",
+        "success_metric": {
+          "metric_id": "metric-expansion",
+          "name": "expansion pipeline coverage",
+          "segment": "existing enterprise accounts",
+          "unit": "percent",
+          "timeframe": "quarterly",
+          "baseline": 105,
+          "target": 120
+        },
+        "kill_criteria": "If two consecutive quarters miss coverage, revisit the investment plan.",
+        "evidence_ids": ["ev-board-1"]
+      }
+    ],
+    "boundaries": {
+      "not_doing": [
+        "Narrative pivot before demand signal stabilizes"
+      ]
+    },
+    "review_cadence": {
+      "weekly": "Monitor inbound signal quality.",
+      "monthly": "Review board narrative assumptions.",
+      "quarterly": "Revisit investment posture before the next board cycle."
+    }
+  }
+}
+-->
diff --git a/benchmarks/fixtures/board-update-ambiguity/first-pass.md b/benchmarks/fixtures/board-update-ambiguity/first-pass.md
new file mode 100644
index 0000000..917b700
--- /dev/null
+++ b/benchmarks/fixtures/board-update-ambiguity/first-pass.md
@@ -0,0 +1,101 @@
+# Strategy: Board Update Under Ambiguity
+
+## Decision Frame
+
+Recommendation: keep the current investment plan while evidence is still incomplete.
+
+## Unknowns & Evidence Gaps
+
+- Demand quality is improving, but not yet uniform across segments.
+
+## Pass/Fail Readiness
+
+PASS because the board-facing recommendation is explicit and bounded.
+
+## Recommended Next Artifact
+
+- Executive briefing with the same recommendation and evidence trail.
+
+The board draft says expansion pipeline coverage is 42% above plan and should reach 120% next quarter, but the supporting source is not yet cited.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "strategy",
+  "mode": "rigorous",
+  "depth": "light",
+  "metadata": {
+    "title": "Strategy: Board Update Under Ambiguity",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Keep the current investment plan while evidence remains mixed.",
+    "tradeoff": "Preserves strategic consistency, but delays a sharper board narrative.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Segment demand quality is not yet stable enough for a sharper commitment."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The recommendation is explicit, bounded, and evidence-linked."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-board-1",
+      "kind": "document",
+      "source_ref": "quarterly-board-pack",
+      "confidence": "medium",
+      "supports": [
+        "decision_frame.recommendation",
+        "bet-hold-line"
+      ]
+    }
+  ],
+  "payload": {
+    "vision": "Preserve credibility with the board while ambiguity remains.",
+    "context": {
+      "current_state": {
+        "product_stage": "growth"
+      }
+    },
+    "primary_segment": "existing enterprise accounts",
+    "bets": [
+      {
+        "bet_id": "bet-hold-line",
+        "name": "Hold current investment line",
+        "thesis": "A steady investment plan is more defensible than a reactive pivot under mixed evidence.",
+        "assumptions": [
+          "Current demand is directionally positive but not yet conclusive."
+        ],
+        "investment_level": "moderate",
+        "success_metric": {
+          "metric_id": "metric-expansion",
+          "name": "expansion pipeline coverage",
+          "segment": "existing enterprise accounts",
+          "unit": "percent",
+          "timeframe": "quarterly",
+          "baseline": 105,
+          "target": 120
+        },
+        "kill_criteria": "If two consecutive quarters miss coverage, revisit the investment plan.",
+        "evidence_ids": ["ev-board-1"]
+      }
+    ],
+    "boundaries": {
+      "not_doing": [
+        "Narrative pivot before demand signal stabilizes"
+      ]
+    },
+    "review_cadence": {
+      "weekly": "Monitor inbound signal quality.",
+      "monthly": "Review board narrative assumptions.",
+      "quarterly": "Revisit investment posture before the next board cycle."
+    }
+  }
+}
+-->
diff --git a/benchmarks/fixtures/churn-conflicting-signals/final-pass.md b/benchmarks/fixtures/churn-conflicting-signals/final-pass.md
new file mode 100644
index 0000000..e320209
--- /dev/null
+++ b/benchmarks/fixtures/churn-conflicting-signals/final-pass.md
@@ -0,0 +1,92 @@
+# PRD: Churn Rescue Pilot
+
+## Decision Frame
+
+Recommendation: run a churn rescue pilot for high-risk accounts.
+
+## Unknowns & Evidence Gaps
+
+- We still need to isolate the highest-signal intervention.
+
+## Pass/Fail Readiness
+
+PASS because the pilot can be run without a broader strategy rewrite.
+
+## Recommended Next Artifact
+
+- Sprint plan for the rescue pilot instrumentation.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "PRD: Churn Rescue Pilot",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Run a churn rescue pilot for high-risk accounts.",
+    "tradeoff": "Focused intervention now, but metric targets diverge from top-level strategy because the pilot targets a narrower segment.",
+    "confidence": "high",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Which intervention has the highest near-term retention effect."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The pilot is scoped well enough to execute."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-churn-1",
+      "kind": "metric",
+      "source_ref": "retention-cohort-analysis",
+      "confidence": "high",
+      "supports": [
+        "decision_frame.recommendation",
+        "problem-churn",
+        "metric-retention"
+      ]
+    }
+  ],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-churn",
+      "text": "High-risk accounts churn before the success team can intervene."
+    },
+    "customer_evidence_ids": ["ev-churn-1"],
+    "success_metrics": [
+      {
+        "metric_id": "metric-retention",
+        "name": "gross retention improvement",
+        "segment": "high-risk accounts",
+        "unit": "percent",
+        "timeframe": "90 days",
+        "baseline": 1,
+        "target": 12,
+        "evidence_ids": ["ev-churn-1"],
+        "explanation": "Supersedes the broad strategy target because this pilot only covers the highest-risk segment."
+      }
+    ],
+    "scope": {
+      "in": [
+        "Risk scoring refresh",
+        "Success-team intervention workflow"
+      ],
+      "out": [
+        "Company-wide pricing changes"
+      ]
+    },
+    "open_questions": [
+      "Which intervention mix should the pilot test first?"
+    ],
+    "target_segment": "high-risk accounts"
+  }
+}
+-->
diff --git a/benchmarks/fixtures/churn-conflicting-signals/first-pass.md b/benchmarks/fixtures/churn-conflicting-signals/first-pass.md
new file mode 100644
index 0000000..0690fbb
--- /dev/null
+++ b/benchmarks/fixtures/churn-conflicting-signals/first-pass.md
@@ -0,0 +1,91 @@
+# PRD: Churn Rescue Pilot
+
+## Decision Frame
+
+Recommendation: run a churn rescue pilot for high-risk accounts.
+
+## Unknowns & Evidence Gaps
+
+- We still need to isolate the highest-signal intervention.
+
+## Pass/Fail Readiness
+
+PASS because the pilot can be run without a broader strategy rewrite.
+
+## Recommended Next Artifact
+
+- Sprint plan for the rescue pilot instrumentation.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "PRD: Churn Rescue Pilot",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Run a churn rescue pilot for high-risk accounts.",
+    "tradeoff": "Focused intervention now, but metric targets may diverge from top-level strategy.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Which intervention has the highest near-term retention effect."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The pilot is scoped well enough to execute."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-churn-1",
+      "kind": "metric",
+      "source_ref": "retention-cohort-analysis",
+      "confidence": "high",
+      "supports": [
+        "decision_frame.recommendation",
+        "problem-churn",
+        "metric-retention"
+      ]
+    }
+  ],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-churn",
+      "text": "High-risk accounts churn before the success team can intervene."
+    },
+    "customer_evidence_ids": ["ev-churn-1"],
+    "success_metrics": [
+      {
+        "metric_id": "metric-retention",
+        "name": "gross retention improvement",
+        "segment": "high-risk accounts",
+        "unit": "percent",
+        "timeframe": "90 days",
+        "baseline": 1,
+        "target": 12,
+        "evidence_ids": ["ev-churn-1"]
+      }
+    ],
+    "scope": {
+      "in": [
+        "Risk scoring refresh",
+        "Success-team intervention workflow"
+      ],
+      "out": [
+        "Company-wide pricing changes"
+      ]
+    },
+    "open_questions": [
+      "Which intervention mix should the pilot test first?"
+    ],
+    "target_segment": "high-risk accounts"
+  }
+}
+-->
diff --git a/benchmarks/fixtures/churn-conflicting-signals/related/strategy.json b/benchmarks/fixtures/churn-conflicting-signals/related/strategy.json
new file mode 100644
index 0000000..08a9ac0
--- /dev/null
+++ b/benchmarks/fixtures/churn-conflicting-signals/related/strategy.json
@@ -0,0 +1,66 @@
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "strategy",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "Strategy: Churn Stabilization",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Improve retention across the existing base.",
+    "tradeoff": "Broader coverage, slower local optimization.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "Strategy direction is approved."
+  },
+  "evidence": [],
+  "payload": {
+    "vision": "Reduce churn without over-rotating the roadmap.",
+    "context": {
+      "current_state": {
+        "product_stage": "growth"
+      }
+    },
+    "primary_segment": "high-risk accounts",
+    "bets": [
+      {
+        "bet_id": "bet-retention",
+        "name": "Retention stabilization",
+        "thesis": "A broad retention program should improve overall results.",
+        "assumptions": [
+          "The broad base can absorb a shared intervention plan."
+        ],
+        "investment_level": "moderate",
+        "success_metric": {
+          "metric_id": "metric-retention",
+          "name": "gross retention improvement",
+          "segment": "high-risk accounts",
+          "unit": "percent",
+          "timeframe": "90 days",
+          "baseline": 1,
+          "target": 8
+        },
+        "kill_criteria": "If retention does not improve, revisit the intervention model.",
+        "evidence_ids": []
+      }
+    ],
+    "boundaries": {
+      "not_doing": [
+        "Immediate pricing changes"
+      ]
+    },
+    "review_cadence": {
+      "weekly": "Monitor retention movement.",
+      "monthly": "Review intervention outcomes.",
+      "quarterly": "Revisit strategy target."
+    }
+  }
+}
diff --git a/benchmarks/fixtures/feature-weak-evidence/final-pass.md b/benchmarks/fixtures/feature-weak-evidence/final-pass.md
new file mode 100644
index 0000000..dfd8aac
--- /dev/null
+++ b/benchmarks/fixtures/feature-weak-evidence/final-pass.md
@@ -0,0 +1,79 @@
+# PRD: Insight Feed
+
+## Decision Frame
+
+Recommendation: build an insight feed for account managers.
+
+## Unknowns & Evidence Gaps
+
+- User demand is still mostly anecdotal.
+- We still lack workflow evidence from live interviews.
+
+## Pass/Fail Readiness
+
+FAIL because the evidence base is still weak.
+
+## Recommended Next Artifact
+
+- Discovery interview prep for account managers.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "fast",
+  "depth": "light",
+  "metadata": {
+    "title": "PRD: Insight Feed",
+    "status": "draft",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Build an insight feed for account managers.",
+    "tradeoff": "Potentially helpful workflow support, but evidence is still weak.",
+    "confidence": "low",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Which workflow problem matters most to account managers."
+  ],
+  "pass_fail_readiness": {
+    "status": "FAIL",
+    "reason": "The artifact is still not evidence-backed."
+  },
+  "evidence": [],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-insight-feed",
+      "text": "Account managers do not have a single place to review customer signals."
+    },
+    "customer_evidence_ids": [],
+    "success_metrics": [
+      {
+        "metric_id": "metric-dau",
+        "name": "weekly active account managers",
+        "segment": "account managers",
+        "unit": "percent",
+        "timeframe": "30 days",
+        "baseline": 0,
+        "target": 35,
+        "evidence_ids": []
+      }
+    ],
+    "scope": {
+      "in": [
+        "Signal feed prototype"
+      ],
+      "out": [
+        "Full account planning workflow"
+      ]
+    },
+    "open_questions": [
+      "Which signals should appear in the first prototype?"
+    ],
+    "target_segment": "account managers"
+  }
+}
+-->
diff --git a/benchmarks/fixtures/feature-weak-evidence/first-pass.md b/benchmarks/fixtures/feature-weak-evidence/first-pass.md
new file mode 100644
index 0000000..507c3a8
--- /dev/null
+++ b/benchmarks/fixtures/feature-weak-evidence/first-pass.md
@@ -0,0 +1,78 @@
+# PRD: Insight Feed
+
+## Decision Frame
+
+Recommendation: build an insight feed for account managers.
+
+## Unknowns & Evidence Gaps
+
+- User demand is still mostly anecdotal.
+
+## Pass/Fail Readiness
+
+FAIL because the evidence base is still weak.
+
+## Recommended Next Artifact
+
+- Discovery interview prep for account managers.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "fast",
+  "depth": "light",
+  "metadata": {
+    "title": "PRD: Insight Feed",
+    "status": "draft",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Build an insight feed for account managers.",
+    "tradeoff": "Potentially helpful workflow support, but evidence is weak.",
+    "confidence": "low",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Which workflow problem matters most to account managers."
+  ],
+  "pass_fail_readiness": {
+    "status": "FAIL",
+    "reason": "The artifact is not yet evidence-backed."
+  },
+  "evidence": [],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-insight-feed",
+      "text": "Account managers do not have a single place to review customer signals."
+    },
+    "customer_evidence_ids": [],
+    "success_metrics": [
+      {
+        "metric_id": "metric-dau",
+        "name": "weekly active account managers",
+        "segment": "account managers",
+        "unit": "percent",
+        "timeframe": "30 days",
+        "baseline": 0,
+        "target": 35,
+        "evidence_ids": []
+      }
+    ],
+    "scope": {
+      "in": [
+        "Signal feed prototype"
+      ],
+      "out": [
+        "Full account planning workflow"
+      ]
+    },
+    "open_questions": [
+      "Which signals should appear in the first prototype?"
+    ],
+    "target_segment": "account managers"
+  }
+}
+-->
diff --git a/benchmarks/fixtures/handoff-contradiction/final-pass.md b/benchmarks/fixtures/handoff-contradiction/final-pass.md
new file mode 100644
index 0000000..2997bde
--- /dev/null
+++ b/benchmarks/fixtures/handoff-contradiction/final-pass.md
@@ -0,0 +1,100 @@
+# PRD: Platform Handoff Alignment
+
+## Decision Frame
+
+Recommendation: ship the first platform handoff artifact for enterprise success teams.
+
+## Unknowns & Evidence Gaps
+
+- Observability coverage is still narrower than ideal.
+
+## Pass/Fail Readiness
+
+PASS because the handoff is now aligned to strategy and challenge resolution is explicit.
+
+## Recommended Next Artifact
+
+- Technical spec for the platform event stream.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "PRD: Platform Handoff Alignment",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Ship the first platform handoff artifact for enterprise success teams.",
+    "tradeoff": "Aligns to platform strategy now, but delays SMB-specific reuse until later.",
+    "confidence": "high",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Observability is not yet complete for every handoff branch."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The handoff is aligned to strategy and challenge handling is explicit."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-handoff-1",
+      "kind": "document",
+      "source_ref": "platform-handoff-design",
+      "confidence": "medium",
+      "supports": [
+        "decision_frame.recommendation",
+        "problem-platform-handoff",
+        "metric-platform-success"
+      ]
+    }
+  ],
+  "challenge_resolution": [
+    {
+      "finding_id": "finding-observability-gap",
+      "state": "waived",
+      "note": "Existing platform metrics are sufficient for the first enterprise-success rollout.",
+      "waiver_reason": "The remaining coverage gap is low-risk for the initial release.",
+      "owner": "PM"
+    }
+  ],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-platform-handoff",
+      "text": "Platform teams lack a shared handoff contract for workflow events."
+    },
+    "customer_evidence_ids": ["ev-handoff-1"],
+    "success_metrics": [
+      {
+        "metric_id": "metric-platform-success",
+        "name": "handoff success rate",
+        "segment": "enterprise success teams",
+        "unit": "percent",
+        "timeframe": "30 days",
+        "baseline": 71,
+        "target": 85,
+        "evidence_ids": ["ev-handoff-1"]
+      }
+    ],
+    "scope": {
+      "in": [
+        "Shared event contract",
+        "Fallback retry policy"
+      ],
+      "out": [
+        "Admin migration tooling"
+      ]
+    },
+    "open_questions": [
+      "Which fallback path should get the first operational dashboard?"
+    ],
+    "target_segment": "enterprise success teams"
+  }
+}
+-->
diff --git a/benchmarks/fixtures/handoff-contradiction/first-pass.md b/benchmarks/fixtures/handoff-contradiction/first-pass.md
new file mode 100644
index 0000000..81f1477
--- /dev/null
+++ b/benchmarks/fixtures/handoff-contradiction/first-pass.md
@@ -0,0 +1,91 @@
+# PRD: Platform Handoff Alignment
+
+## Decision Frame
+
+Recommendation: ship the first platform handoff artifact for self-serve SMB teams.
+
+## Unknowns & Evidence Gaps
+
+- Observability coverage is still narrower than ideal.
+
+## Pass/Fail Readiness
+
+PASS because the handoff can proceed while follow-up risk remains explicit.
+
+## Recommended Next Artifact
+
+- Technical spec for the platform event stream.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "PRD: Platform Handoff Alignment",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Ship the first platform handoff artifact for self-serve SMB teams.",
+    "tradeoff": "Moves faster on a smaller segment, but risks drifting from platform strategy and challenge findings.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Observability is not yet complete for every handoff branch."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The handoff is usable even though a few non-blocking contradictions remain."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-handoff-1",
+      "kind": "document",
+      "source_ref": "platform-handoff-design",
+      "confidence": "medium",
+      "supports": [
+        "decision_frame.recommendation",
+        "problem-platform-handoff",
+        "metric-platform-success"
+      ]
+    }
+  ],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-platform-handoff",
+      "text": "Platform teams lack a shared handoff contract for workflow events."
+    },
+    "customer_evidence_ids": ["ev-handoff-1"],
+    "success_metrics": [
+      {
+        "metric_id": "metric-platform-success",
+        "name": "handoff success rate",
+        "segment": "smb self-serve",
+        "unit": "percent",
+        "timeframe": "30 days",
+        "baseline": 71,
+        "target": 85,
+        "evidence_ids": ["ev-handoff-1"]
+      }
+    ],
+    "scope": {
+      "in": [
+        "Shared event contract",
+        "Fallback retry policy"
+      ],
+      "out": [
+        "Admin migration tooling"
+      ]
+    },
+    "open_questions": [
+      "Which fallback path should get the first operational dashboard?"
+    ],
+    "target_segment": "smb self-serve"
+  }
+}
+-->
diff --git a/benchmarks/fixtures/handoff-contradiction/related/challenge-report.json b/benchmarks/fixtures/handoff-contradiction/related/challenge-report.json
new file mode 100644
index 0000000..93076c1
--- /dev/null
+++ b/benchmarks/fixtures/handoff-contradiction/related/challenge-report.json
@@ -0,0 +1,55 @@
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "challenge-report",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "Challenge: Platform Handoff Alignment",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Address observability risk before broad rollout.",
+    "tradeoff": "Slightly slower execution, lower operational surprise.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [],
+  "pass_fail_readiness": {
+    "status": "FAIL",
+    "reason": "Observability risk still needs an explicit disposition."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-handoff-challenge-1",
+      "kind": "document",
+      "source_ref": "platform-observability-review",
+      "confidence": "medium",
+      "supports": ["finding-observability-gap"]
+    }
+  ],
+  "payload": {
+    "reviewed_artifact": {
+      "title": "PRD: Platform Handoff Alignment",
+      "artifact_type": "prd"
+    },
+    "depth": "standard",
+    "findings": [
+      {
+        "finding_id": "finding-observability-gap",
+        "claim": "The rollout depends on observability that is not yet complete.",
+        "vector": "structural-honesty",
+        "severity": "moderate",
+        "rationale": "The first release can ship, but the monitoring gap should be acknowledged explicitly.",
+        "resolution_condition": "Resolve or waive the observability gap before broad rollout.",
+        "evidence_ids": ["ev-handoff-challenge-1"]
+      }
+    ],
+    "verdict": "Proceed only with explicit risk handling.",
+    "action_plan": [
+      "Either add more observability or document a waiver."
+    ]
+  }
+}
diff --git a/benchmarks/fixtures/handoff-contradiction/related/strategy.json b/benchmarks/fixtures/handoff-contradiction/related/strategy.json
new file mode 100644
index 0000000..4c46783
--- /dev/null
+++ b/benchmarks/fixtures/handoff-contradiction/related/strategy.json
@@ -0,0 +1,66 @@
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "strategy",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "Strategy: Platform Expansion",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Focus platform expansion on enterprise success teams first.",
+    "tradeoff": "Higher implementation effort now, better strategic fit.",
+    "confidence": "high",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "Platform direction is approved."
+  },
+  "evidence": [],
+  "payload": {
+    "vision": "Make platform workflows durable for enterprise operations first.",
+    "context": {
+      "current_state": {
+        "product_stage": "growth"
+      }
+    },
+    "primary_segment": "enterprise success teams",
+    "bets": [
+      {
+        "bet_id": "bet-platform-enterprise",
+        "name": "Enterprise-first handoff",
+        "thesis": "Enterprise success teams create the clearest platform leverage.",
+        "assumptions": [
+          "Enterprise workflows have the strongest operational payoff."
+        ],
+        "investment_level": "major",
+        "success_metric": {
+          "metric_id": "metric-platform-success",
+          "name": "handoff success rate",
+          "segment": "enterprise success teams",
+          "unit": "percent",
+          "timeframe": "30 days",
+          "baseline": 71,
+          "target": 85
+        },
+        "kill_criteria": "If enterprise adoption stalls, revisit the segment sequence.",
+        "evidence_ids": []
+      }
+    ],
+    "boundaries": {
+      "not_doing": [
+        "Broad SMB-first rollout"
+      ]
+    },
+    "review_cadence": {
+      "weekly": "Track engineering progress.",
+      "monthly": "Review rollout readiness.",
+      "quarterly": "Revisit platform sequence."
+    }
+  }
+}
diff --git a/benchmarks/fixtures/prd-hidden-scope-creep/final-pass.md b/benchmarks/fixtures/prd-hidden-scope-creep/final-pass.md
new file mode 100644
index 0000000..94505d8
--- /dev/null
+++ b/benchmarks/fixtures/prd-hidden-scope-creep/final-pass.md
@@ -0,0 +1,100 @@
+# PRD: Team Inbox Workflow Handoff
+
+## Decision Frame
+
+Recommendation: ship the workflow handoff improvement with a limited rollout.
+
+## Unknowns & Evidence Gaps
+
+- Manager routing is explicitly deferred from v1.
+- Escalation ownership still needs a follow-up decision log.
+
+## Pass/Fail Readiness
+
+PASS because the hidden scope is now contained.
+
+## Recommended Next Artifact
+
+- Technical handoff for workflow audit trail delivery.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "PRD: Team Inbox Workflow Handoff",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Ship a limited workflow handoff release for support teams.",
+    "tradeoff": "Keeps v1 shippable now by explicitly excluding manager routing.",
+    "confidence": "high",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Manager routing remains a separate follow-up initiative."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "Critical scope finding has been resolved and v1 boundaries are explicit."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-prd-1",
+      "kind": "document",
+      "source_ref": "support-workflow-audit",
+      "confidence": "high",
+      "supports": [
+        "decision_frame.recommendation",
+        "problem-handoff",
+        "metric-handoff-rate"
+      ]
+    }
+  ],
+  "challenge_resolution": [
+    {
+      "finding_id": "finding-scope-creep",
+      "state": "resolved",
+      "note": "Manager routing is explicitly marked out of scope for v1."
+    }
+  ],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-handoff",
+      "text": "Support teams cannot complete inbox workflow handoff without manual follow-up."
+    },
+    "customer_evidence_ids": ["ev-prd-1"],
+    "success_metrics": [
+      {
+        "metric_id": "metric-handoff-rate",
+        "name": "workflow handoff completion rate",
+        "segment": "mid-market support teams",
+        "unit": "percent",
+        "timeframe": "30 days",
+        "baseline": 42,
+        "target": 65,
+        "evidence_ids": ["ev-prd-1"]
+      }
+    ],
+    "scope": {
+      "in": [
+        "Agent handoff trigger",
+        "Shared workflow audit trail"
+      ],
+      "out": [
+        "Manager routing controls",
+        "Manager escalation exceptions"
+      ]
+    },
+    "open_questions": [
+      "What owner should pick up manager routing in the follow-up cycle?"
+    ],
+    "target_segment": "mid-market support teams"
+  }
+}
+-->
diff --git a/benchmarks/fixtures/prd-hidden-scope-creep/first-pass.md b/benchmarks/fixtures/prd-hidden-scope-creep/first-pass.md
new file mode 100644
index 0000000..085b4a1
--- /dev/null
+++ b/benchmarks/fixtures/prd-hidden-scope-creep/first-pass.md
@@ -0,0 +1,89 @@
+# PRD: Team Inbox Workflow Handoff
+
+## Decision Frame
+
+Recommendation: ship the workflow handoff improvement with a limited rollout.
+
+## Unknowns & Evidence Gaps
+
+- Manager routing rules are still ambiguous.
+- Engineering ownership of escalation handling is not explicit.
+
+## Pass/Fail Readiness
+
+FAIL until the scope challenge is resolved.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "PRD: Team Inbox Workflow Handoff",
+    "status": "in-review",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Ship a limited workflow handoff release for support teams.",
+    "tradeoff": "Faster operator value now, but unresolved routing scope can spill into engineering work.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Manager routing behavior is still unclear.",
+    "Escalation ownership is not explicit."
+  ],
+  "pass_fail_readiness": {
+    "status": "FAIL",
+    "reason": "Challenge findings are still unresolved."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-prd-1",
+      "kind": "document",
+      "source_ref": "support-workflow-audit",
+      "confidence": "high",
+      "supports": [
+        "decision_frame.recommendation",
+        "problem-handoff",
+        "metric-handoff-rate"
+      ]
+    }
+  ],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-handoff",
+      "text": "Support teams cannot complete inbox workflow handoff without manual follow-up."
+    },
+    "customer_evidence_ids": ["ev-prd-1"],
+    "success_metrics": [
+      {
+        "metric_id": "metric-handoff-rate",
+        "name": "workflow handoff completion rate",
+        "segment": "mid-market support teams",
+        "unit": "percent",
+        "timeframe": "30 days",
+        "baseline": 42,
+        "target": 65,
+        "evidence_ids": ["ev-prd-1"]
+      }
+    ],
+    "scope": {
+      "in": [
+        "Agent handoff trigger",
+        "Shared workflow audit trail"
+      ],
+      "out": [
+        "Manager routing controls"
+      ]
+    },
+    "open_questions": [
+      "How should manager escalation exceptions be handled?"
+    ],
+    "target_segment": "mid-market support teams"
+  }
+}
+-->
diff --git a/benchmarks/fixtures/prd-hidden-scope-creep/related/challenge-report.json b/benchmarks/fixtures/prd-hidden-scope-creep/related/challenge-report.json
new file mode 100644
index 0000000..64f8086
--- /dev/null
+++ b/benchmarks/fixtures/prd-hidden-scope-creep/related/challenge-report.json
@@ -0,0 +1,58 @@
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "challenge-report",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "Challenge: Team Inbox Workflow Handoff",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Do not approve until hidden workflow scope is contained.",
+    "tradeoff": "Delay approval to avoid downstream delivery churn.",
+    "confidence": "high",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Instrumentation ownership is still unclear."
+  ],
+  "pass_fail_readiness": {
+    "status": "FAIL",
+    "reason": "Critical hidden-scope finding is unresolved."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-challenge-1",
+      "kind": "document",
+      "source_ref": "design-review-notes",
+      "confidence": "high",
+      "supports": ["finding-scope-creep"]
+    }
+  ],
+  "payload": {
+    "reviewed_artifact": {
+      "title": "PRD: Team Inbox Workflow Handoff",
+      "artifact_type": "prd"
+    },
+    "depth": "standard",
+    "findings": [
+      {
+        "finding_id": "finding-scope-creep",
+        "claim": "The PRD implies manager routing work without naming it in scope boundaries.",
+        "vector": "scope-discipline",
+        "severity": "critical",
+        "rationale": "Hidden workflow work will spill into engineering estimates and rollout planning.",
+        "resolution_condition": "Either remove manager routing from scope or mark it explicitly out of scope with a follow-up owner.",
+        "evidence_ids": ["ev-challenge-1"]
+      }
+    ],
+    "verdict": "Revision required before approval.",
+    "action_plan": [
+      "Clarify scope boundaries.",
+      "Record a resolution state in the revised artifact."
+    ]
+  }
+}
diff --git a/benchmarks/fixtures/pricing-partial-data/final-pass.md b/benchmarks/fixtures/pricing-partial-data/final-pass.md
new file mode 100644
index 0000000..65f0d77
--- /dev/null
+++ b/benchmarks/fixtures/pricing-partial-data/final-pass.md
@@ -0,0 +1,99 @@
+# Strategy: Pricing Reset with Partial Data
+
+## Decision Frame
+
+Recommendation: hold the packaging change until evidence improves.
+
+## Unknowns & Evidence Gaps
+
+- We still need more pricing interviews before approving a rollout decision.
+
+## Pass/Fail Readiness
+
+FAIL because the data is still incomplete for approval.
+
+## Recommended Next Artifact
+
+- Pricing research brief with a fixed interview and survey plan.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "strategy",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "Strategy: Pricing Reset with Partial Data",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Hold the packaging change until willingness-to-pay evidence improves.",
+    "tradeoff": "Protects credibility with sales and finance at the cost of slower monetization work.",
+    "confidence": "high",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "We still need more interview coverage before revisiting approval."
+  ],
+  "pass_fail_readiness": {
+    "status": "FAIL",
+    "reason": "The recommendation is usable and explicit, but not yet approval-ready."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-pricing-1",
+      "kind": "research",
+      "source_ref": "pricing-interview-sample",
+      "confidence": "medium",
+      "supports": [
+        "decision_frame.recommendation",
+        "bet-delay-pricing-change"
+      ]
+    }
+  ],
+  "payload": {
+    "vision": "Reach a packaging decision we can defend externally.",
+    "context": {
+      "current_state": {
+        "product_stage": "growth"
+      }
+    },
+    "primary_segment": "mid-market finance teams",
+    "bets": [
+      {
+        "bet_id": "bet-delay-pricing-change",
+        "name": "Delay packaging change",
+        "thesis": "Waiting for better evidence reduces the odds of a pricing reversal.",
+        "assumptions": [
+          "The current sample is too small to defend a packaging move."
+        ],
+        "investment_level": "moderate",
+        "success_metric": {
+          "metric_id": "metric-win-rate",
+          "name": "pilot close rate",
+          "segment": "mid-market finance teams",
+          "unit": "percent",
+          "timeframe": "quarterly",
+          "baseline": 18,
+          "target": 22
+        },
+        "kill_criteria": "If additional interviews still support waiting, keep the hold decision in place.",
+        "evidence_ids": ["ev-pricing-1"]
+      }
+    ],
+    "boundaries": {
+      "not_doing": [
+        "Immediate pricing rollout"
+      ]
+    },
+    "review_cadence": {
+      "weekly": "Review new interview signal.",
+      "monthly": "Revisit packaging options.",
+      "quarterly": "Reassess go-to-market pricing posture."
+    }
+  }
+}
+-->
diff --git a/benchmarks/fixtures/pricing-partial-data/first-pass.md b/benchmarks/fixtures/pricing-partial-data/first-pass.md
new file mode 100644
index 0000000..bc5d250
--- /dev/null
+++ b/benchmarks/fixtures/pricing-partial-data/first-pass.md
@@ -0,0 +1,89 @@
+# Strategy: Pricing Reset with Partial Data
+
+## Decision Frame
+
+Recommendation: hold the packaging change until evidence improves.
+
+## Unknowns & Evidence Gaps
+
+- We only have partial willingness-to-pay interviews.
+- Competitor packaging was sampled, not exhaustively mapped.
+
+## Pass/Fail Readiness
+
+FAIL because the current recommendation is still under-evidenced.
+
+## Recommended Next Artifact
+
+- Pricing research brief with a tighter interview plan.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "strategy",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "Strategy: Pricing Reset with Partial Data",
+    "status": "in-review",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Hold the packaging change until willingness-to-pay evidence improves.",
+    "tradeoff": "Slower monetization work now, lower reversal risk later.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "How sensitive mid-market buyers are to seat minimums."
+  ],
+  "pass_fail_readiness": {
+    "status": "FAIL",
+    "reason": "Evidence is still too thin for approval."
+  },
+  "evidence": [],
+  "payload": {
+    "vision": "Reach a packaging decision we can defend externally.",
+    "context": {
+      "current_state": {
+        "product_stage": "growth"
+      }
+    },
+    "primary_segment": "mid-market finance teams",
+    "bets": [
+      {
+        "bet_id": "bet-delay-pricing-change",
+        "name": "Delay packaging change",
+        "thesis": "Waiting for better evidence reduces the odds of a pricing reversal.",
+        "assumptions": [
+          "The current sample is too small to defend a packaging move."
+        ],
+        "investment_level": "moderate",
+        "success_metric": {
+          "metric_id": "metric-win-rate",
+          "name": "pilot close rate",
+          "segment": "mid-market finance teams",
+          "unit": "percent",
+          "timeframe": "quarterly",
+          "baseline": 18,
+          "target": 22
+        },
+        "kill_criteria": "If additional interviews do not change the conclusion, revisit the hold decision.",
+        "evidence_ids": []
+      }
+    ],
+    "boundaries": {
+      "not_doing": [
+        "Immediate pricing rollout"
+      ]
+    },
+    "review_cadence": {
+      "weekly": "Review new interview signal.",
+      "monthly": "Revisit packaging options.",
+      "quarterly": "Reassess go-to-market pricing posture."
+    }
+  }
+}
+-->
diff --git a/benchmarks/results/README.md b/benchmarks/results/README.md
new file mode 100644
index 0000000..fba977f
--- /dev/null
+++ b/benchmarks/results/README.md
@@ -0,0 +1,3 @@
+Generated benchmark summaries can be written here with `node scripts/run-benchmarks.mjs --out benchmarks/results/<name>.json --format json`.
+
+Until blind human review has been run, `mean_first_pass_blind_rating` and `mean_final_pass_blind_rating` will remain `null` in the generated summaries.
diff --git a/benchmarks/scenarios/board-update-ambiguity.json b/benchmarks/scenarios/board-update-ambiguity.json
new file mode 100644
index 0000000..ec77ab7
--- /dev/null
+++ b/benchmarks/scenarios/board-update-ambiguity.json
@@ -0,0 +1,35 @@
+{
+  "id": "board-update-ambiguity",
+  "title": "Board update under ambiguity",
+  "inputs": {
+    "prompt": "Write a strategy update for a board audience under ambiguity.",
+    "context_files": [],
+    "expected_artifact_type": "strategy",
+    "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md"
+  },
+  "validator": {
+    "expect_sections": [
+      "Decision Frame",
+      "Unknowns & Evidence Gaps",
+      "Pass/Fail Readiness",
+      "Recommended Next Artifact"
+    ],
+    "expect_structured": true
+  },
+  "fixtures": {
+    "first_pass_artifact": "../fixtures/board-update-ambiguity/first-pass.md",
+    "final_pass_artifact": "../fixtures/board-update-ambiguity/final-pass.md",
+    "related_artifacts": [],
+    "blind_review": null
+  },
+  "run_metadata": {
+    "time_to_first_usable_artifact_seconds": 180,
+    "revision_count": 0
+  },
+  "measures": [
+    "time_to_first_usable_artifact",
+    "revision_count",
+    "contradiction_count",
+    "blind_human_rating"
+  ]
+}
diff --git a/benchmarks/scenarios/churn-conflicting-signals.json b/benchmarks/scenarios/churn-conflicting-signals.json
new file mode 100644
index 0000000..de180f9
--- /dev/null
+++ b/benchmarks/scenarios/churn-conflicting-signals.json
@@ -0,0 +1,37 @@
+{
+  "id": "churn-conflicting-signals",
+  "title": "Churn diagnosis with conflicting signals",
+  "inputs": {
+    "prompt": "Write a churn reduction PRD when signals conflict with strategy targets.",
+    "context_files": [],
+    "expected_artifact_type": "prd",
+    "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md"
+  },
+  "validator": {
+    "expect_sections": [
+      "Decision Frame",
+      "Unknowns & Evidence Gaps",
+      "Pass/Fail Readiness",
+      "Recommended Next Artifact"
+    ],
+    "expect_structured": true
+  },
+  "fixtures": {
+    "first_pass_artifact": "../fixtures/churn-conflicting-signals/first-pass.md",
+    "final_pass_artifact": "../fixtures/churn-conflicting-signals/final-pass.md",
+    "related_artifacts": [
+      "../fixtures/churn-conflicting-signals/related/strategy.json"
+    ],
+    "blind_review": null
+  },
+  "run_metadata": {
+    "time_to_first_usable_artifact_seconds": 210,
+    "revision_count": 1
+  },
+  "measures": [
+    "time_to_first_usable_artifact",
+    "revision_count",
+    "contradiction_count",
+    "blind_human_rating"
+  ]
+}
diff --git a/benchmarks/scenarios/feature-weak-evidence.json b/benchmarks/scenarios/feature-weak-evidence.json
new file mode 100644
index 0000000..7846bf8
--- /dev/null
+++ b/benchmarks/scenarios/feature-weak-evidence.json
@@ -0,0 +1,35 @@
+{
+  "id": "feature-weak-evidence",
+  "title": "New feature with weak evidence",
+  "inputs": {
+    "prompt": "Draft a PRD for a new feature with weak supporting evidence.",
+    "context_files": [],
+    "expected_artifact_type": "prd",
+    "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md"
+  },
+  "validator": {
+    "expect_sections": [
+      "Decision Frame",
+      "Unknowns & Evidence Gaps",
+      "Pass/Fail Readiness",
+      "Recommended Next Artifact"
+    ],
+    "expect_structured": true
+  },
+  "fixtures": {
+    "first_pass_artifact": "../fixtures/feature-weak-evidence/first-pass.md",
+    "final_pass_artifact": "../fixtures/feature-weak-evidence/final-pass.md",
+    "related_artifacts": [],
+    "blind_review": null
+  },
+  "run_metadata": {
+    "time_to_first_usable_artifact_seconds": null,
+    "revision_count": 2
+  },
+  "measures": [
+    "time_to_first_usable_artifact",
+    "revision_count",
+    "contradiction_count",
+    "blind_human_rating"
+  ]
+}
diff --git a/benchmarks/scenarios/handoff-contradiction.json b/benchmarks/scenarios/handoff-contradiction.json
new file mode 100644
index 0000000..46b5c5f
--- /dev/null
+++ b/benchmarks/scenarios/handoff-contradiction.json
@@ -0,0 +1,38 @@
+{
+  "id": "handoff-contradiction",
+  "title": "Handoff artifact with cross-document contradictions",
+  "inputs": {
+    "prompt": "Write a technical handoff PRD aligned to a platform strategy and challenge review.",
+    "context_files": [],
+    "expected_artifact_type": "prd",
+    "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md"
+  },
+  "validator": {
+    "expect_sections": [
+      "Decision Frame",
+      "Unknowns & Evidence Gaps",
+      "Pass/Fail Readiness",
+      "Recommended Next Artifact"
+    ],
+    "expect_structured": true
+  },
+  "fixtures": {
+    "first_pass_artifact": "../fixtures/handoff-contradiction/first-pass.md",
+    "final_pass_artifact": "../fixtures/handoff-contradiction/final-pass.md",
+    "related_artifacts": [
+      "../fixtures/handoff-contradiction/related/strategy.json",
+      "../fixtures/handoff-contradiction/related/challenge-report.json"
+    ],
+    "blind_review": null
+  },
+  "run_metadata": {
+    "time_to_first_usable_artifact_seconds": 240,
+    "revision_count": 1
+  },
+  "measures": [
+    "time_to_first_usable_artifact",
+    "revision_count",
+    "contradiction_count",
+    "blind_human_rating"
+  ]
+}
diff --git a/benchmarks/scenarios/prd-hidden-scope-creep.json b/benchmarks/scenarios/prd-hidden-scope-creep.json
new file mode 100644
index 0000000..70ee187
--- /dev/null
+++ b/benchmarks/scenarios/prd-hidden-scope-creep.json
@@ -0,0 +1,37 @@
+{
+  "id": "prd-hidden-scope-creep",
+  "title": "PRD with hidden scope creep",
+  "inputs": {
+    "prompt": "Write a PRD for a team inbox workflow handoff improvement.",
+    "context_files": [],
+    "expected_artifact_type": "prd",
+    "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md"
+  },
+  "validator": {
+    "expect_sections": [
+      "Decision Frame",
+      "Unknowns & Evidence Gaps",
+      "Pass/Fail Readiness",
+      "Recommended Next Artifact"
+    ],
+    "expect_structured": true
+  },
+  "fixtures": {
+    "first_pass_artifact": "../fixtures/prd-hidden-scope-creep/first-pass.md",
+    "final_pass_artifact": "../fixtures/prd-hidden-scope-creep/final-pass.md",
+    "related_artifacts": [
+      "../fixtures/prd-hidden-scope-creep/related/challenge-report.json"
+    ],
+    "blind_review": null
+  },
+  "run_metadata": {
+    "time_to_first_usable_artifact_seconds": 420,
+    "revision_count": 1
+  },
+  "measures": [
+    "time_to_first_usable_artifact",
+    "revision_count",
+    "contradiction_count",
+    "blind_human_rating"
+  ]
+}
diff --git a/benchmarks/scenarios/pricing-partial-data.json b/benchmarks/scenarios/pricing-partial-data.json
new file mode 100644
index 0000000..43c2cbb
--- /dev/null
+++ b/benchmarks/scenarios/pricing-partial-data.json
@@ -0,0 +1,35 @@
+{
+  "id": "pricing-partial-data",
+  "title": "Pricing change with partial market data",
+  "inputs": {
+    "prompt": "Draft a pricing strategy recommendation under partial market data.",
+    "context_files": [],
+    "expected_artifact_type": "strategy",
+    "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md"
+  },
+  "validator": {
+    "expect_sections": [
+      "Decision Frame",
+      "Unknowns & Evidence Gaps",
+      "Pass/Fail Readiness",
+      "Recommended Next Artifact"
+    ],
+    "expect_structured": true
+  },
+  "fixtures": {
+    "first_pass_artifact": "../fixtures/pricing-partial-data/first-pass.md",
+    "final_pass_artifact": "../fixtures/pricing-partial-data/final-pass.md",
+    "related_artifacts": [],
+    "blind_review": null
+  },
+  "run_metadata": {
+    "time_to_first_usable_artifact_seconds": 540,
+    "revision_count": 1
+  },
+  "measures": [
+    "time_to_first_usable_artifact",
+    "revision_count",
+    "contradiction_count",
+    "blind_human_rating"
+  ]
+}
diff --git a/scripts/run-benchmarks.mjs b/scripts/run-benchmarks.mjs
new file mode 100644
index 0000000..c5580ce
--- /dev/null
+++ b/scripts/run-benchmarks.mjs
@@ -0,0 +1,426 @@
+#!/usr/bin/env node
+
+import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises';
+import path from 'node:path';
+import { pathToFileURL } from 'node:url';
+
+import { extractStructuredArtifact } from './extract-structured-artifact.mjs';
+import {
+  IssueType,
+  Severity,
+  validateArtifact,
+} from './validate-artifact.mjs';
+
+const DEFAULT_SCENARIO_DIR = path.resolve('benchmarks', 'scenarios');
+const DEFAULT_SCORING_SPEC_REF = 'docs/shipwright-v2-benchmark-scoring-spec.md';
+
+const SCORE_DIMENSIONS = Object.freeze([
+  'decision_usefulness',
+  'evidence_discipline',
+  'internal_consistency',
+  'actionability',
+]);
+
+const CONTRADICTION_TYPES = new Set([
+  IssueType.METRIC_CONTRADICTION,
+  IssueType.SEGMENT_CONTRADICTION,
+  IssueType.CHALLENGE_FINDING_UNRESOLVED,
+]);
+
+const USABILITY_BLOCKING_WARNING_TYPES = new Set([
+  IssueType.MISSING_SECTION,
+  IssueType.MISSING_STRUCTURED_ARTIFACT,
+]);
+
+export async function loadBenchmarkScenario(filePath) {
+  const resolved = path.resolve(filePath);
+  const raw = JSON.parse(await readFile(resolved, 'utf8'));
+
+  if (!raw || typeof raw !== 'object' || Array.isArray(raw)) {
+    throw new Error(`Benchmark scenario must be a JSON object: ${filePath}`);
+  }
+
+  if (typeof raw.id !== 'string' || raw.id.trim().length === 0) {
+    throw new Error(`Benchmark scenario is missing id: ${filePath}`);
+  }
+
+  if (typeof raw.inputs?.expected_artifact_type !== 'string') {
+    throw new Error(`Benchmark scenario is missing inputs.expected_artifact_type: ${filePath}`);
+  }
+
+  if (typeof raw.fixtures?.first_pass_artifact !== 'string') {
+    throw new Error(`Benchmark scenario is missing fixtures.first_pass_artifact: ${filePath}`);
+  }
+
+  if (typeof raw.fixtures?.final_pass_artifact !== 'string') {
+    throw new Error(`Benchmark scenario is missing fixtures.final_pass_artifact: ${filePath}`);
+  }
+
+  return {
+    ...raw,
+    source_path: resolved,
+  };
+}
+
+export async function runBenchmarkScenario(scenario, options = {}) {
+  const scenarioRecord = scenario?.source_path
+    ? scenario
+    : await loadBenchmarkScenario(scenario);
+
+  const relatedArtifacts = await loadRelatedArtifacts(scenarioRecord);
+  const blindReview = await loadBlindReview(scenarioRecord);
+  const firstPass = await evaluateScenarioPass(
+    scenarioRecord,
+    'first_pass',
+    relatedArtifacts,
+    blindReview,
+  );
+  const finalPass = await evaluateScenarioPass(
+    scenarioRecord,
+    'final_pass',
+    relatedArtifacts,
+    blindReview,
+  );
+
+  const status = deriveScenarioStatus(finalPass);
+  const result = {
+    scenario_id: scenarioRecord.id,
+    status,
+    first_pass: {
+      usable: firstPass.usable,
+      validator_error_count: firstPass.validator_error_count,
+      contradiction_count: firstPass.contradiction_count,
+      blind_rating: firstPass.blind_rating,
+    },
+    final_pass: {
+      usable: finalPass.usable,
+      time_to_first_usable_artifact_seconds: finalPass.usable
+        ? normalizeTimeToFirstUsable(
+            scenarioRecord.run_metadata?.time_to_first_usable_artifact_seconds,
+            scenarioRecord.id,
+          )
+        : null,
+      revision_count: normalizeRevisionCount(
+        scenarioRecord.run_metadata?.revision_count,
+        scenarioRecord.id,
+      ),
+      validator_error_count: finalPass.validator_error_count,
+      contradiction_count: finalPass.contradiction_count,
+      blind_rating: finalPass.blind_rating,
+    },
+    delta: {
+      usable_changed: finalPass.usable !== firstPass.usable,
+      blind_rating_change: computeNumericDelta(
+        finalPass.blind_rating,
+        firstPass.blind_rating,
+      ),
+      contradiction_count_change:
+        finalPass.contradiction_count - firstPass.contradiction_count,
+      validator_error_count_change:
+        finalPass.validator_error_count - firstPass.validator_error_count,
+    },
+    diagnostics: {
+      title: scenarioRecord.title,
+      expected_artifact_type: scenarioRecord.inputs.expected_artifact_type,
+      scoring_spec_ref:
+        scenarioRecord.inputs.scoring_spec_ref || DEFAULT_SCORING_SPEC_REF,
+      first_pass_issue_types: firstPass.issues.map((issue) => issue.type),
+      final_pass_issue_types: finalPass.issues.map((issue) => issue.type),
+    },
+  };
+
+  return result;
+}
+
+export async function runBenchmarkSuite(options = {}) {
+  const scenarioDir = path.resolve(options.scenarioDir || DEFAULT_SCENARIO_DIR);
+  const scenarioIds = new Set(options.scenarioIds || []);
+  const scenarioFiles = await discoverScenarioFiles(scenarioDir);
+  const selectedScenarioFiles = scenarioFiles.filter((filePath) => {
+    if (scenarioIds.size === 0) return true;
+    return scenarioIds.has(path.basename(filePath, '.json'));
+  });
+
+  if (selectedScenarioFiles.length === 0) {
+    throw new Error(`No benchmark scenarios found in ${scenarioDir}`);
+  }
+
+  const results = [];
+  for (const scenarioFile of selectedScenarioFiles) {
+    const scenario = await loadBenchmarkScenario(scenarioFile);
+    results.push(await runBenchmarkScenario(scenario));
+  }
+
+  const statusCounts = { PASS: 0, FAIL: 0, DNF: 0 };
+  for (const result of results) {
+    statusCounts[result.status] += 1;
+  }
+
+  const summary = {
+    generated_at: new Date().toISOString(),
+    scoring_spec_ref: DEFAULT_SCORING_SPEC_REF,
+    scenario_count: results.length,
+    status_counts: statusCounts,
+    mean_first_pass_blind_rating: computeMean(
+      results.map((result) => result.first_pass.blind_rating),
+    ),
+    mean_final_pass_blind_rating: computeMean(
+      results.map((result) => result.final_pass.blind_rating),
+    ),
+    results,
+  };
+
+  if (options.outPath) {
+    const outPath = path.resolve(options.outPath);
+    await mkdir(path.dirname(outPath), { recursive: true });
+    await writeFile(`${outPath}`, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
+  }
+
+  return summary;
+}
+
+function normalizeTimeToFirstUsable(value, scenarioId) {
+  if (value === null || value === undefined) {
+    throw new Error(`Scenario "${scenarioId}" is missing run_metadata.time_to_first_usable_artifact_seconds.`);
+  }
+
+  if (!Number.isInteger(value) || value < 0) {
+    throw new Error(
+      `Scenario "${scenarioId}" has invalid time_to_first_usable_artifact_seconds.`,
+    );
+  }
+
+  return value;
+}
+
+function normalizeRevisionCount(value, scenarioId) {
+  if (!Number.isInteger(value) || value < 0) {
+    throw new Error(`Scenario "${scenarioId}" has invalid run_metadata.revision_count.`);
+  }
+  return value;
+}
+
+async function evaluateScenarioPass(scenario, passKey, relatedArtifacts, blindReview) {
+  const fixturePath = resolveScenarioPath(
+    scenario,
+    scenario.fixtures[`${passKey}_artifact`],
+  );
+  const text = await readFile(fixturePath, 'utf8');
+  const validation = validateArtifact(text, buildValidationOptions(scenario, relatedArtifacts));
+
+  return {
+    issues: validation.issues,
+    artifact: validation.artifact,
+    usable: isArtifactUsable(validation.issues),
+    validator_error_count: validation.issues.filter(
+      (issue) => issue.severity === Severity.ERROR,
+    ).length,
+    contradiction_count: countContradictions(validation.issues),
+    blind_rating: computeBlindRating(blindReview, passKey),
+  };
+}
+
+function buildValidationOptions(scenario, relatedArtifacts) {
+  return {
+    expectSections: Array.isArray(scenario.validator?.expect_sections)
+      ? scenario.validator.expect_sections
+      : [],
+    expectStructured:
+      scenario.validator?.expect_structured ??
+      Boolean(scenario.inputs?.expected_artifact_type),
+    artifactType: scenario.inputs?.expected_artifact_type,
+    relatedArtifacts,
+  };
+}
+
+function isArtifactUsable(issues) {
+  for (const issue of issues) {
+    if (issue.severity === Severity.ERROR) return false;
+    if (USABILITY_BLOCKING_WARNING_TYPES.has(issue.type)) return false;
+  }
+  return true;
+}
+
+function countContradictions(issues) {
+  const signatures = new Set();
+
+  for (const issue of issues) {
+    if (!CONTRADICTION_TYPES.has(issue.type)) continue;
+    signatures.add(`${issue.type}|${issue.lineNumber}|${issue.message}`);
+  }
+
+  return signatures.size;
+}
+
+function computeBlindRating(blindReview, passKey) {
+  if (!blindReview) return null;
+
+  const raters = Array.isArray(blindReview.raters) ? blindReview.raters : [];
+  if (raters.length < 3) {
+    throw new Error('Blind review requires at least 3 raters.');
+  }
+
+  let total = 0;
+  for (const rater of raters) {
+    const passScores = rater?.[passKey];
+    if (!passScores || typeof passScores !== 'object') {
+      throw new Error(`Blind review is missing ${passKey} scores for rater "${rater?.rater_id || 'unknown'}".`);
+    }
+
+    let raterTotal = 0;
+    for (const dimension of SCORE_DIMENSIONS) {
+      const score = passScores[dimension];
+      if (!Number.isFinite(score) || score < 1 || score > 5) {
+        throw new Error(
+          `Blind review score must be between 1 and 5 for ${dimension}.`,
+        );
+      }
+      raterTotal += score;
+    }
+
+    total += raterTotal / SCORE_DIMENSIONS.length;
+  }
+
+  return roundToOneDecimal((total / raters.length / 5) * 100);
+}
+
+function deriveScenarioStatus(finalPass) {
+  if (!finalPass.usable) return 'DNF';
+
+  const readinessStatus = finalPass.artifact?.pass_fail_readiness?.status;
+  return readinessStatus === 'FAIL' ? 'FAIL' : 'PASS';
+}
+
+function computeNumericDelta(finalValue, firstValue) {
+  if (!Number.isFinite(finalValue) || !Number.isFinite(firstValue)) {
+    return null;
+  }
+
+  return roundToOneDecimal(finalValue - firstValue);
+}
+
+function computeMean(values) {
+  const numericValues = values.filter((value) => Number.isFinite(value));
+  if (numericValues.length === 0) return null;
+
+  const sum = numericValues.reduce((total, value) => total + value, 0);
+  return roundToOneDecimal(sum / numericValues.length);
+}
+
+function roundToOneDecimal(value) {
+  return Math.round(value * 10) / 10;
+}
+
+async function discoverScenarioFiles(scenarioDir) {
+  const entries = await readdir(scenarioDir, { withFileTypes: true });
+  return entries
+    .filter((entry) => entry.isFile() && entry.name.endsWith('.json'))
+    .map((entry) => path.join(scenarioDir, entry.name))
+    .sort();
+}
+
+async function loadRelatedArtifacts(scenario) {
+  const relatedPaths = Array.isArray(scenario.fixtures?.related_artifacts)
+    ? scenario.fixtures.related_artifacts
+    : [];
+
+  const relatedArtifacts = [];
+  for (const relatedPath of relatedPaths) {
+    const raw = await readFile(resolveScenarioPath(scenario, relatedPath), 'utf8');
+    const ext = path.extname(relatedPath).toLowerCase();
+    if (ext === '.json') {
+      relatedArtifacts.push(JSON.parse(raw));
+      continue;
+    }
+
+    const extracted = extractStructuredArtifact(raw);
+    if (extracted.error) {
+      throw new Error(
+        `Related artifact contains invalid structured JSON: ${relatedPath}: ${extracted.error}`,
+      );
+    }
+    if (!extracted.artifact) {
+      throw new Error(`Related artifact is missing structured payload: ${relatedPath}`);
+    }
+    relatedArtifacts.push(extracted.artifact);
+  }
+
+  return relatedArtifacts;
+}
+
+async function loadBlindReview(scenario) {
+  const reviewPath = scenario.fixtures?.blind_review;
+  if (!reviewPath) return null;
+
+  return JSON.parse(
+    await readFile(resolveScenarioPath(scenario, reviewPath), 'utf8'),
+  );
+}
+
+function resolveScenarioPath(scenario, relativePath) {
+  return path.resolve(path.dirname(scenario.source_path), relativePath);
+}
+
+function collectFlagValues(argv, flagName) {
+  const values = [];
+  for (let i = 0; i < argv.length; i += 1) {
+    if (argv[i] === flagName && argv[i + 1]) {
+      values.push(argv[i + 1]);
+      i += 1;
+    }
+  }
+  return values;
+}
+
+function readFlagValue(argv, flagName, fallback) {
+  const index = argv.findIndex((arg) => arg === flagName);
+  if (index !== -1 && argv[index + 1]) return argv[index + 1];
+  return fallback;
+}
+
+function formatSuiteSummary(summary) {
+  const lines = [
+    `Benchmark suite: ${summary.scenario_count} scenario(s)`,
+    `Status counts: PASS ${summary.status_counts.PASS} | FAIL ${summary.status_counts.FAIL} | DNF ${summary.status_counts.DNF}`,
+  ];
+
+  for (const result of summary.results) {
+    lines.push(
+      `- ${result.status} ${result.scenario_id} | revisions ${result.final_pass.revision_count} | first-pass errors ${result.first_pass.validator_error_count} | final errors ${result.final_pass.validator_error_count}`,
+    );
+  }
+
+  return lines.join('\n');
+}
+
+async function main(argv = process.argv.slice(2)) {
+  const scenarioDir = readFlagValue(argv, '--scenario-dir', DEFAULT_SCENARIO_DIR);
+  const outPath = readFlagValue(argv, '--out', null);
+  const format = readFlagValue(argv, '--format', 'text');
+  const scenarioIds = collectFlagValues(argv, '--scenario');
+
+  const summary = await runBenchmarkSuite({
+    scenarioDir,
+    scenarioIds,
+    outPath,
+  });
+
+  if (format === 'json') {
+    console.log(JSON.stringify(summary, null, 2));
+    return;
+  }
+
+  console.log(formatSuiteSummary(summary));
+}
+
+function isDirectRun() {
+  if (!process.argv[1]) return false;
+  return import.meta.url === pathToFileURL(path.resolve(process.argv[1])).href;
+}
+
+if (isDirectRun()) {
+  main().catch((error) => {
+    console.error(error instanceof Error ? error.message : String(error));
+    process.exitCode = 1;
+  });
+}
diff --git a/tests/run-benchmarks.test.mjs b/tests/run-benchmarks.test.mjs
new file mode 100644
index 0000000..2635919
--- /dev/null
+++ b/tests/run-benchmarks.test.mjs
@@ -0,0 +1,355 @@
+import assert from 'node:assert/strict';
+import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
+import os from 'node:os';
+import path from 'node:path';
+import test from 'node:test';
+
+import {
+  loadBenchmarkScenario,
+  runBenchmarkScenario,
+  runBenchmarkSuite,
+} from '../scripts/run-benchmarks.mjs';
+
+test('runBenchmarkSuite evaluates the default benchmark fixture suite', { concurrency: false }, async () => {
+  const summary = await runBenchmarkSuite({
+    scenarioDir: path.resolve('benchmarks/scenarios'),
+  });
+
+  assert.equal(summary.scenario_count, 6);
+  assert.deepEqual(summary.status_counts, {
+    PASS: 4,
+    FAIL: 1,
+    DNF: 1,
+  });
+
+  const pricingScenario = summary.results.find((result) => result.scenario_id === 'pricing-partial-data');
+  assert.equal(pricingScenario.status, 'FAIL');
+  assert.equal(pricingScenario.final_pass.usable, true);
+
+  const boardScenario = summary.results.find((result) => result.scenario_id === 'board-update-ambiguity');
+  assert.ok(boardScenario.diagnostics.first_pass_issue_types.includes('unsupported-numeric'));
+  assert.equal(boardScenario.diagnostics.final_pass_issue_types.length, 0);
+
+  const dnfScenario = summary.results.find((result) => result.scenario_id === 'feature-weak-evidence');
+  assert.equal(dnfScenario.status, 'DNF');
+  assert.equal(dnfScenario.final_pass.usable, false);
+  assert.equal(dnfScenario.final_pass.time_to_first_usable_artifact_seconds, null);
+});
+
+test('runBenchmarkSuite filters to requested scenario ids', { concurrency: false }, async () => {
+  const summary = await runBenchmarkSuite({
+    scenarioDir: path.resolve('benchmarks/scenarios'),
+    scenarioIds: ['board-update-ambiguity', 'pricing-partial-data'],
+  });
+
+  assert.equal(summary.scenario_count, 2);
+  assert.deepEqual(summary.status_counts, {
+    PASS: 1,
+    FAIL: 1,
+    DNF: 0,
+  });
+  assert.deepEqual(
+    summary.results.map((result) => result.scenario_id),
+    ['board-update-ambiguity', 'pricing-partial-data'],
+  );
+});
+
+test('runBenchmarkScenario normalizes blind ratings and computes deltas', { concurrency: false }, async () => {
+  const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-benchmarks-'));
+  const scenarioFile = await writeScenarioFixture(rootDir, {
+    id: 'blind-rating-normalization',
+    artifactType: 'prd',
+    runMetadata: {
+      time_to_first_usable_artifact_seconds: 300,
+      revision_count: 1,
+    },
+    firstArtifact: (() => {
+      const artifact = basePrdArtifact();
+      artifact.evidence = [];
+      artifact.payload.customer_evidence_ids = [];
+      artifact.payload.success_metrics[0].evidence_ids = [];
+      artifact.pass_fail_readiness.reason = 'Missing evidence.';
+      return artifact;
+    })(),
+    finalArtifact: basePrdArtifact(),
+    blindReview: {
+      raters: [
+        makeRater('r1', 2, 4),
+        makeRater('r2', 2, 4),
+        makeRater('r3', 2, 4),
+      ],
+    },
+  });
+
+  const result = await runBenchmarkScenario(scenarioFile);
+  assert.equal(result.status, 'PASS');
+  assert.equal(result.first_pass.usable, false);
+  assert.equal(result.final_pass.usable, true);
+  assert.equal(result.first_pass.blind_rating, 40);
+  assert.equal(result.final_pass.blind_rating, 80);
+  assert.equal(result.delta.blind_rating_change, 40);
+  assert.equal(result.final_pass.time_to_first_usable_artifact_seconds, 300);
+});
+
+test('runBenchmarkScenario preserves diagnostic blind ratings for DNF scenarios', { concurrency: false }, async () => {
+  const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-benchmarks-'));
+  const firstArtifact = basePrdArtifact();
+  firstArtifact.evidence = [];
+  firstArtifact.payload.customer_evidence_ids = [];
+  firstArtifact.payload.success_metrics[0].evidence_ids = [];
+  firstArtifact.pass_fail_readiness.status = 'FAIL';
+  firstArtifact.pass_fail_readiness.reason = 'Still under-evidenced.';
+  firstArtifact.decision_frame.owner = '';
+
+  const finalArtifact = basePrdArtifact();
+  finalArtifact.evidence = [];
+  finalArtifact.payload.customer_evidence_ids = [];
+  finalArtifact.payload.success_metrics[0].evidence_ids = [];
+  finalArtifact.pass_fail_readiness.status = 'FAIL';
+  finalArtifact.pass_fail_readiness.reason = 'Still under-evidenced after one revision.';
+
+  const scenarioFile = await writeScenarioFixture(rootDir, {
+    id: 'dnf-diagnostic-rating',
+    artifactType: 'prd',
+    runMetadata: {
+      time_to_first_usable_artifact_seconds: null,
+      revision_count: 2,
+    },
+    firstArtifact,
+    finalArtifact,
+    blindReview: {
+      raters: [
+        makeRater('r1', 1, 3),
+        makeRater('r2', 1, 3),
+        makeRater('r3', 1, 3),
+      ],
+    },
+  });
+
+  const result = await runBenchmarkScenario(scenarioFile);
+  assert.equal(result.status, 'DNF');
+  assert.equal(result.final_pass.usable, false);
+  assert.equal(result.final_pass.time_to_first_usable_artifact_seconds, null);
+  assert.equal(result.first_pass.validator_error_count, 2);
+  assert.equal(result.final_pass.validator_error_count, 1);
+  assert.equal(result.delta.validator_error_count_change, -1);
+  assert.equal(result.final_pass.blind_rating, 60);
+  assert.equal(result.delta.blind_rating_change, 40);
+});
+
+test('loadBenchmarkScenario rejects scenarios missing id', { concurrency: false }, async () => {
+  const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-benchmarks-'));
+  const scenarioPath = path.join(rootDir, 'missing-id.json');
+  await writeFile(
+    scenarioPath,
+    `${JSON.stringify({
+      title: 'missing id',
+      inputs: { expected_artifact_type: 'prd' },
+      fixtures: {
+        first_pass_artifact: '../fixtures/example/first-pass.md',
+        final_pass_artifact: '../fixtures/example/final-pass.md',
+      },
+    }, null, 2)}\n`,
+    'utf8',
+  );
+
+  await assert.rejects(
+    loadBenchmarkScenario(scenarioPath),
+    /missing id/,
+  );
+});
+
+test('loadBenchmarkScenario rejects scenarios missing first-pass fixture path', { concurrency: false }, async () => {
+  const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-benchmarks-'));
+  const scenarioPath = path.join(rootDir, 'missing-first-pass.json');
+  await writeFile(
+    scenarioPath,
+    `${JSON.stringify({
+      id: 'missing-first-pass',
+      title: 'missing first pass fixture',
+      inputs: { expected_artifact_type: 'prd' },
+      fixtures: {
+        final_pass_artifact: '../fixtures/example/final-pass.md',
+      },
+    }, null, 2)}\n`,
+    'utf8',
+  );
+
+  await assert.rejects(
+    loadBenchmarkScenario(scenarioPath),
+    /missing fixtures\.first_pass_artifact/,
+  );
+});
+
+function makeRater(raterId, firstScore, finalScore) {
+  return {
+    rater_id: raterId,
+    first_pass: {
+      decision_usefulness: firstScore,
+      evidence_discipline: firstScore,
+      internal_consistency: firstScore,
+      actionability: firstScore,
+    },
+    final_pass: {
+      decision_usefulness: finalScore,
+      evidence_discipline: finalScore,
+      internal_consistency: finalScore,
+      actionability: finalScore,
+    },
+  };
+}
+
+function basePrdArtifact() {
+  return {
+    schema_version: '2.0.0',
+    artifact_type: 'prd',
+    mode: 'rigorous',
+    depth: 'standard',
+    metadata: {
+      title: 'PRD: Test Artifact',
+      status: 'approved',
+      authors: ['Shipwright'],
+      updated_at: '2026-04-02',
+    },
+    decision_frame: {
+      recommendation: 'Ship the bounded pilot.',
+      tradeoff: 'Moves quickly with known follow-up work.',
+      confidence: 'medium',
+      owner: 'PM',
+      decision_date: '2026-04-02',
+    },
+    unknowns: ['Which edge case shows up first.'],
+    pass_fail_readiness: {
+      status: 'PASS',
+      reason: 'The artifact is structured and evidence-linked.',
+    },
+    evidence: [
+      {
+        evidence_id: 'ev-test-1',
+        kind: 'document',
+        source_ref: 'test-source',
+        confidence: 'high',
+        supports: [
+          'decision_frame.recommendation',
+          'problem-test',
+          'metric-test',
+        ],
+      },
+    ],
+    payload: {
+      problem_statement: {
+        problem_id: 'problem-test',
+        text: 'Teams need a more reliable handoff flow.',
+      },
+      customer_evidence_ids: ['ev-test-1'],
+      success_metrics: [
+        {
+          metric_id: 'metric-test',
+          name: 'handoff success rate',
+          segment: 'operations teams',
+          unit: 'percent',
+          timeframe: '30 days',
+          baseline: 55,
+          target: 75,
+          evidence_ids: ['ev-test-1'],
+        },
+      ],
+      scope: {
+        in: ['Retry flow'],
+        out: ['Migration tooling'],
+      },
+      open_questions: ['Which notification should ship first?'],
+      target_segment: 'operations teams',
+    },
+  };
+}
+
+function renderArtifactMarkdown(artifact) {
+  return `# ${artifact.metadata.title}
+
+## Decision Frame
+
+Recommendation: ${artifact.decision_frame.recommendation}
+
+## Unknowns & Evidence Gaps
+
+- ${artifact.unknowns[0]}
+
+## Pass/Fail Readiness
+
+${artifact.pass_fail_readiness.status} because ${artifact.pass_fail_readiness.reason}
+
+## Recommended Next Artifact
+
+- Sprint plan.
+
+<!-- shipwright:artifact
+${JSON.stringify(artifact, null, 2)}
+-->
+`;
+}
+
+async function writeScenarioFixture(rootDir, options) {
+  const scenarioDir = path.join(rootDir, 'scenarios');
+  const fixtureDir = path.join(rootDir, 'fixtures', options.id);
+  await mkdir(scenarioDir, { recursive: true });
+  await mkdir(fixtureDir, { recursive: true });
+
+  await writeFile(
+    path.join(fixtureDir, 'first-pass.md'),
+    renderArtifactMarkdown(options.firstArtifact),
+    'utf8',
+  );
+  await writeFile(
+    path.join(fixtureDir, 'final-pass.md'),
+    renderArtifactMarkdown(options.finalArtifact),
+    'utf8',
+  );
+
+  let blindReviewPath = null;
+  if (options.blindReview) {
+    blindReviewPath = path.join(fixtureDir, 'blind-review.json');
+    await writeFile(
+      blindReviewPath,
+      `${JSON.stringify(options.blindReview, null, 2)}\n`,
+      'utf8',
+    );
+  }
+
+  const scenario = {
+    id: options.id,
+    title: options.id,
+    inputs: {
+      prompt: 'Test benchmark scenario',
+      context_files: [],
+      expected_artifact_type: options.artifactType,
+      scoring_spec_ref: 'docs/shipwright-v2-benchmark-scoring-spec.md',
+    },
+    validator: {
+      expect_sections: [
+        'Decision Frame',
+        'Unknowns & Evidence Gaps',
+        'Pass/Fail Readiness',
+        'Recommended Next Artifact',
+      ],
+      expect_structured: true,
+    },
+    fixtures: {
+      first_pass_artifact: `../fixtures/${options.id}/first-pass.md`,
+      final_pass_artifact: `../fixtures/${options.id}/final-pass.md`,
+      related_artifacts: [],
+      blind_review: blindReviewPath ? `../fixtures/${options.id}/blind-review.json` : null,
+    },
+    run_metadata: options.runMetadata,
+    measures: [
+      'time_to_first_usable_artifact',
+      'revision_count',
+      'contradiction_count',
+      'blind_human_rating',
+    ],
+  };
+
+  const scenarioFile = path.join(scenarioDir, `${options.id}.json`);
+  await writeFile(scenarioFile, `${JSON.stringify(scenario, null, 2)}\n`, 'utf8');
+  return scenarioFile;
+}