EdgeCaser · EdgeCaser · Apr 2, 2026 · Apr 2, 2026
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,69 @@
+# Benchmarks
+
+Shipwright v2 benchmarks are fixture-based and deterministic by design.
+
+Each scenario lives in `benchmarks/scenarios/` and points to:
+
+- an initial markdown artifact fixture
+- a final markdown artifact fixture
+- optional related structured artifacts for contradiction and challenge checks
+- optional blind review inputs
+
+The harness in `scripts/run-benchmarks.mjs` validates those fixtures with the existing runtime validator and emits the required result shape from `docs/shipwright-v2-benchmark-scoring-spec.md`.
+
+## Fixture Rules
+
+- Markdown artifacts remain canonical for human review.
+- Structured payloads embedded in artifact comments are extracted and validated.
+- Related artifacts may be raw `.json` sidecars or markdown artifacts with an embedded structured payload.
+- `blind_review` may be `null` until human review has been run.
+
+## Default Run
+
+```bash
+node scripts/run-benchmarks.mjs
+node scripts/run-benchmarks.mjs --format json
+node scripts/run-benchmarks.mjs --out benchmarks/results/latest.json --format json
+```
+
+## Scenario Shape
+
+```json
+{
+  "id": "prd-hidden-scope-creep",
+  "title": "PRD with hidden scope creep",
+  "inputs": {
+    "prompt": "Write a PRD for ...",
+    "context_files": [],
+    "expected_artifact_type": "prd",
+    "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md"
+  },
+  "validator": {
+    "expect_sections": [
+      "Decision Frame",
+      "Unknowns & Evidence Gaps",
+      "Pass/Fail Readiness",
+      "Recommended Next Artifact"
+    ],
+    "expect_structured": true
+  },
+  "fixtures": {
+    "first_pass_artifact": "../fixtures/prd-hidden-scope-creep/first-pass.md",
+    "final_pass_artifact": "../fixtures/prd-hidden-scope-creep/final-pass.md",
+    "related_artifacts": [
+      "../fixtures/prd-hidden-scope-creep/related/challenge-report.json"
+    ],
+    "blind_review": null
+  },
+  "run_metadata": {
+    "time_to_first_usable_artifact_seconds": 420,
+    "revision_count": 1
+  },
+  "measures": [
+    "time_to_first_usable_artifact",
+    "revision_count",
+    "contradiction_count",
+    "blind_human_rating"
+  ]
+}
+```
diff --git a/benchmarks/baselines/README.md b/benchmarks/baselines/README.md
@@ -0,0 +1 @@
+Baseline benchmark outputs should be written here once the frozen v2 contract has been run against the comparison system.
diff --git a/benchmarks/fixtures/board-update-ambiguity/final-pass.md b/benchmarks/fixtures/board-update-ambiguity/final-pass.md
@@ -0,0 +1,105 @@
+# Strategy: Board Update Under Ambiguity
+
+## Decision Frame
+
+Recommendation: keep the current investment plan while evidence is still incomplete.
+
+## Unknowns & Evidence Gaps
+
+- Demand quality is improving, but not yet uniform across segments.
+
+## Pass/Fail Readiness
+
+PASS because the board-facing recommendation is explicit and bounded.
+
+## Recommended Next Artifact
+
+- Executive briefing with the same recommendation and evidence trail.
+
+The board draft says expansion pipeline coverage is 42% above plan and should reach 120% next quarter.[1]
+
+## Sources
+
+- [1] Board pipeline pack, Q2 planning draft.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "strategy",
+  "mode": "rigorous",
+  "depth": "light",
+  "metadata": {
+    "title": "Strategy: Board Update Under Ambiguity",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Keep the current investment plan while evidence remains mixed.",
+    "tradeoff": "Preserves strategic consistency, but delays a sharper board narrative.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Segment demand quality is not yet stable enough for a sharper commitment."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The recommendation is explicit, bounded, and evidence-linked."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-board-1",
+      "kind": "document",
+      "source_ref": "quarterly-board-pack",
+      "confidence": "medium",
+      "supports": [
+        "decision_frame.recommendation",
+        "bet-hold-line"
+      ]
+    }
+  ],
+  "payload": {
+    "vision": "Preserve credibility with the board while ambiguity remains.",
+    "context": {
+      "current_state": {
+        "product_stage": "growth"
+      }
+    },
+    "primary_segment": "existing enterprise accounts",
+    "bets": [
+      {
+        "bet_id": "bet-hold-line",
+        "name": "Hold current investment line",
+        "thesis": "A steady investment plan is more defensible than a reactive pivot under mixed evidence.",
+        "assumptions": [
+          "Current demand is directionally positive but not yet conclusive."
+        ],
+        "investment_level": "moderate",
+        "success_metric": {
+          "metric_id": "metric-expansion",
+          "name": "expansion pipeline coverage",
+          "segment": "existing enterprise accounts",
+          "unit": "percent",
+          "timeframe": "quarterly",
+          "baseline": 105,
+          "target": 120
+        },
+        "kill_criteria": "If two consecutive quarters miss coverage, revisit the investment plan.",
+        "evidence_ids": ["ev-board-1"]
+      }
+    ],
+    "boundaries": {
+      "not_doing": [
+        "Narrative pivot before demand signal stabilizes"
+      ]
+    },
+    "review_cadence": {
+      "weekly": "Monitor inbound signal quality.",
+      "monthly": "Review board narrative assumptions.",
+      "quarterly": "Revisit investment posture before the next board cycle."
+    }
+  }
+}
+-->
diff --git a/benchmarks/fixtures/board-update-ambiguity/first-pass.md b/benchmarks/fixtures/board-update-ambiguity/first-pass.md
@@ -0,0 +1,101 @@
+# Strategy: Board Update Under Ambiguity
+
+## Decision Frame
+
+Recommendation: keep the current investment plan while evidence is still incomplete.
+
+## Unknowns & Evidence Gaps
+
+- Demand quality is improving, but not yet uniform across segments.
+
+## Pass/Fail Readiness
+
+PASS because the board-facing recommendation is explicit and bounded.
+
+## Recommended Next Artifact
+
+- Executive briefing with the same recommendation and evidence trail.
+
+The board draft says expansion pipeline coverage is 42% above plan and should reach 120% next quarter, but the supporting source is not yet cited.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "strategy",
+  "mode": "rigorous",
+  "depth": "light",
+  "metadata": {
+    "title": "Strategy: Board Update Under Ambiguity",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Keep the current investment plan while evidence remains mixed.",
+    "tradeoff": "Preserves strategic consistency, but delays a sharper board narrative.",
+    "confidence": "medium",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Segment demand quality is not yet stable enough for a sharper commitment."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The recommendation is explicit, bounded, and evidence-linked."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-board-1",
+      "kind": "document",
+      "source_ref": "quarterly-board-pack",
+      "confidence": "medium",
+      "supports": [
+        "decision_frame.recommendation",
+        "bet-hold-line"
+      ]
+    }
+  ],
+  "payload": {
+    "vision": "Preserve credibility with the board while ambiguity remains.",
+    "context": {
+      "current_state": {
+        "product_stage": "growth"
+      }
+    },
+    "primary_segment": "existing enterprise accounts",
+    "bets": [
+      {
+        "bet_id": "bet-hold-line",
+        "name": "Hold current investment line",
+        "thesis": "A steady investment plan is more defensible than a reactive pivot under mixed evidence.",
+        "assumptions": [
+          "Current demand is directionally positive but not yet conclusive."
+        ],
+        "investment_level": "moderate",
+        "success_metric": {
+          "metric_id": "metric-expansion",
+          "name": "expansion pipeline coverage",
+          "segment": "existing enterprise accounts",
+          "unit": "percent",
+          "timeframe": "quarterly",
+          "baseline": 105,
+          "target": 120
+        },
+        "kill_criteria": "If two consecutive quarters miss coverage, revisit the investment plan.",
+        "evidence_ids": ["ev-board-1"]
+      }
+    ],
+    "boundaries": {
+      "not_doing": [
+        "Narrative pivot before demand signal stabilizes"
+      ]
+    },
+    "review_cadence": {
+      "weekly": "Monitor inbound signal quality.",
+      "monthly": "Review board narrative assumptions.",
+      "quarterly": "Revisit investment posture before the next board cycle."
+    }
+  }
+}
+-->
diff --git a/benchmarks/fixtures/churn-conflicting-signals/final-pass.md b/benchmarks/fixtures/churn-conflicting-signals/final-pass.md
@@ -0,0 +1,92 @@
+# PRD: Churn Rescue Pilot
+
+## Decision Frame
+
+Recommendation: run a churn rescue pilot for high-risk accounts.
+
+## Unknowns & Evidence Gaps
+
+- We still need to isolate the highest-signal intervention.
+
+## Pass/Fail Readiness
+
+PASS because the pilot can be run without a broader strategy rewrite.
+
+## Recommended Next Artifact
+
+- Sprint plan for the rescue pilot instrumentation.
+
+<!-- shipwright:artifact
+{
+  "schema_version": "2.0.0",
+  "artifact_type": "prd",
+  "mode": "rigorous",
+  "depth": "standard",
+  "metadata": {
+    "title": "PRD: Churn Rescue Pilot",
+    "status": "approved",
+    "authors": ["Shipwright"],
+    "updated_at": "2026-04-02"
+  },
+  "decision_frame": {
+    "recommendation": "Run a churn rescue pilot for high-risk accounts.",
+    "tradeoff": "Focused intervention now, but metric targets diverge from top-level strategy because the pilot targets a narrower segment.",
+    "confidence": "high",
+    "owner": "PM",
+    "decision_date": "2026-04-02"
+  },
+  "unknowns": [
+    "Which intervention has the highest near-term retention effect."
+  ],
+  "pass_fail_readiness": {
+    "status": "PASS",
+    "reason": "The pilot is scoped well enough to execute."
+  },
+  "evidence": [
+    {
+      "evidence_id": "ev-churn-1",
+      "kind": "metric",
+      "source_ref": "retention-cohort-analysis",
+      "confidence": "high",
+      "supports": [
+        "decision_frame.recommendation",
+        "problem-churn",
+        "metric-retention"
+      ]
+    }
+  ],
+  "payload": {
+    "problem_statement": {
+      "problem_id": "problem-churn",
+      "text": "High-risk accounts churn before the success team can intervene."
+    },
+    "customer_evidence_ids": ["ev-churn-1"],
+    "success_metrics": [
+      {
+        "metric_id": "metric-retention",
+        "name": "gross retention improvement",
+        "segment": "high-risk accounts",
+        "unit": "percent",
+        "timeframe": "90 days",
+        "baseline": 1,
+        "target": 12,
+        "evidence_ids": ["ev-churn-1"],
+        "explanation": "Supersedes the broad strategy target because this pilot only covers the highest-risk segment."
+      }
+    ],
+    "scope": {
+      "in": [
+        "Risk scoring refresh",
+        "Success-team intervention workflow"
+      ],
+      "out": [
+        "Company-wide pricing changes"
+      ]
+    },
+    "open_questions": [
+      "Which intervention mix should the pilot test first?"
+    ],
+    "target_segment": "high-risk accounts"
+  }
+}
+-->
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Baseline benchmark outputs should be written here once the frozen v2 contract has been run against the comparison system.