From 8d64281b84ca9b76b2e05f7fe860f0787f1fa5e3 Mon Sep 17 00:00:00 2001 From: Edgecaser Date: Thu, 2 Apr 2026 10:28:35 -0700 Subject: [PATCH] Add Shipwright v2 benchmark harness --- benchmarks/README.md | 69 +++ benchmarks/baselines/README.md | 1 + .../board-update-ambiguity/final-pass.md | 105 +++++ .../board-update-ambiguity/first-pass.md | 101 +++++ .../churn-conflicting-signals/final-pass.md | 92 ++++ .../churn-conflicting-signals/first-pass.md | 91 ++++ .../related/strategy.json | 66 +++ .../feature-weak-evidence/final-pass.md | 79 ++++ .../feature-weak-evidence/first-pass.md | 78 ++++ .../handoff-contradiction/final-pass.md | 100 ++++ .../handoff-contradiction/first-pass.md | 91 ++++ .../related/challenge-report.json | 55 +++ .../related/strategy.json | 66 +++ .../prd-hidden-scope-creep/final-pass.md | 100 ++++ .../prd-hidden-scope-creep/first-pass.md | 89 ++++ .../related/challenge-report.json | 58 +++ .../pricing-partial-data/final-pass.md | 99 ++++ .../pricing-partial-data/first-pass.md | 89 ++++ benchmarks/results/README.md | 3 + .../scenarios/board-update-ambiguity.json | 35 ++ .../scenarios/churn-conflicting-signals.json | 37 ++ .../scenarios/feature-weak-evidence.json | 35 ++ .../scenarios/handoff-contradiction.json | 38 ++ .../scenarios/prd-hidden-scope-creep.json | 37 ++ .../scenarios/pricing-partial-data.json | 35 ++ scripts/run-benchmarks.mjs | 426 ++++++++++++++++++ tests/run-benchmarks.test.mjs | 355 +++++++++++++++ 27 files changed, 2430 insertions(+) create mode 100644 benchmarks/README.md create mode 100644 benchmarks/baselines/README.md create mode 100644 benchmarks/fixtures/board-update-ambiguity/final-pass.md create mode 100644 benchmarks/fixtures/board-update-ambiguity/first-pass.md create mode 100644 benchmarks/fixtures/churn-conflicting-signals/final-pass.md create mode 100644 benchmarks/fixtures/churn-conflicting-signals/first-pass.md create mode 100644 benchmarks/fixtures/churn-conflicting-signals/related/strategy.json create mode 100644 benchmarks/fixtures/feature-weak-evidence/final-pass.md create mode 100644 benchmarks/fixtures/feature-weak-evidence/first-pass.md create mode 100644 benchmarks/fixtures/handoff-contradiction/final-pass.md create mode 100644 benchmarks/fixtures/handoff-contradiction/first-pass.md create mode 100644 benchmarks/fixtures/handoff-contradiction/related/challenge-report.json create mode 100644 benchmarks/fixtures/handoff-contradiction/related/strategy.json create mode 100644 benchmarks/fixtures/prd-hidden-scope-creep/final-pass.md create mode 100644 benchmarks/fixtures/prd-hidden-scope-creep/first-pass.md create mode 100644 benchmarks/fixtures/prd-hidden-scope-creep/related/challenge-report.json create mode 100644 benchmarks/fixtures/pricing-partial-data/final-pass.md create mode 100644 benchmarks/fixtures/pricing-partial-data/first-pass.md create mode 100644 benchmarks/results/README.md create mode 100644 benchmarks/scenarios/board-update-ambiguity.json create mode 100644 benchmarks/scenarios/churn-conflicting-signals.json create mode 100644 benchmarks/scenarios/feature-weak-evidence.json create mode 100644 benchmarks/scenarios/handoff-contradiction.json create mode 100644 benchmarks/scenarios/prd-hidden-scope-creep.json create mode 100644 benchmarks/scenarios/pricing-partial-data.json create mode 100644 scripts/run-benchmarks.mjs create mode 100644 tests/run-benchmarks.test.mjs diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..2079b85 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,69 @@ +# Benchmarks + +Shipwright v2 benchmarks are fixture-based and deterministic by design. + +Each scenario lives in `benchmarks/scenarios/` and points to: + +- an initial markdown artifact fixture +- a final markdown artifact fixture +- optional related structured artifacts for contradiction and challenge checks +- optional blind review inputs + +The harness in `scripts/run-benchmarks.mjs` validates those fixtures with the existing runtime validator and emits the required result shape from `docs/shipwright-v2-benchmark-scoring-spec.md`. + +## Fixture Rules + +- Markdown artifacts remain canonical for human review. +- Structured payloads embedded in artifact comments are extracted and validated. +- Related artifacts may be raw `.json` sidecars or markdown artifacts with an embedded structured payload. +- `blind_review` may be `null` until human review has been run. + +## Default Run + +```bash +node scripts/run-benchmarks.mjs +node scripts/run-benchmarks.mjs --format json +node scripts/run-benchmarks.mjs --out benchmarks/results/latest.json --format json +``` + +## Scenario Shape + +```json +{ + "id": "prd-hidden-scope-creep", + "title": "PRD with hidden scope creep", + "inputs": { + "prompt": "Write a PRD for ...", + "context_files": [], + "expected_artifact_type": "prd", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/prd-hidden-scope-creep/first-pass.md", + "final_pass_artifact": "../fixtures/prd-hidden-scope-creep/final-pass.md", + "related_artifacts": [ + "../fixtures/prd-hidden-scope-creep/related/challenge-report.json" + ], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": 420, + "revision_count": 1 + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} +``` diff --git a/benchmarks/baselines/README.md b/benchmarks/baselines/README.md new file mode 100644 index 0000000..7d0b89b --- /dev/null +++ b/benchmarks/baselines/README.md @@ -0,0 +1 @@ +Baseline benchmark outputs should be written here once the frozen v2 contract has been run against the comparison system. diff --git a/benchmarks/fixtures/board-update-ambiguity/final-pass.md b/benchmarks/fixtures/board-update-ambiguity/final-pass.md new file mode 100644 index 0000000..02fbe89 --- /dev/null +++ b/benchmarks/fixtures/board-update-ambiguity/final-pass.md @@ -0,0 +1,105 @@ +# Strategy: Board Update Under Ambiguity + +## Decision Frame + +Recommendation: keep the current investment plan while evidence is still incomplete. + +## Unknowns & Evidence Gaps + +- Demand quality is improving, but not yet uniform across segments. + +## Pass/Fail Readiness + +PASS because the board-facing recommendation is explicit and bounded. + +## Recommended Next Artifact + +- Executive briefing with the same recommendation and evidence trail. + +The board draft says expansion pipeline coverage is 42% above plan and should reach 120% next quarter.[1] + +## Sources + +- [1] Board pipeline pack, Q2 planning draft. + + diff --git a/benchmarks/fixtures/board-update-ambiguity/first-pass.md b/benchmarks/fixtures/board-update-ambiguity/first-pass.md new file mode 100644 index 0000000..917b700 --- /dev/null +++ b/benchmarks/fixtures/board-update-ambiguity/first-pass.md @@ -0,0 +1,101 @@ +# Strategy: Board Update Under Ambiguity + +## Decision Frame + +Recommendation: keep the current investment plan while evidence is still incomplete. + +## Unknowns & Evidence Gaps + +- Demand quality is improving, but not yet uniform across segments. + +## Pass/Fail Readiness + +PASS because the board-facing recommendation is explicit and bounded. + +## Recommended Next Artifact + +- Executive briefing with the same recommendation and evidence trail. + +The board draft says expansion pipeline coverage is 42% above plan and should reach 120% next quarter, but the supporting source is not yet cited. + + diff --git a/benchmarks/fixtures/churn-conflicting-signals/final-pass.md b/benchmarks/fixtures/churn-conflicting-signals/final-pass.md new file mode 100644 index 0000000..e320209 --- /dev/null +++ b/benchmarks/fixtures/churn-conflicting-signals/final-pass.md @@ -0,0 +1,92 @@ +# PRD: Churn Rescue Pilot + +## Decision Frame + +Recommendation: run a churn rescue pilot for high-risk accounts. + +## Unknowns & Evidence Gaps + +- We still need to isolate the highest-signal intervention. + +## Pass/Fail Readiness + +PASS because the pilot can be run without a broader strategy rewrite. + +## Recommended Next Artifact + +- Sprint plan for the rescue pilot instrumentation. + + diff --git a/benchmarks/fixtures/churn-conflicting-signals/first-pass.md b/benchmarks/fixtures/churn-conflicting-signals/first-pass.md new file mode 100644 index 0000000..0690fbb --- /dev/null +++ b/benchmarks/fixtures/churn-conflicting-signals/first-pass.md @@ -0,0 +1,91 @@ +# PRD: Churn Rescue Pilot + +## Decision Frame + +Recommendation: run a churn rescue pilot for high-risk accounts. + +## Unknowns & Evidence Gaps + +- We still need to isolate the highest-signal intervention. + +## Pass/Fail Readiness + +PASS because the pilot can be run without a broader strategy rewrite. + +## Recommended Next Artifact + +- Sprint plan for the rescue pilot instrumentation. + + diff --git a/benchmarks/fixtures/churn-conflicting-signals/related/strategy.json b/benchmarks/fixtures/churn-conflicting-signals/related/strategy.json new file mode 100644 index 0000000..08a9ac0 --- /dev/null +++ b/benchmarks/fixtures/churn-conflicting-signals/related/strategy.json @@ -0,0 +1,66 @@ +{ + "schema_version": "2.0.0", + "artifact_type": "strategy", + "mode": "rigorous", + "depth": "standard", + "metadata": { + "title": "Strategy: Churn Stabilization", + "status": "approved", + "authors": ["Shipwright"], + "updated_at": "2026-04-02" + }, + "decision_frame": { + "recommendation": "Improve retention across the existing base.", + "tradeoff": "Broader coverage, slower local optimization.", + "confidence": "medium", + "owner": "PM", + "decision_date": "2026-04-02" + }, + "unknowns": [], + "pass_fail_readiness": { + "status": "PASS", + "reason": "Strategy direction is approved." + }, + "evidence": [], + "payload": { + "vision": "Reduce churn without over-rotating the roadmap.", + "context": { + "current_state": { + "product_stage": "growth" + } + }, + "primary_segment": "high-risk accounts", + "bets": [ + { + "bet_id": "bet-retention", + "name": "Retention stabilization", + "thesis": "A broad retention program should improve overall results.", + "assumptions": [ + "The broad base can absorb a shared intervention plan." + ], + "investment_level": "moderate", + "success_metric": { + "metric_id": "metric-retention", + "name": "gross retention improvement", + "segment": "high-risk accounts", + "unit": "percent", + "timeframe": "90 days", + "baseline": 1, + "target": 8 + }, + "kill_criteria": "If retention does not improve, revisit the intervention model.", + "evidence_ids": [] + } + ], + "boundaries": { + "not_doing": [ + "Immediate pricing changes" + ] + }, + "review_cadence": { + "weekly": "Monitor retention movement.", + "monthly": "Review intervention outcomes.", + "quarterly": "Revisit strategy target." + } + } +} diff --git a/benchmarks/fixtures/feature-weak-evidence/final-pass.md b/benchmarks/fixtures/feature-weak-evidence/final-pass.md new file mode 100644 index 0000000..dfd8aac --- /dev/null +++ b/benchmarks/fixtures/feature-weak-evidence/final-pass.md @@ -0,0 +1,79 @@ +# PRD: Insight Feed + +## Decision Frame + +Recommendation: build an insight feed for account managers. + +## Unknowns & Evidence Gaps + +- User demand is still mostly anecdotal. +- We still lack workflow evidence from live interviews. + +## Pass/Fail Readiness + +FAIL because the evidence base is still weak. + +## Recommended Next Artifact + +- Discovery interview prep for account managers. + + diff --git a/benchmarks/fixtures/feature-weak-evidence/first-pass.md b/benchmarks/fixtures/feature-weak-evidence/first-pass.md new file mode 100644 index 0000000..507c3a8 --- /dev/null +++ b/benchmarks/fixtures/feature-weak-evidence/first-pass.md @@ -0,0 +1,78 @@ +# PRD: Insight Feed + +## Decision Frame + +Recommendation: build an insight feed for account managers. + +## Unknowns & Evidence Gaps + +- User demand is still mostly anecdotal. + +## Pass/Fail Readiness + +FAIL because the evidence base is still weak. + +## Recommended Next Artifact + +- Discovery interview prep for account managers. + + diff --git a/benchmarks/fixtures/handoff-contradiction/final-pass.md b/benchmarks/fixtures/handoff-contradiction/final-pass.md new file mode 100644 index 0000000..2997bde --- /dev/null +++ b/benchmarks/fixtures/handoff-contradiction/final-pass.md @@ -0,0 +1,100 @@ +# PRD: Platform Handoff Alignment + +## Decision Frame + +Recommendation: ship the first platform handoff artifact for enterprise success teams. + +## Unknowns & Evidence Gaps + +- Observability coverage is still narrower than ideal. + +## Pass/Fail Readiness + +PASS because the handoff is now aligned to strategy and challenge resolution is explicit. + +## Recommended Next Artifact + +- Technical spec for the platform event stream. + + diff --git a/benchmarks/fixtures/handoff-contradiction/first-pass.md b/benchmarks/fixtures/handoff-contradiction/first-pass.md new file mode 100644 index 0000000..81f1477 --- /dev/null +++ b/benchmarks/fixtures/handoff-contradiction/first-pass.md @@ -0,0 +1,91 @@ +# PRD: Platform Handoff Alignment + +## Decision Frame + +Recommendation: ship the first platform handoff artifact for self-serve SMB teams. + +## Unknowns & Evidence Gaps + +- Observability coverage is still narrower than ideal. + +## Pass/Fail Readiness + +PASS because the handoff can proceed while follow-up risk remains explicit. + +## Recommended Next Artifact + +- Technical spec for the platform event stream. + + diff --git a/benchmarks/fixtures/handoff-contradiction/related/challenge-report.json b/benchmarks/fixtures/handoff-contradiction/related/challenge-report.json new file mode 100644 index 0000000..93076c1 --- /dev/null +++ b/benchmarks/fixtures/handoff-contradiction/related/challenge-report.json @@ -0,0 +1,55 @@ +{ + "schema_version": "2.0.0", + "artifact_type": "challenge-report", + "mode": "rigorous", + "depth": "standard", + "metadata": { + "title": "Challenge: Platform Handoff Alignment", + "status": "approved", + "authors": ["Shipwright"], + "updated_at": "2026-04-02" + }, + "decision_frame": { + "recommendation": "Address observability risk before broad rollout.", + "tradeoff": "Slightly slower execution, lower operational surprise.", + "confidence": "medium", + "owner": "PM", + "decision_date": "2026-04-02" + }, + "unknowns": [], + "pass_fail_readiness": { + "status": "FAIL", + "reason": "Observability risk still needs an explicit disposition." + }, + "evidence": [ + { + "evidence_id": "ev-handoff-challenge-1", + "kind": "document", + "source_ref": "platform-observability-review", + "confidence": "medium", + "supports": ["finding-observability-gap"] + } + ], + "payload": { + "reviewed_artifact": { + "title": "PRD: Platform Handoff Alignment", + "artifact_type": "prd" + }, + "depth": "standard", + "findings": [ + { + "finding_id": "finding-observability-gap", + "claim": "The rollout depends on observability that is not yet complete.", + "vector": "structural-honesty", + "severity": "moderate", + "rationale": "The first release can ship, but the monitoring gap should be acknowledged explicitly.", + "resolution_condition": "Resolve or waive the observability gap before broad rollout.", + "evidence_ids": ["ev-handoff-challenge-1"] + } + ], + "verdict": "Proceed only with explicit risk handling.", + "action_plan": [ + "Either add more observability or document a waiver." + ] + } +} diff --git a/benchmarks/fixtures/handoff-contradiction/related/strategy.json b/benchmarks/fixtures/handoff-contradiction/related/strategy.json new file mode 100644 index 0000000..4c46783 --- /dev/null +++ b/benchmarks/fixtures/handoff-contradiction/related/strategy.json @@ -0,0 +1,66 @@ +{ + "schema_version": "2.0.0", + "artifact_type": "strategy", + "mode": "rigorous", + "depth": "standard", + "metadata": { + "title": "Strategy: Platform Expansion", + "status": "approved", + "authors": ["Shipwright"], + "updated_at": "2026-04-02" + }, + "decision_frame": { + "recommendation": "Focus platform expansion on enterprise success teams first.", + "tradeoff": "Higher implementation effort now, better strategic fit.", + "confidence": "high", + "owner": "PM", + "decision_date": "2026-04-02" + }, + "unknowns": [], + "pass_fail_readiness": { + "status": "PASS", + "reason": "Platform direction is approved." + }, + "evidence": [], + "payload": { + "vision": "Make platform workflows durable for enterprise operations first.", + "context": { + "current_state": { + "product_stage": "growth" + } + }, + "primary_segment": "enterprise success teams", + "bets": [ + { + "bet_id": "bet-platform-enterprise", + "name": "Enterprise-first handoff", + "thesis": "Enterprise success teams create the clearest platform leverage.", + "assumptions": [ + "Enterprise workflows have the strongest operational payoff." + ], + "investment_level": "major", + "success_metric": { + "metric_id": "metric-platform-success", + "name": "handoff success rate", + "segment": "enterprise success teams", + "unit": "percent", + "timeframe": "30 days", + "baseline": 71, + "target": 85 + }, + "kill_criteria": "If enterprise adoption stalls, revisit the segment sequence.", + "evidence_ids": [] + } + ], + "boundaries": { + "not_doing": [ + "Broad SMB-first rollout" + ] + }, + "review_cadence": { + "weekly": "Track engineering progress.", + "monthly": "Review rollout readiness.", + "quarterly": "Revisit platform sequence." + } + } +} diff --git a/benchmarks/fixtures/prd-hidden-scope-creep/final-pass.md b/benchmarks/fixtures/prd-hidden-scope-creep/final-pass.md new file mode 100644 index 0000000..94505d8 --- /dev/null +++ b/benchmarks/fixtures/prd-hidden-scope-creep/final-pass.md @@ -0,0 +1,100 @@ +# PRD: Team Inbox Workflow Handoff + +## Decision Frame + +Recommendation: ship the workflow handoff improvement with a limited rollout. + +## Unknowns & Evidence Gaps + +- Manager routing is explicitly deferred from v1. +- Escalation ownership still needs a follow-up decision log. + +## Pass/Fail Readiness + +PASS because the hidden scope is now contained. + +## Recommended Next Artifact + +- Technical handoff for workflow audit trail delivery. + + diff --git a/benchmarks/fixtures/prd-hidden-scope-creep/first-pass.md b/benchmarks/fixtures/prd-hidden-scope-creep/first-pass.md new file mode 100644 index 0000000..085b4a1 --- /dev/null +++ b/benchmarks/fixtures/prd-hidden-scope-creep/first-pass.md @@ -0,0 +1,89 @@ +# PRD: Team Inbox Workflow Handoff + +## Decision Frame + +Recommendation: ship the workflow handoff improvement with a limited rollout. + +## Unknowns & Evidence Gaps + +- Manager routing rules are still ambiguous. +- Engineering ownership of escalation handling is not explicit. + +## Pass/Fail Readiness + +FAIL until the scope challenge is resolved. + + diff --git a/benchmarks/fixtures/prd-hidden-scope-creep/related/challenge-report.json b/benchmarks/fixtures/prd-hidden-scope-creep/related/challenge-report.json new file mode 100644 index 0000000..64f8086 --- /dev/null +++ b/benchmarks/fixtures/prd-hidden-scope-creep/related/challenge-report.json @@ -0,0 +1,58 @@ +{ + "schema_version": "2.0.0", + "artifact_type": "challenge-report", + "mode": "rigorous", + "depth": "standard", + "metadata": { + "title": "Challenge: Team Inbox Workflow Handoff", + "status": "approved", + "authors": ["Shipwright"], + "updated_at": "2026-04-02" + }, + "decision_frame": { + "recommendation": "Do not approve until hidden workflow scope is contained.", + "tradeoff": "Delay approval to avoid downstream delivery churn.", + "confidence": "high", + "owner": "PM", + "decision_date": "2026-04-02" + }, + "unknowns": [ + "Instrumentation ownership is still unclear." + ], + "pass_fail_readiness": { + "status": "FAIL", + "reason": "Critical hidden-scope finding is unresolved." + }, + "evidence": [ + { + "evidence_id": "ev-challenge-1", + "kind": "document", + "source_ref": "design-review-notes", + "confidence": "high", + "supports": ["finding-scope-creep"] + } + ], + "payload": { + "reviewed_artifact": { + "title": "PRD: Team Inbox Workflow Handoff", + "artifact_type": "prd" + }, + "depth": "standard", + "findings": [ + { + "finding_id": "finding-scope-creep", + "claim": "The PRD implies manager routing work without naming it in scope boundaries.", + "vector": "scope-discipline", + "severity": "critical", + "rationale": "Hidden workflow work will spill into engineering estimates and rollout planning.", + "resolution_condition": "Either remove manager routing from scope or mark it explicitly out of scope with a follow-up owner.", + "evidence_ids": ["ev-challenge-1"] + } + ], + "verdict": "Revision required before approval.", + "action_plan": [ + "Clarify scope boundaries.", + "Record a resolution state in the revised artifact." + ] + } +} diff --git a/benchmarks/fixtures/pricing-partial-data/final-pass.md b/benchmarks/fixtures/pricing-partial-data/final-pass.md new file mode 100644 index 0000000..65f0d77 --- /dev/null +++ b/benchmarks/fixtures/pricing-partial-data/final-pass.md @@ -0,0 +1,99 @@ +# Strategy: Pricing Reset with Partial Data + +## Decision Frame + +Recommendation: hold the packaging change until evidence improves. + +## Unknowns & Evidence Gaps + +- We still need more pricing interviews before approving a rollout decision. + +## Pass/Fail Readiness + +FAIL because the data is still incomplete for approval. + +## Recommended Next Artifact + +- Pricing research brief with a fixed interview and survey plan. + + diff --git a/benchmarks/fixtures/pricing-partial-data/first-pass.md b/benchmarks/fixtures/pricing-partial-data/first-pass.md new file mode 100644 index 0000000..bc5d250 --- /dev/null +++ b/benchmarks/fixtures/pricing-partial-data/first-pass.md @@ -0,0 +1,89 @@ +# Strategy: Pricing Reset with Partial Data + +## Decision Frame + +Recommendation: hold the packaging change until evidence improves. + +## Unknowns & Evidence Gaps + +- We only have partial willingness-to-pay interviews. +- Competitor packaging was sampled, not exhaustively mapped. + +## Pass/Fail Readiness + +FAIL because the current recommendation is still under-evidenced. + +## Recommended Next Artifact + +- Pricing research brief with a tighter interview plan. + + diff --git a/benchmarks/results/README.md b/benchmarks/results/README.md new file mode 100644 index 0000000..fba977f --- /dev/null +++ b/benchmarks/results/README.md @@ -0,0 +1,3 @@ +Generated benchmark summaries can be written here with `node scripts/run-benchmarks.mjs --out benchmarks/results/.json --format json`. + +Until blind human review has been run, `mean_first_pass_blind_rating` and `mean_final_pass_blind_rating` will remain `null` in the generated summaries. diff --git a/benchmarks/scenarios/board-update-ambiguity.json b/benchmarks/scenarios/board-update-ambiguity.json new file mode 100644 index 0000000..ec77ab7 --- /dev/null +++ b/benchmarks/scenarios/board-update-ambiguity.json @@ -0,0 +1,35 @@ +{ + "id": "board-update-ambiguity", + "title": "Board update under ambiguity", + "inputs": { + "prompt": "Write a strategy update for a board audience under ambiguity.", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/board-update-ambiguity/first-pass.md", + "final_pass_artifact": "../fixtures/board-update-ambiguity/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": 180, + "revision_count": 0 + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/churn-conflicting-signals.json b/benchmarks/scenarios/churn-conflicting-signals.json new file mode 100644 index 0000000..de180f9 --- /dev/null +++ b/benchmarks/scenarios/churn-conflicting-signals.json @@ -0,0 +1,37 @@ +{ + "id": "churn-conflicting-signals", + "title": "Churn diagnosis with conflicting signals", + "inputs": { + "prompt": "Write a churn reduction PRD when signals conflict with strategy targets.", + "context_files": [], + "expected_artifact_type": "prd", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/churn-conflicting-signals/first-pass.md", + "final_pass_artifact": "../fixtures/churn-conflicting-signals/final-pass.md", + "related_artifacts": [ + "../fixtures/churn-conflicting-signals/related/strategy.json" + ], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": 210, + "revision_count": 1 + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/feature-weak-evidence.json b/benchmarks/scenarios/feature-weak-evidence.json new file mode 100644 index 0000000..7846bf8 --- /dev/null +++ b/benchmarks/scenarios/feature-weak-evidence.json @@ -0,0 +1,35 @@ +{ + "id": "feature-weak-evidence", + "title": "New feature with weak evidence", + "inputs": { + "prompt": "Draft a PRD for a new feature with weak supporting evidence.", + "context_files": [], + "expected_artifact_type": "prd", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/feature-weak-evidence/first-pass.md", + "final_pass_artifact": "../fixtures/feature-weak-evidence/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": null, + "revision_count": 2 + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/handoff-contradiction.json b/benchmarks/scenarios/handoff-contradiction.json new file mode 100644 index 0000000..46b5c5f --- /dev/null +++ b/benchmarks/scenarios/handoff-contradiction.json @@ -0,0 +1,38 @@ +{ + "id": "handoff-contradiction", + "title": "Handoff artifact with cross-document contradictions", + "inputs": { + "prompt": "Write a technical handoff PRD aligned to a platform strategy and challenge review.", + "context_files": [], + "expected_artifact_type": "prd", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/handoff-contradiction/first-pass.md", + "final_pass_artifact": "../fixtures/handoff-contradiction/final-pass.md", + "related_artifacts": [ + "../fixtures/handoff-contradiction/related/strategy.json", + "../fixtures/handoff-contradiction/related/challenge-report.json" + ], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": 240, + "revision_count": 1 + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/prd-hidden-scope-creep.json b/benchmarks/scenarios/prd-hidden-scope-creep.json new file mode 100644 index 0000000..70ee187 --- /dev/null +++ b/benchmarks/scenarios/prd-hidden-scope-creep.json @@ -0,0 +1,37 @@ +{ + "id": "prd-hidden-scope-creep", + "title": "PRD with hidden scope creep", + "inputs": { + "prompt": "Write a PRD for a team inbox workflow handoff improvement.", + "context_files": [], + "expected_artifact_type": "prd", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/prd-hidden-scope-creep/first-pass.md", + "final_pass_artifact": "../fixtures/prd-hidden-scope-creep/final-pass.md", + "related_artifacts": [ + "../fixtures/prd-hidden-scope-creep/related/challenge-report.json" + ], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": 420, + "revision_count": 1 + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/benchmarks/scenarios/pricing-partial-data.json b/benchmarks/scenarios/pricing-partial-data.json new file mode 100644 index 0000000..43c2cbb --- /dev/null +++ b/benchmarks/scenarios/pricing-partial-data.json @@ -0,0 +1,35 @@ +{ + "id": "pricing-partial-data", + "title": "Pricing change with partial market data", + "inputs": { + "prompt": "Draft a pricing strategy recommendation under partial market data.", + "context_files": [], + "expected_artifact_type": "strategy", + "scoring_spec_ref": "docs/shipwright-v2-benchmark-scoring-spec.md" + }, + "validator": { + "expect_sections": [ + "Decision Frame", + "Unknowns & Evidence Gaps", + "Pass/Fail Readiness", + "Recommended Next Artifact" + ], + "expect_structured": true + }, + "fixtures": { + "first_pass_artifact": "../fixtures/pricing-partial-data/first-pass.md", + "final_pass_artifact": "../fixtures/pricing-partial-data/final-pass.md", + "related_artifacts": [], + "blind_review": null + }, + "run_metadata": { + "time_to_first_usable_artifact_seconds": 540, + "revision_count": 1 + }, + "measures": [ + "time_to_first_usable_artifact", + "revision_count", + "contradiction_count", + "blind_human_rating" + ] +} diff --git a/scripts/run-benchmarks.mjs b/scripts/run-benchmarks.mjs new file mode 100644 index 0000000..c5580ce --- /dev/null +++ b/scripts/run-benchmarks.mjs @@ -0,0 +1,426 @@ +#!/usr/bin/env node + +import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; + +import { extractStructuredArtifact } from './extract-structured-artifact.mjs'; +import { + IssueType, + Severity, + validateArtifact, +} from './validate-artifact.mjs'; + +const DEFAULT_SCENARIO_DIR = path.resolve('benchmarks', 'scenarios'); +const DEFAULT_SCORING_SPEC_REF = 'docs/shipwright-v2-benchmark-scoring-spec.md'; + +const SCORE_DIMENSIONS = Object.freeze([ + 'decision_usefulness', + 'evidence_discipline', + 'internal_consistency', + 'actionability', +]); + +const CONTRADICTION_TYPES = new Set([ + IssueType.METRIC_CONTRADICTION, + IssueType.SEGMENT_CONTRADICTION, + IssueType.CHALLENGE_FINDING_UNRESOLVED, +]); + +const USABILITY_BLOCKING_WARNING_TYPES = new Set([ + IssueType.MISSING_SECTION, + IssueType.MISSING_STRUCTURED_ARTIFACT, +]); + +export async function loadBenchmarkScenario(filePath) { + const resolved = path.resolve(filePath); + const raw = JSON.parse(await readFile(resolved, 'utf8')); + + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + throw new Error(`Benchmark scenario must be a JSON object: ${filePath}`); + } + + if (typeof raw.id !== 'string' || raw.id.trim().length === 0) { + throw new Error(`Benchmark scenario is missing id: ${filePath}`); + } + + if (typeof raw.inputs?.expected_artifact_type !== 'string') { + throw new Error(`Benchmark scenario is missing inputs.expected_artifact_type: ${filePath}`); + } + + if (typeof raw.fixtures?.first_pass_artifact !== 'string') { + throw new Error(`Benchmark scenario is missing fixtures.first_pass_artifact: ${filePath}`); + } + + if (typeof raw.fixtures?.final_pass_artifact !== 'string') { + throw new Error(`Benchmark scenario is missing fixtures.final_pass_artifact: ${filePath}`); + } + + return { + ...raw, + source_path: resolved, + }; +} + +export async function runBenchmarkScenario(scenario, options = {}) { + const scenarioRecord = scenario?.source_path + ? scenario + : await loadBenchmarkScenario(scenario); + + const relatedArtifacts = await loadRelatedArtifacts(scenarioRecord); + const blindReview = await loadBlindReview(scenarioRecord); + const firstPass = await evaluateScenarioPass( + scenarioRecord, + 'first_pass', + relatedArtifacts, + blindReview, + ); + const finalPass = await evaluateScenarioPass( + scenarioRecord, + 'final_pass', + relatedArtifacts, + blindReview, + ); + + const status = deriveScenarioStatus(finalPass); + const result = { + scenario_id: scenarioRecord.id, + status, + first_pass: { + usable: firstPass.usable, + validator_error_count: firstPass.validator_error_count, + contradiction_count: firstPass.contradiction_count, + blind_rating: firstPass.blind_rating, + }, + final_pass: { + usable: finalPass.usable, + time_to_first_usable_artifact_seconds: finalPass.usable + ? normalizeTimeToFirstUsable( + scenarioRecord.run_metadata?.time_to_first_usable_artifact_seconds, + scenarioRecord.id, + ) + : null, + revision_count: normalizeRevisionCount( + scenarioRecord.run_metadata?.revision_count, + scenarioRecord.id, + ), + validator_error_count: finalPass.validator_error_count, + contradiction_count: finalPass.contradiction_count, + blind_rating: finalPass.blind_rating, + }, + delta: { + usable_changed: finalPass.usable !== firstPass.usable, + blind_rating_change: computeNumericDelta( + finalPass.blind_rating, + firstPass.blind_rating, + ), + contradiction_count_change: + finalPass.contradiction_count - firstPass.contradiction_count, + validator_error_count_change: + finalPass.validator_error_count - firstPass.validator_error_count, + }, + diagnostics: { + title: scenarioRecord.title, + expected_artifact_type: scenarioRecord.inputs.expected_artifact_type, + scoring_spec_ref: + scenarioRecord.inputs.scoring_spec_ref || DEFAULT_SCORING_SPEC_REF, + first_pass_issue_types: firstPass.issues.map((issue) => issue.type), + final_pass_issue_types: finalPass.issues.map((issue) => issue.type), + }, + }; + + return result; +} + +export async function runBenchmarkSuite(options = {}) { + const scenarioDir = path.resolve(options.scenarioDir || DEFAULT_SCENARIO_DIR); + const scenarioIds = new Set(options.scenarioIds || []); + const scenarioFiles = await discoverScenarioFiles(scenarioDir); + const selectedScenarioFiles = scenarioFiles.filter((filePath) => { + if (scenarioIds.size === 0) return true; + return scenarioIds.has(path.basename(filePath, '.json')); + }); + + if (selectedScenarioFiles.length === 0) { + throw new Error(`No benchmark scenarios found in ${scenarioDir}`); + } + + const results = []; + for (const scenarioFile of selectedScenarioFiles) { + const scenario = await loadBenchmarkScenario(scenarioFile); + results.push(await runBenchmarkScenario(scenario)); + } + + const statusCounts = { PASS: 0, FAIL: 0, DNF: 0 }; + for (const result of results) { + statusCounts[result.status] += 1; + } + + const summary = { + generated_at: new Date().toISOString(), + scoring_spec_ref: DEFAULT_SCORING_SPEC_REF, + scenario_count: results.length, + status_counts: statusCounts, + mean_first_pass_blind_rating: computeMean( + results.map((result) => result.first_pass.blind_rating), + ), + mean_final_pass_blind_rating: computeMean( + results.map((result) => result.final_pass.blind_rating), + ), + results, + }; + + if (options.outPath) { + const outPath = path.resolve(options.outPath); + await mkdir(path.dirname(outPath), { recursive: true }); + await writeFile(`${outPath}`, `${JSON.stringify(summary, null, 2)}\n`, 'utf8'); + } + + return summary; +} + +function normalizeTimeToFirstUsable(value, scenarioId) { + if (value === null || value === undefined) { + throw new Error(`Scenario "${scenarioId}" is missing run_metadata.time_to_first_usable_artifact_seconds.`); + } + + if (!Number.isInteger(value) || value < 0) { + throw new Error( + `Scenario "${scenarioId}" has invalid time_to_first_usable_artifact_seconds.`, + ); + } + + return value; +} + +function normalizeRevisionCount(value, scenarioId) { + if (!Number.isInteger(value) || value < 0) { + throw new Error(`Scenario "${scenarioId}" has invalid run_metadata.revision_count.`); + } + return value; +} + +async function evaluateScenarioPass(scenario, passKey, relatedArtifacts, blindReview) { + const fixturePath = resolveScenarioPath( + scenario, + scenario.fixtures[`${passKey}_artifact`], + ); + const text = await readFile(fixturePath, 'utf8'); + const validation = validateArtifact(text, buildValidationOptions(scenario, relatedArtifacts)); + + return { + issues: validation.issues, + artifact: validation.artifact, + usable: isArtifactUsable(validation.issues), + validator_error_count: validation.issues.filter( + (issue) => issue.severity === Severity.ERROR, + ).length, + contradiction_count: countContradictions(validation.issues), + blind_rating: computeBlindRating(blindReview, passKey), + }; +} + +function buildValidationOptions(scenario, relatedArtifacts) { + return { + expectSections: Array.isArray(scenario.validator?.expect_sections) + ? scenario.validator.expect_sections + : [], + expectStructured: + scenario.validator?.expect_structured ?? + Boolean(scenario.inputs?.expected_artifact_type), + artifactType: scenario.inputs?.expected_artifact_type, + relatedArtifacts, + }; +} + +function isArtifactUsable(issues) { + for (const issue of issues) { + if (issue.severity === Severity.ERROR) return false; + if (USABILITY_BLOCKING_WARNING_TYPES.has(issue.type)) return false; + } + return true; +} + +function countContradictions(issues) { + const signatures = new Set(); + + for (const issue of issues) { + if (!CONTRADICTION_TYPES.has(issue.type)) continue; + signatures.add(`${issue.type}|${issue.lineNumber}|${issue.message}`); + } + + return signatures.size; +} + +function computeBlindRating(blindReview, passKey) { + if (!blindReview) return null; + + const raters = Array.isArray(blindReview.raters) ? blindReview.raters : []; + if (raters.length < 3) { + throw new Error('Blind review requires at least 3 raters.'); + } + + let total = 0; + for (const rater of raters) { + const passScores = rater?.[passKey]; + if (!passScores || typeof passScores !== 'object') { + throw new Error(`Blind review is missing ${passKey} scores for rater "${rater?.rater_id || 'unknown'}".`); + } + + let raterTotal = 0; + for (const dimension of SCORE_DIMENSIONS) { + const score = passScores[dimension]; + if (!Number.isFinite(score) || score < 1 || score > 5) { + throw new Error( + `Blind review score must be between 1 and 5 for ${dimension}.`, + ); + } + raterTotal += score; + } + + total += raterTotal / SCORE_DIMENSIONS.length; + } + + return roundToOneDecimal((total / raters.length / 5) * 100); +} + +function deriveScenarioStatus(finalPass) { + if (!finalPass.usable) return 'DNF'; + + const readinessStatus = finalPass.artifact?.pass_fail_readiness?.status; + return readinessStatus === 'FAIL' ? 'FAIL' : 'PASS'; +} + +function computeNumericDelta(finalValue, firstValue) { + if (!Number.isFinite(finalValue) || !Number.isFinite(firstValue)) { + return null; + } + + return roundToOneDecimal(finalValue - firstValue); +} + +function computeMean(values) { + const numericValues = values.filter((value) => Number.isFinite(value)); + if (numericValues.length === 0) return null; + + const sum = numericValues.reduce((total, value) => total + value, 0); + return roundToOneDecimal(sum / numericValues.length); +} + +function roundToOneDecimal(value) { + return Math.round(value * 10) / 10; +} + +async function discoverScenarioFiles(scenarioDir) { + const entries = await readdir(scenarioDir, { withFileTypes: true }); + return entries + .filter((entry) => entry.isFile() && entry.name.endsWith('.json')) + .map((entry) => path.join(scenarioDir, entry.name)) + .sort(); +} + +async function loadRelatedArtifacts(scenario) { + const relatedPaths = Array.isArray(scenario.fixtures?.related_artifacts) + ? scenario.fixtures.related_artifacts + : []; + + const relatedArtifacts = []; + for (const relatedPath of relatedPaths) { + const raw = await readFile(resolveScenarioPath(scenario, relatedPath), 'utf8'); + const ext = path.extname(relatedPath).toLowerCase(); + if (ext === '.json') { + relatedArtifacts.push(JSON.parse(raw)); + continue; + } + + const extracted = extractStructuredArtifact(raw); + if (extracted.error) { + throw new Error( + `Related artifact contains invalid structured JSON: ${relatedPath}: ${extracted.error}`, + ); + } + if (!extracted.artifact) { + throw new Error(`Related artifact is missing structured payload: ${relatedPath}`); + } + relatedArtifacts.push(extracted.artifact); + } + + return relatedArtifacts; +} + +async function loadBlindReview(scenario) { + const reviewPath = scenario.fixtures?.blind_review; + if (!reviewPath) return null; + + return JSON.parse( + await readFile(resolveScenarioPath(scenario, reviewPath), 'utf8'), + ); +} + +function resolveScenarioPath(scenario, relativePath) { + return path.resolve(path.dirname(scenario.source_path), relativePath); +} + +function collectFlagValues(argv, flagName) { + const values = []; + for (let i = 0; i < argv.length; i += 1) { + if (argv[i] === flagName && argv[i + 1]) { + values.push(argv[i + 1]); + i += 1; + } + } + return values; +} + +function readFlagValue(argv, flagName, fallback) { + const index = argv.findIndex((arg) => arg === flagName); + if (index !== -1 && argv[index + 1]) return argv[index + 1]; + return fallback; +} + +function formatSuiteSummary(summary) { + const lines = [ + `Benchmark suite: ${summary.scenario_count} scenario(s)`, + `Status counts: PASS ${summary.status_counts.PASS} | FAIL ${summary.status_counts.FAIL} | DNF ${summary.status_counts.DNF}`, + ]; + + for (const result of summary.results) { + lines.push( + `- ${result.status} ${result.scenario_id} | revisions ${result.final_pass.revision_count} | first-pass errors ${result.first_pass.validator_error_count} | final errors ${result.final_pass.validator_error_count}`, + ); + } + + return lines.join('\n'); +} + +async function main(argv = process.argv.slice(2)) { + const scenarioDir = readFlagValue(argv, '--scenario-dir', DEFAULT_SCENARIO_DIR); + const outPath = readFlagValue(argv, '--out', null); + const format = readFlagValue(argv, '--format', 'text'); + const scenarioIds = collectFlagValues(argv, '--scenario'); + + const summary = await runBenchmarkSuite({ + scenarioDir, + scenarioIds, + outPath, + }); + + if (format === 'json') { + console.log(JSON.stringify(summary, null, 2)); + return; + } + + console.log(formatSuiteSummary(summary)); +} + +function isDirectRun() { + if (!process.argv[1]) return false; + return import.meta.url === pathToFileURL(path.resolve(process.argv[1])).href; +} + +if (isDirectRun()) { + main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exitCode = 1; + }); +} diff --git a/tests/run-benchmarks.test.mjs b/tests/run-benchmarks.test.mjs new file mode 100644 index 0000000..2635919 --- /dev/null +++ b/tests/run-benchmarks.test.mjs @@ -0,0 +1,355 @@ +import assert from 'node:assert/strict'; +import { mkdir, mkdtemp, writeFile } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import test from 'node:test'; + +import { + loadBenchmarkScenario, + runBenchmarkScenario, + runBenchmarkSuite, +} from '../scripts/run-benchmarks.mjs'; + +test('runBenchmarkSuite evaluates the default benchmark fixture suite', { concurrency: false }, async () => { + const summary = await runBenchmarkSuite({ + scenarioDir: path.resolve('benchmarks/scenarios'), + }); + + assert.equal(summary.scenario_count, 6); + assert.deepEqual(summary.status_counts, { + PASS: 4, + FAIL: 1, + DNF: 1, + }); + + const pricingScenario = summary.results.find((result) => result.scenario_id === 'pricing-partial-data'); + assert.equal(pricingScenario.status, 'FAIL'); + assert.equal(pricingScenario.final_pass.usable, true); + + const boardScenario = summary.results.find((result) => result.scenario_id === 'board-update-ambiguity'); + assert.ok(boardScenario.diagnostics.first_pass_issue_types.includes('unsupported-numeric')); + assert.equal(boardScenario.diagnostics.final_pass_issue_types.length, 0); + + const dnfScenario = summary.results.find((result) => result.scenario_id === 'feature-weak-evidence'); + assert.equal(dnfScenario.status, 'DNF'); + assert.equal(dnfScenario.final_pass.usable, false); + assert.equal(dnfScenario.final_pass.time_to_first_usable_artifact_seconds, null); +}); + +test('runBenchmarkSuite filters to requested scenario ids', { concurrency: false }, async () => { + const summary = await runBenchmarkSuite({ + scenarioDir: path.resolve('benchmarks/scenarios'), + scenarioIds: ['board-update-ambiguity', 'pricing-partial-data'], + }); + + assert.equal(summary.scenario_count, 2); + assert.deepEqual(summary.status_counts, { + PASS: 1, + FAIL: 1, + DNF: 0, + }); + assert.deepEqual( + summary.results.map((result) => result.scenario_id), + ['board-update-ambiguity', 'pricing-partial-data'], + ); +}); + +test('runBenchmarkScenario normalizes blind ratings and computes deltas', { concurrency: false }, async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-benchmarks-')); + const scenarioFile = await writeScenarioFixture(rootDir, { + id: 'blind-rating-normalization', + artifactType: 'prd', + runMetadata: { + time_to_first_usable_artifact_seconds: 300, + revision_count: 1, + }, + firstArtifact: (() => { + const artifact = basePrdArtifact(); + artifact.evidence = []; + artifact.payload.customer_evidence_ids = []; + artifact.payload.success_metrics[0].evidence_ids = []; + artifact.pass_fail_readiness.reason = 'Missing evidence.'; + return artifact; + })(), + finalArtifact: basePrdArtifact(), + blindReview: { + raters: [ + makeRater('r1', 2, 4), + makeRater('r2', 2, 4), + makeRater('r3', 2, 4), + ], + }, + }); + + const result = await runBenchmarkScenario(scenarioFile); + assert.equal(result.status, 'PASS'); + assert.equal(result.first_pass.usable, false); + assert.equal(result.final_pass.usable, true); + assert.equal(result.first_pass.blind_rating, 40); + assert.equal(result.final_pass.blind_rating, 80); + assert.equal(result.delta.blind_rating_change, 40); + assert.equal(result.final_pass.time_to_first_usable_artifact_seconds, 300); +}); + +test('runBenchmarkScenario preserves diagnostic blind ratings for DNF scenarios', { concurrency: false }, async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-benchmarks-')); + const firstArtifact = basePrdArtifact(); + firstArtifact.evidence = []; + firstArtifact.payload.customer_evidence_ids = []; + firstArtifact.payload.success_metrics[0].evidence_ids = []; + firstArtifact.pass_fail_readiness.status = 'FAIL'; + firstArtifact.pass_fail_readiness.reason = 'Still under-evidenced.'; + firstArtifact.decision_frame.owner = ''; + + const finalArtifact = basePrdArtifact(); + finalArtifact.evidence = []; + finalArtifact.payload.customer_evidence_ids = []; + finalArtifact.payload.success_metrics[0].evidence_ids = []; + finalArtifact.pass_fail_readiness.status = 'FAIL'; + finalArtifact.pass_fail_readiness.reason = 'Still under-evidenced after one revision.'; + + const scenarioFile = await writeScenarioFixture(rootDir, { + id: 'dnf-diagnostic-rating', + artifactType: 'prd', + runMetadata: { + time_to_first_usable_artifact_seconds: null, + revision_count: 2, + }, + firstArtifact, + finalArtifact, + blindReview: { + raters: [ + makeRater('r1', 1, 3), + makeRater('r2', 1, 3), + makeRater('r3', 1, 3), + ], + }, + }); + + const result = await runBenchmarkScenario(scenarioFile); + assert.equal(result.status, 'DNF'); + assert.equal(result.final_pass.usable, false); + assert.equal(result.final_pass.time_to_first_usable_artifact_seconds, null); + assert.equal(result.first_pass.validator_error_count, 2); + assert.equal(result.final_pass.validator_error_count, 1); + assert.equal(result.delta.validator_error_count_change, -1); + assert.equal(result.final_pass.blind_rating, 60); + assert.equal(result.delta.blind_rating_change, 40); +}); + +test('loadBenchmarkScenario rejects scenarios missing id', { concurrency: false }, async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-benchmarks-')); + const scenarioPath = path.join(rootDir, 'missing-id.json'); + await writeFile( + scenarioPath, + `${JSON.stringify({ + title: 'missing id', + inputs: { expected_artifact_type: 'prd' }, + fixtures: { + first_pass_artifact: '../fixtures/example/first-pass.md', + final_pass_artifact: '../fixtures/example/final-pass.md', + }, + }, null, 2)}\n`, + 'utf8', + ); + + await assert.rejects( + loadBenchmarkScenario(scenarioPath), + /missing id/, + ); +}); + +test('loadBenchmarkScenario rejects scenarios missing first-pass fixture path', { concurrency: false }, async () => { + const rootDir = await mkdtemp(path.join(os.tmpdir(), 'shipwright-benchmarks-')); + const scenarioPath = path.join(rootDir, 'missing-first-pass.json'); + await writeFile( + scenarioPath, + `${JSON.stringify({ + id: 'missing-first-pass', + title: 'missing first pass fixture', + inputs: { expected_artifact_type: 'prd' }, + fixtures: { + final_pass_artifact: '../fixtures/example/final-pass.md', + }, + }, null, 2)}\n`, + 'utf8', + ); + + await assert.rejects( + loadBenchmarkScenario(scenarioPath), + /missing fixtures\.first_pass_artifact/, + ); +}); + +function makeRater(raterId, firstScore, finalScore) { + return { + rater_id: raterId, + first_pass: { + decision_usefulness: firstScore, + evidence_discipline: firstScore, + internal_consistency: firstScore, + actionability: firstScore, + }, + final_pass: { + decision_usefulness: finalScore, + evidence_discipline: finalScore, + internal_consistency: finalScore, + actionability: finalScore, + }, + }; +} + +function basePrdArtifact() { + return { + schema_version: '2.0.0', + artifact_type: 'prd', + mode: 'rigorous', + depth: 'standard', + metadata: { + title: 'PRD: Test Artifact', + status: 'approved', + authors: ['Shipwright'], + updated_at: '2026-04-02', + }, + decision_frame: { + recommendation: 'Ship the bounded pilot.', + tradeoff: 'Moves quickly with known follow-up work.', + confidence: 'medium', + owner: 'PM', + decision_date: '2026-04-02', + }, + unknowns: ['Which edge case shows up first.'], + pass_fail_readiness: { + status: 'PASS', + reason: 'The artifact is structured and evidence-linked.', + }, + evidence: [ + { + evidence_id: 'ev-test-1', + kind: 'document', + source_ref: 'test-source', + confidence: 'high', + supports: [ + 'decision_frame.recommendation', + 'problem-test', + 'metric-test', + ], + }, + ], + payload: { + problem_statement: { + problem_id: 'problem-test', + text: 'Teams need a more reliable handoff flow.', + }, + customer_evidence_ids: ['ev-test-1'], + success_metrics: [ + { + metric_id: 'metric-test', + name: 'handoff success rate', + segment: 'operations teams', + unit: 'percent', + timeframe: '30 days', + baseline: 55, + target: 75, + evidence_ids: ['ev-test-1'], + }, + ], + scope: { + in: ['Retry flow'], + out: ['Migration tooling'], + }, + open_questions: ['Which notification should ship first?'], + target_segment: 'operations teams', + }, + }; +} + +function renderArtifactMarkdown(artifact) { + return `# ${artifact.metadata.title} + +## Decision Frame + +Recommendation: ${artifact.decision_frame.recommendation} + +## Unknowns & Evidence Gaps + +- ${artifact.unknowns[0]} + +## Pass/Fail Readiness + +${artifact.pass_fail_readiness.status} because ${artifact.pass_fail_readiness.reason} + +## Recommended Next Artifact + +- Sprint plan. + + +`; +} + +async function writeScenarioFixture(rootDir, options) { + const scenarioDir = path.join(rootDir, 'scenarios'); + const fixtureDir = path.join(rootDir, 'fixtures', options.id); + await mkdir(scenarioDir, { recursive: true }); + await mkdir(fixtureDir, { recursive: true }); + + await writeFile( + path.join(fixtureDir, 'first-pass.md'), + renderArtifactMarkdown(options.firstArtifact), + 'utf8', + ); + await writeFile( + path.join(fixtureDir, 'final-pass.md'), + renderArtifactMarkdown(options.finalArtifact), + 'utf8', + ); + + let blindReviewPath = null; + if (options.blindReview) { + blindReviewPath = path.join(fixtureDir, 'blind-review.json'); + await writeFile( + blindReviewPath, + `${JSON.stringify(options.blindReview, null, 2)}\n`, + 'utf8', + ); + } + + const scenario = { + id: options.id, + title: options.id, + inputs: { + prompt: 'Test benchmark scenario', + context_files: [], + expected_artifact_type: options.artifactType, + scoring_spec_ref: 'docs/shipwright-v2-benchmark-scoring-spec.md', + }, + validator: { + expect_sections: [ + 'Decision Frame', + 'Unknowns & Evidence Gaps', + 'Pass/Fail Readiness', + 'Recommended Next Artifact', + ], + expect_structured: true, + }, + fixtures: { + first_pass_artifact: `../fixtures/${options.id}/first-pass.md`, + final_pass_artifact: `../fixtures/${options.id}/final-pass.md`, + related_artifacts: [], + blind_review: blindReviewPath ? `../fixtures/${options.id}/blind-review.json` : null, + }, + run_metadata: options.runMetadata, + measures: [ + 'time_to_first_usable_artifact', + 'revision_count', + 'contradiction_count', + 'blind_human_rating', + ], + }; + + const scenarioFile = path.join(scenarioDir, `${options.id}.json`); + await writeFile(scenarioFile, `${JSON.stringify(scenario, null, 2)}\n`, 'utf8'); + return scenarioFile; +}