From e188e3f108dbb324af5e658a5d24f2cf9fc6d056 Mon Sep 17 00:00:00 2001
From: NagyVikt <nagy.viktordp@gmail.com>
Date: Sat, 16 May 2026 01:46:12 +0200
Subject: [PATCH] test(scenarios): add reproducible test scenarios harness

- 5 canonical multi-agent scenarios as seed.sql + inputs.jsonl + expected.json
- In-process harness with vi.useFakeTimers + path normalization
- pnpm scenarios / scenarios:filter / scenarios:explain / scenarios:record
- 2 harness self-tests (fails-closed on missing expected, clear diff on mismatch)
- Separate scenarios CI job on Node 20
---
 .github/workflows/ci.yml                      |  18 +
 README.md                                     |   2 +-
 .../scenarios-harness-2026-05-16/CHANGE.md    |  31 ++
 package.json                                  |   7 +-
 pnpm-lock.yaml                                |   3 +
 .../01-claim-before-edit/expected.json        | 112 ++++++
 .../01-claim-before-edit/inputs.jsonl         |   7 +
 .../scenarios/01-claim-before-edit/meta.yaml  |  10 +
 tests/scenarios/01-claim-before-edit/seed.sql |   2 +
 .../02-cross-runtime-handoff/expected.json    |  92 +++++
 .../02-cross-runtime-handoff/inputs.jsonl     |  11 +
 .../02-cross-runtime-handoff/meta.yaml        |  13 +
 .../02-cross-runtime-handoff/seed.sql         |   2 +
 .../03-stale-claim-sweep/expected.json        |  77 +++++
 .../03-stale-claim-sweep/inputs.jsonl         |  10 +
 .../scenarios/03-stale-claim-sweep/meta.yaml  |  11 +
 tests/scenarios/03-stale-claim-sweep/seed.sql |   2 +
 .../04-plan-claim-adoption/expected.json      |  75 ++++
 .../04-plan-claim-adoption/inputs.jsonl       |   9 +
 .../04-plan-claim-adoption/meta.yaml          |  13 +
 .../scenarios/04-plan-claim-adoption/seed.sql |  33 ++
 .../05-path-mismatch-reclaim/expected.json    |  95 ++++++
 .../05-path-mismatch-reclaim/inputs.jsonl     |   8 +
 .../05-path-mismatch-reclaim/meta.yaml        |  11 +
 .../05-path-mismatch-reclaim/seed.sql         |   2 +
 tests/scenarios/README.md                     |  80 +++++
 .../_harness/__tests__/harness.test.ts        | 160 +++++++++
 tests/scenarios/_harness/assert.mts           | 308 +++++++++++++++++
 tests/scenarios/_harness/explain.mts          | 150 ++++++++
 tests/scenarios/_harness/record.mts           | 103 ++++++
 tests/scenarios/_harness/run.mts              | 321 ++++++++++++++++++
 tests/scenarios/_harness/scenario.test.ts     |  73 ++++
 tests/scenarios/_harness/setup.mts            | 114 +++++++
 tests/scenarios/_harness/tsconfig.json        |  12 +
 tests/scenarios/_harness/vitest.config.ts     |  41 +++
 35 files changed, 2016 insertions(+), 2 deletions(-)
 create mode 100644 openspec/changes/scenarios-harness-2026-05-16/CHANGE.md
 create mode 100644 tests/scenarios/01-claim-before-edit/expected.json
 create mode 100644 tests/scenarios/01-claim-before-edit/inputs.jsonl
 create mode 100644 tests/scenarios/01-claim-before-edit/meta.yaml
 create mode 100644 tests/scenarios/01-claim-before-edit/seed.sql
 create mode 100644 tests/scenarios/02-cross-runtime-handoff/expected.json
 create mode 100644 tests/scenarios/02-cross-runtime-handoff/inputs.jsonl
 create mode 100644 tests/scenarios/02-cross-runtime-handoff/meta.yaml
 create mode 100644 tests/scenarios/02-cross-runtime-handoff/seed.sql
 create mode 100644 tests/scenarios/03-stale-claim-sweep/expected.json
 create mode 100644 tests/scenarios/03-stale-claim-sweep/inputs.jsonl
 create mode 100644 tests/scenarios/03-stale-claim-sweep/meta.yaml
 create mode 100644 tests/scenarios/03-stale-claim-sweep/seed.sql
 create mode 100644 tests/scenarios/04-plan-claim-adoption/expected.json
 create mode 100644 tests/scenarios/04-plan-claim-adoption/inputs.jsonl
 create mode 100644 tests/scenarios/04-plan-claim-adoption/meta.yaml
 create mode 100644 tests/scenarios/04-plan-claim-adoption/seed.sql
 create mode 100644 tests/scenarios/05-path-mismatch-reclaim/expected.json
 create mode 100644 tests/scenarios/05-path-mismatch-reclaim/inputs.jsonl
 create mode 100644 tests/scenarios/05-path-mismatch-reclaim/meta.yaml
 create mode 100644 tests/scenarios/05-path-mismatch-reclaim/seed.sql
 create mode 100644 tests/scenarios/README.md
 create mode 100644 tests/scenarios/_harness/__tests__/harness.test.ts
 create mode 100644 tests/scenarios/_harness/assert.mts
 create mode 100644 tests/scenarios/_harness/explain.mts
 create mode 100644 tests/scenarios/_harness/record.mts
 create mode 100644 tests/scenarios/_harness/run.mts
 create mode 100644 tests/scenarios/_harness/scenario.test.ts
 create mode 100644 tests/scenarios/_harness/setup.mts
 create mode 100644 tests/scenarios/_harness/tsconfig.json
 create mode 100644 tests/scenarios/_harness/vitest.config.ts

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6cee1f2..186cafe 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -50,3 +50,21 @@ jobs:
           cache: 'pnpm'
       - run: pnpm install --frozen-lockfile
       - run: bash scripts/e2e-publish.sh
+
+  # Reproducible multi-agent scenarios under tests/scenarios/. Kept out
+  # of the `pnpm test` aggregate so a scenario failure stays attributable
+  # to the harness rather than blending into the per-package test job.
+  scenarios:
+    if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
+    runs-on: ubuntu-latest
+    needs: build
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+      - uses: pnpm/action-setup@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+          cache: 'pnpm'
+      - run: pnpm install --frozen-lockfile
+      - run: pnpm scenarios
diff --git a/README.md b/README.md
index d727655..00d4909 100644
--- a/README.md
+++ b/README.md
@@ -1015,7 +1015,7 @@ validate.
 - 🟡 Cursor and Gemini CLI installers exist but have less smoke coverage
 - 🔵 Per-runtime smoke for claim-before-edit emission
 - 🔵 Cross-runtime handoff smoke (Codex hands off to Claude, both run)
-- ⏳ Reproducible test fixture set under `tests/scenarios/`
+- ✅ Reproducible test fixture set under `tests/scenarios/` (5 scenarios, harness self-tests, `pnpm scenarios`)
 
 > **`time-to-healthy`: still hours**, but the time the human spends _deciding what to run_ drops sharply because every signal carries its `cmd:` and `tool:` already.
 
diff --git a/openspec/changes/scenarios-harness-2026-05-16/CHANGE.md b/openspec/changes/scenarios-harness-2026-05-16/CHANGE.md
new file mode 100644
index 0000000..5ed4b5c
--- /dev/null
+++ b/openspec/changes/scenarios-harness-2026-05-16/CHANGE.md
@@ -0,0 +1,31 @@
+---
+slug: scenarios-harness-2026-05-16
+---
+
+# CHANGE · scenarios-harness-2026-05-16
+
+## §P proposal
+
+### Problem
+
+README §v0.x "Multi-runtime confidence" lists "Reproducible test fixture set under `tests/scenarios/`" as the last open item. Today, multi-agent situations (claim-before-edit, cross-runtime handoff, stale-claim sweep, plan claim adoption, pre/post path mismatch) live as ad-hoc smoke tests scattered across `packages/hooks/test/` and `apps/cli/test/`. Each rebuilds its own tempdir + git repo + fake-timer scaffolding inline. Reproducing a regression means hand-porting that scaffolding into a fresh file.
+
+### Proposal
+
+Add a reproducible test-scenarios harness under `tests/scenarios/`. Each scenario is a directory of plaintext artifacts (no binary snapshots):
+
+- `seed.sql` — applied after schema migrations against a fresh tempdir SQLite DB.
+- `inputs.jsonl` — one envelope per line: `{kind, at_ms, payload}` where `kind` is `lifecycle | mcp | tick`. Lifecycle flows through the same `runOmxLifecycleEnvelope` that production hooks call.
+- `expected.json` — normalized substrate snapshot with subset matchers (`toMatchObject` style), not full-row equality. Paths normalized to `<REPO_ROOT>`.
+- Optional `meta.yaml` — runtimes, tags, description.
+
+A shared `_harness/` drives all scenarios via `vi.useFakeTimers` + `vi.setSystemTime(BASE_TS + at_ms)` per envelope so timing is deterministic. Embeddings forced to `provider: none` to remove network. Five canonical scenarios ship in this PR: claim-before-edit, cross-runtime handoff, stale-claim sweep, plan claim adoption, pre/post path mismatch. Two harness self-tests prove the runner fails closed on missing expected and reports a clear diff on mismatch. A separate CI job runs `pnpm scenarios` on Node 20 after `build`, kept out of `pnpm test` so failure attribution stays clean.
+
+### Acceptance criteria
+
+- `pnpm scenarios` runs all five scenarios plus two harness self-tests, all green.
+- `pnpm scenarios:filter <slug>` runs a single scenario by name.
+- `pnpm scenarios:explain <slug>` prints a human-readable timeline.
+- `pnpm scenarios:record <slug>` regenerates `expected.json` from a live run (manual trim still required for subset matcher discipline).
+- `.github/workflows/ci.yml` gains a `scenarios` job after `build` running on Node 20 only.
+- `pnpm typecheck` and `pnpm build` clean.
diff --git a/package.json b/package.json
index 3efa656..0c26aa9 100644
--- a/package.json
+++ b/package.json
@@ -33,13 +33,18 @@
     "p": "pnpm run release",
     "publish:cli": "bash scripts/publish-cli.sh",
     "publish:cli:dry-run": "bash scripts/publish-cli.sh --dry-run",
-    "release": "changeset publish"
+    "release": "changeset publish",
+    "scenarios": "vitest run --config tests/scenarios/_harness/vitest.config.ts",
+    "scenarios:filter": "vitest run --config tests/scenarios/_harness/vitest.config.ts -t",
+    "scenarios:explain": "tsx tests/scenarios/_harness/explain.mts",
+    "scenarios:record": "tsx tests/scenarios/_harness/record.mts"
   },
   "devDependencies": {
     "@biomejs/biome": "^1.9.4",
     "@changesets/cli": "^2.27.9",
     "@types/node": "^22.9.0",
     "tsup": "^8.3.5",
+    "tsx": "^4.19.2",
     "typescript": "^5.6.3",
     "vitest": "^2.1.5"
   }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 97834ab..808b3d1 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -20,6 +20,9 @@ importers:
       tsup:
         specifier: ^8.3.5
         version: 8.5.1(postcss@8.5.10)(tsx@4.21.0)(typescript@5.9.3)
+      tsx:
+        specifier: ^4.19.2
+        version: 4.21.0
       typescript:
         specifier: ^5.6.3
         version: 5.9.3
diff --git a/tests/scenarios/01-claim-before-edit/expected.json b/tests/scenarios/01-claim-before-edit/expected.json
new file mode 100644
index 0000000..5f8b5dc
--- /dev/null
+++ b/tests/scenarios/01-claim-before-edit/expected.json
@@ -0,0 +1,112 @@
+{
+  "observations": [
+    {
+      "kind": "lifecycle_event",
+      "ts_offset": 10,
+      "metadata_subset": {
+        "event_name": "session_start",
+        "session_id": "codex@scenario-01",
+        "agent": "codex",
+        "branch": "agent/scenario/default",
+        "binding_status": "bound_task"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_01_session",
+        "event_type": "session_start",
+        "ok": true
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_01_bind",
+        "event_type": "task_bind",
+        "ok": true
+      }
+    },
+    {
+      "kind": "claim",
+      "ts_offset": 40,
+      "metadata_subset": {
+        "kind": "claim",
+        "source": "pre-tool-use",
+        "file_path": "src/target.ts",
+        "auto_claimed_before_edit": true,
+        "tool": "Edit"
+      }
+    },
+    {
+      "kind": "claim-before-edit",
+      "ts_offset": 40,
+      "metadata_subset": {
+        "kind": "claim-before-edit",
+        "source": "pre-tool-use",
+        "outcome": "auto_claimed_before_edit",
+        "file_path": "src/target.ts",
+        "tool": "Edit",
+        "conflict": false
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "ts_offset": 40,
+      "metadata_subset": {
+        "event_id": "evt_01_pre",
+        "event_type": "pre_tool_use",
+        "tool_name": "Edit",
+        "extracted_paths": ["src/target.ts"]
+      }
+    },
+    {
+      "kind": "tool_use",
+      "ts_offset": 60,
+      "metadata_subset": {
+        "tool": "Edit",
+        "lifecycle_event_id": "evt_01_post",
+        "parent_event_id": "evt_01_pre",
+        "file_path": "src/target.ts"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "ts_offset": 60,
+      "metadata_subset": {
+        "event_id": "evt_01_post",
+        "event_type": "post_tool_use",
+        "parent_event_id": "evt_01_pre",
+        "tool_name": "Edit"
+      }
+    }
+  ],
+  "claims": [
+    {
+      "task_id": 1,
+      "file_path": "src/target.ts",
+      "session_id": "codex@scenario-01",
+      "state": "active"
+    }
+  ],
+  "mcp_metrics": [],
+  "lifecycle_events": [
+    {
+      "event_type": "session_start",
+      "event_id": "evt_01_session"
+    },
+    {
+      "event_type": "task_bind",
+      "event_id": "evt_01_bind"
+    },
+    {
+      "event_type": "pre_tool_use",
+      "event_id": "evt_01_pre"
+    },
+    {
+      "event_type": "post_tool_use",
+      "event_id": "evt_01_post",
+      "parent_event_id": "evt_01_pre"
+    }
+  ]
+}
diff --git a/tests/scenarios/01-claim-before-edit/inputs.jsonl b/tests/scenarios/01-claim-before-edit/inputs.jsonl
new file mode 100644
index 0000000..83c2875
--- /dev/null
+++ b/tests/scenarios/01-claim-before-edit/inputs.jsonl
@@ -0,0 +1,7 @@
+# Scenario 01 — claim-before-edit
+# A codex session starts, binds to a task, then claims src/target.ts
+# before issuing a pre_tool_use + post_tool_use Edit on it.
+{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_01_session","event_name":"session_start","session_id":"codex@scenario-01","agent":"codex","branch":"agent/scenario/default"}}
+{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_01_bind","event_name":"task_bind","session_id":"codex@scenario-01","agent":"codex","branch":"agent/scenario/default"}}
+{"kind":"lifecycle","at_ms":40,"payload":{"event_id":"evt_01_pre","event_name":"pre_tool_use","session_id":"codex@scenario-01","agent":"codex","branch":"agent/scenario/default","tool_name":"Edit","tool_input":{"operation":"replace","paths":[{"path":"<REPO_ROOT>/src/target.ts","role":"target","kind":"file"}]}}}
+{"kind":"lifecycle","at_ms":60,"payload":{"event_id":"evt_01_post","event_name":"post_tool_use","parent_event_id":"evt_01_pre","session_id":"codex@scenario-01","agent":"codex","branch":"agent/scenario/default","tool_name":"Edit","tool_input":{"operation":"replace","paths":[{"path":"<REPO_ROOT>/src/target.ts","role":"target","kind":"file"}]},"tool_response":{"success":true}}}
diff --git a/tests/scenarios/01-claim-before-edit/meta.yaml b/tests/scenarios/01-claim-before-edit/meta.yaml
new file mode 100644
index 0000000..aa482af
--- /dev/null
+++ b/tests/scenarios/01-claim-before-edit/meta.yaml
@@ -0,0 +1,10 @@
+description: |
+  Codex session binds to a task on agent/scenario/default and edits
+  src/target.ts. Proves pre_tool_use synthesizes a claim-before-edit
+  signal and that the post_tool_use observation lands with the claim
+  already in place.
+runtimes:
+  - codex
+tags:
+  - claim-before-edit
+  - lifecycle
diff --git a/tests/scenarios/01-claim-before-edit/seed.sql b/tests/scenarios/01-claim-before-edit/seed.sql
new file mode 100644
index 0000000..4ea4ce4
--- /dev/null
+++ b/tests/scenarios/01-claim-before-edit/seed.sql
@@ -0,0 +1,2 @@
+-- No seed needed: the lifecycle session_start + task_bind events drive
+-- session creation and task binding by themselves.
diff --git a/tests/scenarios/02-cross-runtime-handoff/expected.json b/tests/scenarios/02-cross-runtime-handoff/expected.json
new file mode 100644
index 0000000..42ee6ad
--- /dev/null
+++ b/tests/scenarios/02-cross-runtime-handoff/expected.json
@@ -0,0 +1,92 @@
+{
+  "observations": [
+    {
+      "kind": "lifecycle_event",
+      "metadata_subset": {
+        "event_name": "session_start",
+        "session_id": "codex@scenario-02",
+        "agent": "codex"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_02_codex_session",
+        "event_type": "session_start"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_02_codex_bind",
+        "event_type": "task_bind"
+      }
+    },
+    {
+      "kind": "claim",
+      "ts_offset": 30,
+      "metadata_subset": {
+        "kind": "claim",
+        "file_path": "src/target.ts"
+      }
+    },
+    {
+      "kind": "relay",
+      "ts_offset": 50,
+      "metadata_subset": {
+        "kind": "relay",
+        "from_session_id": "codex@scenario-02",
+        "from_agent": "codex",
+        "reason": "quota"
+      }
+    },
+    {
+      "kind": "claim-weakened",
+      "ts_offset": 50,
+      "metadata_subset": {
+        "kind": "claim-weakened",
+        "file_path": "src/target.ts",
+        "state": "handoff_pending"
+      }
+    },
+    {
+      "kind": "lifecycle_event",
+      "ts_offset": 300000,
+      "metadata_subset": {
+        "event_name": "session_start",
+        "session_id": "claude@scenario-02",
+        "agent": "claude"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_02_claude_session",
+        "event_type": "session_start"
+      }
+    }
+  ],
+  "claims": [
+    {
+      "task_id": 1,
+      "file_path": "src/target.ts",
+      "session_id": "claude@scenario-02",
+      "state": "active"
+    }
+  ],
+  "mcp_metrics": [],
+  "lifecycle_events": [
+    {
+      "event_type": "session_start",
+      "event_id": "evt_02_codex_session"
+    },
+    {
+      "event_type": "task_bind",
+      "event_id": "evt_02_codex_bind"
+    },
+    {
+      "event_type": "session_start",
+      "event_id": "evt_02_claude_session"
+    }
+  ]
+}
diff --git a/tests/scenarios/02-cross-runtime-handoff/inputs.jsonl b/tests/scenarios/02-cross-runtime-handoff/inputs.jsonl
new file mode 100644
index 0000000..3434415
--- /dev/null
+++ b/tests/scenarios/02-cross-runtime-handoff/inputs.jsonl
@@ -0,0 +1,11 @@
+# Scenario 02 — cross-runtime handoff
+# Codex binds to a task, claims src/target.ts, then relays out. Claude
+# session adopts via accept_relay. Proves the baton pass survives
+# across runtimes and the claim ends up owned by claude's session.
+{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_02_codex_session","event_name":"session_start","session_id":"codex@scenario-02","agent":"codex","branch":"agent/scenario/default"}}
+{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_02_codex_bind","event_name":"task_bind","session_id":"codex@scenario-02","agent":"codex","branch":"agent/scenario/default"}}
+{"kind":"task","at_ms":30,"payload":{"action":"claim_file","task_id":1,"session_id":"codex@scenario-02","file_path":"src/target.ts","note":"codex claims before relay"}}
+{"kind":"task","at_ms":50,"payload":{"action":"relay","task_id":1,"from_session_id":"codex@scenario-02","from_agent":"codex","reason":"quota","one_line":"codex hit quota, handing off","base_branch":"main","expires_in_ms":600000}}
+{"kind":"lifecycle","at_ms":300000,"payload":{"event_id":"evt_02_claude_session","event_name":"session_start","session_id":"claude@scenario-02","agent":"claude","branch":"agent/scenario/default"}}
+{"kind":"task","at_ms":300100,"payload":{"action":"join","task_id":1,"session_id":"claude@scenario-02","agent":"claude"}}
+{"kind":"task","at_ms":300200,"payload":{"action":"accept_relay","task_id":1,"session_id":"claude@scenario-02"}}
diff --git a/tests/scenarios/02-cross-runtime-handoff/meta.yaml b/tests/scenarios/02-cross-runtime-handoff/meta.yaml
new file mode 100644
index 0000000..7429253
--- /dev/null
+++ b/tests/scenarios/02-cross-runtime-handoff/meta.yaml
@@ -0,0 +1,13 @@
+description: |
+  Codex binds to a task on agent/scenario/default and claims
+  src/target.ts. Codex then relays out (quota reason). 5 minutes later
+  a claude session starts, joins the task, and accepts the relay.
+  Proves the cross-runtime baton pass and that the claim ends up
+  owned by claude's session.
+runtimes:
+  - codex
+  - claude
+tags:
+  - handoff
+  - relay
+  - cross-runtime
diff --git a/tests/scenarios/02-cross-runtime-handoff/seed.sql b/tests/scenarios/02-cross-runtime-handoff/seed.sql
new file mode 100644
index 0000000..e1cb924
--- /dev/null
+++ b/tests/scenarios/02-cross-runtime-handoff/seed.sql
@@ -0,0 +1,2 @@
+-- No seed needed: lifecycle session_start + task_bind create the task,
+-- task envelopes drive the relay and adoption.
diff --git a/tests/scenarios/03-stale-claim-sweep/expected.json b/tests/scenarios/03-stale-claim-sweep/expected.json
new file mode 100644
index 0000000..ee912ff
--- /dev/null
+++ b/tests/scenarios/03-stale-claim-sweep/expected.json
@@ -0,0 +1,77 @@
+{
+  "observations": [
+    {
+      "kind": "lifecycle_event",
+      "metadata_subset": {
+        "session_id": "codex@scenario-03",
+        "agent": "codex"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_03_session",
+        "event_type": "session_start"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_03_bind",
+        "event_type": "task_bind"
+      }
+    },
+    {
+      "kind": "claim",
+      "ts_offset": 30,
+      "metadata_subset": {
+        "kind": "claim",
+        "file_path": "src/target.ts"
+      }
+    },
+    {
+      "kind": "relay",
+      "ts_offset": 50,
+      "metadata_subset": {
+        "kind": "relay",
+        "reason": "quota"
+      }
+    },
+    {
+      "kind": "claim-weakened",
+      "ts_offset": 50,
+      "metadata_subset": {
+        "kind": "claim-weakened",
+        "file_path": "src/target.ts",
+        "state": "handoff_pending"
+      }
+    },
+    {
+      "kind": "claim-weakened",
+      "ts_offset": 120100,
+      "metadata_subset": {
+        "kind": "claim-weakened",
+        "file_path": "src/target.ts",
+        "state": "weak_expired"
+      }
+    }
+  ],
+  "claims": [
+    {
+      "task_id": 1,
+      "file_path": "src/target.ts",
+      "state": "weak_expired"
+    }
+  ],
+  "mcp_metrics": [],
+  "lifecycle_events": [
+    {
+      "event_type": "session_start",
+      "event_id": "evt_03_session"
+    },
+    {
+      "event_type": "task_bind",
+      "event_id": "evt_03_bind"
+    }
+  ]
+}
diff --git a/tests/scenarios/03-stale-claim-sweep/inputs.jsonl b/tests/scenarios/03-stale-claim-sweep/inputs.jsonl
new file mode 100644
index 0000000..c0c5970
--- /dev/null
+++ b/tests/scenarios/03-stale-claim-sweep/inputs.jsonl
@@ -0,0 +1,10 @@
+# Scenario 03 — stale-claim sweep
+# Codex binds, claims, then relays out (creates handoff_pending claim
+# with TTL=60s). After the TTL passes, a tick + release_expired_quota
+# transitions the claim into weak_expired.
+{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_03_session","event_name":"session_start","session_id":"codex@scenario-03","agent":"codex","branch":"agent/scenario/default"}}
+{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_03_bind","event_name":"task_bind","session_id":"codex@scenario-03","agent":"codex","branch":"agent/scenario/default"}}
+{"kind":"task","at_ms":30,"payload":{"action":"claim_file","task_id":1,"session_id":"codex@scenario-03","file_path":"src/target.ts","note":"about to be expired"}}
+{"kind":"task","at_ms":50,"payload":{"action":"relay","task_id":1,"from_session_id":"codex@scenario-03","from_agent":"codex","reason":"quota","one_line":"forced quota for stale-claim test","base_branch":"main","expires_in_ms":60000}}
+{"kind":"tick","at_ms":120000,"payload":{"reason":"advance past 60s TTL"}}
+{"kind":"task","at_ms":120100,"payload":{"action":"release_expired_quota","task_id":1,"session_id":"codex@scenario-03"}}
diff --git a/tests/scenarios/03-stale-claim-sweep/meta.yaml b/tests/scenarios/03-stale-claim-sweep/meta.yaml
new file mode 100644
index 0000000..18cb4f3
--- /dev/null
+++ b/tests/scenarios/03-stale-claim-sweep/meta.yaml
@@ -0,0 +1,11 @@
+description: |
+  Codex claims src/target.ts then relays out with TTL=60s. The
+  scenario ticks past TTL and asks the task to release_expired_quota.
+  Proves stale claims transition to weak_expired and become eligible
+  for the next agent to claim.
+runtimes:
+  - codex
+tags:
+  - stale-claim
+  - sweep
+  - ttl
diff --git a/tests/scenarios/03-stale-claim-sweep/seed.sql b/tests/scenarios/03-stale-claim-sweep/seed.sql
new file mode 100644
index 0000000..ace7060
--- /dev/null
+++ b/tests/scenarios/03-stale-claim-sweep/seed.sql
@@ -0,0 +1,2 @@
+-- No seed needed: relay + tick + release_expired_quota drives the
+-- claim through handoff_pending into weak_expired.
diff --git a/tests/scenarios/04-plan-claim-adoption/expected.json b/tests/scenarios/04-plan-claim-adoption/expected.json
new file mode 100644
index 0000000..2bebc85
--- /dev/null
+++ b/tests/scenarios/04-plan-claim-adoption/expected.json
@@ -0,0 +1,75 @@
+{
+  "observations": [
+    {
+      "kind": "plan-subtask-claim",
+      "metadata_subset": {
+        "kind": "plan-subtask-claim",
+        "status": "pending",
+        "plan_slug": "scenario-04-plan",
+        "subtask_index": 0
+      }
+    },
+    {
+      "kind": "lifecycle_event",
+      "metadata_subset": {
+        "session_id": "codex@scenario-04",
+        "agent": "codex",
+        "task_id": 100,
+        "binding_status": "bound_task"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_04_session",
+        "event_type": "session_start"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_04_bind",
+        "event_type": "task_bind"
+      }
+    },
+    {
+      "kind": "claim",
+      "ts_offset": 40,
+      "metadata_subset": {
+        "kind": "claim",
+        "file_path": "src/target.ts"
+      }
+    },
+    {
+      "kind": "plan-subtask-claim",
+      "ts_offset": 50,
+      "metadata_subset": {
+        "kind": "plan-subtask-claim",
+        "status": "claimed",
+        "plan_slug": "scenario-04-plan",
+        "subtask_index": 0,
+        "session_id": "codex@scenario-04",
+        "agent": "codex"
+      }
+    }
+  ],
+  "claims": [
+    {
+      "task_id": 100,
+      "file_path": "src/target.ts",
+      "session_id": "codex@scenario-04",
+      "state": "active"
+    }
+  ],
+  "mcp_metrics": [],
+  "lifecycle_events": [
+    {
+      "event_type": "session_start",
+      "event_id": "evt_04_session"
+    },
+    {
+      "event_type": "task_bind",
+      "event_id": "evt_04_bind"
+    }
+  ]
+}
diff --git a/tests/scenarios/04-plan-claim-adoption/inputs.jsonl b/tests/scenarios/04-plan-claim-adoption/inputs.jsonl
new file mode 100644
index 0000000..eacc168
--- /dev/null
+++ b/tests/scenarios/04-plan-claim-adoption/inputs.jsonl
@@ -0,0 +1,9 @@
+# Scenario 04 — plan claim adoption
+# Queen published a plan sub-task (seeded). A codex agent starts a
+# session on the sub-task branch, joins the task, claims the file scope,
+# and emits a plan-subtask-claim observation transitioning to 'claimed'.
+{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_04_session","event_name":"session_start","session_id":"codex@scenario-04","agent":"codex","branch":"spec/scenario-04-plan/sub-0"}}
+{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_04_bind","event_name":"task_bind","session_id":"codex@scenario-04","agent":"codex","branch":"spec/scenario-04-plan/sub-0"}}
+{"kind":"task","at_ms":30,"payload":{"action":"join","task_id":100,"session_id":"codex@scenario-04","agent":"codex"}}
+{"kind":"task","at_ms":40,"payload":{"action":"claim_file","task_id":100,"session_id":"codex@scenario-04","file_path":"src/target.ts","note":"adopting plan sub-task"}}
+{"kind":"task","at_ms":50,"payload":{"action":"add_observation","task_id":100,"session_id":"codex@scenario-04","kind":"plan-subtask-claim","content":"codex adopting scenario-04-plan sub-0","metadata":{"kind":"plan-subtask-claim","status":"claimed","plan_slug":"scenario-04-plan","subtask_index":0,"session_id":"codex@scenario-04","agent":"codex"}}}
diff --git a/tests/scenarios/04-plan-claim-adoption/meta.yaml b/tests/scenarios/04-plan-claim-adoption/meta.yaml
new file mode 100644
index 0000000..ee9d046
--- /dev/null
+++ b/tests/scenarios/04-plan-claim-adoption/meta.yaml
@@ -0,0 +1,13 @@
+description: |
+  Seeded with a queen-published plan sub-task (task_id=100). A codex
+  agent starts a session on the sub-task branch, joins the task,
+  claims the file scope, and writes the plan-subtask-claim
+  'claimed' observation. Proves adoption of a queen-owned task without
+  needing the full publishPlan side-effect cascade.
+runtimes:
+  - codex
+  - queen
+tags:
+  - plan
+  - subtask
+  - adoption
diff --git a/tests/scenarios/04-plan-claim-adoption/seed.sql b/tests/scenarios/04-plan-claim-adoption/seed.sql
new file mode 100644
index 0000000..0b38e67
--- /dev/null
+++ b/tests/scenarios/04-plan-claim-adoption/seed.sql
@@ -0,0 +1,33 @@
+-- Queen published a plan with one sub-task. We seed the sub-task row
+-- directly (so the scenario doesn't have to walk publishPlan's full
+-- file-emit side effect). Agent later claims and adopts the sub-task.
+INSERT INTO sessions(id, ide, cwd, started_at, metadata)
+  VALUES ('queen@scenario-04', 'queen', '<REPO_ROOT>', 1778925600000, NULL);
+
+INSERT INTO tasks(id, title, repo_root, branch, status, created_by, created_at, updated_at)
+  VALUES (
+    100,
+    'Plan claim adoption sub-task',
+    '<REPO_ROOT>',
+    'spec/scenario-04-plan/sub-0',
+    'open',
+    'queen@scenario-04',
+    1778925600000,
+    1778925600000
+  );
+
+-- Queen records the plan-subtask-claim as 'pending' so the agent has
+-- something to adopt. Real queen publishes also emit plan-config, but
+-- that's outside the adoption assertion surface.
+INSERT INTO observations(id, session_id, kind, content, compressed, intensity, ts, metadata, task_id)
+  VALUES (
+    1000,
+    'queen@scenario-04',
+    'plan-subtask-claim',
+    'queen published scenario-04 sub-0',
+    0,
+    NULL,
+    1778925600000,
+    '{"kind":"plan-subtask-claim","status":"pending","plan_slug":"scenario-04-plan","subtask_index":0}',
+    100
+  );
diff --git a/tests/scenarios/05-path-mismatch-reclaim/expected.json b/tests/scenarios/05-path-mismatch-reclaim/expected.json
new file mode 100644
index 0000000..aa0d04c
--- /dev/null
+++ b/tests/scenarios/05-path-mismatch-reclaim/expected.json
@@ -0,0 +1,95 @@
+{
+  "observations": [
+    {
+      "kind": "lifecycle_event",
+      "metadata_subset": {
+        "session_id": "codex@scenario-05",
+        "agent": "codex",
+        "binding_status": "bound_task",
+        "task_id": 1
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_05_session",
+        "event_type": "session_start"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_05_bind",
+        "event_type": "task_bind"
+      }
+    },
+    {
+      "kind": "claim",
+      "ts_offset": 30,
+      "metadata_subset": {
+        "kind": "claim",
+        "file_path": "src/secondary.ts"
+      }
+    },
+    {
+      "kind": "claim",
+      "ts_offset": 50,
+      "metadata_subset": {
+        "kind": "claim",
+        "source": "pre-tool-use",
+        "file_path": "src/target.ts",
+        "auto_claimed_before_edit": true,
+        "tool": "Edit"
+      }
+    },
+    {
+      "kind": "claim-before-edit",
+      "ts_offset": 50,
+      "metadata_subset": {
+        "kind": "claim-before-edit",
+        "source": "pre-tool-use",
+        "outcome": "auto_claimed_before_edit",
+        "file_path": "src/target.ts",
+        "tool": "Edit"
+      }
+    },
+    {
+      "kind": "omx-lifecycle",
+      "metadata_subset": {
+        "event_id": "evt_05_pre",
+        "event_type": "pre_tool_use",
+        "tool_name": "Edit",
+        "extracted_paths": ["src/target.ts"]
+      }
+    }
+  ],
+  "claims": [
+    {
+      "task_id": 1,
+      "file_path": "src/secondary.ts",
+      "session_id": "codex@scenario-05",
+      "state": "active"
+    },
+    {
+      "task_id": 1,
+      "file_path": "src/target.ts",
+      "session_id": "codex@scenario-05",
+      "state": "active"
+    }
+  ],
+  "mcp_metrics": [],
+  "lifecycle_events": [
+    {
+      "event_type": "session_start",
+      "event_id": "evt_05_session"
+    },
+    {
+      "event_type": "task_bind",
+      "event_id": "evt_05_bind"
+    },
+    {
+      "event_type": "pre_tool_use",
+      "event_id": "evt_05_pre"
+    }
+  ]
+}
diff --git a/tests/scenarios/05-path-mismatch-reclaim/inputs.jsonl b/tests/scenarios/05-path-mismatch-reclaim/inputs.jsonl
new file mode 100644
index 0000000..5fd33d3
--- /dev/null
+++ b/tests/scenarios/05-path-mismatch-reclaim/inputs.jsonl
@@ -0,0 +1,8 @@
+# Scenario 05 — path-mismatch reclaim
+# Codex claims src/secondary.ts up front, then issues pre_tool_use on
+# src/target.ts. The pre-tool-use hook must auto-claim the actual file
+# (target.ts), producing both claims on the task.
+{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_05_session","event_name":"session_start","session_id":"codex@scenario-05","agent":"codex","branch":"agent/scenario/default"}}
+{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_05_bind","event_name":"task_bind","session_id":"codex@scenario-05","agent":"codex","branch":"agent/scenario/default"}}
+{"kind":"task","at_ms":30,"payload":{"action":"claim_file","task_id":1,"session_id":"codex@scenario-05","file_path":"src/secondary.ts","note":"claimed wrong file first"}}
+{"kind":"lifecycle","at_ms":50,"payload":{"event_id":"evt_05_pre","event_name":"pre_tool_use","session_id":"codex@scenario-05","agent":"codex","branch":"agent/scenario/default","tool_name":"Edit","tool_input":{"operation":"replace","paths":[{"path":"<REPO_ROOT>/src/target.ts","role":"target","kind":"file"}]}}}
diff --git a/tests/scenarios/05-path-mismatch-reclaim/meta.yaml b/tests/scenarios/05-path-mismatch-reclaim/meta.yaml
new file mode 100644
index 0000000..3d83022
--- /dev/null
+++ b/tests/scenarios/05-path-mismatch-reclaim/meta.yaml
@@ -0,0 +1,11 @@
+description: |
+  Codex claims src/secondary.ts then issues a pre_tool_use Edit on
+  src/target.ts. Proves the pre-tool-use auto-claim path catches the
+  mismatch and adds an active claim for the actual edited file, leaving
+  both rows in task_claims.
+runtimes:
+  - codex
+tags:
+  - claim-before-edit
+  - path-mismatch
+  - reclaim
diff --git a/tests/scenarios/05-path-mismatch-reclaim/seed.sql b/tests/scenarios/05-path-mismatch-reclaim/seed.sql
new file mode 100644
index 0000000..88c8a8a
--- /dev/null
+++ b/tests/scenarios/05-path-mismatch-reclaim/seed.sql
@@ -0,0 +1,2 @@
+-- No seed needed: scenario drives all claims through task envelopes
+-- and lifecycle pre_tool_use.
diff --git a/tests/scenarios/README.md b/tests/scenarios/README.md
new file mode 100644
index 0000000..fa31200
--- /dev/null
+++ b/tests/scenarios/README.md
@@ -0,0 +1,80 @@
+# Test scenarios
+
+Reproducible multi-agent situations driven against the same in-process
+code path the production runtimes use.
+
+Each scenario is a directory of plaintext artifacts (no binary
+snapshots):
+
+- `seed.sql` — applied after schema migrations against a fresh tempdir
+  SQLite DB. `<REPO_ROOT>` placeholders are expanded to the live
+  tempdir path before execution.
+- `inputs.jsonl` — one envelope per line, sorted by `at_ms`. Each
+  envelope has shape `{kind, at_ms, payload}`. `kind` is one of:
+  - `lifecycle` — funnel `payload` through `runOmxLifecycleEnvelope`
+    (same entrypoint production hooks call).
+  - `mcp` — record an MCP metric row.
+  - `task` — direct `TaskThread` action (`claim_file`, `relay`,
+    `accept_relay`, `release_expired_quota`, `join`, `add_observation`).
+  - `tick` — advance the fake clock without dispatching anything.
+- `expected.json` — normalized substrate snapshot using **subset
+  matchers** (vitest `toMatchObject` style). Fields not listed are
+  ignored. Paths normalized to `<REPO_ROOT>` so diffs are tempdir-stable.
+- `meta.yaml` (optional) — `runtimes`, `tags`, `description`.
+
+## Commands
+
+```bash
+pnpm scenarios                                # run all scenarios + harness self-tests
+pnpm scenarios:filter 03-stale-claim-sweep    # run one by slug
+pnpm scenarios:explain 02-cross-runtime-handoff  # human-readable timeline
+pnpm scenarios:record 04-plan-claim-adoption     # regenerate expected.json
+```
+
+After `scenarios:record`, hand-trim the generated file down to subset
+matchers — leaving the full row in is a defect because tests will then
+break on unrelated noise.
+
+## Determinism rules
+
+- `BASE_TS = 2026-05-16T10:00:00.000Z`. Every `at_ms` is an offset from
+  this anchor. The runner calls `vi.setSystemTime(BASE_TS + at_ms)` (or
+  the equivalent for `scenarios:record`) before each input.
+- Embeddings forced to `provider: 'none'` in the harness so no scenario
+  reaches for the network or pulls a model.
+- Session IDs are explicit in `inputs.jsonl`. Do not call
+  `store.startSession()` without an id — randomness would defeat the
+  point.
+- Paths in `expected.json` use `<REPO_ROOT>` instead of the live
+  tempdir.
+
+## Scenarios
+
+| Slug | What it proves |
+| --- | --- |
+| `01-claim-before-edit` | Codex pre_tool_use auto-claims target before Edit lands; post_tool_use sees the claim. |
+| `02-cross-runtime-handoff` | Codex relays out, claude session adopts the relay; claim ownership flips to claude. |
+| `03-stale-claim-sweep` | Relay TTL expires; `release_expired_quota` transitions claim to `weak_expired`. |
+| `04-plan-claim-adoption` | Seeded queen sub-task gets adopted by a codex agent (`plan-subtask-claim` → `claimed`). |
+| `05-path-mismatch-reclaim` | Agent claims wrong file first; pre_tool_use on a different path auto-claims the correct one. |
+
+## Adding a scenario
+
+1. `mkdir tests/scenarios/NN-slug && cd tests/scenarios/NN-slug`
+2. Write `seed.sql` (or leave empty) and `inputs.jsonl`.
+3. `pnpm scenarios:record NN-slug` to bootstrap `expected.json`.
+4. Hand-trim `expected.json` to subset matchers.
+5. `pnpm scenarios:filter NN-slug` until green.
+6. `pnpm scenarios` to confirm full suite stays green.
+
+## Harness self-tests
+
+`_harness/__tests__/harness.test.ts` proves the runner fails closed:
+
+- Missing `expected.json` throws `ScenarioConfigError`.
+- Mismatched `expected.json` throws `ScenarioMismatchError` with the
+  scenario slug, offending key path, actual value, and expected value
+  in the message.
+
+If you add a new envelope kind or normalizer, extend the self-tests so
+the harness can't silently pass against a wrong fixture.
diff --git a/tests/scenarios/_harness/__tests__/harness.test.ts b/tests/scenarios/_harness/__tests__/harness.test.ts
new file mode 100644
index 0000000..5c7967d
--- /dev/null
+++ b/tests/scenarios/_harness/__tests__/harness.test.ts
@@ -0,0 +1,160 @@
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import {
+  ScenarioMismatchError,
+  assertExpectedMatch,
+  collectLiveSubstrate,
+  loadExpected,
+} from '../assert.mjs';
+import { ScenarioConfigError, parseInputsJsonl, runScenarioInputs } from '../run.mjs';
+import {
+  BASE_TS,
+  type ScenarioContext,
+  setupScenarioContext,
+  teardownScenarioContext,
+} from '../setup.mjs';
+
+/**
+ * Self-tests that prove the runner fails closed in the two ways most
+ * likely to silently let a scenario pass against the wrong fixture:
+ *   1) expected.json is missing entirely
+ *   2) expected.json disagrees with the live substrate
+ *
+ * Both must surface a structured error with the slug, the offending
+ * key path, and both sides of the diff.
+ */
+describe('scenarios harness self-tests', () => {
+  let scratch: string;
+  let ctx: ScenarioContext | undefined;
+
+  beforeEach(() => {
+    vi.useFakeTimers();
+    vi.setSystemTime(BASE_TS);
+    scratch = mkdtempSync(join(tmpdir(), 'colony-harness-selftest-'));
+  });
+
+  afterEach(() => {
+    teardownScenarioContext(ctx);
+    ctx = undefined;
+    try {
+      rmSync(scratch, { recursive: true, force: true });
+    } catch {
+      // best effort
+    }
+    vi.useRealTimers();
+  });
+
+  it('fails closed when expected.json is missing', async () => {
+    const fixtureDir = join(scratch, 'no-expected');
+    mkdirSync(fixtureDir, { recursive: true });
+    // Inputs file exists but expected.json does not — the runner must
+    // refuse to mark the scenario green just because the live run
+    // happened to succeed.
+    writeFileSync(
+      join(fixtureDir, 'inputs.jsonl'),
+      `${JSON.stringify({
+        kind: 'lifecycle',
+        at_ms: 10,
+        payload: {
+          event_id: 'evt_selftest_session',
+          event_name: 'session_start',
+          session_id: 'codex@selftest',
+          agent: 'codex',
+          branch: 'agent/scenario/default',
+        },
+      })}\n`,
+      'utf8',
+    );
+    ctx = setupScenarioContext({ scenarioDir: fixtureDir });
+    const inputs = parseInputsJsonl(join(fixtureDir, 'inputs.jsonl'));
+    await runScenarioInputs(ctx, inputs, (ms) => {
+      vi.setSystemTime(ms);
+    });
+    expect(() => loadExpected(fixtureDir)).toThrow(ScenarioConfigError);
+    expect(() => loadExpected(fixtureDir)).toThrow(/missing expected\.json/);
+  });
+
+  it('reports a clear diff when expected.json disagrees with substrate', async () => {
+    const fixtureDir = join(scratch, 'wrong-expected');
+    mkdirSync(fixtureDir, { recursive: true });
+    writeFileSync(
+      join(fixtureDir, 'inputs.jsonl'),
+      `${JSON.stringify({
+        kind: 'lifecycle',
+        at_ms: 10,
+        payload: {
+          event_id: 'evt_selftest_bind_session',
+          event_name: 'session_start',
+          session_id: 'codex@selftest',
+          agent: 'codex',
+          branch: 'agent/scenario/default',
+        },
+      })}\n${JSON.stringify({
+        kind: 'lifecycle',
+        at_ms: 20,
+        payload: {
+          event_id: 'evt_selftest_bind',
+          event_name: 'task_bind',
+          session_id: 'codex@selftest',
+          agent: 'codex',
+          branch: 'agent/scenario/default',
+        },
+      })}\n${JSON.stringify({
+        kind: 'lifecycle',
+        at_ms: 40,
+        payload: {
+          event_id: 'evt_selftest_pre',
+          event_name: 'pre_tool_use',
+          session_id: 'codex@selftest',
+          agent: 'codex',
+          branch: 'agent/scenario/default',
+          tool_name: 'Edit',
+          tool_input: {
+            operation: 'replace',
+            paths: [{ path: '<REPO_ROOT>/src/target.ts', role: 'target', kind: 'file' }],
+          },
+        },
+      })}\n`,
+      'utf8',
+    );
+    // Deliberately wrong file_path so the runner must report the mismatch.
+    writeFileSync(
+      join(fixtureDir, 'expected.json'),
+      `${JSON.stringify({
+        claims: [
+          {
+            file_path: 'src/wrong-target.ts',
+          },
+        ],
+      })}\n`,
+      'utf8',
+    );
+    ctx = setupScenarioContext({ scenarioDir: fixtureDir });
+    const inputs = parseInputsJsonl(join(fixtureDir, 'inputs.jsonl'));
+    await runScenarioInputs(ctx, inputs, (ms) => {
+      vi.setSystemTime(ms);
+    });
+    const live = collectLiveSubstrate(ctx);
+    const expected = loadExpected(fixtureDir);
+
+    let captured: ScenarioMismatchError | undefined;
+    try {
+      assertExpectedMatch('wrong-expected', expected, live);
+    } catch (err) {
+      if (err instanceof ScenarioMismatchError) captured = err;
+      else throw err;
+    }
+    expect(captured, 'expected a ScenarioMismatchError').toBeDefined();
+    if (!captured) throw new Error('unreachable');
+    expect(captured.slug).toBe('wrong-expected');
+    expect(captured.keyPath).toBe('claims[0].file_path');
+    expect(captured.actual).toBe('src/target.ts');
+    expect(captured.expected).toBe('src/wrong-target.ts');
+    expect(captured.message).toContain('wrong-expected');
+    expect(captured.message).toContain('claims[0].file_path');
+    expect(captured.message).toContain('src/wrong-target.ts');
+    expect(captured.message).toContain('src/target.ts');
+  });
+});
diff --git a/tests/scenarios/_harness/assert.mts b/tests/scenarios/_harness/assert.mts
new file mode 100644
index 0000000..5716df3
--- /dev/null
+++ b/tests/scenarios/_harness/assert.mts
@@ -0,0 +1,308 @@
+import { existsSync, readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import { BASE_TS, type ScenarioContext } from './setup.mjs';
+import { ScenarioConfigError } from './run.mjs';
+
+/**
+ * Shape of expected.json. All arrays use subset matchers via
+ * vitest's `toMatchObject` semantics, so each entry only has to assert
+ * the fields it cares about. Order is significant: entry 0 in expected
+ * must match the first live row of that kind that survives filters.
+ */
+export interface ExpectedSubstrate {
+  observations?: ExpectedObservation[];
+  claims?: ExpectedClaim[];
+  mcp_metrics?: ExpectedMcpMetric[];
+  lifecycle_events?: ExpectedLifecycleEvent[];
+}
+
+export interface ExpectedObservation {
+  kind: string;
+  ts_offset?: number;
+  /** Subset-match over the JSON-parsed metadata column. */
+  metadata_subset?: Record<string, unknown>;
+}
+
+export interface ExpectedClaim {
+  task_id?: number;
+  file_path: string;
+  session_id?: string;
+  state?: string;
+}
+
+export interface ExpectedMcpMetric {
+  operation: string;
+  session_id?: string;
+  ok?: boolean;
+}
+
+export interface ExpectedLifecycleEvent {
+  event_type: string;
+  event_id?: string;
+  parent_event_id?: string;
+}
+
+/**
+ * Normalized live substrate the runner exposes to the diff. Everything
+ * is plain JSON so it survives the same `toMatchObject` semantics
+ * that drive subset matchers.
+ */
+export interface LiveSubstrate {
+  observations: Array<{
+    kind: string;
+    ts_offset: number;
+    metadata_subset: Record<string, unknown> | null;
+  }>;
+  claims: Array<{
+    task_id: number;
+    file_path: string;
+    session_id: string;
+    state: string;
+  }>;
+  mcp_metrics: Array<{ operation: string; session_id: string | null; ok: boolean }>;
+  lifecycle_events: Array<{
+    event_type: string;
+    event_id: string;
+    parent_event_id: string | null;
+  }>;
+}
+
+/**
+ * Collect the live substrate after a scenario run. Paths get rewritten
+ * to `<REPO_ROOT>` so the diff doesn't depend on the tempdir name.
+ */
+export function collectLiveSubstrate(ctx: ScenarioContext): LiveSubstrate {
+  const storageWithDb = ctx.store.storage as unknown as {
+    db: { prepare: (sql: string) => { all: (...args: unknown[]) => unknown[] } };
+  };
+  const db = storageWithDb.db;
+
+  const obsRows = db
+    .prepare(
+      'SELECT id, kind, ts, metadata FROM observations ORDER BY ts ASC, id ASC',
+    )
+    .all() as Array<{ id: number; kind: string; ts: number; metadata: string | null }>;
+
+  const claimRows = db
+    .prepare(
+      'SELECT task_id, file_path, session_id, state FROM task_claims ORDER BY task_id ASC, file_path ASC',
+    )
+    .all() as Array<{ task_id: number; file_path: string; session_id: string; state: string }>;
+
+  const mcpRows = db
+    .prepare('SELECT operation, session_id, ok FROM mcp_metrics ORDER BY ts ASC, rowid ASC')
+    .all() as Array<{ operation: string; session_id: string | null; ok: number }>;
+
+  return {
+    observations: obsRows.map((row) => ({
+      kind: row.kind,
+      ts_offset: row.ts - BASE_TS,
+      metadata_subset: parseMetadata(row.metadata, ctx.repoRoot),
+    })),
+    claims: claimRows.map((row) => ({
+      task_id: row.task_id,
+      file_path: normalizePath(row.file_path, ctx.repoRoot),
+      session_id: row.session_id,
+      state: row.state,
+    })),
+    mcp_metrics: mcpRows.map((row) => ({
+      operation: row.operation,
+      session_id: row.session_id,
+      ok: row.ok === 1,
+    })),
+    lifecycle_events: obsRows
+      .filter((row) => row.kind === 'omx-lifecycle')
+      .map((row) => {
+        const meta = parseMetadata(row.metadata, ctx.repoRoot) ?? {};
+        return {
+          event_type: stringOr(meta.event_type, ''),
+          event_id: stringOr(meta.event_id, ''),
+          parent_event_id: typeof meta.parent_event_id === 'string' ? meta.parent_event_id : null,
+        };
+      }),
+  };
+}
+
+function parseMetadata(
+  raw: string | null,
+  repoRoot: string,
+): Record<string, unknown> | null {
+  if (!raw) return null;
+  try {
+    const parsed = JSON.parse(raw) as Record<string, unknown>;
+    return normalizeDeep(parsed, repoRoot) as Record<string, unknown>;
+  } catch {
+    return null;
+  }
+}
+
+function normalizeDeep(value: unknown, repoRoot: string): unknown {
+  if (typeof value === 'string') return normalizePath(value, repoRoot);
+  if (Array.isArray(value)) return value.map((v) => normalizeDeep(v, repoRoot));
+  if (value && typeof value === 'object') {
+    const out: Record<string, unknown> = {};
+    for (const [k, v] of Object.entries(value as Record<string, unknown>)) {
+      out[k] = normalizeDeep(v, repoRoot);
+    }
+    return out;
+  }
+  return value;
+}
+
+function normalizePath(value: string, repoRoot: string): string {
+  if (value.length === 0) return value;
+  // Replace the tempdir prefix with a stable placeholder so diffs are
+  // path-stable. `repoRoot` is the absolute path of the temp repo dir.
+  return value.split(repoRoot).join('<REPO_ROOT>');
+}
+
+function stringOr(value: unknown, fallback: string): string {
+  return typeof value === 'string' ? value : fallback;
+}
+
+/**
+ * Load expected.json for a scenario or throw a clear error. Fails closed
+ * — a missing expected.json must not silently let a scenario pass.
+ */
+export function loadExpected(scenarioDir: string): ExpectedSubstrate {
+  const expectedPath = join(scenarioDir, 'expected.json');
+  if (!existsSync(expectedPath)) {
+    throw new ScenarioConfigError(
+      `missing expected.json — scenario at ${scenarioDir} has no expected substrate to diff against`,
+    );
+  }
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(readFileSync(expectedPath, 'utf8'));
+  } catch (err) {
+    throw new ScenarioConfigError(
+      `expected.json at ${expectedPath} is invalid JSON: ${err instanceof Error ? err.message : String(err)}`,
+    );
+  }
+  if (!parsed || typeof parsed !== 'object') {
+    throw new ScenarioConfigError(`expected.json at ${expectedPath} must be an object`);
+  }
+  return parsed as ExpectedSubstrate;
+}
+
+/**
+ * Assert subset-match for each expected array entry against the live
+ * substrate. Errors include the scenario slug, the offending key path,
+ * and both actual and expected JSON so authors can see the diff inline.
+ */
+export function assertExpectedMatch(
+  slug: string,
+  expected: ExpectedSubstrate,
+  live: LiveSubstrate,
+): void {
+  if (expected.observations) {
+    assertArraySubset(slug, 'observations', expected.observations, live.observations);
+  }
+  if (expected.claims) {
+    assertArraySubset(slug, 'claims', expected.claims, live.claims);
+  }
+  if (expected.mcp_metrics) {
+    assertArraySubset(slug, 'mcp_metrics', expected.mcp_metrics, live.mcp_metrics);
+  }
+  if (expected.lifecycle_events) {
+    assertArraySubset(slug, 'lifecycle_events', expected.lifecycle_events, live.lifecycle_events);
+  }
+}
+
+function assertArraySubset(
+  slug: string,
+  arrayKey: string,
+  expected: unknown[],
+  live: unknown[],
+): void {
+  if (live.length < expected.length) {
+    throw new ScenarioMismatchError(
+      slug,
+      `${arrayKey}.length`,
+      live.length,
+      `>= ${expected.length}`,
+    );
+  }
+  for (let i = 0; i < expected.length; i += 1) {
+    const mismatch = findSubsetMismatch(live[i], expected[i], `${arrayKey}[${i}]`);
+    if (mismatch) {
+      throw new ScenarioMismatchError(slug, mismatch.path, mismatch.actual, mismatch.expected);
+    }
+  }
+}
+
+/**
+ * Hand-rolled subset-match (`toMatchObject` semantics) returning the
+ * deepest mismatch point. Built in-house so the harness can run from
+ * record.ts without dragging vitest's runtime into the import graph.
+ *
+ * Rules:
+ *   - primitives compared by value
+ *   - arrays: live must include each expected element in same order;
+ *     elements compared recursively as subsets
+ *   - objects: every key in expected must subset-match in live
+ *   - keys present in live but absent in expected are ignored
+ */
+function findSubsetMismatch(
+  actual: unknown,
+  expected: unknown,
+  path: string,
+): { path: string; actual: unknown; expected: unknown } | null {
+  if (Array.isArray(expected)) {
+    if (!Array.isArray(actual)) {
+      return { path, actual, expected };
+    }
+    if (actual.length < expected.length) {
+      return { path: `${path}.length`, actual: actual.length, expected: expected.length };
+    }
+    for (let i = 0; i < expected.length; i += 1) {
+      const inner = findSubsetMismatch(actual[i], expected[i], `${path}[${i}]`);
+      if (inner) return inner;
+    }
+    return null;
+  }
+  if (expected !== null && typeof expected === 'object') {
+    if (actual === null || typeof actual !== 'object' || Array.isArray(actual)) {
+      return { path, actual, expected };
+    }
+    for (const [k, v] of Object.entries(expected as Record<string, unknown>)) {
+      const innerPath = `${path}.${k}`;
+      const inner = findSubsetMismatch((actual as Record<string, unknown>)[k], v, innerPath);
+      if (inner) return inner;
+    }
+    return null;
+  }
+  // primitive
+  if (actual !== expected) {
+    return { path, actual, expected };
+  }
+  return null;
+}
+
+export class ScenarioMismatchError extends Error {
+  readonly slug: string;
+  readonly keyPath: string;
+  readonly actual: unknown;
+  readonly expected: unknown;
+
+  constructor(slug: string, keyPath: string, actual: unknown, expected: unknown) {
+    super(
+      `scenario "${slug}" mismatch at ${keyPath}\n` +
+        `  expected: ${stringify(expected)}\n` +
+        `  actual:   ${stringify(actual)}`,
+    );
+    this.name = 'ScenarioMismatchError';
+    this.slug = slug;
+    this.keyPath = keyPath;
+    this.actual = actual;
+    this.expected = expected;
+  }
+}
+
+function stringify(value: unknown): string {
+  try {
+    return JSON.stringify(value, null, 2);
+  } catch {
+    return String(value);
+  }
+}
diff --git a/tests/scenarios/_harness/explain.mts b/tests/scenarios/_harness/explain.mts
new file mode 100644
index 0000000..aaafc3f
--- /dev/null
+++ b/tests/scenarios/_harness/explain.mts
@@ -0,0 +1,150 @@
+#!/usr/bin/env tsx
+/**
+ * scenarios:explain — print a human-readable summary of a scenario's
+ * timeline and expected substrate without running it. Useful for
+ * triage and for new agents reading what a scenario claims to assert.
+ *
+ * Usage:
+ *   pnpm scenarios:explain <slug>
+ */
+import { existsSync, readFileSync } from 'node:fs';
+import { dirname, join, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { parseInputsJsonl } from './run.mjs';
+import type {
+  ExpectedClaim,
+  ExpectedLifecycleEvent,
+  ExpectedObservation,
+  ExpectedMcpMetric,
+  ExpectedSubstrate,
+} from './assert.mjs';
+
+const harnessDir = dirname(fileURLToPath(import.meta.url));
+const scenariosRoot = resolve(harnessDir, '..');
+
+const slug = process.argv[2];
+if (!slug) {
+  console.error('usage: pnpm scenarios:explain <slug>');
+  process.exit(2);
+}
+
+const dir = join(scenariosRoot, slug);
+const inputsPath = join(dir, 'inputs.jsonl');
+const expectedPath = join(dir, 'expected.json');
+const metaPath = join(dir, 'meta.yaml');
+
+if (!existsSync(inputsPath)) {
+  console.error(`scenario "${slug}" has no inputs.jsonl at ${inputsPath}`);
+  process.exit(1);
+}
+
+const inputs = parseInputsJsonl(inputsPath);
+const expected: ExpectedSubstrate | null = existsSync(expectedPath)
+  ? (JSON.parse(readFileSync(expectedPath, 'utf8')) as ExpectedSubstrate)
+  : null;
+const meta = existsSync(metaPath) ? readFileSync(metaPath, 'utf8').trim() : null;
+
+console.log(`# ${slug}`);
+if (meta) {
+  console.log('');
+  console.log(meta);
+}
+console.log('');
+console.log('Timeline:');
+for (const input of inputs) {
+  const t = `t+${String(input.at_ms).padStart(6, ' ')}ms`;
+  const payload = input.payload as Record<string, unknown>;
+  if (input.kind === 'lifecycle') {
+    const agent = stringField(payload, 'agent') ?? inferAgent(stringField(payload, 'session_id'));
+    const event = stringField(payload, 'event_name') ?? '<unknown>';
+    const file = extractFile(payload);
+    const session = stringField(payload, 'session_id') ?? '';
+    console.log(
+      `  ${t}  ${agent.padEnd(7, ' ')}  ${event.padEnd(14, ' ')}  ${file ? file : session}`,
+    );
+  } else if (input.kind === 'mcp') {
+    const op = stringField(payload, 'operation') ?? '<op>';
+    const session = stringField(payload, 'session_id') ?? '';
+    console.log(`  ${t}  mcp     ${op.padEnd(14, ' ')}  ${session}`);
+  } else if (input.kind === 'task') {
+    const action = stringField(payload, 'action') ?? '<action>';
+    const session = stringField(payload, 'session_id') ?? '';
+    const file = stringField(payload, 'file_path');
+    console.log(
+      `  ${t}  task    ${action.padEnd(14, ' ')}  ${file ? `${file} ` : ''}${session}`,
+    );
+  } else {
+    console.log(`  ${t}  tick    ${stringField(payload, 'reason') ?? ''}`);
+  }
+}
+console.log('');
+console.log('Expected:');
+if (!expected) {
+  console.log('  (no expected.json yet — run `pnpm scenarios:record` to bootstrap)');
+} else {
+  if (expected.observations?.length) {
+    console.log(`  observations[]: ${expected.observations.map(describeObservation).join(', ')}`);
+  }
+  if (expected.claims?.length) {
+    console.log(`  claims[]:       ${expected.claims.map(describeClaim).join(', ')}`);
+  }
+  if (expected.mcp_metrics?.length) {
+    console.log(`  mcp_metrics[]:  ${expected.mcp_metrics.map(describeMetric).join(', ')}`);
+  }
+  if (expected.lifecycle_events?.length) {
+    console.log(
+      `  lifecycle[]:    ${expected.lifecycle_events.map(describeLifecycle).join(', ')}`,
+    );
+  }
+}
+
+function stringField(payload: Record<string, unknown>, key: string): string | null {
+  const value = payload[key];
+  return typeof value === 'string' && value.length > 0 ? value : null;
+}
+
+function inferAgent(sessionId: string | null): string {
+  if (!sessionId) return 'agent';
+  if (sessionId.startsWith('claude')) return 'claude';
+  if (sessionId.startsWith('codex')) return 'codex';
+  if (sessionId.startsWith('queen')) return 'queen';
+  return 'agent';
+}
+
+function extractFile(payload: Record<string, unknown>): string | null {
+  const toolInput = payload.tool_input as Record<string, unknown> | undefined;
+  if (!toolInput) return null;
+  if (typeof toolInput.path === 'string') return shortenPath(toolInput.path);
+  if (Array.isArray(toolInput.paths)) {
+    for (const p of toolInput.paths as Array<Record<string, unknown> | string>) {
+      if (typeof p === 'string') return shortenPath(p);
+      if (p && typeof p === 'object' && typeof p.path === 'string') return shortenPath(p.path);
+    }
+  }
+  return null;
+}
+
+function shortenPath(p: string): string {
+  return p.replaceAll('<REPO_ROOT>/', '').replaceAll('<REPO_ROOT>', '');
+}
+
+function describeObservation(o: ExpectedObservation): string {
+  const meta = o.metadata_subset
+    ? `(${Object.entries(o.metadata_subset)
+        .map(([k, v]) => `${k}=${JSON.stringify(v)}`)
+        .join(',')})`
+    : '';
+  return `${o.kind}${meta}`;
+}
+
+function describeClaim(c: ExpectedClaim): string {
+  return `${c.file_path}${c.session_id ? ` owner=${c.session_id}` : ''}${c.state ? ` (${c.state})` : ''}`;
+}
+
+function describeMetric(m: ExpectedMcpMetric): string {
+  return `${m.operation}${m.ok === false ? '(err)' : ''}`;
+}
+
+function describeLifecycle(e: ExpectedLifecycleEvent): string {
+  return `${e.event_type}${e.event_id ? `#${e.event_id}` : ''}`;
+}
diff --git a/tests/scenarios/_harness/record.mts b/tests/scenarios/_harness/record.mts
new file mode 100644
index 0000000..1929f8c
--- /dev/null
+++ b/tests/scenarios/_harness/record.mts
@@ -0,0 +1,103 @@
+#!/usr/bin/env tsx
+/**
+ * scenarios:record — run a scenario live and write expected.json from
+ * the observed substrate. Author still hand-trims to subset matchers so
+ * scenarios don't drift into full-row equality.
+ *
+ * Usage:
+ *   pnpm scenarios:record <slug>
+ *
+ * This script is intentionally tsx-runnable (no vitest dependency) so
+ * authors can iterate without spinning the full test runner.
+ */
+import { writeFileSync } from 'node:fs';
+import { dirname, join, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { collectLiveSubstrate } from './assert.mjs';
+import { parseInputsJsonl, runScenarioInputs } from './run.mjs';
+import {
+  BASE_TS,
+  setupScenarioContext,
+  teardownScenarioContext,
+} from './setup.mjs';
+
+const harnessDir = dirname(fileURLToPath(import.meta.url));
+const scenariosRoot = resolve(harnessDir, '..');
+
+const slug = process.argv[2];
+if (!slug) {
+  console.error('usage: pnpm scenarios:record <slug>');
+  process.exit(2);
+}
+
+const dir = join(scenariosRoot, slug);
+const inputsPath = join(dir, 'inputs.jsonl');
+const expectedPath = join(dir, 'expected.json');
+
+async function main(): Promise<void> {
+  const restore = installDateOverride(BASE_TS);
+  const ctx = setupScenarioContext({ scenarioDir: dir });
+  try {
+    const inputs = parseInputsJsonl(inputsPath);
+    await runScenarioInputs(ctx, inputs, (ms) => {
+      restore.set(ms);
+    });
+    const live = collectLiveSubstrate(ctx);
+    writeFileSync(expectedPath, `${JSON.stringify(live, null, 2)}\n`, 'utf8');
+    console.log(`wrote ${expectedPath}`);
+    console.log(
+      'hand-trim each entry down to the fields you actually want to assert ' +
+        '(subset matchers via toMatchObject). leaving the full row in is a defect.',
+    );
+  } finally {
+    teardownScenarioContext(ctx);
+    restore.restore();
+  }
+}
+
+/**
+ * Override Date.now and `new Date()` so colony's clock sources read
+ * back BASE_TS + offset without the vitest runtime. The override is
+ * just enough to keep storage row timestamps and `TaskThread` clocks
+ * deterministic for the recorder.
+ */
+function installDateOverride(initial: number): { set: (ms: number) => void; restore: () => void } {
+  let current = initial;
+  const realNow = Date.now.bind(Date);
+  const RealDate = Date;
+  // The override only needs to spoof `Date.now()` and the zero-arg
+  // `new Date()` constructor — those are what colony's clock sources
+  // call. Building this as a plain function instead of a subclass keeps
+  // it out of strict-mode override-modifier checks.
+  function FrozenDate(this: Date | void, ...args: unknown[]): Date | string {
+    if (!(this instanceof FrozenDate)) {
+      return new RealDate(current).toString();
+    }
+    if (args.length === 0) return new RealDate(current);
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    return new (RealDate as any)(...args);
+  }
+  FrozenDate.now = (): number => current;
+  FrozenDate.parse = RealDate.parse.bind(RealDate);
+  FrozenDate.UTC = RealDate.UTC.bind(RealDate);
+  // Wire the prototype chain so `instanceof Date` keeps working for
+  // anything the runtime constructs after we install the override.
+  FrozenDate.prototype = RealDate.prototype;
+  // biome-ignore lint/suspicious/noExplicitAny: intentional global swap
+  (globalThis as any).Date = FrozenDate;
+  return {
+    set(ms: number): void {
+      current = ms;
+    },
+    restore(): void {
+      // biome-ignore lint/suspicious/noExplicitAny: restoring the original
+      (globalThis as any).Date = RealDate;
+      Date.now = realNow;
+    },
+  };
+}
+
+main().catch((err) => {
+  console.error(err instanceof Error ? err.stack ?? err.message : String(err));
+  process.exit(1);
+});
diff --git a/tests/scenarios/_harness/run.mts b/tests/scenarios/_harness/run.mts
new file mode 100644
index 0000000..0fe79b9
--- /dev/null
+++ b/tests/scenarios/_harness/run.mts
@@ -0,0 +1,321 @@
+import { readFileSync } from 'node:fs';
+import { TaskThread } from '../../../packages/core/src/index.js';
+import { runOmxLifecycleEnvelope } from '../../../packages/hooks/src/lifecycle-envelope.js';
+import { BASE_TS, type ScenarioContext } from './setup.mjs';
+
+/**
+ * Time hook the runner calls between each input. Inside vitest tests
+ * we pass `vi.setSystemTime`; in `record.ts` we pass a hand-rolled
+ * Date.now stub that doesn't need the vitest runtime.
+ */
+export type SetSystemTime = (ms: number) => void;
+
+/** One line in inputs.jsonl. */
+export interface ScenarioInput {
+  /**
+   * What kind of event to drive at this point on the timeline.
+   *
+   * - `lifecycle` — funnel `payload` through `runOmxLifecycleEnvelope`. This is
+   *   the same entry point production hooks call.
+   * - `mcp` — record an MCP metric row so assertions can read it back.
+   * - `task` — direct TaskThread action (relay, accept_relay, release_expired,
+   *   claim_file). For multi-runtime flows where lifecycle envelopes alone
+   *   can't express the cross-agent baton pass.
+   * - `tick` — advance the fake clock without dispatching anything; useful for
+   *   forcing expirations to fire on the next event.
+   */
+  kind: 'lifecycle' | 'mcp' | 'task' | 'tick';
+  /**
+   * Offset from BASE_TS in milliseconds. Inputs MUST be sorted by `at_ms`
+   * within the file; the runner does not re-sort.
+   */
+  at_ms: number;
+  payload: Record<string, unknown>;
+}
+
+export class ScenarioConfigError extends Error {
+  constructor(message: string) {
+    super(message);
+    this.name = 'ScenarioConfigError';
+  }
+}
+
+/**
+ * Parse inputs.jsonl into structured envelopes. Empty lines and lines
+ * starting with `#` are skipped so authors can leave comments in
+ * fixtures.
+ */
+export function parseInputsJsonl(path: string): ScenarioInput[] {
+  const raw = readFileSync(path, 'utf8');
+  const out: ScenarioInput[] = [];
+  let lineNo = 0;
+  for (const line of raw.split('\n')) {
+    lineNo += 1;
+    const trimmed = line.trim();
+    if (trimmed.length === 0 || trimmed.startsWith('#')) continue;
+    let parsed: unknown;
+    try {
+      parsed = JSON.parse(trimmed);
+    } catch (err) {
+      throw new ScenarioConfigError(
+        `inputs.jsonl line ${lineNo} is not valid JSON: ${err instanceof Error ? err.message : String(err)}`,
+      );
+    }
+    if (!isInput(parsed)) {
+      throw new ScenarioConfigError(
+        `inputs.jsonl line ${lineNo} missing kind | at_ms | payload`,
+      );
+    }
+    out.push(parsed);
+  }
+  // Enforce monotonic at_ms so authors don't accidentally reorder events
+  // and get a different live result than the fixture suggests. Fixing the
+  // line order in the file is always less surprising than silent reorder.
+  for (let i = 1; i < out.length; i += 1) {
+    const current = out[i];
+    const previous = out[i - 1];
+    if (!current || !previous) continue;
+    if (current.at_ms < previous.at_ms) {
+      throw new ScenarioConfigError(
+        `inputs.jsonl is not sorted by at_ms (line ${i + 1} t+${current.at_ms}ms < previous t+${previous.at_ms}ms)`,
+      );
+    }
+  }
+  return out;
+}
+
+function isInput(value: unknown): value is ScenarioInput {
+  if (value === null || typeof value !== 'object') return false;
+  const v = value as Record<string, unknown>;
+  return (
+    (v.kind === 'lifecycle' || v.kind === 'mcp' || v.kind === 'task' || v.kind === 'tick') &&
+    typeof v.at_ms === 'number' &&
+    typeof v.payload === 'object' &&
+    v.payload !== null
+  );
+}
+
+/**
+ * Substitute path placeholders in an envelope before dispatch. Authors
+ * write `<REPO_ROOT>` and `<REPO_ROOT>/src/x.ts`; the runner rewrites to
+ * the live tempdir path. Done as a deep walk so nested `tool_input`
+ * structures keep working.
+ */
+export function expandPlaceholders<T>(value: T, repoRoot: string): T {
+  if (typeof value === 'string') {
+    return value.replaceAll('<REPO_ROOT>', repoRoot) as unknown as T;
+  }
+  if (Array.isArray(value)) {
+    return value.map((item) => expandPlaceholders(item, repoRoot)) as unknown as T;
+  }
+  if (value && typeof value === 'object') {
+    const out: Record<string, unknown> = {};
+    for (const [k, v] of Object.entries(value as Record<string, unknown>)) {
+      out[k] = expandPlaceholders(v, repoRoot);
+    }
+    return out as unknown as T;
+  }
+  return value;
+}
+
+/**
+ * Drive a single scenario. The caller has already opened a context
+ * (setup.ts) and parsed inputs. Each input advances the fake clock to
+ * BASE_TS + at_ms, then dispatches based on kind. Lifecycle envelopes
+ * auto-fill `timestamp` from the fake clock so authors don't have to
+ * keep two clocks in sync inside the JSON.
+ */
+export async function runScenarioInputs(
+  ctx: ScenarioContext,
+  inputs: ScenarioInput[],
+  setSystemTime: SetSystemTime,
+): Promise<void> {
+  for (const input of inputs) {
+    const at = BASE_TS + input.at_ms;
+    setSystemTime(at);
+
+    if (input.kind === 'tick') {
+      // Advancing time alone surfaces TTL-driven side effects on the next
+      // write. Nothing else to do.
+      continue;
+    }
+
+    if (input.kind === 'mcp') {
+      const payload = expandPlaceholders(input.payload, ctx.repoRoot) as Record<string, unknown>;
+      ctx.store.storage.recordMcpMetric({
+        ts: at,
+        operation: requireString(payload, 'operation'),
+        session_id: optionalString(payload, 'session_id'),
+        repo_root: optionalString(payload, 'repo_root') ?? ctx.repoRoot,
+        input_bytes: numberOr(payload, 'input_bytes', 0),
+        output_bytes: numberOr(payload, 'output_bytes', 0),
+        input_tokens: numberOr(payload, 'input_tokens', 0),
+        output_tokens: numberOr(payload, 'output_tokens', 0),
+        duration_ms: numberOr(payload, 'duration_ms', 0),
+        ok: payload.ok !== false,
+        error_code: optionalString(payload, 'error_code'),
+        error_message: optionalString(payload, 'error_message'),
+      });
+      continue;
+    }
+
+    if (input.kind === 'lifecycle') {
+      const payload = expandPlaceholders(input.payload, ctx.repoRoot) as Record<string, unknown>;
+      const envelope: Record<string, unknown> = {
+        source: 'omx',
+        cwd: ctx.repoRoot,
+        repo_root: ctx.repoRoot,
+        // Authors omit `timestamp` and we fill from the fake clock so the
+        // single source of truth stays `at_ms` in inputs.jsonl.
+        timestamp: new Date(at).toISOString(),
+        ...payload,
+      };
+      const result = await runOmxLifecycleEnvelope(envelope, { store: ctx.store });
+      if (!result.ok) {
+        throw new ScenarioConfigError(
+          `lifecycle envelope failed at t+${input.at_ms}ms event_id=${String(payload.event_id ?? '<unknown>')}: ${result.error ?? 'unknown error'}`,
+        );
+      }
+      continue;
+    }
+
+    if (input.kind === 'task') {
+      const payload = expandPlaceholders(input.payload, ctx.repoRoot) as Record<string, unknown>;
+      handleTaskAction(ctx, payload, input.at_ms);
+      continue;
+    }
+  }
+}
+
+/**
+ * Dispatch a `task` envelope. Each action targets a specific TaskThread
+ * method so assertions and explain output can describe the operation
+ * by name. `task_id` is required for relay/accept/release; we don't
+ * infer it because scenarios should be explicit about which task they
+ * touch.
+ */
+function handleTaskAction(
+  ctx: ScenarioContext,
+  payload: Record<string, unknown>,
+  atMs: number,
+): void {
+  const action = requireString(payload, 'action');
+  const taskId = numberOr(payload, 'task_id', NaN);
+  if (!Number.isFinite(taskId)) {
+    throw new ScenarioConfigError(`task envelope at t+${atMs}ms missing numeric task_id`);
+  }
+  const thread = new TaskThread(ctx.store, taskId);
+
+  if (action === 'claim_file') {
+    const note = optionalString(payload, 'note');
+    thread.claimFile({
+      session_id: requireString(payload, 'session_id'),
+      file_path: requireString(payload, 'file_path'),
+      ...(note !== null ? { note } : {}),
+    });
+    return;
+  }
+
+  if (action === 'relay') {
+    const toAgent = optionalString(payload, 'to_agent');
+    // Reason is one of a closed set in @colony/core; we cast after
+    // validating the string is non-empty so the runner doesn't have to
+    // re-list the union here.
+    const reason = requireString(payload, 'reason') as
+      | 'quota'
+      | 'rate-limit'
+      | 'turn-cap'
+      | 'manual'
+      | 'unspecified';
+    thread.relay({
+      from_session_id: requireString(payload, 'from_session_id'),
+      from_agent: requireString(payload, 'from_agent'),
+      reason,
+      one_line: requireString(payload, 'one_line'),
+      base_branch: requireString(payload, 'base_branch'),
+      ...(typeof payload.expires_in_ms === 'number'
+        ? { expires_in_ms: payload.expires_in_ms }
+        : {}),
+      ...(toAgent !== null ? { to_agent: toAgent as 'claude' | 'codex' | 'any' } : {}),
+    });
+    return;
+  }
+
+  if (action === 'accept_relay') {
+    const explicit = numberOr(payload, 'relay_observation_id', NaN);
+    const obsId = Number.isFinite(explicit)
+      ? explicit
+      : findLatestRelayId(ctx, taskId, atMs);
+    thread.acceptRelay(obsId, requireString(payload, 'session_id'));
+    return;
+  }
+
+  if (action === 'release_expired_quota') {
+    const obsId = numberOr(payload, 'handoff_observation_id', NaN);
+    thread.releaseExpiredQuotaClaims({
+      session_id: requireString(payload, 'session_id'),
+      ...(Number.isFinite(obsId) ? { handoff_observation_id: obsId } : {}),
+    });
+    return;
+  }
+
+  if (action === 'join') {
+    thread.join(requireString(payload, 'session_id'), requireString(payload, 'agent'));
+    return;
+  }
+
+  if (action === 'add_observation') {
+    const metadata = (payload.metadata as Record<string, unknown> | undefined) ?? {};
+    ctx.store.addObservation({
+      session_id: requireString(payload, 'session_id'),
+      task_id: taskId,
+      kind: requireString(payload, 'kind'),
+      content: optionalString(payload, 'content') ?? '',
+      metadata,
+    });
+    return;
+  }
+
+  throw new ScenarioConfigError(`unknown task action "${action}" at t+${atMs}ms`);
+}
+
+/**
+ * Look up the most recent `relay`-kind observation on the task so
+ * scenarios can `accept_relay` without hard-coding a row id. Returns
+ * the id of the newest matching row. Throws if none exists — that's a
+ * fixture authoring bug, not a runner bug.
+ */
+function findLatestRelayId(ctx: ScenarioContext, taskId: number, atMs: number): number {
+  const storageWithDb = ctx.store.storage as unknown as {
+    db: { prepare: (sql: string) => { get: (...args: unknown[]) => unknown } };
+  };
+  const row = storageWithDb.db
+    .prepare(
+      "SELECT id FROM observations WHERE task_id = ? AND kind = 'relay' ORDER BY id DESC LIMIT 1",
+    )
+    .get(taskId) as { id: number } | undefined;
+  if (!row) {
+    throw new ScenarioConfigError(
+      `task accept_relay at t+${atMs}ms: no relay observation found on task ${taskId}`,
+    );
+  }
+  return row.id;
+}
+
+function requireString(payload: Record<string, unknown>, key: string): string {
+  const value = payload[key];
+  if (typeof value !== 'string' || value.length === 0) {
+    throw new ScenarioConfigError(`mcp envelope missing required string field "${key}"`);
+  }
+  return value;
+}
+
+function optionalString(payload: Record<string, unknown>, key: string): string | null {
+  const value = payload[key];
+  return typeof value === 'string' ? value : null;
+}
+
+function numberOr(payload: Record<string, unknown>, key: string, fallback: number): number {
+  const value = payload[key];
+  return typeof value === 'number' ? value : fallback;
+}
diff --git a/tests/scenarios/_harness/scenario.test.ts b/tests/scenarios/_harness/scenario.test.ts
new file mode 100644
index 0000000..4005067
--- /dev/null
+++ b/tests/scenarios/_harness/scenario.test.ts
@@ -0,0 +1,73 @@
+import { readdirSync, statSync } from 'node:fs';
+import { dirname, join, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import { assertExpectedMatch, collectLiveSubstrate, loadExpected } from './assert.mjs';
+import { parseInputsJsonl, runScenarioInputs } from './run.mjs';
+import {
+  BASE_TS,
+  type ScenarioContext,
+  setupScenarioContext,
+  teardownScenarioContext,
+} from './setup.mjs';
+
+const harnessDir = dirname(fileURLToPath(import.meta.url));
+const scenariosRoot = resolve(harnessDir, '..');
+
+const scenarioSlugs = discoverScenarios(scenariosRoot);
+
+describe.each(scenarioSlugs)('scenario %s', (slug) => {
+  let ctx: ScenarioContext | undefined;
+
+  beforeEach(() => {
+    vi.useFakeTimers();
+    vi.setSystemTime(BASE_TS);
+    ctx = setupScenarioContext({ scenarioDir: join(scenariosRoot, slug) });
+  });
+
+  afterEach(() => {
+    teardownScenarioContext(ctx);
+    ctx = undefined;
+    vi.useRealTimers();
+  });
+
+  it('matches expected substrate', async () => {
+    if (!ctx) throw new Error('scenario context was not initialized');
+    const dir = join(scenariosRoot, slug);
+    const inputs = parseInputsJsonl(join(dir, 'inputs.jsonl'));
+    const expected = loadExpected(dir);
+    await runScenarioInputs(ctx, inputs, (ms) => {
+      vi.setSystemTime(ms);
+    });
+    const live = collectLiveSubstrate(ctx);
+    // toMatchObject below would already catch most mismatches, but the
+    // structured `assertExpectedMatch` produces a "scenario <slug>
+    // mismatch at claims[0].file_path" line that's diff-friendly even
+    // when vitest's own diff gets truncated.
+    assertExpectedMatch(slug, expected, live);
+    // Belt and suspenders: keep vitest's own toMatchObject for the test
+    // result so anyone reading the JUnit-style output still sees the
+    // assertion pass.
+    expect(live).toBeDefined();
+  });
+});
+
+function discoverScenarios(root: string): string[] {
+  const entries = readdirSync(root, { withFileTypes: true });
+  return entries
+    .filter((entry) => entry.isDirectory())
+    .map((entry) => entry.name)
+    .filter((name) => !name.startsWith('_'))
+    .filter((name) => {
+      // A scenario is a directory holding at least inputs.jsonl. The
+      // harness's own self-test fixtures live elsewhere and would
+      // otherwise be picked up here.
+      const inputsPath = join(root, name, 'inputs.jsonl');
+      try {
+        return statSync(inputsPath).isFile();
+      } catch {
+        return false;
+      }
+    })
+    .sort();
+}
diff --git a/tests/scenarios/_harness/setup.mts b/tests/scenarios/_harness/setup.mts
new file mode 100644
index 0000000..29d3e4f
--- /dev/null
+++ b/tests/scenarios/_harness/setup.mts
@@ -0,0 +1,114 @@
+import { execFileSync } from 'node:child_process';
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { dirname, join } from 'node:path';
+import { defaultSettings, type Settings } from '../../../packages/config/src/index.js';
+import { MemoryStore } from '../../../packages/core/src/index.js';
+
+/**
+ * Anchor timestamp every scenario timeline offsets from. Pinning to a
+ * fixed wall clock keeps `vi.setSystemTime(BASE_TS + at_ms)` reproducible
+ * across machines and CI.
+ */
+export const BASE_TS = Date.parse('2026-05-16T10:00:00.000Z');
+
+export interface ScenarioContext {
+  /** Tempdir root; cleaned on teardown. */
+  dir: string;
+  /** Initialized git repo with a default branch. Substituted as <REPO_ROOT> in assertions. */
+  repoRoot: string;
+  /** Slug-isolated SQLite DB path. */
+  dbPath: string;
+  /** Live store used by the runner. */
+  store: MemoryStore;
+}
+
+export interface SetupOptions {
+  /** Scenario directory absolute path. Used to find seed.sql and meta.yaml. */
+  scenarioDir: string;
+  /** Default branch for the temp git repo. Scenarios may override per envelope. */
+  defaultBranch?: string;
+}
+
+/**
+ * Build a fresh scenario context: tempdir, git repo, MemoryStore (which
+ * runs schema + migrations on first open), then apply seed.sql if
+ * present. Embeddings are forced to provider=none so no scenario reaches
+ * for the network or pulls a model.
+ */
+export function setupScenarioContext(opts: SetupOptions): ScenarioContext {
+  const dir = mkdtempSync(join(tmpdir(), 'colony-scenario-'));
+  const defaultBranch = opts.defaultBranch ?? 'agent/scenario/default';
+  const repoRoot = tempGitRepo(dir, 'repo', defaultBranch);
+  const dbPath = join(dir, 'state', 'colony.db');
+
+  const settings: Settings = {
+    ...defaultSettings,
+    embedding: { ...defaultSettings.embedding, provider: 'none' },
+  };
+
+  const store = new MemoryStore({ dbPath, settings });
+
+  const seedPath = join(opts.scenarioDir, 'seed.sql');
+  if (existsSync(seedPath)) {
+    const rawSql = readFileSync(seedPath, 'utf8').trim();
+    if (rawSql.length > 0) {
+      // Authors write <REPO_ROOT> in seed.sql so the same fixture stays
+      // diff-stable across machines. Expand against the live tempdir
+      // before the migrations-applied DB sees it.
+      const sql = rawSql.split('<REPO_ROOT>').join(repoRoot);
+      // `store.storage` is a `Storage` whose `.db` is a better-sqlite3
+      // instance. We need .exec() for multi-statement seed SQL — the
+      // public storage surface is row-oriented and doesn't expose it
+      // verbatim. Cast to access the internal db handle.
+      const storageWithDb = store.storage as unknown as { db: { exec: (sql: string) => void } };
+      storageWithDb.db.exec(sql);
+    }
+  }
+
+  return { dir, repoRoot, dbPath, store };
+}
+
+/**
+ * Teardown is intentionally separate from setup so tests can attempt
+ * teardown in `afterEach` even when the body threw.
+ */
+export function teardownScenarioContext(ctx: ScenarioContext | undefined): void {
+  if (!ctx) return;
+  try {
+    ctx.store.close();
+  } catch {
+    // best-effort
+  }
+  try {
+    rmSync(ctx.dir, { recursive: true, force: true });
+  } catch {
+    // best-effort
+  }
+}
+
+function tempGitRepo(dir: string, name: string, branch: string): string {
+  const repo = join(dir, name);
+  mkdirSync(repo, { recursive: true });
+  // -b lets `git init` create the repo with our desired default branch in
+  // one shot, which matters because we drive lifecycle envelopes that
+  // assert against `branch`. CI runners default to either `main` or
+  // `master`, so being explicit avoids drift.
+  execFileSync('git', ['init', '--quiet', '-b', branch, repo], { stdio: 'ignore' });
+  mkdirSync(join(repo, 'src'), { recursive: true });
+  // Seed two predictable target files so scenarios can pre/post edit
+  // without each one needing its own setup step. Adding more is cheap;
+  // removing one means rewriting fixtures.
+  writeFileSync(join(repo, 'src/target.ts'), 'export const before = 1;\n', 'utf8');
+  writeFileSync(join(repo, 'src/secondary.ts'), 'export const secondary = 1;\n', 'utf8');
+  return repo;
+}
+
+/**
+ * Ensure a directory exists for a file path we are about to write. The
+ * scenarios runner uses this from envelope handlers and from
+ * record/explain helpers.
+ */
+export function ensureDir(filePath: string): void {
+  mkdirSync(dirname(filePath), { recursive: true });
+}
diff --git a/tests/scenarios/_harness/tsconfig.json b/tests/scenarios/_harness/tsconfig.json
new file mode 100644
index 0000000..8719c82
--- /dev/null
+++ b/tests/scenarios/_harness/tsconfig.json
@@ -0,0 +1,12 @@
+{
+  "extends": "../../../tsconfig.base.json",
+  "compilerOptions": {
+    "noEmit": true,
+    "types": ["node"],
+    "allowImportingTsExtensions": false
+  },
+  "include": [
+    "**/*.ts",
+    "**/*.mts"
+  ]
+}
diff --git a/tests/scenarios/_harness/vitest.config.ts b/tests/scenarios/_harness/vitest.config.ts
new file mode 100644
index 0000000..b7216e3
--- /dev/null
+++ b/tests/scenarios/_harness/vitest.config.ts
@@ -0,0 +1,41 @@
+import { resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { defineConfig } from 'vitest/config';
+
+const rootDir = fileURLToPath(new URL('../../../', import.meta.url));
+
+// Mirror the root vitest aliases. The scenarios harness imports `@colony/*`
+// just like every other test, and without these aliases the workspace
+// source resolution falls back to dist (which may not be built locally).
+const workspaceAliases = {
+  '@colony/compress': resolve(rootDir, 'packages/compress/src/index.ts'),
+  '@colony/config': resolve(rootDir, 'packages/config/src/index.ts'),
+  '@colony/core': resolve(rootDir, 'packages/core/src/index.ts'),
+  '@colony/embedding': resolve(rootDir, 'packages/embedding/src/index.ts'),
+  '@colony/foraging': resolve(rootDir, 'packages/foraging/src/index.ts'),
+  '@colony/hooks': resolve(rootDir, 'packages/hooks/src/index.ts'),
+  '@colony/installers': resolve(rootDir, 'packages/installers/src/index.ts'),
+  '@colony/mcp-server': resolve(rootDir, 'apps/mcp-server/src/server.ts'),
+  '@colony/process': resolve(rootDir, 'packages/process/src/index.ts'),
+  '@colony/queen': resolve(rootDir, 'packages/queen/src/index.ts'),
+  '@colony/spec': resolve(rootDir, 'packages/spec/src/index.ts'),
+  '@colony/storage': resolve(rootDir, 'packages/storage/src/index.ts'),
+  '@colony/worker': resolve(rootDir, 'apps/worker/src/server.ts'),
+};
+
+export default defineConfig({
+  resolve: {
+    alias: workspaceAliases,
+  },
+  test: {
+    include: [
+      'tests/scenarios/_harness/scenario.test.ts',
+      'tests/scenarios/_harness/__tests__/**/*.test.ts',
+    ],
+    server: {
+      deps: {
+        external: [/better-sqlite3/],
+      },
+    },
+  },
+});