From e188e3f108dbb324af5e658a5d24f2cf9fc6d056 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Sat, 16 May 2026 01:46:12 +0200 Subject: [PATCH] test(scenarios): add reproducible test scenarios harness - 5 canonical multi-agent scenarios as seed.sql + inputs.jsonl + expected.json - In-process harness with vi.useFakeTimers + path normalization - pnpm scenarios / scenarios:filter / scenarios:explain / scenarios:record - 2 harness self-tests (fails-closed on missing expected, clear diff on mismatch) - Separate scenarios CI job on Node 20 --- .github/workflows/ci.yml | 18 + README.md | 2 +- .../scenarios-harness-2026-05-16/CHANGE.md | 31 ++ package.json | 7 +- pnpm-lock.yaml | 3 + .../01-claim-before-edit/expected.json | 112 ++++++ .../01-claim-before-edit/inputs.jsonl | 7 + .../scenarios/01-claim-before-edit/meta.yaml | 10 + tests/scenarios/01-claim-before-edit/seed.sql | 2 + .../02-cross-runtime-handoff/expected.json | 92 +++++ .../02-cross-runtime-handoff/inputs.jsonl | 11 + .../02-cross-runtime-handoff/meta.yaml | 13 + .../02-cross-runtime-handoff/seed.sql | 2 + .../03-stale-claim-sweep/expected.json | 77 +++++ .../03-stale-claim-sweep/inputs.jsonl | 10 + .../scenarios/03-stale-claim-sweep/meta.yaml | 11 + tests/scenarios/03-stale-claim-sweep/seed.sql | 2 + .../04-plan-claim-adoption/expected.json | 75 ++++ .../04-plan-claim-adoption/inputs.jsonl | 9 + .../04-plan-claim-adoption/meta.yaml | 13 + .../scenarios/04-plan-claim-adoption/seed.sql | 33 ++ .../05-path-mismatch-reclaim/expected.json | 95 ++++++ .../05-path-mismatch-reclaim/inputs.jsonl | 8 + .../05-path-mismatch-reclaim/meta.yaml | 11 + .../05-path-mismatch-reclaim/seed.sql | 2 + tests/scenarios/README.md | 80 +++++ .../_harness/__tests__/harness.test.ts | 160 +++++++++ tests/scenarios/_harness/assert.mts | 308 +++++++++++++++++ tests/scenarios/_harness/explain.mts | 150 ++++++++ tests/scenarios/_harness/record.mts | 103 ++++++ tests/scenarios/_harness/run.mts | 321 ++++++++++++++++++ tests/scenarios/_harness/scenario.test.ts | 73 ++++ tests/scenarios/_harness/setup.mts | 114 +++++++ tests/scenarios/_harness/tsconfig.json | 12 + tests/scenarios/_harness/vitest.config.ts | 41 +++ 35 files changed, 2016 insertions(+), 2 deletions(-) create mode 100644 openspec/changes/scenarios-harness-2026-05-16/CHANGE.md create mode 100644 tests/scenarios/01-claim-before-edit/expected.json create mode 100644 tests/scenarios/01-claim-before-edit/inputs.jsonl create mode 100644 tests/scenarios/01-claim-before-edit/meta.yaml create mode 100644 tests/scenarios/01-claim-before-edit/seed.sql create mode 100644 tests/scenarios/02-cross-runtime-handoff/expected.json create mode 100644 tests/scenarios/02-cross-runtime-handoff/inputs.jsonl create mode 100644 tests/scenarios/02-cross-runtime-handoff/meta.yaml create mode 100644 tests/scenarios/02-cross-runtime-handoff/seed.sql create mode 100644 tests/scenarios/03-stale-claim-sweep/expected.json create mode 100644 tests/scenarios/03-stale-claim-sweep/inputs.jsonl create mode 100644 tests/scenarios/03-stale-claim-sweep/meta.yaml create mode 100644 tests/scenarios/03-stale-claim-sweep/seed.sql create mode 100644 tests/scenarios/04-plan-claim-adoption/expected.json create mode 100644 tests/scenarios/04-plan-claim-adoption/inputs.jsonl create mode 100644 tests/scenarios/04-plan-claim-adoption/meta.yaml create mode 100644 tests/scenarios/04-plan-claim-adoption/seed.sql create mode 100644 tests/scenarios/05-path-mismatch-reclaim/expected.json create mode 100644 tests/scenarios/05-path-mismatch-reclaim/inputs.jsonl create mode 100644 tests/scenarios/05-path-mismatch-reclaim/meta.yaml create mode 100644 tests/scenarios/05-path-mismatch-reclaim/seed.sql create mode 100644 tests/scenarios/README.md create mode 100644 tests/scenarios/_harness/__tests__/harness.test.ts create mode 100644 tests/scenarios/_harness/assert.mts create mode 100644 tests/scenarios/_harness/explain.mts create mode 100644 tests/scenarios/_harness/record.mts create mode 100644 tests/scenarios/_harness/run.mts create mode 100644 tests/scenarios/_harness/scenario.test.ts create mode 100644 tests/scenarios/_harness/setup.mts create mode 100644 tests/scenarios/_harness/tsconfig.json create mode 100644 tests/scenarios/_harness/vitest.config.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6cee1f2..186cafe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,3 +50,21 @@ jobs: cache: 'pnpm' - run: pnpm install --frozen-lockfile - run: bash scripts/e2e-publish.sh + + # Reproducible multi-agent scenarios under tests/scenarios/. Kept out + # of the `pnpm test` aggregate so a scenario failure stays attributable + # to the harness rather than blending into the per-package test job. + scenarios: + if: github.event_name != 'pull_request' || github.event.pull_request.draft == false + runs-on: ubuntu-latest + needs: build + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + - uses: pnpm/action-setup@v4 + - uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + - run: pnpm install --frozen-lockfile + - run: pnpm scenarios diff --git a/README.md b/README.md index d727655..00d4909 100644 --- a/README.md +++ b/README.md @@ -1015,7 +1015,7 @@ validate. - ๐ŸŸก Cursor and Gemini CLI installers exist but have less smoke coverage - ๐Ÿ”ต Per-runtime smoke for claim-before-edit emission - ๐Ÿ”ต Cross-runtime handoff smoke (Codex hands off to Claude, both run) -- โณ Reproducible test fixture set under `tests/scenarios/` +- โœ… Reproducible test fixture set under `tests/scenarios/` (5 scenarios, harness self-tests, `pnpm scenarios`) > **`time-to-healthy`: still hours**, but the time the human spends _deciding what to run_ drops sharply because every signal carries its `cmd:` and `tool:` already. diff --git a/openspec/changes/scenarios-harness-2026-05-16/CHANGE.md b/openspec/changes/scenarios-harness-2026-05-16/CHANGE.md new file mode 100644 index 0000000..5ed4b5c --- /dev/null +++ b/openspec/changes/scenarios-harness-2026-05-16/CHANGE.md @@ -0,0 +1,31 @@ +--- +slug: scenarios-harness-2026-05-16 +--- + +# CHANGE ยท scenarios-harness-2026-05-16 + +## ยงP proposal + +### Problem + +README ยงv0.x "Multi-runtime confidence" lists "Reproducible test fixture set under `tests/scenarios/`" as the last open item. Today, multi-agent situations (claim-before-edit, cross-runtime handoff, stale-claim sweep, plan claim adoption, pre/post path mismatch) live as ad-hoc smoke tests scattered across `packages/hooks/test/` and `apps/cli/test/`. Each rebuilds its own tempdir + git repo + fake-timer scaffolding inline. Reproducing a regression means hand-porting that scaffolding into a fresh file. + +### Proposal + +Add a reproducible test-scenarios harness under `tests/scenarios/`. Each scenario is a directory of plaintext artifacts (no binary snapshots): + +- `seed.sql` โ€” applied after schema migrations against a fresh tempdir SQLite DB. +- `inputs.jsonl` โ€” one envelope per line: `{kind, at_ms, payload}` where `kind` is `lifecycle | mcp | tick`. Lifecycle flows through the same `runOmxLifecycleEnvelope` that production hooks call. +- `expected.json` โ€” normalized substrate snapshot with subset matchers (`toMatchObject` style), not full-row equality. Paths normalized to ``. +- Optional `meta.yaml` โ€” runtimes, tags, description. + +A shared `_harness/` drives all scenarios via `vi.useFakeTimers` + `vi.setSystemTime(BASE_TS + at_ms)` per envelope so timing is deterministic. Embeddings forced to `provider: none` to remove network. Five canonical scenarios ship in this PR: claim-before-edit, cross-runtime handoff, stale-claim sweep, plan claim adoption, pre/post path mismatch. Two harness self-tests prove the runner fails closed on missing expected and reports a clear diff on mismatch. A separate CI job runs `pnpm scenarios` on Node 20 after `build`, kept out of `pnpm test` so failure attribution stays clean. + +### Acceptance criteria + +- `pnpm scenarios` runs all five scenarios plus two harness self-tests, all green. +- `pnpm scenarios:filter ` runs a single scenario by name. +- `pnpm scenarios:explain ` prints a human-readable timeline. +- `pnpm scenarios:record ` regenerates `expected.json` from a live run (manual trim still required for subset matcher discipline). +- `.github/workflows/ci.yml` gains a `scenarios` job after `build` running on Node 20 only. +- `pnpm typecheck` and `pnpm build` clean. diff --git a/package.json b/package.json index 3efa656..0c26aa9 100644 --- a/package.json +++ b/package.json @@ -33,13 +33,18 @@ "p": "pnpm run release", "publish:cli": "bash scripts/publish-cli.sh", "publish:cli:dry-run": "bash scripts/publish-cli.sh --dry-run", - "release": "changeset publish" + "release": "changeset publish", + "scenarios": "vitest run --config tests/scenarios/_harness/vitest.config.ts", + "scenarios:filter": "vitest run --config tests/scenarios/_harness/vitest.config.ts -t", + "scenarios:explain": "tsx tests/scenarios/_harness/explain.mts", + "scenarios:record": "tsx tests/scenarios/_harness/record.mts" }, "devDependencies": { "@biomejs/biome": "^1.9.4", "@changesets/cli": "^2.27.9", "@types/node": "^22.9.0", "tsup": "^8.3.5", + "tsx": "^4.19.2", "typescript": "^5.6.3", "vitest": "^2.1.5" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 97834ab..808b3d1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -20,6 +20,9 @@ importers: tsup: specifier: ^8.3.5 version: 8.5.1(postcss@8.5.10)(tsx@4.21.0)(typescript@5.9.3) + tsx: + specifier: ^4.19.2 + version: 4.21.0 typescript: specifier: ^5.6.3 version: 5.9.3 diff --git a/tests/scenarios/01-claim-before-edit/expected.json b/tests/scenarios/01-claim-before-edit/expected.json new file mode 100644 index 0000000..5f8b5dc --- /dev/null +++ b/tests/scenarios/01-claim-before-edit/expected.json @@ -0,0 +1,112 @@ +{ + "observations": [ + { + "kind": "lifecycle_event", + "ts_offset": 10, + "metadata_subset": { + "event_name": "session_start", + "session_id": "codex@scenario-01", + "agent": "codex", + "branch": "agent/scenario/default", + "binding_status": "bound_task" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_01_session", + "event_type": "session_start", + "ok": true + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_01_bind", + "event_type": "task_bind", + "ok": true + } + }, + { + "kind": "claim", + "ts_offset": 40, + "metadata_subset": { + "kind": "claim", + "source": "pre-tool-use", + "file_path": "src/target.ts", + "auto_claimed_before_edit": true, + "tool": "Edit" + } + }, + { + "kind": "claim-before-edit", + "ts_offset": 40, + "metadata_subset": { + "kind": "claim-before-edit", + "source": "pre-tool-use", + "outcome": "auto_claimed_before_edit", + "file_path": "src/target.ts", + "tool": "Edit", + "conflict": false + } + }, + { + "kind": "omx-lifecycle", + "ts_offset": 40, + "metadata_subset": { + "event_id": "evt_01_pre", + "event_type": "pre_tool_use", + "tool_name": "Edit", + "extracted_paths": ["src/target.ts"] + } + }, + { + "kind": "tool_use", + "ts_offset": 60, + "metadata_subset": { + "tool": "Edit", + "lifecycle_event_id": "evt_01_post", + "parent_event_id": "evt_01_pre", + "file_path": "src/target.ts" + } + }, + { + "kind": "omx-lifecycle", + "ts_offset": 60, + "metadata_subset": { + "event_id": "evt_01_post", + "event_type": "post_tool_use", + "parent_event_id": "evt_01_pre", + "tool_name": "Edit" + } + } + ], + "claims": [ + { + "task_id": 1, + "file_path": "src/target.ts", + "session_id": "codex@scenario-01", + "state": "active" + } + ], + "mcp_metrics": [], + "lifecycle_events": [ + { + "event_type": "session_start", + "event_id": "evt_01_session" + }, + { + "event_type": "task_bind", + "event_id": "evt_01_bind" + }, + { + "event_type": "pre_tool_use", + "event_id": "evt_01_pre" + }, + { + "event_type": "post_tool_use", + "event_id": "evt_01_post", + "parent_event_id": "evt_01_pre" + } + ] +} diff --git a/tests/scenarios/01-claim-before-edit/inputs.jsonl b/tests/scenarios/01-claim-before-edit/inputs.jsonl new file mode 100644 index 0000000..83c2875 --- /dev/null +++ b/tests/scenarios/01-claim-before-edit/inputs.jsonl @@ -0,0 +1,7 @@ +# Scenario 01 โ€” claim-before-edit +# A codex session starts, binds to a task, then claims src/target.ts +# before issuing a pre_tool_use + post_tool_use Edit on it. +{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_01_session","event_name":"session_start","session_id":"codex@scenario-01","agent":"codex","branch":"agent/scenario/default"}} +{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_01_bind","event_name":"task_bind","session_id":"codex@scenario-01","agent":"codex","branch":"agent/scenario/default"}} +{"kind":"lifecycle","at_ms":40,"payload":{"event_id":"evt_01_pre","event_name":"pre_tool_use","session_id":"codex@scenario-01","agent":"codex","branch":"agent/scenario/default","tool_name":"Edit","tool_input":{"operation":"replace","paths":[{"path":"/src/target.ts","role":"target","kind":"file"}]}}} +{"kind":"lifecycle","at_ms":60,"payload":{"event_id":"evt_01_post","event_name":"post_tool_use","parent_event_id":"evt_01_pre","session_id":"codex@scenario-01","agent":"codex","branch":"agent/scenario/default","tool_name":"Edit","tool_input":{"operation":"replace","paths":[{"path":"/src/target.ts","role":"target","kind":"file"}]},"tool_response":{"success":true}}} diff --git a/tests/scenarios/01-claim-before-edit/meta.yaml b/tests/scenarios/01-claim-before-edit/meta.yaml new file mode 100644 index 0000000..aa482af --- /dev/null +++ b/tests/scenarios/01-claim-before-edit/meta.yaml @@ -0,0 +1,10 @@ +description: | + Codex session binds to a task on agent/scenario/default and edits + src/target.ts. Proves pre_tool_use synthesizes a claim-before-edit + signal and that the post_tool_use observation lands with the claim + already in place. +runtimes: + - codex +tags: + - claim-before-edit + - lifecycle diff --git a/tests/scenarios/01-claim-before-edit/seed.sql b/tests/scenarios/01-claim-before-edit/seed.sql new file mode 100644 index 0000000..4ea4ce4 --- /dev/null +++ b/tests/scenarios/01-claim-before-edit/seed.sql @@ -0,0 +1,2 @@ +-- No seed needed: the lifecycle session_start + task_bind events drive +-- session creation and task binding by themselves. diff --git a/tests/scenarios/02-cross-runtime-handoff/expected.json b/tests/scenarios/02-cross-runtime-handoff/expected.json new file mode 100644 index 0000000..42ee6ad --- /dev/null +++ b/tests/scenarios/02-cross-runtime-handoff/expected.json @@ -0,0 +1,92 @@ +{ + "observations": [ + { + "kind": "lifecycle_event", + "metadata_subset": { + "event_name": "session_start", + "session_id": "codex@scenario-02", + "agent": "codex" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_02_codex_session", + "event_type": "session_start" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_02_codex_bind", + "event_type": "task_bind" + } + }, + { + "kind": "claim", + "ts_offset": 30, + "metadata_subset": { + "kind": "claim", + "file_path": "src/target.ts" + } + }, + { + "kind": "relay", + "ts_offset": 50, + "metadata_subset": { + "kind": "relay", + "from_session_id": "codex@scenario-02", + "from_agent": "codex", + "reason": "quota" + } + }, + { + "kind": "claim-weakened", + "ts_offset": 50, + "metadata_subset": { + "kind": "claim-weakened", + "file_path": "src/target.ts", + "state": "handoff_pending" + } + }, + { + "kind": "lifecycle_event", + "ts_offset": 300000, + "metadata_subset": { + "event_name": "session_start", + "session_id": "claude@scenario-02", + "agent": "claude" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_02_claude_session", + "event_type": "session_start" + } + } + ], + "claims": [ + { + "task_id": 1, + "file_path": "src/target.ts", + "session_id": "claude@scenario-02", + "state": "active" + } + ], + "mcp_metrics": [], + "lifecycle_events": [ + { + "event_type": "session_start", + "event_id": "evt_02_codex_session" + }, + { + "event_type": "task_bind", + "event_id": "evt_02_codex_bind" + }, + { + "event_type": "session_start", + "event_id": "evt_02_claude_session" + } + ] +} diff --git a/tests/scenarios/02-cross-runtime-handoff/inputs.jsonl b/tests/scenarios/02-cross-runtime-handoff/inputs.jsonl new file mode 100644 index 0000000..3434415 --- /dev/null +++ b/tests/scenarios/02-cross-runtime-handoff/inputs.jsonl @@ -0,0 +1,11 @@ +# Scenario 02 โ€” cross-runtime handoff +# Codex binds to a task, claims src/target.ts, then relays out. Claude +# session adopts via accept_relay. Proves the baton pass survives +# across runtimes and the claim ends up owned by claude's session. +{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_02_codex_session","event_name":"session_start","session_id":"codex@scenario-02","agent":"codex","branch":"agent/scenario/default"}} +{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_02_codex_bind","event_name":"task_bind","session_id":"codex@scenario-02","agent":"codex","branch":"agent/scenario/default"}} +{"kind":"task","at_ms":30,"payload":{"action":"claim_file","task_id":1,"session_id":"codex@scenario-02","file_path":"src/target.ts","note":"codex claims before relay"}} +{"kind":"task","at_ms":50,"payload":{"action":"relay","task_id":1,"from_session_id":"codex@scenario-02","from_agent":"codex","reason":"quota","one_line":"codex hit quota, handing off","base_branch":"main","expires_in_ms":600000}} +{"kind":"lifecycle","at_ms":300000,"payload":{"event_id":"evt_02_claude_session","event_name":"session_start","session_id":"claude@scenario-02","agent":"claude","branch":"agent/scenario/default"}} +{"kind":"task","at_ms":300100,"payload":{"action":"join","task_id":1,"session_id":"claude@scenario-02","agent":"claude"}} +{"kind":"task","at_ms":300200,"payload":{"action":"accept_relay","task_id":1,"session_id":"claude@scenario-02"}} diff --git a/tests/scenarios/02-cross-runtime-handoff/meta.yaml b/tests/scenarios/02-cross-runtime-handoff/meta.yaml new file mode 100644 index 0000000..7429253 --- /dev/null +++ b/tests/scenarios/02-cross-runtime-handoff/meta.yaml @@ -0,0 +1,13 @@ +description: | + Codex binds to a task on agent/scenario/default and claims + src/target.ts. Codex then relays out (quota reason). 5 minutes later + a claude session starts, joins the task, and accepts the relay. + Proves the cross-runtime baton pass and that the claim ends up + owned by claude's session. +runtimes: + - codex + - claude +tags: + - handoff + - relay + - cross-runtime diff --git a/tests/scenarios/02-cross-runtime-handoff/seed.sql b/tests/scenarios/02-cross-runtime-handoff/seed.sql new file mode 100644 index 0000000..e1cb924 --- /dev/null +++ b/tests/scenarios/02-cross-runtime-handoff/seed.sql @@ -0,0 +1,2 @@ +-- No seed needed: lifecycle session_start + task_bind create the task, +-- task envelopes drive the relay and adoption. diff --git a/tests/scenarios/03-stale-claim-sweep/expected.json b/tests/scenarios/03-stale-claim-sweep/expected.json new file mode 100644 index 0000000..ee912ff --- /dev/null +++ b/tests/scenarios/03-stale-claim-sweep/expected.json @@ -0,0 +1,77 @@ +{ + "observations": [ + { + "kind": "lifecycle_event", + "metadata_subset": { + "session_id": "codex@scenario-03", + "agent": "codex" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_03_session", + "event_type": "session_start" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_03_bind", + "event_type": "task_bind" + } + }, + { + "kind": "claim", + "ts_offset": 30, + "metadata_subset": { + "kind": "claim", + "file_path": "src/target.ts" + } + }, + { + "kind": "relay", + "ts_offset": 50, + "metadata_subset": { + "kind": "relay", + "reason": "quota" + } + }, + { + "kind": "claim-weakened", + "ts_offset": 50, + "metadata_subset": { + "kind": "claim-weakened", + "file_path": "src/target.ts", + "state": "handoff_pending" + } + }, + { + "kind": "claim-weakened", + "ts_offset": 120100, + "metadata_subset": { + "kind": "claim-weakened", + "file_path": "src/target.ts", + "state": "weak_expired" + } + } + ], + "claims": [ + { + "task_id": 1, + "file_path": "src/target.ts", + "state": "weak_expired" + } + ], + "mcp_metrics": [], + "lifecycle_events": [ + { + "event_type": "session_start", + "event_id": "evt_03_session" + }, + { + "event_type": "task_bind", + "event_id": "evt_03_bind" + } + ] +} diff --git a/tests/scenarios/03-stale-claim-sweep/inputs.jsonl b/tests/scenarios/03-stale-claim-sweep/inputs.jsonl new file mode 100644 index 0000000..c0c5970 --- /dev/null +++ b/tests/scenarios/03-stale-claim-sweep/inputs.jsonl @@ -0,0 +1,10 @@ +# Scenario 03 โ€” stale-claim sweep +# Codex binds, claims, then relays out (creates handoff_pending claim +# with TTL=60s). After the TTL passes, a tick + release_expired_quota +# transitions the claim into weak_expired. +{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_03_session","event_name":"session_start","session_id":"codex@scenario-03","agent":"codex","branch":"agent/scenario/default"}} +{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_03_bind","event_name":"task_bind","session_id":"codex@scenario-03","agent":"codex","branch":"agent/scenario/default"}} +{"kind":"task","at_ms":30,"payload":{"action":"claim_file","task_id":1,"session_id":"codex@scenario-03","file_path":"src/target.ts","note":"about to be expired"}} +{"kind":"task","at_ms":50,"payload":{"action":"relay","task_id":1,"from_session_id":"codex@scenario-03","from_agent":"codex","reason":"quota","one_line":"forced quota for stale-claim test","base_branch":"main","expires_in_ms":60000}} +{"kind":"tick","at_ms":120000,"payload":{"reason":"advance past 60s TTL"}} +{"kind":"task","at_ms":120100,"payload":{"action":"release_expired_quota","task_id":1,"session_id":"codex@scenario-03"}} diff --git a/tests/scenarios/03-stale-claim-sweep/meta.yaml b/tests/scenarios/03-stale-claim-sweep/meta.yaml new file mode 100644 index 0000000..18cb4f3 --- /dev/null +++ b/tests/scenarios/03-stale-claim-sweep/meta.yaml @@ -0,0 +1,11 @@ +description: | + Codex claims src/target.ts then relays out with TTL=60s. The + scenario ticks past TTL and asks the task to release_expired_quota. + Proves stale claims transition to weak_expired and become eligible + for the next agent to claim. +runtimes: + - codex +tags: + - stale-claim + - sweep + - ttl diff --git a/tests/scenarios/03-stale-claim-sweep/seed.sql b/tests/scenarios/03-stale-claim-sweep/seed.sql new file mode 100644 index 0000000..ace7060 --- /dev/null +++ b/tests/scenarios/03-stale-claim-sweep/seed.sql @@ -0,0 +1,2 @@ +-- No seed needed: relay + tick + release_expired_quota drives the +-- claim through handoff_pending into weak_expired. diff --git a/tests/scenarios/04-plan-claim-adoption/expected.json b/tests/scenarios/04-plan-claim-adoption/expected.json new file mode 100644 index 0000000..2bebc85 --- /dev/null +++ b/tests/scenarios/04-plan-claim-adoption/expected.json @@ -0,0 +1,75 @@ +{ + "observations": [ + { + "kind": "plan-subtask-claim", + "metadata_subset": { + "kind": "plan-subtask-claim", + "status": "pending", + "plan_slug": "scenario-04-plan", + "subtask_index": 0 + } + }, + { + "kind": "lifecycle_event", + "metadata_subset": { + "session_id": "codex@scenario-04", + "agent": "codex", + "task_id": 100, + "binding_status": "bound_task" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_04_session", + "event_type": "session_start" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_04_bind", + "event_type": "task_bind" + } + }, + { + "kind": "claim", + "ts_offset": 40, + "metadata_subset": { + "kind": "claim", + "file_path": "src/target.ts" + } + }, + { + "kind": "plan-subtask-claim", + "ts_offset": 50, + "metadata_subset": { + "kind": "plan-subtask-claim", + "status": "claimed", + "plan_slug": "scenario-04-plan", + "subtask_index": 0, + "session_id": "codex@scenario-04", + "agent": "codex" + } + } + ], + "claims": [ + { + "task_id": 100, + "file_path": "src/target.ts", + "session_id": "codex@scenario-04", + "state": "active" + } + ], + "mcp_metrics": [], + "lifecycle_events": [ + { + "event_type": "session_start", + "event_id": "evt_04_session" + }, + { + "event_type": "task_bind", + "event_id": "evt_04_bind" + } + ] +} diff --git a/tests/scenarios/04-plan-claim-adoption/inputs.jsonl b/tests/scenarios/04-plan-claim-adoption/inputs.jsonl new file mode 100644 index 0000000..eacc168 --- /dev/null +++ b/tests/scenarios/04-plan-claim-adoption/inputs.jsonl @@ -0,0 +1,9 @@ +# Scenario 04 โ€” plan claim adoption +# Queen published a plan sub-task (seeded). A codex agent starts a +# session on the sub-task branch, joins the task, claims the file scope, +# and emits a plan-subtask-claim observation transitioning to 'claimed'. +{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_04_session","event_name":"session_start","session_id":"codex@scenario-04","agent":"codex","branch":"spec/scenario-04-plan/sub-0"}} +{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_04_bind","event_name":"task_bind","session_id":"codex@scenario-04","agent":"codex","branch":"spec/scenario-04-plan/sub-0"}} +{"kind":"task","at_ms":30,"payload":{"action":"join","task_id":100,"session_id":"codex@scenario-04","agent":"codex"}} +{"kind":"task","at_ms":40,"payload":{"action":"claim_file","task_id":100,"session_id":"codex@scenario-04","file_path":"src/target.ts","note":"adopting plan sub-task"}} +{"kind":"task","at_ms":50,"payload":{"action":"add_observation","task_id":100,"session_id":"codex@scenario-04","kind":"plan-subtask-claim","content":"codex adopting scenario-04-plan sub-0","metadata":{"kind":"plan-subtask-claim","status":"claimed","plan_slug":"scenario-04-plan","subtask_index":0,"session_id":"codex@scenario-04","agent":"codex"}}} diff --git a/tests/scenarios/04-plan-claim-adoption/meta.yaml b/tests/scenarios/04-plan-claim-adoption/meta.yaml new file mode 100644 index 0000000..ee9d046 --- /dev/null +++ b/tests/scenarios/04-plan-claim-adoption/meta.yaml @@ -0,0 +1,13 @@ +description: | + Seeded with a queen-published plan sub-task (task_id=100). A codex + agent starts a session on the sub-task branch, joins the task, + claims the file scope, and writes the plan-subtask-claim + 'claimed' observation. Proves adoption of a queen-owned task without + needing the full publishPlan side-effect cascade. +runtimes: + - codex + - queen +tags: + - plan + - subtask + - adoption diff --git a/tests/scenarios/04-plan-claim-adoption/seed.sql b/tests/scenarios/04-plan-claim-adoption/seed.sql new file mode 100644 index 0000000..0b38e67 --- /dev/null +++ b/tests/scenarios/04-plan-claim-adoption/seed.sql @@ -0,0 +1,33 @@ +-- Queen published a plan with one sub-task. We seed the sub-task row +-- directly (so the scenario doesn't have to walk publishPlan's full +-- file-emit side effect). Agent later claims and adopts the sub-task. +INSERT INTO sessions(id, ide, cwd, started_at, metadata) + VALUES ('queen@scenario-04', 'queen', '', 1778925600000, NULL); + +INSERT INTO tasks(id, title, repo_root, branch, status, created_by, created_at, updated_at) + VALUES ( + 100, + 'Plan claim adoption sub-task', + '', + 'spec/scenario-04-plan/sub-0', + 'open', + 'queen@scenario-04', + 1778925600000, + 1778925600000 + ); + +-- Queen records the plan-subtask-claim as 'pending' so the agent has +-- something to adopt. Real queen publishes also emit plan-config, but +-- that's outside the adoption assertion surface. +INSERT INTO observations(id, session_id, kind, content, compressed, intensity, ts, metadata, task_id) + VALUES ( + 1000, + 'queen@scenario-04', + 'plan-subtask-claim', + 'queen published scenario-04 sub-0', + 0, + NULL, + 1778925600000, + '{"kind":"plan-subtask-claim","status":"pending","plan_slug":"scenario-04-plan","subtask_index":0}', + 100 + ); diff --git a/tests/scenarios/05-path-mismatch-reclaim/expected.json b/tests/scenarios/05-path-mismatch-reclaim/expected.json new file mode 100644 index 0000000..aa0d04c --- /dev/null +++ b/tests/scenarios/05-path-mismatch-reclaim/expected.json @@ -0,0 +1,95 @@ +{ + "observations": [ + { + "kind": "lifecycle_event", + "metadata_subset": { + "session_id": "codex@scenario-05", + "agent": "codex", + "binding_status": "bound_task", + "task_id": 1 + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_05_session", + "event_type": "session_start" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_05_bind", + "event_type": "task_bind" + } + }, + { + "kind": "claim", + "ts_offset": 30, + "metadata_subset": { + "kind": "claim", + "file_path": "src/secondary.ts" + } + }, + { + "kind": "claim", + "ts_offset": 50, + "metadata_subset": { + "kind": "claim", + "source": "pre-tool-use", + "file_path": "src/target.ts", + "auto_claimed_before_edit": true, + "tool": "Edit" + } + }, + { + "kind": "claim-before-edit", + "ts_offset": 50, + "metadata_subset": { + "kind": "claim-before-edit", + "source": "pre-tool-use", + "outcome": "auto_claimed_before_edit", + "file_path": "src/target.ts", + "tool": "Edit" + } + }, + { + "kind": "omx-lifecycle", + "metadata_subset": { + "event_id": "evt_05_pre", + "event_type": "pre_tool_use", + "tool_name": "Edit", + "extracted_paths": ["src/target.ts"] + } + } + ], + "claims": [ + { + "task_id": 1, + "file_path": "src/secondary.ts", + "session_id": "codex@scenario-05", + "state": "active" + }, + { + "task_id": 1, + "file_path": "src/target.ts", + "session_id": "codex@scenario-05", + "state": "active" + } + ], + "mcp_metrics": [], + "lifecycle_events": [ + { + "event_type": "session_start", + "event_id": "evt_05_session" + }, + { + "event_type": "task_bind", + "event_id": "evt_05_bind" + }, + { + "event_type": "pre_tool_use", + "event_id": "evt_05_pre" + } + ] +} diff --git a/tests/scenarios/05-path-mismatch-reclaim/inputs.jsonl b/tests/scenarios/05-path-mismatch-reclaim/inputs.jsonl new file mode 100644 index 0000000..5fd33d3 --- /dev/null +++ b/tests/scenarios/05-path-mismatch-reclaim/inputs.jsonl @@ -0,0 +1,8 @@ +# Scenario 05 โ€” path-mismatch reclaim +# Codex claims src/secondary.ts up front, then issues pre_tool_use on +# src/target.ts. The pre-tool-use hook must auto-claim the actual file +# (target.ts), producing both claims on the task. +{"kind":"lifecycle","at_ms":10,"payload":{"event_id":"evt_05_session","event_name":"session_start","session_id":"codex@scenario-05","agent":"codex","branch":"agent/scenario/default"}} +{"kind":"lifecycle","at_ms":20,"payload":{"event_id":"evt_05_bind","event_name":"task_bind","session_id":"codex@scenario-05","agent":"codex","branch":"agent/scenario/default"}} +{"kind":"task","at_ms":30,"payload":{"action":"claim_file","task_id":1,"session_id":"codex@scenario-05","file_path":"src/secondary.ts","note":"claimed wrong file first"}} +{"kind":"lifecycle","at_ms":50,"payload":{"event_id":"evt_05_pre","event_name":"pre_tool_use","session_id":"codex@scenario-05","agent":"codex","branch":"agent/scenario/default","tool_name":"Edit","tool_input":{"operation":"replace","paths":[{"path":"/src/target.ts","role":"target","kind":"file"}]}}} diff --git a/tests/scenarios/05-path-mismatch-reclaim/meta.yaml b/tests/scenarios/05-path-mismatch-reclaim/meta.yaml new file mode 100644 index 0000000..3d83022 --- /dev/null +++ b/tests/scenarios/05-path-mismatch-reclaim/meta.yaml @@ -0,0 +1,11 @@ +description: | + Codex claims src/secondary.ts then issues a pre_tool_use Edit on + src/target.ts. Proves the pre-tool-use auto-claim path catches the + mismatch and adds an active claim for the actual edited file, leaving + both rows in task_claims. +runtimes: + - codex +tags: + - claim-before-edit + - path-mismatch + - reclaim diff --git a/tests/scenarios/05-path-mismatch-reclaim/seed.sql b/tests/scenarios/05-path-mismatch-reclaim/seed.sql new file mode 100644 index 0000000..88c8a8a --- /dev/null +++ b/tests/scenarios/05-path-mismatch-reclaim/seed.sql @@ -0,0 +1,2 @@ +-- No seed needed: scenario drives all claims through task envelopes +-- and lifecycle pre_tool_use. diff --git a/tests/scenarios/README.md b/tests/scenarios/README.md new file mode 100644 index 0000000..fa31200 --- /dev/null +++ b/tests/scenarios/README.md @@ -0,0 +1,80 @@ +# Test scenarios + +Reproducible multi-agent situations driven against the same in-process +code path the production runtimes use. + +Each scenario is a directory of plaintext artifacts (no binary +snapshots): + +- `seed.sql` โ€” applied after schema migrations against a fresh tempdir + SQLite DB. `` placeholders are expanded to the live + tempdir path before execution. +- `inputs.jsonl` โ€” one envelope per line, sorted by `at_ms`. Each + envelope has shape `{kind, at_ms, payload}`. `kind` is one of: + - `lifecycle` โ€” funnel `payload` through `runOmxLifecycleEnvelope` + (same entrypoint production hooks call). + - `mcp` โ€” record an MCP metric row. + - `task` โ€” direct `TaskThread` action (`claim_file`, `relay`, + `accept_relay`, `release_expired_quota`, `join`, `add_observation`). + - `tick` โ€” advance the fake clock without dispatching anything. +- `expected.json` โ€” normalized substrate snapshot using **subset + matchers** (vitest `toMatchObject` style). Fields not listed are + ignored. Paths normalized to `` so diffs are tempdir-stable. +- `meta.yaml` (optional) โ€” `runtimes`, `tags`, `description`. + +## Commands + +```bash +pnpm scenarios # run all scenarios + harness self-tests +pnpm scenarios:filter 03-stale-claim-sweep # run one by slug +pnpm scenarios:explain 02-cross-runtime-handoff # human-readable timeline +pnpm scenarios:record 04-plan-claim-adoption # regenerate expected.json +``` + +After `scenarios:record`, hand-trim the generated file down to subset +matchers โ€” leaving the full row in is a defect because tests will then +break on unrelated noise. + +## Determinism rules + +- `BASE_TS = 2026-05-16T10:00:00.000Z`. Every `at_ms` is an offset from + this anchor. The runner calls `vi.setSystemTime(BASE_TS + at_ms)` (or + the equivalent for `scenarios:record`) before each input. +- Embeddings forced to `provider: 'none'` in the harness so no scenario + reaches for the network or pulls a model. +- Session IDs are explicit in `inputs.jsonl`. Do not call + `store.startSession()` without an id โ€” randomness would defeat the + point. +- Paths in `expected.json` use `` instead of the live + tempdir. + +## Scenarios + +| Slug | What it proves | +| --- | --- | +| `01-claim-before-edit` | Codex pre_tool_use auto-claims target before Edit lands; post_tool_use sees the claim. | +| `02-cross-runtime-handoff` | Codex relays out, claude session adopts the relay; claim ownership flips to claude. | +| `03-stale-claim-sweep` | Relay TTL expires; `release_expired_quota` transitions claim to `weak_expired`. | +| `04-plan-claim-adoption` | Seeded queen sub-task gets adopted by a codex agent (`plan-subtask-claim` โ†’ `claimed`). | +| `05-path-mismatch-reclaim` | Agent claims wrong file first; pre_tool_use on a different path auto-claims the correct one. | + +## Adding a scenario + +1. `mkdir tests/scenarios/NN-slug && cd tests/scenarios/NN-slug` +2. Write `seed.sql` (or leave empty) and `inputs.jsonl`. +3. `pnpm scenarios:record NN-slug` to bootstrap `expected.json`. +4. Hand-trim `expected.json` to subset matchers. +5. `pnpm scenarios:filter NN-slug` until green. +6. `pnpm scenarios` to confirm full suite stays green. + +## Harness self-tests + +`_harness/__tests__/harness.test.ts` proves the runner fails closed: + +- Missing `expected.json` throws `ScenarioConfigError`. +- Mismatched `expected.json` throws `ScenarioMismatchError` with the + scenario slug, offending key path, actual value, and expected value + in the message. + +If you add a new envelope kind or normalizer, extend the self-tests so +the harness can't silently pass against a wrong fixture. diff --git a/tests/scenarios/_harness/__tests__/harness.test.ts b/tests/scenarios/_harness/__tests__/harness.test.ts new file mode 100644 index 0000000..5c7967d --- /dev/null +++ b/tests/scenarios/_harness/__tests__/harness.test.ts @@ -0,0 +1,160 @@ +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { + ScenarioMismatchError, + assertExpectedMatch, + collectLiveSubstrate, + loadExpected, +} from '../assert.mjs'; +import { ScenarioConfigError, parseInputsJsonl, runScenarioInputs } from '../run.mjs'; +import { + BASE_TS, + type ScenarioContext, + setupScenarioContext, + teardownScenarioContext, +} from '../setup.mjs'; + +/** + * Self-tests that prove the runner fails closed in the two ways most + * likely to silently let a scenario pass against the wrong fixture: + * 1) expected.json is missing entirely + * 2) expected.json disagrees with the live substrate + * + * Both must surface a structured error with the slug, the offending + * key path, and both sides of the diff. + */ +describe('scenarios harness self-tests', () => { + let scratch: string; + let ctx: ScenarioContext | undefined; + + beforeEach(() => { + vi.useFakeTimers(); + vi.setSystemTime(BASE_TS); + scratch = mkdtempSync(join(tmpdir(), 'colony-harness-selftest-')); + }); + + afterEach(() => { + teardownScenarioContext(ctx); + ctx = undefined; + try { + rmSync(scratch, { recursive: true, force: true }); + } catch { + // best effort + } + vi.useRealTimers(); + }); + + it('fails closed when expected.json is missing', async () => { + const fixtureDir = join(scratch, 'no-expected'); + mkdirSync(fixtureDir, { recursive: true }); + // Inputs file exists but expected.json does not โ€” the runner must + // refuse to mark the scenario green just because the live run + // happened to succeed. + writeFileSync( + join(fixtureDir, 'inputs.jsonl'), + `${JSON.stringify({ + kind: 'lifecycle', + at_ms: 10, + payload: { + event_id: 'evt_selftest_session', + event_name: 'session_start', + session_id: 'codex@selftest', + agent: 'codex', + branch: 'agent/scenario/default', + }, + })}\n`, + 'utf8', + ); + ctx = setupScenarioContext({ scenarioDir: fixtureDir }); + const inputs = parseInputsJsonl(join(fixtureDir, 'inputs.jsonl')); + await runScenarioInputs(ctx, inputs, (ms) => { + vi.setSystemTime(ms); + }); + expect(() => loadExpected(fixtureDir)).toThrow(ScenarioConfigError); + expect(() => loadExpected(fixtureDir)).toThrow(/missing expected\.json/); + }); + + it('reports a clear diff when expected.json disagrees with substrate', async () => { + const fixtureDir = join(scratch, 'wrong-expected'); + mkdirSync(fixtureDir, { recursive: true }); + writeFileSync( + join(fixtureDir, 'inputs.jsonl'), + `${JSON.stringify({ + kind: 'lifecycle', + at_ms: 10, + payload: { + event_id: 'evt_selftest_bind_session', + event_name: 'session_start', + session_id: 'codex@selftest', + agent: 'codex', + branch: 'agent/scenario/default', + }, + })}\n${JSON.stringify({ + kind: 'lifecycle', + at_ms: 20, + payload: { + event_id: 'evt_selftest_bind', + event_name: 'task_bind', + session_id: 'codex@selftest', + agent: 'codex', + branch: 'agent/scenario/default', + }, + })}\n${JSON.stringify({ + kind: 'lifecycle', + at_ms: 40, + payload: { + event_id: 'evt_selftest_pre', + event_name: 'pre_tool_use', + session_id: 'codex@selftest', + agent: 'codex', + branch: 'agent/scenario/default', + tool_name: 'Edit', + tool_input: { + operation: 'replace', + paths: [{ path: '/src/target.ts', role: 'target', kind: 'file' }], + }, + }, + })}\n`, + 'utf8', + ); + // Deliberately wrong file_path so the runner must report the mismatch. + writeFileSync( + join(fixtureDir, 'expected.json'), + `${JSON.stringify({ + claims: [ + { + file_path: 'src/wrong-target.ts', + }, + ], + })}\n`, + 'utf8', + ); + ctx = setupScenarioContext({ scenarioDir: fixtureDir }); + const inputs = parseInputsJsonl(join(fixtureDir, 'inputs.jsonl')); + await runScenarioInputs(ctx, inputs, (ms) => { + vi.setSystemTime(ms); + }); + const live = collectLiveSubstrate(ctx); + const expected = loadExpected(fixtureDir); + + let captured: ScenarioMismatchError | undefined; + try { + assertExpectedMatch('wrong-expected', expected, live); + } catch (err) { + if (err instanceof ScenarioMismatchError) captured = err; + else throw err; + } + expect(captured, 'expected a ScenarioMismatchError').toBeDefined(); + if (!captured) throw new Error('unreachable'); + expect(captured.slug).toBe('wrong-expected'); + expect(captured.keyPath).toBe('claims[0].file_path'); + expect(captured.actual).toBe('src/target.ts'); + expect(captured.expected).toBe('src/wrong-target.ts'); + expect(captured.message).toContain('wrong-expected'); + expect(captured.message).toContain('claims[0].file_path'); + expect(captured.message).toContain('src/wrong-target.ts'); + expect(captured.message).toContain('src/target.ts'); + }); +}); diff --git a/tests/scenarios/_harness/assert.mts b/tests/scenarios/_harness/assert.mts new file mode 100644 index 0000000..5716df3 --- /dev/null +++ b/tests/scenarios/_harness/assert.mts @@ -0,0 +1,308 @@ +import { existsSync, readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { BASE_TS, type ScenarioContext } from './setup.mjs'; +import { ScenarioConfigError } from './run.mjs'; + +/** + * Shape of expected.json. All arrays use subset matchers via + * vitest's `toMatchObject` semantics, so each entry only has to assert + * the fields it cares about. Order is significant: entry 0 in expected + * must match the first live row of that kind that survives filters. + */ +export interface ExpectedSubstrate { + observations?: ExpectedObservation[]; + claims?: ExpectedClaim[]; + mcp_metrics?: ExpectedMcpMetric[]; + lifecycle_events?: ExpectedLifecycleEvent[]; +} + +export interface ExpectedObservation { + kind: string; + ts_offset?: number; + /** Subset-match over the JSON-parsed metadata column. */ + metadata_subset?: Record; +} + +export interface ExpectedClaim { + task_id?: number; + file_path: string; + session_id?: string; + state?: string; +} + +export interface ExpectedMcpMetric { + operation: string; + session_id?: string; + ok?: boolean; +} + +export interface ExpectedLifecycleEvent { + event_type: string; + event_id?: string; + parent_event_id?: string; +} + +/** + * Normalized live substrate the runner exposes to the diff. Everything + * is plain JSON so it survives the same `toMatchObject` semantics + * that drive subset matchers. + */ +export interface LiveSubstrate { + observations: Array<{ + kind: string; + ts_offset: number; + metadata_subset: Record | null; + }>; + claims: Array<{ + task_id: number; + file_path: string; + session_id: string; + state: string; + }>; + mcp_metrics: Array<{ operation: string; session_id: string | null; ok: boolean }>; + lifecycle_events: Array<{ + event_type: string; + event_id: string; + parent_event_id: string | null; + }>; +} + +/** + * Collect the live substrate after a scenario run. Paths get rewritten + * to `` so the diff doesn't depend on the tempdir name. + */ +export function collectLiveSubstrate(ctx: ScenarioContext): LiveSubstrate { + const storageWithDb = ctx.store.storage as unknown as { + db: { prepare: (sql: string) => { all: (...args: unknown[]) => unknown[] } }; + }; + const db = storageWithDb.db; + + const obsRows = db + .prepare( + 'SELECT id, kind, ts, metadata FROM observations ORDER BY ts ASC, id ASC', + ) + .all() as Array<{ id: number; kind: string; ts: number; metadata: string | null }>; + + const claimRows = db + .prepare( + 'SELECT task_id, file_path, session_id, state FROM task_claims ORDER BY task_id ASC, file_path ASC', + ) + .all() as Array<{ task_id: number; file_path: string; session_id: string; state: string }>; + + const mcpRows = db + .prepare('SELECT operation, session_id, ok FROM mcp_metrics ORDER BY ts ASC, rowid ASC') + .all() as Array<{ operation: string; session_id: string | null; ok: number }>; + + return { + observations: obsRows.map((row) => ({ + kind: row.kind, + ts_offset: row.ts - BASE_TS, + metadata_subset: parseMetadata(row.metadata, ctx.repoRoot), + })), + claims: claimRows.map((row) => ({ + task_id: row.task_id, + file_path: normalizePath(row.file_path, ctx.repoRoot), + session_id: row.session_id, + state: row.state, + })), + mcp_metrics: mcpRows.map((row) => ({ + operation: row.operation, + session_id: row.session_id, + ok: row.ok === 1, + })), + lifecycle_events: obsRows + .filter((row) => row.kind === 'omx-lifecycle') + .map((row) => { + const meta = parseMetadata(row.metadata, ctx.repoRoot) ?? {}; + return { + event_type: stringOr(meta.event_type, ''), + event_id: stringOr(meta.event_id, ''), + parent_event_id: typeof meta.parent_event_id === 'string' ? meta.parent_event_id : null, + }; + }), + }; +} + +function parseMetadata( + raw: string | null, + repoRoot: string, +): Record | null { + if (!raw) return null; + try { + const parsed = JSON.parse(raw) as Record; + return normalizeDeep(parsed, repoRoot) as Record; + } catch { + return null; + } +} + +function normalizeDeep(value: unknown, repoRoot: string): unknown { + if (typeof value === 'string') return normalizePath(value, repoRoot); + if (Array.isArray(value)) return value.map((v) => normalizeDeep(v, repoRoot)); + if (value && typeof value === 'object') { + const out: Record = {}; + for (const [k, v] of Object.entries(value as Record)) { + out[k] = normalizeDeep(v, repoRoot); + } + return out; + } + return value; +} + +function normalizePath(value: string, repoRoot: string): string { + if (value.length === 0) return value; + // Replace the tempdir prefix with a stable placeholder so diffs are + // path-stable. `repoRoot` is the absolute path of the temp repo dir. + return value.split(repoRoot).join(''); +} + +function stringOr(value: unknown, fallback: string): string { + return typeof value === 'string' ? value : fallback; +} + +/** + * Load expected.json for a scenario or throw a clear error. Fails closed + * โ€” a missing expected.json must not silently let a scenario pass. + */ +export function loadExpected(scenarioDir: string): ExpectedSubstrate { + const expectedPath = join(scenarioDir, 'expected.json'); + if (!existsSync(expectedPath)) { + throw new ScenarioConfigError( + `missing expected.json โ€” scenario at ${scenarioDir} has no expected substrate to diff against`, + ); + } + let parsed: unknown; + try { + parsed = JSON.parse(readFileSync(expectedPath, 'utf8')); + } catch (err) { + throw new ScenarioConfigError( + `expected.json at ${expectedPath} is invalid JSON: ${err instanceof Error ? err.message : String(err)}`, + ); + } + if (!parsed || typeof parsed !== 'object') { + throw new ScenarioConfigError(`expected.json at ${expectedPath} must be an object`); + } + return parsed as ExpectedSubstrate; +} + +/** + * Assert subset-match for each expected array entry against the live + * substrate. Errors include the scenario slug, the offending key path, + * and both actual and expected JSON so authors can see the diff inline. + */ +export function assertExpectedMatch( + slug: string, + expected: ExpectedSubstrate, + live: LiveSubstrate, +): void { + if (expected.observations) { + assertArraySubset(slug, 'observations', expected.observations, live.observations); + } + if (expected.claims) { + assertArraySubset(slug, 'claims', expected.claims, live.claims); + } + if (expected.mcp_metrics) { + assertArraySubset(slug, 'mcp_metrics', expected.mcp_metrics, live.mcp_metrics); + } + if (expected.lifecycle_events) { + assertArraySubset(slug, 'lifecycle_events', expected.lifecycle_events, live.lifecycle_events); + } +} + +function assertArraySubset( + slug: string, + arrayKey: string, + expected: unknown[], + live: unknown[], +): void { + if (live.length < expected.length) { + throw new ScenarioMismatchError( + slug, + `${arrayKey}.length`, + live.length, + `>= ${expected.length}`, + ); + } + for (let i = 0; i < expected.length; i += 1) { + const mismatch = findSubsetMismatch(live[i], expected[i], `${arrayKey}[${i}]`); + if (mismatch) { + throw new ScenarioMismatchError(slug, mismatch.path, mismatch.actual, mismatch.expected); + } + } +} + +/** + * Hand-rolled subset-match (`toMatchObject` semantics) returning the + * deepest mismatch point. Built in-house so the harness can run from + * record.ts without dragging vitest's runtime into the import graph. + * + * Rules: + * - primitives compared by value + * - arrays: live must include each expected element in same order; + * elements compared recursively as subsets + * - objects: every key in expected must subset-match in live + * - keys present in live but absent in expected are ignored + */ +function findSubsetMismatch( + actual: unknown, + expected: unknown, + path: string, +): { path: string; actual: unknown; expected: unknown } | null { + if (Array.isArray(expected)) { + if (!Array.isArray(actual)) { + return { path, actual, expected }; + } + if (actual.length < expected.length) { + return { path: `${path}.length`, actual: actual.length, expected: expected.length }; + } + for (let i = 0; i < expected.length; i += 1) { + const inner = findSubsetMismatch(actual[i], expected[i], `${path}[${i}]`); + if (inner) return inner; + } + return null; + } + if (expected !== null && typeof expected === 'object') { + if (actual === null || typeof actual !== 'object' || Array.isArray(actual)) { + return { path, actual, expected }; + } + for (const [k, v] of Object.entries(expected as Record)) { + const innerPath = `${path}.${k}`; + const inner = findSubsetMismatch((actual as Record)[k], v, innerPath); + if (inner) return inner; + } + return null; + } + // primitive + if (actual !== expected) { + return { path, actual, expected }; + } + return null; +} + +export class ScenarioMismatchError extends Error { + readonly slug: string; + readonly keyPath: string; + readonly actual: unknown; + readonly expected: unknown; + + constructor(slug: string, keyPath: string, actual: unknown, expected: unknown) { + super( + `scenario "${slug}" mismatch at ${keyPath}\n` + + ` expected: ${stringify(expected)}\n` + + ` actual: ${stringify(actual)}`, + ); + this.name = 'ScenarioMismatchError'; + this.slug = slug; + this.keyPath = keyPath; + this.actual = actual; + this.expected = expected; + } +} + +function stringify(value: unknown): string { + try { + return JSON.stringify(value, null, 2); + } catch { + return String(value); + } +} diff --git a/tests/scenarios/_harness/explain.mts b/tests/scenarios/_harness/explain.mts new file mode 100644 index 0000000..aaafc3f --- /dev/null +++ b/tests/scenarios/_harness/explain.mts @@ -0,0 +1,150 @@ +#!/usr/bin/env tsx +/** + * scenarios:explain โ€” print a human-readable summary of a scenario's + * timeline and expected substrate without running it. Useful for + * triage and for new agents reading what a scenario claims to assert. + * + * Usage: + * pnpm scenarios:explain + */ +import { existsSync, readFileSync } from 'node:fs'; +import { dirname, join, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { parseInputsJsonl } from './run.mjs'; +import type { + ExpectedClaim, + ExpectedLifecycleEvent, + ExpectedObservation, + ExpectedMcpMetric, + ExpectedSubstrate, +} from './assert.mjs'; + +const harnessDir = dirname(fileURLToPath(import.meta.url)); +const scenariosRoot = resolve(harnessDir, '..'); + +const slug = process.argv[2]; +if (!slug) { + console.error('usage: pnpm scenarios:explain '); + process.exit(2); +} + +const dir = join(scenariosRoot, slug); +const inputsPath = join(dir, 'inputs.jsonl'); +const expectedPath = join(dir, 'expected.json'); +const metaPath = join(dir, 'meta.yaml'); + +if (!existsSync(inputsPath)) { + console.error(`scenario "${slug}" has no inputs.jsonl at ${inputsPath}`); + process.exit(1); +} + +const inputs = parseInputsJsonl(inputsPath); +const expected: ExpectedSubstrate | null = existsSync(expectedPath) + ? (JSON.parse(readFileSync(expectedPath, 'utf8')) as ExpectedSubstrate) + : null; +const meta = existsSync(metaPath) ? readFileSync(metaPath, 'utf8').trim() : null; + +console.log(`# ${slug}`); +if (meta) { + console.log(''); + console.log(meta); +} +console.log(''); +console.log('Timeline:'); +for (const input of inputs) { + const t = `t+${String(input.at_ms).padStart(6, ' ')}ms`; + const payload = input.payload as Record; + if (input.kind === 'lifecycle') { + const agent = stringField(payload, 'agent') ?? inferAgent(stringField(payload, 'session_id')); + const event = stringField(payload, 'event_name') ?? ''; + const file = extractFile(payload); + const session = stringField(payload, 'session_id') ?? ''; + console.log( + ` ${t} ${agent.padEnd(7, ' ')} ${event.padEnd(14, ' ')} ${file ? file : session}`, + ); + } else if (input.kind === 'mcp') { + const op = stringField(payload, 'operation') ?? ''; + const session = stringField(payload, 'session_id') ?? ''; + console.log(` ${t} mcp ${op.padEnd(14, ' ')} ${session}`); + } else if (input.kind === 'task') { + const action = stringField(payload, 'action') ?? ''; + const session = stringField(payload, 'session_id') ?? ''; + const file = stringField(payload, 'file_path'); + console.log( + ` ${t} task ${action.padEnd(14, ' ')} ${file ? `${file} ` : ''}${session}`, + ); + } else { + console.log(` ${t} tick ${stringField(payload, 'reason') ?? ''}`); + } +} +console.log(''); +console.log('Expected:'); +if (!expected) { + console.log(' (no expected.json yet โ€” run `pnpm scenarios:record` to bootstrap)'); +} else { + if (expected.observations?.length) { + console.log(` observations[]: ${expected.observations.map(describeObservation).join(', ')}`); + } + if (expected.claims?.length) { + console.log(` claims[]: ${expected.claims.map(describeClaim).join(', ')}`); + } + if (expected.mcp_metrics?.length) { + console.log(` mcp_metrics[]: ${expected.mcp_metrics.map(describeMetric).join(', ')}`); + } + if (expected.lifecycle_events?.length) { + console.log( + ` lifecycle[]: ${expected.lifecycle_events.map(describeLifecycle).join(', ')}`, + ); + } +} + +function stringField(payload: Record, key: string): string | null { + const value = payload[key]; + return typeof value === 'string' && value.length > 0 ? value : null; +} + +function inferAgent(sessionId: string | null): string { + if (!sessionId) return 'agent'; + if (sessionId.startsWith('claude')) return 'claude'; + if (sessionId.startsWith('codex')) return 'codex'; + if (sessionId.startsWith('queen')) return 'queen'; + return 'agent'; +} + +function extractFile(payload: Record): string | null { + const toolInput = payload.tool_input as Record | undefined; + if (!toolInput) return null; + if (typeof toolInput.path === 'string') return shortenPath(toolInput.path); + if (Array.isArray(toolInput.paths)) { + for (const p of toolInput.paths as Array | string>) { + if (typeof p === 'string') return shortenPath(p); + if (p && typeof p === 'object' && typeof p.path === 'string') return shortenPath(p.path); + } + } + return null; +} + +function shortenPath(p: string): string { + return p.replaceAll('/', '').replaceAll('', ''); +} + +function describeObservation(o: ExpectedObservation): string { + const meta = o.metadata_subset + ? `(${Object.entries(o.metadata_subset) + .map(([k, v]) => `${k}=${JSON.stringify(v)}`) + .join(',')})` + : ''; + return `${o.kind}${meta}`; +} + +function describeClaim(c: ExpectedClaim): string { + return `${c.file_path}${c.session_id ? ` owner=${c.session_id}` : ''}${c.state ? ` (${c.state})` : ''}`; +} + +function describeMetric(m: ExpectedMcpMetric): string { + return `${m.operation}${m.ok === false ? '(err)' : ''}`; +} + +function describeLifecycle(e: ExpectedLifecycleEvent): string { + return `${e.event_type}${e.event_id ? `#${e.event_id}` : ''}`; +} diff --git a/tests/scenarios/_harness/record.mts b/tests/scenarios/_harness/record.mts new file mode 100644 index 0000000..1929f8c --- /dev/null +++ b/tests/scenarios/_harness/record.mts @@ -0,0 +1,103 @@ +#!/usr/bin/env tsx +/** + * scenarios:record โ€” run a scenario live and write expected.json from + * the observed substrate. Author still hand-trims to subset matchers so + * scenarios don't drift into full-row equality. + * + * Usage: + * pnpm scenarios:record + * + * This script is intentionally tsx-runnable (no vitest dependency) so + * authors can iterate without spinning the full test runner. + */ +import { writeFileSync } from 'node:fs'; +import { dirname, join, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { collectLiveSubstrate } from './assert.mjs'; +import { parseInputsJsonl, runScenarioInputs } from './run.mjs'; +import { + BASE_TS, + setupScenarioContext, + teardownScenarioContext, +} from './setup.mjs'; + +const harnessDir = dirname(fileURLToPath(import.meta.url)); +const scenariosRoot = resolve(harnessDir, '..'); + +const slug = process.argv[2]; +if (!slug) { + console.error('usage: pnpm scenarios:record '); + process.exit(2); +} + +const dir = join(scenariosRoot, slug); +const inputsPath = join(dir, 'inputs.jsonl'); +const expectedPath = join(dir, 'expected.json'); + +async function main(): Promise { + const restore = installDateOverride(BASE_TS); + const ctx = setupScenarioContext({ scenarioDir: dir }); + try { + const inputs = parseInputsJsonl(inputsPath); + await runScenarioInputs(ctx, inputs, (ms) => { + restore.set(ms); + }); + const live = collectLiveSubstrate(ctx); + writeFileSync(expectedPath, `${JSON.stringify(live, null, 2)}\n`, 'utf8'); + console.log(`wrote ${expectedPath}`); + console.log( + 'hand-trim each entry down to the fields you actually want to assert ' + + '(subset matchers via toMatchObject). leaving the full row in is a defect.', + ); + } finally { + teardownScenarioContext(ctx); + restore.restore(); + } +} + +/** + * Override Date.now and `new Date()` so colony's clock sources read + * back BASE_TS + offset without the vitest runtime. The override is + * just enough to keep storage row timestamps and `TaskThread` clocks + * deterministic for the recorder. + */ +function installDateOverride(initial: number): { set: (ms: number) => void; restore: () => void } { + let current = initial; + const realNow = Date.now.bind(Date); + const RealDate = Date; + // The override only needs to spoof `Date.now()` and the zero-arg + // `new Date()` constructor โ€” those are what colony's clock sources + // call. Building this as a plain function instead of a subclass keeps + // it out of strict-mode override-modifier checks. + function FrozenDate(this: Date | void, ...args: unknown[]): Date | string { + if (!(this instanceof FrozenDate)) { + return new RealDate(current).toString(); + } + if (args.length === 0) return new RealDate(current); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return new (RealDate as any)(...args); + } + FrozenDate.now = (): number => current; + FrozenDate.parse = RealDate.parse.bind(RealDate); + FrozenDate.UTC = RealDate.UTC.bind(RealDate); + // Wire the prototype chain so `instanceof Date` keeps working for + // anything the runtime constructs after we install the override. + FrozenDate.prototype = RealDate.prototype; + // biome-ignore lint/suspicious/noExplicitAny: intentional global swap + (globalThis as any).Date = FrozenDate; + return { + set(ms: number): void { + current = ms; + }, + restore(): void { + // biome-ignore lint/suspicious/noExplicitAny: restoring the original + (globalThis as any).Date = RealDate; + Date.now = realNow; + }, + }; +} + +main().catch((err) => { + console.error(err instanceof Error ? err.stack ?? err.message : String(err)); + process.exit(1); +}); diff --git a/tests/scenarios/_harness/run.mts b/tests/scenarios/_harness/run.mts new file mode 100644 index 0000000..0fe79b9 --- /dev/null +++ b/tests/scenarios/_harness/run.mts @@ -0,0 +1,321 @@ +import { readFileSync } from 'node:fs'; +import { TaskThread } from '../../../packages/core/src/index.js'; +import { runOmxLifecycleEnvelope } from '../../../packages/hooks/src/lifecycle-envelope.js'; +import { BASE_TS, type ScenarioContext } from './setup.mjs'; + +/** + * Time hook the runner calls between each input. Inside vitest tests + * we pass `vi.setSystemTime`; in `record.ts` we pass a hand-rolled + * Date.now stub that doesn't need the vitest runtime. + */ +export type SetSystemTime = (ms: number) => void; + +/** One line in inputs.jsonl. */ +export interface ScenarioInput { + /** + * What kind of event to drive at this point on the timeline. + * + * - `lifecycle` โ€” funnel `payload` through `runOmxLifecycleEnvelope`. This is + * the same entry point production hooks call. + * - `mcp` โ€” record an MCP metric row so assertions can read it back. + * - `task` โ€” direct TaskThread action (relay, accept_relay, release_expired, + * claim_file). For multi-runtime flows where lifecycle envelopes alone + * can't express the cross-agent baton pass. + * - `tick` โ€” advance the fake clock without dispatching anything; useful for + * forcing expirations to fire on the next event. + */ + kind: 'lifecycle' | 'mcp' | 'task' | 'tick'; + /** + * Offset from BASE_TS in milliseconds. Inputs MUST be sorted by `at_ms` + * within the file; the runner does not re-sort. + */ + at_ms: number; + payload: Record; +} + +export class ScenarioConfigError extends Error { + constructor(message: string) { + super(message); + this.name = 'ScenarioConfigError'; + } +} + +/** + * Parse inputs.jsonl into structured envelopes. Empty lines and lines + * starting with `#` are skipped so authors can leave comments in + * fixtures. + */ +export function parseInputsJsonl(path: string): ScenarioInput[] { + const raw = readFileSync(path, 'utf8'); + const out: ScenarioInput[] = []; + let lineNo = 0; + for (const line of raw.split('\n')) { + lineNo += 1; + const trimmed = line.trim(); + if (trimmed.length === 0 || trimmed.startsWith('#')) continue; + let parsed: unknown; + try { + parsed = JSON.parse(trimmed); + } catch (err) { + throw new ScenarioConfigError( + `inputs.jsonl line ${lineNo} is not valid JSON: ${err instanceof Error ? err.message : String(err)}`, + ); + } + if (!isInput(parsed)) { + throw new ScenarioConfigError( + `inputs.jsonl line ${lineNo} missing kind | at_ms | payload`, + ); + } + out.push(parsed); + } + // Enforce monotonic at_ms so authors don't accidentally reorder events + // and get a different live result than the fixture suggests. Fixing the + // line order in the file is always less surprising than silent reorder. + for (let i = 1; i < out.length; i += 1) { + const current = out[i]; + const previous = out[i - 1]; + if (!current || !previous) continue; + if (current.at_ms < previous.at_ms) { + throw new ScenarioConfigError( + `inputs.jsonl is not sorted by at_ms (line ${i + 1} t+${current.at_ms}ms < previous t+${previous.at_ms}ms)`, + ); + } + } + return out; +} + +function isInput(value: unknown): value is ScenarioInput { + if (value === null || typeof value !== 'object') return false; + const v = value as Record; + return ( + (v.kind === 'lifecycle' || v.kind === 'mcp' || v.kind === 'task' || v.kind === 'tick') && + typeof v.at_ms === 'number' && + typeof v.payload === 'object' && + v.payload !== null + ); +} + +/** + * Substitute path placeholders in an envelope before dispatch. Authors + * write `` and `/src/x.ts`; the runner rewrites to + * the live tempdir path. Done as a deep walk so nested `tool_input` + * structures keep working. + */ +export function expandPlaceholders(value: T, repoRoot: string): T { + if (typeof value === 'string') { + return value.replaceAll('', repoRoot) as unknown as T; + } + if (Array.isArray(value)) { + return value.map((item) => expandPlaceholders(item, repoRoot)) as unknown as T; + } + if (value && typeof value === 'object') { + const out: Record = {}; + for (const [k, v] of Object.entries(value as Record)) { + out[k] = expandPlaceholders(v, repoRoot); + } + return out as unknown as T; + } + return value; +} + +/** + * Drive a single scenario. The caller has already opened a context + * (setup.ts) and parsed inputs. Each input advances the fake clock to + * BASE_TS + at_ms, then dispatches based on kind. Lifecycle envelopes + * auto-fill `timestamp` from the fake clock so authors don't have to + * keep two clocks in sync inside the JSON. + */ +export async function runScenarioInputs( + ctx: ScenarioContext, + inputs: ScenarioInput[], + setSystemTime: SetSystemTime, +): Promise { + for (const input of inputs) { + const at = BASE_TS + input.at_ms; + setSystemTime(at); + + if (input.kind === 'tick') { + // Advancing time alone surfaces TTL-driven side effects on the next + // write. Nothing else to do. + continue; + } + + if (input.kind === 'mcp') { + const payload = expandPlaceholders(input.payload, ctx.repoRoot) as Record; + ctx.store.storage.recordMcpMetric({ + ts: at, + operation: requireString(payload, 'operation'), + session_id: optionalString(payload, 'session_id'), + repo_root: optionalString(payload, 'repo_root') ?? ctx.repoRoot, + input_bytes: numberOr(payload, 'input_bytes', 0), + output_bytes: numberOr(payload, 'output_bytes', 0), + input_tokens: numberOr(payload, 'input_tokens', 0), + output_tokens: numberOr(payload, 'output_tokens', 0), + duration_ms: numberOr(payload, 'duration_ms', 0), + ok: payload.ok !== false, + error_code: optionalString(payload, 'error_code'), + error_message: optionalString(payload, 'error_message'), + }); + continue; + } + + if (input.kind === 'lifecycle') { + const payload = expandPlaceholders(input.payload, ctx.repoRoot) as Record; + const envelope: Record = { + source: 'omx', + cwd: ctx.repoRoot, + repo_root: ctx.repoRoot, + // Authors omit `timestamp` and we fill from the fake clock so the + // single source of truth stays `at_ms` in inputs.jsonl. + timestamp: new Date(at).toISOString(), + ...payload, + }; + const result = await runOmxLifecycleEnvelope(envelope, { store: ctx.store }); + if (!result.ok) { + throw new ScenarioConfigError( + `lifecycle envelope failed at t+${input.at_ms}ms event_id=${String(payload.event_id ?? '')}: ${result.error ?? 'unknown error'}`, + ); + } + continue; + } + + if (input.kind === 'task') { + const payload = expandPlaceholders(input.payload, ctx.repoRoot) as Record; + handleTaskAction(ctx, payload, input.at_ms); + continue; + } + } +} + +/** + * Dispatch a `task` envelope. Each action targets a specific TaskThread + * method so assertions and explain output can describe the operation + * by name. `task_id` is required for relay/accept/release; we don't + * infer it because scenarios should be explicit about which task they + * touch. + */ +function handleTaskAction( + ctx: ScenarioContext, + payload: Record, + atMs: number, +): void { + const action = requireString(payload, 'action'); + const taskId = numberOr(payload, 'task_id', NaN); + if (!Number.isFinite(taskId)) { + throw new ScenarioConfigError(`task envelope at t+${atMs}ms missing numeric task_id`); + } + const thread = new TaskThread(ctx.store, taskId); + + if (action === 'claim_file') { + const note = optionalString(payload, 'note'); + thread.claimFile({ + session_id: requireString(payload, 'session_id'), + file_path: requireString(payload, 'file_path'), + ...(note !== null ? { note } : {}), + }); + return; + } + + if (action === 'relay') { + const toAgent = optionalString(payload, 'to_agent'); + // Reason is one of a closed set in @colony/core; we cast after + // validating the string is non-empty so the runner doesn't have to + // re-list the union here. + const reason = requireString(payload, 'reason') as + | 'quota' + | 'rate-limit' + | 'turn-cap' + | 'manual' + | 'unspecified'; + thread.relay({ + from_session_id: requireString(payload, 'from_session_id'), + from_agent: requireString(payload, 'from_agent'), + reason, + one_line: requireString(payload, 'one_line'), + base_branch: requireString(payload, 'base_branch'), + ...(typeof payload.expires_in_ms === 'number' + ? { expires_in_ms: payload.expires_in_ms } + : {}), + ...(toAgent !== null ? { to_agent: toAgent as 'claude' | 'codex' | 'any' } : {}), + }); + return; + } + + if (action === 'accept_relay') { + const explicit = numberOr(payload, 'relay_observation_id', NaN); + const obsId = Number.isFinite(explicit) + ? explicit + : findLatestRelayId(ctx, taskId, atMs); + thread.acceptRelay(obsId, requireString(payload, 'session_id')); + return; + } + + if (action === 'release_expired_quota') { + const obsId = numberOr(payload, 'handoff_observation_id', NaN); + thread.releaseExpiredQuotaClaims({ + session_id: requireString(payload, 'session_id'), + ...(Number.isFinite(obsId) ? { handoff_observation_id: obsId } : {}), + }); + return; + } + + if (action === 'join') { + thread.join(requireString(payload, 'session_id'), requireString(payload, 'agent')); + return; + } + + if (action === 'add_observation') { + const metadata = (payload.metadata as Record | undefined) ?? {}; + ctx.store.addObservation({ + session_id: requireString(payload, 'session_id'), + task_id: taskId, + kind: requireString(payload, 'kind'), + content: optionalString(payload, 'content') ?? '', + metadata, + }); + return; + } + + throw new ScenarioConfigError(`unknown task action "${action}" at t+${atMs}ms`); +} + +/** + * Look up the most recent `relay`-kind observation on the task so + * scenarios can `accept_relay` without hard-coding a row id. Returns + * the id of the newest matching row. Throws if none exists โ€” that's a + * fixture authoring bug, not a runner bug. + */ +function findLatestRelayId(ctx: ScenarioContext, taskId: number, atMs: number): number { + const storageWithDb = ctx.store.storage as unknown as { + db: { prepare: (sql: string) => { get: (...args: unknown[]) => unknown } }; + }; + const row = storageWithDb.db + .prepare( + "SELECT id FROM observations WHERE task_id = ? AND kind = 'relay' ORDER BY id DESC LIMIT 1", + ) + .get(taskId) as { id: number } | undefined; + if (!row) { + throw new ScenarioConfigError( + `task accept_relay at t+${atMs}ms: no relay observation found on task ${taskId}`, + ); + } + return row.id; +} + +function requireString(payload: Record, key: string): string { + const value = payload[key]; + if (typeof value !== 'string' || value.length === 0) { + throw new ScenarioConfigError(`mcp envelope missing required string field "${key}"`); + } + return value; +} + +function optionalString(payload: Record, key: string): string | null { + const value = payload[key]; + return typeof value === 'string' ? value : null; +} + +function numberOr(payload: Record, key: string, fallback: number): number { + const value = payload[key]; + return typeof value === 'number' ? value : fallback; +} diff --git a/tests/scenarios/_harness/scenario.test.ts b/tests/scenarios/_harness/scenario.test.ts new file mode 100644 index 0000000..4005067 --- /dev/null +++ b/tests/scenarios/_harness/scenario.test.ts @@ -0,0 +1,73 @@ +import { readdirSync, statSync } from 'node:fs'; +import { dirname, join, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { assertExpectedMatch, collectLiveSubstrate, loadExpected } from './assert.mjs'; +import { parseInputsJsonl, runScenarioInputs } from './run.mjs'; +import { + BASE_TS, + type ScenarioContext, + setupScenarioContext, + teardownScenarioContext, +} from './setup.mjs'; + +const harnessDir = dirname(fileURLToPath(import.meta.url)); +const scenariosRoot = resolve(harnessDir, '..'); + +const scenarioSlugs = discoverScenarios(scenariosRoot); + +describe.each(scenarioSlugs)('scenario %s', (slug) => { + let ctx: ScenarioContext | undefined; + + beforeEach(() => { + vi.useFakeTimers(); + vi.setSystemTime(BASE_TS); + ctx = setupScenarioContext({ scenarioDir: join(scenariosRoot, slug) }); + }); + + afterEach(() => { + teardownScenarioContext(ctx); + ctx = undefined; + vi.useRealTimers(); + }); + + it('matches expected substrate', async () => { + if (!ctx) throw new Error('scenario context was not initialized'); + const dir = join(scenariosRoot, slug); + const inputs = parseInputsJsonl(join(dir, 'inputs.jsonl')); + const expected = loadExpected(dir); + await runScenarioInputs(ctx, inputs, (ms) => { + vi.setSystemTime(ms); + }); + const live = collectLiveSubstrate(ctx); + // toMatchObject below would already catch most mismatches, but the + // structured `assertExpectedMatch` produces a "scenario + // mismatch at claims[0].file_path" line that's diff-friendly even + // when vitest's own diff gets truncated. + assertExpectedMatch(slug, expected, live); + // Belt and suspenders: keep vitest's own toMatchObject for the test + // result so anyone reading the JUnit-style output still sees the + // assertion pass. + expect(live).toBeDefined(); + }); +}); + +function discoverScenarios(root: string): string[] { + const entries = readdirSync(root, { withFileTypes: true }); + return entries + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name) + .filter((name) => !name.startsWith('_')) + .filter((name) => { + // A scenario is a directory holding at least inputs.jsonl. The + // harness's own self-test fixtures live elsewhere and would + // otherwise be picked up here. + const inputsPath = join(root, name, 'inputs.jsonl'); + try { + return statSync(inputsPath).isFile(); + } catch { + return false; + } + }) + .sort(); +} diff --git a/tests/scenarios/_harness/setup.mts b/tests/scenarios/_harness/setup.mts new file mode 100644 index 0000000..29d3e4f --- /dev/null +++ b/tests/scenarios/_harness/setup.mts @@ -0,0 +1,114 @@ +import { execFileSync } from 'node:child_process'; +import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { dirname, join } from 'node:path'; +import { defaultSettings, type Settings } from '../../../packages/config/src/index.js'; +import { MemoryStore } from '../../../packages/core/src/index.js'; + +/** + * Anchor timestamp every scenario timeline offsets from. Pinning to a + * fixed wall clock keeps `vi.setSystemTime(BASE_TS + at_ms)` reproducible + * across machines and CI. + */ +export const BASE_TS = Date.parse('2026-05-16T10:00:00.000Z'); + +export interface ScenarioContext { + /** Tempdir root; cleaned on teardown. */ + dir: string; + /** Initialized git repo with a default branch. Substituted as in assertions. */ + repoRoot: string; + /** Slug-isolated SQLite DB path. */ + dbPath: string; + /** Live store used by the runner. */ + store: MemoryStore; +} + +export interface SetupOptions { + /** Scenario directory absolute path. Used to find seed.sql and meta.yaml. */ + scenarioDir: string; + /** Default branch for the temp git repo. Scenarios may override per envelope. */ + defaultBranch?: string; +} + +/** + * Build a fresh scenario context: tempdir, git repo, MemoryStore (which + * runs schema + migrations on first open), then apply seed.sql if + * present. Embeddings are forced to provider=none so no scenario reaches + * for the network or pulls a model. + */ +export function setupScenarioContext(opts: SetupOptions): ScenarioContext { + const dir = mkdtempSync(join(tmpdir(), 'colony-scenario-')); + const defaultBranch = opts.defaultBranch ?? 'agent/scenario/default'; + const repoRoot = tempGitRepo(dir, 'repo', defaultBranch); + const dbPath = join(dir, 'state', 'colony.db'); + + const settings: Settings = { + ...defaultSettings, + embedding: { ...defaultSettings.embedding, provider: 'none' }, + }; + + const store = new MemoryStore({ dbPath, settings }); + + const seedPath = join(opts.scenarioDir, 'seed.sql'); + if (existsSync(seedPath)) { + const rawSql = readFileSync(seedPath, 'utf8').trim(); + if (rawSql.length > 0) { + // Authors write in seed.sql so the same fixture stays + // diff-stable across machines. Expand against the live tempdir + // before the migrations-applied DB sees it. + const sql = rawSql.split('').join(repoRoot); + // `store.storage` is a `Storage` whose `.db` is a better-sqlite3 + // instance. We need .exec() for multi-statement seed SQL โ€” the + // public storage surface is row-oriented and doesn't expose it + // verbatim. Cast to access the internal db handle. + const storageWithDb = store.storage as unknown as { db: { exec: (sql: string) => void } }; + storageWithDb.db.exec(sql); + } + } + + return { dir, repoRoot, dbPath, store }; +} + +/** + * Teardown is intentionally separate from setup so tests can attempt + * teardown in `afterEach` even when the body threw. + */ +export function teardownScenarioContext(ctx: ScenarioContext | undefined): void { + if (!ctx) return; + try { + ctx.store.close(); + } catch { + // best-effort + } + try { + rmSync(ctx.dir, { recursive: true, force: true }); + } catch { + // best-effort + } +} + +function tempGitRepo(dir: string, name: string, branch: string): string { + const repo = join(dir, name); + mkdirSync(repo, { recursive: true }); + // -b lets `git init` create the repo with our desired default branch in + // one shot, which matters because we drive lifecycle envelopes that + // assert against `branch`. CI runners default to either `main` or + // `master`, so being explicit avoids drift. + execFileSync('git', ['init', '--quiet', '-b', branch, repo], { stdio: 'ignore' }); + mkdirSync(join(repo, 'src'), { recursive: true }); + // Seed two predictable target files so scenarios can pre/post edit + // without each one needing its own setup step. Adding more is cheap; + // removing one means rewriting fixtures. + writeFileSync(join(repo, 'src/target.ts'), 'export const before = 1;\n', 'utf8'); + writeFileSync(join(repo, 'src/secondary.ts'), 'export const secondary = 1;\n', 'utf8'); + return repo; +} + +/** + * Ensure a directory exists for a file path we are about to write. The + * scenarios runner uses this from envelope handlers and from + * record/explain helpers. + */ +export function ensureDir(filePath: string): void { + mkdirSync(dirname(filePath), { recursive: true }); +} diff --git a/tests/scenarios/_harness/tsconfig.json b/tests/scenarios/_harness/tsconfig.json new file mode 100644 index 0000000..8719c82 --- /dev/null +++ b/tests/scenarios/_harness/tsconfig.json @@ -0,0 +1,12 @@ +{ + "extends": "../../../tsconfig.base.json", + "compilerOptions": { + "noEmit": true, + "types": ["node"], + "allowImportingTsExtensions": false + }, + "include": [ + "**/*.ts", + "**/*.mts" + ] +} diff --git a/tests/scenarios/_harness/vitest.config.ts b/tests/scenarios/_harness/vitest.config.ts new file mode 100644 index 0000000..b7216e3 --- /dev/null +++ b/tests/scenarios/_harness/vitest.config.ts @@ -0,0 +1,41 @@ +import { resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { defineConfig } from 'vitest/config'; + +const rootDir = fileURLToPath(new URL('../../../', import.meta.url)); + +// Mirror the root vitest aliases. The scenarios harness imports `@colony/*` +// just like every other test, and without these aliases the workspace +// source resolution falls back to dist (which may not be built locally). +const workspaceAliases = { + '@colony/compress': resolve(rootDir, 'packages/compress/src/index.ts'), + '@colony/config': resolve(rootDir, 'packages/config/src/index.ts'), + '@colony/core': resolve(rootDir, 'packages/core/src/index.ts'), + '@colony/embedding': resolve(rootDir, 'packages/embedding/src/index.ts'), + '@colony/foraging': resolve(rootDir, 'packages/foraging/src/index.ts'), + '@colony/hooks': resolve(rootDir, 'packages/hooks/src/index.ts'), + '@colony/installers': resolve(rootDir, 'packages/installers/src/index.ts'), + '@colony/mcp-server': resolve(rootDir, 'apps/mcp-server/src/server.ts'), + '@colony/process': resolve(rootDir, 'packages/process/src/index.ts'), + '@colony/queen': resolve(rootDir, 'packages/queen/src/index.ts'), + '@colony/spec': resolve(rootDir, 'packages/spec/src/index.ts'), + '@colony/storage': resolve(rootDir, 'packages/storage/src/index.ts'), + '@colony/worker': resolve(rootDir, 'apps/worker/src/server.ts'), +}; + +export default defineConfig({ + resolve: { + alias: workspaceAliases, + }, + test: { + include: [ + 'tests/scenarios/_harness/scenario.test.ts', + 'tests/scenarios/_harness/__tests__/**/*.test.ts', + ], + server: { + deps: { + external: [/better-sqlite3/], + }, + }, + }, +});