From c34753d174fd6f11cb577e24d5db84abcdd4c0f5 Mon Sep 17 00:00:00 2001 From: Peter Wielander Date: Fri, 22 May 2026 13:40:04 +0200 Subject: [PATCH 1/6] [tests] Re-enable describe.concurrent in e2e suite Flips packages/core/e2e/e2e.test.ts back to describe.concurrent. The suite was made sequential ("temporarily disabling concurrent tests to avoid flakiness") and never re-enabled. Each workbench now runs ~87 tests serially against its preview deploy, dominating CI wall-clock. Putting this up as a draft to see which (if any) tests are still flake-prone under concurrency, after recent fixes to the abort-fetch class of flakes. --- .changeset/concurrent-e2e.md | 4 ++++ packages/core/e2e/e2e.test.ts | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 .changeset/concurrent-e2e.md diff --git a/.changeset/concurrent-e2e.md b/.changeset/concurrent-e2e.md new file mode 100644 index 0000000000..3bb05ef22b --- /dev/null +++ b/.changeset/concurrent-e2e.md @@ -0,0 +1,4 @@ +--- +--- + +chore(tests): re-enable `describe.concurrent` in the e2e suite to cut CI wall-clock per workbench. diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 5f00f28ae1..cc478b637b 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -182,9 +182,7 @@ async function startWorkflowViaHttp( return run; } -// NOTE: Temporarily disabling concurrent tests to avoid flakiness. -// TODO: Re-enable concurrent tests after conf when we have more time to investigate. -describe('e2e', () => { +describe.concurrent('e2e', () => { // Configure the World for the test runner process so that start() and // run.returnValue can communicate with the same backend as the workbench app. beforeAll(async () => { From a11577405681a4b5865c5610dce84ec397e1edb1 Mon Sep 17 00:00:00 2001 From: Peter Wielander Date: Fri, 22 May 2026 15:10:04 +0200 Subject: [PATCH 2/6] [tests] Isolate diagnostic tracking per concurrent task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The e2e suite tracked failed-run diagnostics via two module-level mutables (`trackedRuns` and `currentTestName`) that every beforeEach reset. Under describe.concurrent that races: when a test times out, the displayed diagnostic typically belongs to whichever sibling task most recently called setupRunTracking(). That's why PR #2083's first CI run showed an unrelated `errorStepCrossFile` run as the "diagnostic" for the FatalError catchability test that actually timed out — leaving us blind to the real failure cause. Switch to a `Map` keyed off `getCurrentTest().id`, with the entry dropped via onTestFinished to keep the map bounded. Also unconditionally log `[trackRun] ` so we can still correlate runs from stdout even when diagnostics fetching itself stalls past the test's deadline. --- .changeset/concurrent-e2e.md | 2 +- packages/core/e2e/utils.ts | 74 ++++++++++++++++++++++++++++-------- 2 files changed, 60 insertions(+), 16 deletions(-) diff --git a/.changeset/concurrent-e2e.md b/.changeset/concurrent-e2e.md index 3bb05ef22b..f050cf9eed 100644 --- a/.changeset/concurrent-e2e.md +++ b/.changeset/concurrent-e2e.md @@ -1,4 +1,4 @@ --- --- -chore(tests): re-enable `describe.concurrent` in the e2e suite to cut CI wall-clock per workbench. +chore(tests): re-enable `describe.concurrent` in the e2e suite to cut CI wall-clock per workbench, and isolate diagnostic tracking state per-task so concurrent tests no longer clobber each other's `trackedRuns` (each failing test now surfaces its own run's diagnostics, and every tracked run is logged with `[trackRun] ` for stdout correlation even when diagnostics fetching fails). diff --git a/packages/core/e2e/utils.ts b/packages/core/e2e/utils.ts index 509775fbd3..d84e96eff4 100644 --- a/packages/core/e2e/utils.ts +++ b/packages/core/e2e/utils.ts @@ -4,7 +4,8 @@ import path, { dirname } from 'node:path'; import { setTimeout as sleep } from 'node:timers/promises'; import { fileURLToPath } from 'node:url'; import { createVercelWorld } from '@workflow/world-vercel'; -import { onTestFailed } from 'vitest'; +import { onTestFailed, onTestFinished } from 'vitest'; +import { getCurrentTest } from 'vitest/suite'; import { getTrustedSourcesHeaders } from '../../../scripts/trusted-sources-headers.mjs'; import { parseEnvironmentFlag } from '../../next/src/environment-flag.js'; import type { Run } from '../src/runtime'; @@ -460,8 +461,28 @@ interface TrackedRun { workflowFn?: string; } -// Per-test tracked runs — reset between tests via setupRunTracking() -let trackedRuns: TrackedRun[] = []; +interface PerTaskState { + testName: string; + trackedRuns: TrackedRun[]; +} + +// Keyed by vitest task id so concurrent tests get isolated state. +// Stored on a module-level Map (rather than a `let` slot) because +// `describe.concurrent` interleaves beforeEach/test/onTestFailed across +// tasks — a shared `let` gets clobbered by whichever task last entered +// beforeEach, surfacing the wrong run's diagnostics on the failing test. +const perTaskState = new Map(); + +function currentTaskState(): PerTaskState | undefined { + const task = getCurrentTest(); + if (!task) return undefined; + let state = perTaskState.get(task.id); + if (!state) { + state = { testName: task.name, trackedRuns: [] }; + perTaskState.set(task.id, state); + } + return state; +} // Global list of run IDs collected for metadata (observability links) const globalCollectedRunIds: { @@ -492,17 +513,24 @@ export function trackRun( workflowFn?: string; } ): Run { - const testName = options?.testName ?? currentTestName; - trackedRuns.push({ - run, - workflowFile: options?.workflowFile, - workflowFn: options?.workflowFn, - }); + const state = currentTaskState(); + const testName = options?.testName ?? state?.testName ?? 'unknown'; + if (state) { + state.trackedRuns.push({ + run, + workflowFile: options?.workflowFile, + workflowFn: options?.workflowFn, + }); + } globalCollectedRunIds.push({ testName, runId: run.runId, timestamp: new Date().toISOString(), }); + // Unconditionally log run-id <-> test correlation so that if diagnostics + // fail to fetch (or the test hangs past the timeout), we can still trace + // back from CI stdout which workflow run belongs to which test. + console.log(`[trackRun] ${testName} → ${run.runId}`); return run; } @@ -661,13 +689,27 @@ function emitGitHubAnnotation( * beforeEach((ctx) => { setupRunTracking(ctx.task.name); }); */ export function setupRunTracking(testName: string) { - currentTestName = testName; - trackedRuns = []; + const task = getCurrentTest(); + if (!task) { + // Outside of a test context — nothing to wire up. + return; + } + const state: PerTaskState = { testName, trackedRuns: [] }; + perTaskState.set(task.id, state); + onTestFailed( async (result) => { const errorMessage = result.errors?.[0]?.message || 'Test failed'; - for (const tracked of trackedRuns) { + if (state.trackedRuns.length === 0) { + // Make the absence of tracked runs explicit: helps distinguish "test + // failed before start()" from "diagnostics infrastructure is broken". + console.error( + `[diagnostics] ${testName}: no tracked runs for this test` + ); + } + + for (const tracked of state.trackedRuns) { try { const diagnostics = await getRunDiagnostics(tracked); console.error(diagnostics); @@ -681,10 +723,12 @@ export function setupRunTracking(testName: string) { }, 30_000 // Allow 30s for diagnostics fetching (default hookTimeout is 10s) ); -} -// Current test name for auto-tracking -let currentTestName = 'unknown'; + // Drop the per-task slot after the test settles to keep the Map bounded. + onTestFinished(() => { + perTaskState.delete(task.id); + }); +} /** * Write diagnostics sidecar file with per-test run info for the aggregation script. From 2c0a47dc62ca11f387c21669ed077708fe432054 Mon Sep 17 00:00:00 2001 From: Peter Wielander Date: Fri, 22 May 2026 15:22:21 +0200 Subject: [PATCH 3/6] [world-local] Annotate readJSON SyntaxError with file path + content snippet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #2083's concurrent-mode e2e run surfaced a bare `SyntaxError: Unexpected end of JSON input` inside Run.#pollReturnValue with no information about which file ended up empty. All write paths into the runs/events/hooks JSON files go through `writeJSON`'s atomic tmp+rename, so an empty file shouldn't be possible on POSIX — but the diagnostic is generic enough that we can't tell which entity is involved, whether the file is actually zero bytes vs. truncated, or how stale it is. Wrap `readJSON` so an empty file or any `SyntaxError` becomes `readJSON: parsing (size=, content=)`. No behavior change for callers — a thrown error is still thrown — but the next time this fires we'll know exactly what to chase. --- .changeset/readjson-diagnostics.md | 5 +++++ packages/world-local/src/fs.ts | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 .changeset/readjson-diagnostics.md diff --git a/.changeset/readjson-diagnostics.md b/.changeset/readjson-diagnostics.md new file mode 100644 index 0000000000..a608715905 --- /dev/null +++ b/.changeset/readjson-diagnostics.md @@ -0,0 +1,5 @@ +--- +'@workflow/world-local': patch +--- + +`readJSON` now surfaces the file path, on-disk size, and a snippet of the (possibly empty) content when it encounters a `SyntaxError`. Previously a corrupted or zero-byte file produced a bare `Unexpected end of JSON input` with no indication of which entity (run / event / hook / etc.) was affected, making concurrent-read races extremely hard to diagnose from CI logs. diff --git a/packages/world-local/src/fs.ts b/packages/world-local/src/fs.ts index 5cccf744a0..803474a162 100644 --- a/packages/world-local/src/fs.ts +++ b/packages/world-local/src/fs.ts @@ -359,9 +359,33 @@ export async function readJSON( ): Promise { try { const content = await fs.readFile(filePath, 'utf-8'); + if (content.length === 0) { + // An empty file here means a write path produced zero bytes without + // the atomic tmp+rename guard, or that a reader caught the file + // briefly empty during creation. Surface enough state to pin it down + // from CI logs. + const stat = await fs.stat(filePath).catch(() => null); + throw new Error( + `readJSON: empty file at ${filePath} (size=${stat?.size ?? '?'}, mtime=${stat?.mtimeMs ?? '?'})` + ); + } return decoder.parse(JSON.parse(content, jsonReviver)); } catch (error) { if ((error as any).code === 'ENOENT') return null; + if (error instanceof SyntaxError) { + // Attach the file path + size so the SyntaxError isn't anonymous + // when it bubbles up to a test. Without this we can't tell which + // entity (run / event / hook / etc.) the corruption was on. + const stat = await fs.stat(filePath).catch(() => null); + const content = await fs.readFile(filePath, 'utf-8').catch(() => null); + throw new Error( + `readJSON: ${error.message} parsing ${filePath} (size=${stat?.size ?? '?'}, content=${ + content === null + ? '' + : JSON.stringify(content.slice(0, 200)) + })` + ); + } throw error; } } From 2fce0bbe8ec2a1359dec54a31c37c03ca9416808 Mon Sep 17 00:00:00 2001 From: Peter Wielander Date: Fri, 22 May 2026 15:34:00 +0200 Subject: [PATCH 4/6] [world-local] Make writeExclusive atomic for content, preserve SyntaxError type in readJSON MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous writeExclusive used `fs.writeFile(path, data, { flag: 'wx' })`, which is atomic for the file *creation* (O_CREAT|O_EXCL) but NOT for the subsequent data write. A concurrent reader could catch the file with zero bytes (or just `{`) before the writer's content reached the page cache. This race had been silently papered over by `readHookTokenClaim`, which swallows any SyntaxError as "no claim exists" — which is wrong: a partial write means a claim *does* exist, just not yet flushed, and treating it as absent lets duplicate hook tokens through. PR #2083's concurrent e2e run also surfaced the same race on runs/.json creation, producing size=0 byte run files in CI logs. Fix: write content to a temp file first, then `link(2)` it atomically into place. POSIX link is atomic and fails with EEXIST if the target already exists, preserving the exclusive-claim semantics. Readers now either see ENOENT or the fully-populated inode — never a half-written file. Also preserve the SyntaxError type when annotating readJSON failures with file path / size / content snippet, so callers like readHookTokenClaim that intentionally swallow `error instanceof SyntaxError` continue to work (the previous diagnostic threw a plain Error and broke them). --- .changeset/readjson-diagnostics.md | 2 +- packages/world-local/src/fs.ts | 70 ++++++++++++++++++------------ 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/.changeset/readjson-diagnostics.md b/.changeset/readjson-diagnostics.md index a608715905..4deaccf760 100644 --- a/.changeset/readjson-diagnostics.md +++ b/.changeset/readjson-diagnostics.md @@ -2,4 +2,4 @@ '@workflow/world-local': patch --- -`readJSON` now surfaces the file path, on-disk size, and a snippet of the (possibly empty) content when it encounters a `SyntaxError`. Previously a corrupted or zero-byte file produced a bare `Unexpected end of JSON input` with no indication of which entity (run / event / hook / etc.) was affected, making concurrent-read races extremely hard to diagnose from CI logs. +Fix a concurrent-read race in `writeExclusive`. `fs.writeFile(path, data, { flag: 'wx' })` is atomic for file *creation* (`O_CREAT | O_EXCL`) but the subsequent data write is not, so a concurrent reader could observe a half-written file (e.g. just `{`) before the writer's content was flushed. Callers that combined exclusive claim semantics with a follow-up read of the payload — notably the hook-token claim path and run-creation — would intermittently see `SyntaxError: Unexpected end of JSON input` (or worse, treat a partial file as an empty claim, masking conflicts). Switch to a two-step pattern: write the full content to a temp file first, then `link(2)` it into place. POSIX `link` is atomic and fails with `EEXIST` if the target already exists, so readers either see `ENOENT` or the fully-populated inode. `readJSON` also annotates any `SyntaxError` with the file path, on-disk size, and a snippet of the offending content, making future cases of this class actionable from CI logs (the error type is preserved so callers that intentionally swallow `SyntaxError` continue to work). diff --git a/packages/world-local/src/fs.ts b/packages/world-local/src/fs.ts index 803474a162..a86ce4718f 100644 --- a/packages/world-local/src/fs.ts +++ b/packages/world-local/src/fs.ts @@ -357,34 +357,26 @@ export async function readJSON( filePath: string, decoder: z.ZodType ): Promise { + let content: string; try { - const content = await fs.readFile(filePath, 'utf-8'); - if (content.length === 0) { - // An empty file here means a write path produced zero bytes without - // the atomic tmp+rename guard, or that a reader caught the file - // briefly empty during creation. Surface enough state to pin it down - // from CI logs. - const stat = await fs.stat(filePath).catch(() => null); - throw new Error( - `readJSON: empty file at ${filePath} (size=${stat?.size ?? '?'}, mtime=${stat?.mtimeMs ?? '?'})` - ); - } - return decoder.parse(JSON.parse(content, jsonReviver)); + content = await fs.readFile(filePath, 'utf-8'); } catch (error) { if ((error as any).code === 'ENOENT') return null; + throw error; + } + try { + return decoder.parse(JSON.parse(content, jsonReviver)); + } catch (error) { if (error instanceof SyntaxError) { - // Attach the file path + size so the SyntaxError isn't anonymous - // when it bubbles up to a test. Without this we can't tell which - // entity (run / event / hook / etc.) the corruption was on. + // Annotate the SyntaxError with the file path, size, and a snippet of + // the (possibly empty / partial) content. Some writers — notably + // `writeExclusive` — are atomic for file *creation* but not for the + // data write that follows, so a concurrent reader can briefly observe + // a half-written file. Preserve the SyntaxError type so callers like + // `readHookTokenClaim` that intentionally swallow it via + // `error instanceof SyntaxError` continue to work. const stat = await fs.stat(filePath).catch(() => null); - const content = await fs.readFile(filePath, 'utf-8').catch(() => null); - throw new Error( - `readJSON: ${error.message} parsing ${filePath} (size=${stat?.size ?? '?'}, content=${ - content === null - ? '' - : JSON.stringify(content.slice(0, 200)) - })` - ); + error.message = `${error.message} (parsing ${filePath}, size=${stat?.size ?? '?'}, content=${JSON.stringify(content.slice(0, 200))})`; } throw error; } @@ -404,23 +396,47 @@ export async function deleteJSON(filePath: string): Promise { } /** - * Atomically create a file using O_CREAT | O_EXCL flags. - * Returns true if the file was created, false if it already exists. - * This is atomic at the OS level, safe for concurrent access. + * Atomically create a file containing `data`, failing if the target already + * exists. Returns true on successful claim, false if the file is already + * present. + * + * The naive `fs.writeFile(path, data, { flag: 'wx' })` form is atomic for the + * file *creation* (`O_CREAT | O_EXCL`) but the subsequent content write is + * not — a concurrent reader can observe the file with partial bytes (e.g. + * just `{`) before the writer's content has been flushed. Callers that + * combine `writeExclusive` for claim-style exclusion with a follow-up read + * of the claim payload would intermittently see a `SyntaxError` instead of + * the claim data. + * + * Use a two-step atomic pattern instead: write the full content to a temp + * file first, then `link(2)` it into place. POSIX guarantees `link` is + * atomic and fails with `EEXIST` if the target already exists. The reader + * either sees nothing (ENOENT) or the fully-populated linked inode — never + * a half-written file. */ export async function writeExclusive( filePath: string, data: string ): Promise { await ensureDir(path.dirname(filePath)); + const tempPath = `${filePath}.tmp.${ulid()}`; + let tempCreated = false; try { - await fs.writeFile(filePath, data, { flag: 'wx' }); + await fs.writeFile(tempPath, data); + tempCreated = true; + await withWindowsRetry(() => fs.link(tempPath, filePath)); return true; } catch (error: any) { if (error.code === 'EEXIST') { return false; } throw error; + } finally { + if (tempCreated) { + // Best-effort cleanup of the temp inode regardless of whether the + // link succeeded — leaving it behind would leak files in the data dir. + await withWindowsRetry(() => fs.unlink(tempPath), 3).catch(() => {}); + } } } From c7060b131f9ae1a17401c7a90c8efddda81724d5 Mon Sep 17 00:00:00 2001 From: Peter Wielander Date: Fri, 22 May 2026 15:43:51 +0200 Subject: [PATCH 5/6] [tests] Cap e2e concurrency at 3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vitest's default `maxConcurrency` of 5 saturates the workbench preview deploys and the in-process world runtime under `describe.concurrent`, pushing load-heavy tests (fibonacci tree spawn, multi-step abort sequencing, AbortSignal.any composition) into their per-test deadlines on PR #2083's run 4 even though the same tests pass cleanly when run sequentially. Drop to 3 in the root vitest config (only `test:e2e` and `bench` consume it — per-package configs are untouched). The Vercel-deploy matrix went from ~14-17min sequential → ~5-6min at concurrency 5; trading some of that headroom for stability is the right call now that the isolation/correctness fixes have landed. --- vitest.config.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vitest.config.ts b/vitest.config.ts index 7d49659dbd..1ca89f864a 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -3,6 +3,14 @@ import { defineConfig } from 'vitest/config'; export default defineConfig({ test: { testTimeout: 60_000, + // Cap parallel test execution at 3 within `describe.concurrent` blocks. + // The e2e suite is the only consumer of this config (only `test:e2e` + // and `bench` use the root vitest setup); the default of 5 saturates + // the workbench preview deploy / local runtime enough that load-heavy + // tests (fibonacci tree spawn, multi-step abort sequencing) bump into + // their per-test timeouts even though the same tests pass comfortably + // when run sequentially. + maxConcurrency: 3, }, benchmark: { include: ['**/*.bench.ts'], From 6e264eb7364ea20e10992069ff2c0974b7a5459f Mon Sep 17 00:00:00 2001 From: Peter Wielander Date: Fri, 22 May 2026 15:54:41 +0200 Subject: [PATCH 6/6] [tests] Revert e2e maxConcurrency cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dropping vitest's default concurrency cap from 5 to 3 (commit c7060b131f) didn't reduce the remaining e2e flake count — runs 3/4 at default concurrency=5 and run 5 at concurrency=3 all landed on exactly 15 failures. The remaining failures are external-network and source-map class issues (httpbin/postman-echo abort tests, nuxt/nextjs-webpack sourcemap assertions, fibonacci tree spawn timeouts) that lower test parallelism doesn't address. Keeping the speedup is more valuable than the small load reduction. --- vitest.config.ts | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vitest.config.ts b/vitest.config.ts index 1ca89f864a..7d49659dbd 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -3,14 +3,6 @@ import { defineConfig } from 'vitest/config'; export default defineConfig({ test: { testTimeout: 60_000, - // Cap parallel test execution at 3 within `describe.concurrent` blocks. - // The e2e suite is the only consumer of this config (only `test:e2e` - // and `bench` use the root vitest setup); the default of 5 saturates - // the workbench preview deploy / local runtime enough that load-heavy - // tests (fibonacci tree spawn, multi-step abort sequencing) bump into - // their per-test timeouts even though the same tests pass comfortably - // when run sequentially. - maxConcurrency: 3, }, benchmark: { include: ['**/*.bench.ts'],