From c34753d174fd6f11cb577e24d5db84abcdd4c0f5 Mon Sep 17 00:00:00 2001
From: Peter Wielander <peter.wielander@vercel.com>
Date: Fri, 22 May 2026 13:40:04 +0200
Subject: [PATCH 1/6] [tests] Re-enable describe.concurrent in e2e suite

Flips packages/core/e2e/e2e.test.ts back to describe.concurrent. The
suite was made sequential ("temporarily disabling concurrent tests to
avoid flakiness") and never re-enabled. Each workbench now runs ~87
tests serially against its preview deploy, dominating CI wall-clock.

Putting this up as a draft to see which (if any) tests are still
flake-prone under concurrency, after recent fixes to the abort-fetch
class of flakes.
---
 .changeset/concurrent-e2e.md  | 4 ++++
 packages/core/e2e/e2e.test.ts | 4 +---
 2 files changed, 5 insertions(+), 3 deletions(-)
 create mode 100644 .changeset/concurrent-e2e.md
diff --git a/.changeset/concurrent-e2e.md b/.changeset/concurrent-e2e.md
new file mode 100644
index 0000000000..3bb05ef22b
--- /dev/null
+++ b/.changeset/concurrent-e2e.md
@@ -0,0 +1,4 @@
+---
+---
+
+chore(tests): re-enable `describe.concurrent` in the e2e suite to cut CI wall-clock per workbench.
diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts
index 5f00f28ae1..cc478b637b 100644
--- a/packages/core/e2e/e2e.test.ts
+++ b/packages/core/e2e/e2e.test.ts
@@ -182,9 +182,7 @@ async function startWorkflowViaHttp(
   return run;
 }
 
-// NOTE: Temporarily disabling concurrent tests to avoid flakiness.
-// TODO: Re-enable concurrent tests after conf when we have more time to investigate.
-describe('e2e', () => {
+describe.concurrent('e2e', () => {
   // Configure the World for the test runner process so that start() and
   // run.returnValue can communicate with the same backend as the workbench app.
   beforeAll(async () => {

From a11577405681a4b5865c5610dce84ec397e1edb1 Mon Sep 17 00:00:00 2001
From: Peter Wielander <peter.wielander@vercel.com>
Date: Fri, 22 May 2026 15:10:04 +0200
Subject: [PATCH 2/6] [tests] Isolate diagnostic tracking per concurrent task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The e2e suite tracked failed-run diagnostics via two module-level
mutables (`trackedRuns` and `currentTestName`) that every beforeEach
reset. Under describe.concurrent that races: when a test times out,
the displayed diagnostic typically belongs to whichever sibling task
most recently called setupRunTracking(). That's why PR #2083's first
CI run showed an unrelated `errorStepCrossFile` run as the
"diagnostic" for the FatalError catchability test that actually
timed out — leaving us blind to the real failure cause.

Switch to a `Map<taskId, PerTaskState>` keyed off
`getCurrentTest().id`, with the entry dropped via onTestFinished to
keep the map bounded. Also unconditionally log
`[trackRun] <test> → <runId>` so we can still correlate runs from
stdout even when diagnostics fetching itself stalls past the test's
deadline.
---
 .changeset/concurrent-e2e.md |  2 +-
 packages/core/e2e/utils.ts   | 74 ++++++++++++++++++++++++++++--------
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/.changeset/concurrent-e2e.md b/.changeset/concurrent-e2e.md
index 3bb05ef22b..f050cf9eed 100644
--- a/.changeset/concurrent-e2e.md
+++ b/.changeset/concurrent-e2e.md
@@ -1,4 +1,4 @@
 ---
 ---
 
-chore(tests): re-enable `describe.concurrent` in the e2e suite to cut CI wall-clock per workbench.
+chore(tests): re-enable `describe.concurrent` in the e2e suite to cut CI wall-clock per workbench, and isolate diagnostic tracking state per-task so concurrent tests no longer clobber each other's `trackedRuns` (each failing test now surfaces its own run's diagnostics, and every tracked run is logged with `[trackRun] <test> → <runId>` for stdout correlation even when diagnostics fetching fails).
diff --git a/packages/core/e2e/utils.ts b/packages/core/e2e/utils.ts
index 509775fbd3..d84e96eff4 100644
--- a/packages/core/e2e/utils.ts
+++ b/packages/core/e2e/utils.ts
@@ -4,7 +4,8 @@ import path, { dirname } from 'node:path';
 import { setTimeout as sleep } from 'node:timers/promises';
 import { fileURLToPath } from 'node:url';
 import { createVercelWorld } from '@workflow/world-vercel';
-import { onTestFailed } from 'vitest';
+import { onTestFailed, onTestFinished } from 'vitest';
+import { getCurrentTest } from 'vitest/suite';
 import { getTrustedSourcesHeaders } from '../../../scripts/trusted-sources-headers.mjs';
 import { parseEnvironmentFlag } from '../../next/src/environment-flag.js';
 import type { Run } from '../src/runtime';
@@ -460,8 +461,28 @@ interface TrackedRun {
   workflowFn?: string;
 }
 
-// Per-test tracked runs — reset between tests via setupRunTracking()
-let trackedRuns: TrackedRun[] = [];
+interface PerTaskState {
+  testName: string;
+  trackedRuns: TrackedRun[];
+}
+
+// Keyed by vitest task id so concurrent tests get isolated state.
+// Stored on a module-level Map (rather than a `let` slot) because
+// `describe.concurrent` interleaves beforeEach/test/onTestFailed across
+// tasks — a shared `let` gets clobbered by whichever task last entered
+// beforeEach, surfacing the wrong run's diagnostics on the failing test.
+const perTaskState = new Map<string, PerTaskState>();
+
+function currentTaskState(): PerTaskState | undefined {
+  const task = getCurrentTest();
+  if (!task) return undefined;
+  let state = perTaskState.get(task.id);
+  if (!state) {
+    state = { testName: task.name, trackedRuns: [] };
+    perTaskState.set(task.id, state);
+  }
+  return state;
+}
 
 // Global list of run IDs collected for metadata (observability links)
 const globalCollectedRunIds: {
@@ -492,17 +513,24 @@ export function trackRun<T>(
     workflowFn?: string;
   }
 ): Run<T> {
-  const testName = options?.testName ?? currentTestName;
-  trackedRuns.push({
-    run,
-    workflowFile: options?.workflowFile,
-    workflowFn: options?.workflowFn,
-  });
+  const state = currentTaskState();
+  const testName = options?.testName ?? state?.testName ?? 'unknown';
+  if (state) {
+    state.trackedRuns.push({
+      run,
+      workflowFile: options?.workflowFile,
+      workflowFn: options?.workflowFn,
+    });
+  }
   globalCollectedRunIds.push({
     testName,
     runId: run.runId,
     timestamp: new Date().toISOString(),
   });
+  // Unconditionally log run-id <-> test correlation so that if diagnostics
+  // fail to fetch (or the test hangs past the timeout), we can still trace
+  // back from CI stdout which workflow run belongs to which test.
+  console.log(`[trackRun] ${testName} → ${run.runId}`);
   return run;
 }
 
@@ -661,13 +689,27 @@ function emitGitHubAnnotation(
  *   beforeEach((ctx) => { setupRunTracking(ctx.task.name); });
  */
 export function setupRunTracking(testName: string) {
-  currentTestName = testName;
-  trackedRuns = [];
+  const task = getCurrentTest();
+  if (!task) {
+    // Outside of a test context — nothing to wire up.
+    return;
+  }
+  const state: PerTaskState = { testName, trackedRuns: [] };
+  perTaskState.set(task.id, state);
+
   onTestFailed(
     async (result) => {
       const errorMessage = result.errors?.[0]?.message || 'Test failed';
 
-      for (const tracked of trackedRuns) {
+      if (state.trackedRuns.length === 0) {
+        // Make the absence of tracked runs explicit: helps distinguish "test
+        // failed before start()" from "diagnostics infrastructure is broken".
+        console.error(
+          `[diagnostics] ${testName}: no tracked runs for this test`
+        );
+      }
+
+      for (const tracked of state.trackedRuns) {
         try {
           const diagnostics = await getRunDiagnostics(tracked);
           console.error(diagnostics);
@@ -681,10 +723,12 @@ export function setupRunTracking(testName: string) {
     },
     30_000 // Allow 30s for diagnostics fetching (default hookTimeout is 10s)
   );
-}
 
-// Current test name for auto-tracking
-let currentTestName = 'unknown';
+  // Drop the per-task slot after the test settles to keep the Map bounded.
+  onTestFinished(() => {
+    perTaskState.delete(task.id);
+  });
+}
 
 /**
  * Write diagnostics sidecar file with per-test run info for the aggregation script.

From 2c0a47dc62ca11f387c21669ed077708fe432054 Mon Sep 17 00:00:00 2001
From: Peter Wielander <peter.wielander@vercel.com>
Date: Fri, 22 May 2026 15:22:21 +0200
Subject: [PATCH 3/6] [world-local] Annotate readJSON SyntaxError with file
 path + content snippet
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #2083's concurrent-mode e2e run surfaced a bare
`SyntaxError: Unexpected end of JSON input` inside Run.#pollReturnValue
with no information about which file ended up empty. All write paths
into the runs/events/hooks JSON files go through `writeJSON`'s
atomic tmp+rename, so an empty file shouldn't be possible on POSIX
— but the diagnostic is generic enough that we can't tell which
entity is involved, whether the file is actually zero bytes vs.
truncated, or how stale it is.

Wrap `readJSON` so an empty file or any `SyntaxError` becomes
`readJSON: <message> parsing <path> (size=<bytes>, content=<first 200 chars>)`.
No behavior change for callers — a thrown error is still thrown —
but the next time this fires we'll know exactly what to chase.
---
 .changeset/readjson-diagnostics.md |  5 +++++
 packages/world-local/src/fs.ts     | 24 ++++++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 .changeset/readjson-diagnostics.md

diff --git a/.changeset/readjson-diagnostics.md b/.changeset/readjson-diagnostics.md
new file mode 100644
index 0000000000..a608715905
--- /dev/null
+++ b/.changeset/readjson-diagnostics.md
@@ -0,0 +1,5 @@
+---
+'@workflow/world-local': patch
+---
+
+`readJSON` now surfaces the file path, on-disk size, and a snippet of the (possibly empty) content when it encounters a `SyntaxError`. Previously a corrupted or zero-byte file produced a bare `Unexpected end of JSON input` with no indication of which entity (run / event / hook / etc.) was affected, making concurrent-read races extremely hard to diagnose from CI logs.
diff --git a/packages/world-local/src/fs.ts b/packages/world-local/src/fs.ts
index 5cccf744a0..803474a162 100644
--- a/packages/world-local/src/fs.ts
+++ b/packages/world-local/src/fs.ts
@@ -359,9 +359,33 @@ export async function readJSON<T>(
 ): Promise<T | null> {
   try {
     const content = await fs.readFile(filePath, 'utf-8');
+    if (content.length === 0) {
+      // An empty file here means a write path produced zero bytes without
+      // the atomic tmp+rename guard, or that a reader caught the file
+      // briefly empty during creation. Surface enough state to pin it down
+      // from CI logs.
+      const stat = await fs.stat(filePath).catch(() => null);
+      throw new Error(
+        `readJSON: empty file at ${filePath} (size=${stat?.size ?? '?'}, mtime=${stat?.mtimeMs ?? '?'})`
+      );
+    }
     return decoder.parse(JSON.parse(content, jsonReviver));
   } catch (error) {
     if ((error as any).code === 'ENOENT') return null;
+    if (error instanceof SyntaxError) {
+      // Attach the file path + size so the SyntaxError isn't anonymous
+      // when it bubbles up to a test. Without this we can't tell which
+      // entity (run / event / hook / etc.) the corruption was on.
+      const stat = await fs.stat(filePath).catch(() => null);
+      const content = await fs.readFile(filePath, 'utf-8').catch(() => null);
+      throw new Error(
+        `readJSON: ${error.message} parsing ${filePath} (size=${stat?.size ?? '?'}, content=${
+          content === null
+            ? '<unreadable>'
+            : JSON.stringify(content.slice(0, 200))
+        })`
+      );
+    }
     throw error;
   }
 }

From 2fce0bbe8ec2a1359dec54a31c37c03ca9416808 Mon Sep 17 00:00:00 2001
From: Peter Wielander <peter.wielander@vercel.com>
Date: Fri, 22 May 2026 15:34:00 +0200
Subject: [PATCH 4/6] [world-local] Make writeExclusive atomic for content,
 preserve SyntaxError type in readJSON
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous writeExclusive used `fs.writeFile(path, data, { flag: 'wx' })`,
which is atomic for the file *creation* (O_CREAT|O_EXCL) but NOT for the
subsequent data write. A concurrent reader could catch the file with
zero bytes (or just `{`) before the writer's content reached the page
cache.

This race had been silently papered over by `readHookTokenClaim`, which
swallows any SyntaxError as "no claim exists" — which is wrong: a partial
write means a claim *does* exist, just not yet flushed, and treating it as
absent lets duplicate hook tokens through. PR #2083's concurrent e2e run
also surfaced the same race on runs/<id>.json creation, producing
size=0 byte run files in CI logs.

Fix: write content to a temp file first, then `link(2)` it atomically
into place. POSIX link is atomic and fails with EEXIST if the target
already exists, preserving the exclusive-claim semantics. Readers now
either see ENOENT or the fully-populated inode — never a half-written
file.

Also preserve the SyntaxError type when annotating readJSON failures
with file path / size / content snippet, so callers like
readHookTokenClaim that intentionally swallow `error instanceof SyntaxError`
continue to work (the previous diagnostic threw a plain Error and broke
them).
---
 .changeset/readjson-diagnostics.md |  2 +-
 packages/world-local/src/fs.ts     | 70 ++++++++++++++++++------------
 2 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/.changeset/readjson-diagnostics.md b/.changeset/readjson-diagnostics.md
index a608715905..4deaccf760 100644
--- a/.changeset/readjson-diagnostics.md
+++ b/.changeset/readjson-diagnostics.md
@@ -2,4 +2,4 @@
 '@workflow/world-local': patch
 ---
 
-`readJSON` now surfaces the file path, on-disk size, and a snippet of the (possibly empty) content when it encounters a `SyntaxError`. Previously a corrupted or zero-byte file produced a bare `Unexpected end of JSON input` with no indication of which entity (run / event / hook / etc.) was affected, making concurrent-read races extremely hard to diagnose from CI logs.
+Fix a concurrent-read race in `writeExclusive`. `fs.writeFile(path, data, { flag: 'wx' })` is atomic for file *creation* (`O_CREAT | O_EXCL`) but the subsequent data write is not, so a concurrent reader could observe a half-written file (e.g. just `{`) before the writer's content was flushed. Callers that combined exclusive claim semantics with a follow-up read of the payload — notably the hook-token claim path and run-creation — would intermittently see `SyntaxError: Unexpected end of JSON input` (or worse, treat a partial file as an empty claim, masking conflicts). Switch to a two-step pattern: write the full content to a temp file first, then `link(2)` it into place. POSIX `link` is atomic and fails with `EEXIST` if the target already exists, so readers either see `ENOENT` or the fully-populated inode. `readJSON` also annotates any `SyntaxError` with the file path, on-disk size, and a snippet of the offending content, making future cases of this class actionable from CI logs (the error type is preserved so callers that intentionally swallow `SyntaxError` continue to work).
diff --git a/packages/world-local/src/fs.ts b/packages/world-local/src/fs.ts
index 803474a162..a86ce4718f 100644
--- a/packages/world-local/src/fs.ts
+++ b/packages/world-local/src/fs.ts
@@ -357,34 +357,26 @@ export async function readJSON<T>(
   filePath: string,
   decoder: z.ZodType<T>
 ): Promise<T | null> {
+  let content: string;
   try {
-    const content = await fs.readFile(filePath, 'utf-8');
-    if (content.length === 0) {
-      // An empty file here means a write path produced zero bytes without
-      // the atomic tmp+rename guard, or that a reader caught the file
-      // briefly empty during creation. Surface enough state to pin it down
-      // from CI logs.
-      const stat = await fs.stat(filePath).catch(() => null);
-      throw new Error(
-        `readJSON: empty file at ${filePath} (size=${stat?.size ?? '?'}, mtime=${stat?.mtimeMs ?? '?'})`
-      );
-    }
-    return decoder.parse(JSON.parse(content, jsonReviver));
+    content = await fs.readFile(filePath, 'utf-8');
   } catch (error) {
     if ((error as any).code === 'ENOENT') return null;
+    throw error;
+  }
+  try {
+    return decoder.parse(JSON.parse(content, jsonReviver));
+  } catch (error) {
     if (error instanceof SyntaxError) {
-      // Attach the file path + size so the SyntaxError isn't anonymous
-      // when it bubbles up to a test. Without this we can't tell which
-      // entity (run / event / hook / etc.) the corruption was on.
+      // Annotate the SyntaxError with the file path, size, and a snippet of
+      // the (possibly empty / partial) content. Some writers — notably
+      // `writeExclusive` — are atomic for file *creation* but not for the
+      // data write that follows, so a concurrent reader can briefly observe
+      // a half-written file. Preserve the SyntaxError type so callers like
+      // `readHookTokenClaim` that intentionally swallow it via
+      // `error instanceof SyntaxError` continue to work.
       const stat = await fs.stat(filePath).catch(() => null);
-      const content = await fs.readFile(filePath, 'utf-8').catch(() => null);
-      throw new Error(
-        `readJSON: ${error.message} parsing ${filePath} (size=${stat?.size ?? '?'}, content=${
-          content === null
-            ? '<unreadable>'
-            : JSON.stringify(content.slice(0, 200))
-        })`
-      );
+      error.message = `${error.message} (parsing ${filePath}, size=${stat?.size ?? '?'}, content=${JSON.stringify(content.slice(0, 200))})`;
     }
     throw error;
   }
@@ -404,23 +396,47 @@ export async function deleteJSON(filePath: string): Promise<void> {
 }
 
 /**
- * Atomically create a file using O_CREAT | O_EXCL flags.
- * Returns true if the file was created, false if it already exists.
- * This is atomic at the OS level, safe for concurrent access.
+ * Atomically create a file containing `data`, failing if the target already
+ * exists. Returns true on successful claim, false if the file is already
+ * present.
+ *
+ * The naive `fs.writeFile(path, data, { flag: 'wx' })` form is atomic for the
+ * file *creation* (`O_CREAT | O_EXCL`) but the subsequent content write is
+ * not — a concurrent reader can observe the file with partial bytes (e.g.
+ * just `{`) before the writer's content has been flushed. Callers that
+ * combine `writeExclusive` for claim-style exclusion with a follow-up read
+ * of the claim payload would intermittently see a `SyntaxError` instead of
+ * the claim data.
+ *
+ * Use a two-step atomic pattern instead: write the full content to a temp
+ * file first, then `link(2)` it into place. POSIX guarantees `link` is
+ * atomic and fails with `EEXIST` if the target already exists. The reader
+ * either sees nothing (ENOENT) or the fully-populated linked inode — never
+ * a half-written file.
  */
 export async function writeExclusive(
   filePath: string,
   data: string
 ): Promise<boolean> {
   await ensureDir(path.dirname(filePath));
+  const tempPath = `${filePath}.tmp.${ulid()}`;
+  let tempCreated = false;
   try {
-    await fs.writeFile(filePath, data, { flag: 'wx' });
+    await fs.writeFile(tempPath, data);
+    tempCreated = true;
+    await withWindowsRetry(() => fs.link(tempPath, filePath));
     return true;
   } catch (error: any) {
     if (error.code === 'EEXIST') {
       return false;
     }
     throw error;
+  } finally {
+    if (tempCreated) {
+      // Best-effort cleanup of the temp inode regardless of whether the
+      // link succeeded — leaving it behind would leak files in the data dir.
+      await withWindowsRetry(() => fs.unlink(tempPath), 3).catch(() => {});
+    }
   }
 }
 

From c7060b131f9ae1a17401c7a90c8efddda81724d5 Mon Sep 17 00:00:00 2001
From: Peter Wielander <peter.wielander@vercel.com>
Date: Fri, 22 May 2026 15:43:51 +0200
Subject: [PATCH 5/6] [tests] Cap e2e concurrency at 3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vitest's default `maxConcurrency` of 5 saturates the workbench
preview deploys and the in-process world runtime under
`describe.concurrent`, pushing load-heavy tests (fibonacci tree
spawn, multi-step abort sequencing, AbortSignal.any composition)
into their per-test deadlines on PR #2083's run 4 even though
the same tests pass cleanly when run sequentially.

Drop to 3 in the root vitest config (only `test:e2e` and `bench`
consume it — per-package configs are untouched). The Vercel-deploy
matrix went from ~14-17min sequential → ~5-6min at concurrency 5;
trading some of that headroom for stability is the right call now
that the isolation/correctness fixes have landed.
---
 vitest.config.ts | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vitest.config.ts b/vitest.config.ts
index 7d49659dbd..1ca89f864a 100644
--- a/vitest.config.ts
+++ b/vitest.config.ts
@@ -3,6 +3,14 @@ import { defineConfig } from 'vitest/config';
 export default defineConfig({
   test: {
     testTimeout: 60_000,
+    // Cap parallel test execution at 3 within `describe.concurrent` blocks.
+    // The e2e suite is the only consumer of this config (only `test:e2e`
+    // and `bench` use the root vitest setup); the default of 5 saturates
+    // the workbench preview deploy / local runtime enough that load-heavy
+    // tests (fibonacci tree spawn, multi-step abort sequencing) bump into
+    // their per-test timeouts even though the same tests pass comfortably
+    // when run sequentially.
+    maxConcurrency: 3,
   },
   benchmark: {
     include: ['**/*.bench.ts'],

From 6e264eb7364ea20e10992069ff2c0974b7a5459f Mon Sep 17 00:00:00 2001
From: Peter Wielander <peter.wielander@vercel.com>
Date: Fri, 22 May 2026 15:54:41 +0200
Subject: [PATCH 6/6] [tests] Revert e2e maxConcurrency cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dropping vitest's default concurrency cap from 5 to 3 (commit c7060b131f)
didn't reduce the remaining e2e flake count — runs 3/4 at default
concurrency=5 and run 5 at concurrency=3 all landed on exactly 15
failures. The remaining failures are external-network and source-map
class issues (httpbin/postman-echo abort tests, nuxt/nextjs-webpack
sourcemap assertions, fibonacci tree spawn timeouts) that lower test
parallelism doesn't address. Keeping the speedup is more valuable than
the small load reduction.
---
 vitest.config.ts | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/vitest.config.ts b/vitest.config.ts
index 1ca89f864a..7d49659dbd 100644
--- a/vitest.config.ts
+++ b/vitest.config.ts
@@ -3,14 +3,6 @@ import { defineConfig } from 'vitest/config';
 export default defineConfig({
   test: {
     testTimeout: 60_000,
-    // Cap parallel test execution at 3 within `describe.concurrent` blocks.
-    // The e2e suite is the only consumer of this config (only `test:e2e`
-    // and `bench` use the root vitest setup); the default of 5 saturates
-    // the workbench preview deploy / local runtime enough that load-heavy
-    // tests (fibonacci tree spawn, multi-step abort sequencing) bump into
-    // their per-test timeouts even though the same tests pass comfortably
-    // when run sequentially.
-    maxConcurrency: 3,
   },
   benchmark: {
     include: ['**/*.bench.ts'],