triggerdotdev
diff --git a/‎.github/workflows/unit-tests-internal.yml‎
Lines changed: 17 additions & 7 deletions b/‎.github/workflows/unit-tests-internal.yml‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎.github/workflows/unit-tests-packages.yml‎
Lines changed: 19 additions & 7 deletions b/‎.github/workflows/unit-tests-packages.yml‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎.github/workflows/unit-tests-webapp.yml‎
Lines changed: 18 additions & 8 deletions b/‎.github/workflows/unit-tests-webapp.yml‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎apps/webapp/test/engine/streamBatchItems.test.ts‎
Lines changed: 9 additions & 6 deletions b/‎apps/webapp/test/engine/streamBatchItems.test.ts‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎apps/webapp/test/runsBackfiller.test.ts‎
Lines changed: 2 additions & 2 deletions b/‎apps/webapp/test/runsBackfiller.test.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎apps/webapp/test/runsReplicationBenchmark.test.ts‎
Lines changed: 2 additions & 2 deletions b/‎apps/webapp/test/runsReplicationBenchmark.test.ts‎
Lines changed: 2 additions & 2 deletions
@@ -19,8 +19,8 @@ jobs:
       # one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard
       fail-fast: false
       matrix:
-        shardIndex: [1, 2, 3, 4, 5, 6, 7, 8]
-        shardTotal: [8]
+        shardIndex: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+        shardTotal: [12]
     env:
       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
       SHARD_INDEX: ${{ matrix.shardIndex }}
@@ -83,12 +83,22 @@ jobs:
       - name: 🐳 Pre-pull testcontainer images
         if: ${{ env.DOCKERHUB_USERNAME }}
         run: |
+          # Retry each pull - DockerHub registry timeouts are a recurring transient CI flake.
+          pull() {
+            for attempt in 1 2 3; do
+              docker pull "$1" && return 0
+              echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s"
+              sleep 10
+            done
+            echo "::error::docker pull $1 failed after 3 attempts"
+            return 1
+          }
           echo "Pre-pulling Docker images with authenticated session..."
-          docker pull postgres:14
-          docker pull clickhouse/clickhouse-server:25.4-alpine
-          docker pull redis:7.2
-          docker pull testcontainers/ryuk:0.14.0
-          docker pull electricsql/electric:1.2.4
+          pull postgres:14
+          pull clickhouse/clickhouse-server:25.4-alpine
+          pull redis:7.2
+          pull testcontainers/ryuk:0.14.0
+          pull electricsql/electric:1.2.4
           echo "Image pre-pull complete"
 
       - name: 📥 Download deps
 
@@ -16,9 +16,11 @@ jobs:
     name: "🧪 Unit Tests: Packages"
     runs-on: ubuntu-latest
     strategy:
+      # one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard
+      fail-fast: false
       matrix:
-        shardIndex: [1]
-        shardTotal: [1]
+        shardIndex: [1, 2, 3]
+        shardTotal: [3]
     env:
       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
       SHARD_INDEX: ${{ matrix.shardIndex }}
@@ -81,12 +83,22 @@ jobs:
       - name: 🐳 Pre-pull testcontainer images
         if: ${{ env.DOCKERHUB_USERNAME }}
         run: |
+          # Retry each pull - DockerHub registry timeouts are a recurring transient CI flake.
+          pull() {
+            for attempt in 1 2 3; do
+              docker pull "$1" && return 0
+              echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s"
+              sleep 10
+            done
+            echo "::error::docker pull $1 failed after 3 attempts"
+            return 1
+          }
           echo "Pre-pulling Docker images with authenticated session..."
-          docker pull postgres:14
-          docker pull clickhouse/clickhouse-server:25.4-alpine
-          docker pull redis:7.2
-          docker pull testcontainers/ryuk:0.14.0
-          docker pull electricsql/electric:1.2.4
+          pull postgres:14
+          pull clickhouse/clickhouse-server:25.4-alpine
+          pull redis:7.2
+          pull testcontainers/ryuk:0.14.0
+          pull electricsql/electric:1.2.4
           echo "Image pre-pull complete"
 
       - name: 📥 Download deps
 
@@ -19,8 +19,8 @@ jobs:
       # one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard
       fail-fast: false
       matrix:
-        shardIndex: [1, 2, 3, 4, 5, 6, 7, 8]
-        shardTotal: [8]
+        shardIndex: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        shardTotal: [10]
     env:
       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
       SHARD_INDEX: ${{ matrix.shardIndex }}
@@ -83,13 +83,23 @@ jobs:
       - name: 🐳 Pre-pull testcontainer images
         if: ${{ env.DOCKERHUB_USERNAME }}
         run: |
+          # Retry each pull - DockerHub registry timeouts are a recurring transient CI flake.
+          pull() {
+            for attempt in 1 2 3; do
+              docker pull "$1" && return 0
+              echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s"
+              sleep 10
+            done
+            echo "::error::docker pull $1 failed after 3 attempts"
+            return 1
+          }
           echo "Pre-pulling Docker images with authenticated session..."
-          docker pull postgres:14
-          docker pull clickhouse/clickhouse-server:25.4-alpine
-          docker pull redis:7.2
-          docker pull testcontainers/ryuk:0.14.0
-          docker pull electricsql/electric:1.2.4
-          docker pull minio/minio:latest
+          pull postgres:14
+          pull clickhouse/clickhouse-server:25.4-alpine
+          pull redis:7.2
+          pull testcontainers/ryuk:0.14.0
+          pull electricsql/electric:1.2.4
+          pull minio/minio:latest
           echo "Image pre-pull complete"
 
       - name: 📥 Download deps
 
@@ -72,4 +72,6 @@ apps/**/public/build
 .mcp.log
 .mcp.json
 .cursor/debug.log
-ailogger-output.log
+ailogger-output.log
+# per-package vitest timing capture (transient; merged into root test-timings.json)
+.vitest-timing.json
@@ -16,7 +16,11 @@ vi.mock("~/services/platform.v3.server", async (importOriginal) => {
 
 import { RunEngine } from "@internal/run-engine";
 import { setupAuthenticatedEnvironment } from "@internal/run-engine/tests";
-import { containerTest } from "@internal/testcontainers";
+// Per-test redis (isolated): each test spins up its own RunEngine and runs batch work, which leaves
+// background activity on redis that outlives the test - sharing a worker redis across the 16 cases
+// here caused cross-test interference and 30s seal-timeout flakes. Same carve-out as the run-engine
+// batch tests.
+import { containerTestWithIsolatedRedis as containerTest } from "@internal/testcontainers";
 import { trace } from "@opentelemetry/api";
 import { PrismaClient } from "@trigger.dev/database";
 import { BatchId } from "@trigger.dev/core/v3/isomorphic";
@@ -1584,10 +1588,7 @@ describe("createNdjsonParserStream", () => {
     const parser = createNdjsonParserStream(1024);
     const results = await collectStream(stream.pipeThrough(parser));
 
-    expect(results).toEqual([
-      { payload: "line1\nline2\nline3" },
-      { payload: "no newlines" },
-    ]);
+    expect(results).toEqual([{ payload: "line1\nline2\nline3" }, { payload: "no newlines" }]);
   });
 
   it("should skip empty lines", async () => {
@@ -1888,7 +1889,9 @@ describe("extractIndexAndTask", () => {
   });
 
   it("should not match nested keys", () => {
-    const bytes = encoder.encode('{"nested":{"index":999,"task":"inner"},"index":5,"task":"outer"}');
+    const bytes = encoder.encode(
+      '{"nested":{"index":999,"task":"inner"},"index":5,"task":"outer"}'
+    );
     const result = extractIndexAndTask(bytes);
     expect(result.index).toBe(5);
     expect(result.task).toBe("outer");
 
@@ -7,7 +7,7 @@ vi.mock("~/db.server", () => ({
 }));
 
 import { ClickHouse } from "@internal/clickhouse";
-import { containerTest } from "@internal/testcontainers";
+import { replicationContainerTest } from "@internal/testcontainers";
 import { z } from "zod";
 import { RunsBackfillerService } from "~/services/runsBackfiller.server";
 import { RunsReplicationService } from "~/services/runsReplicationService.server";
@@ -17,7 +17,7 @@ import { TestReplicationClickhouseFactory } from "./utils/testReplicationClickho
 vi.setConfig({ testTimeout: 60_000 });
 
 describe("RunsBackfillerService", () => {
-  containerTest(
+  replicationContainerTest(
     "should backfill completed runs to clickhouse",
     async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => {
       const clickhouse = new ClickHouse({
 
@@ -1,5 +1,5 @@
 import { ClickHouse } from "@internal/clickhouse";
-import { containerTest } from "@internal/testcontainers";
+import { replicationContainerTest } from "@internal/testcontainers";
 import { fork, type ChildProcess } from "node:child_process";
 import { performance, PerformanceObserver } from "node:perf_hooks";
 import { setTimeout } from "node:timers/promises";
@@ -501,7 +501,7 @@ function compareBenchmarks(baseline: BenchmarkResult, comparison: BenchmarkResul
 }
 
 describe("RunsReplicationService Benchmark", () => {
-  containerTest.skipIf(process.env.BENCHMARKS_ENABLED !== "1")(
+  replicationContainerTest.skipIf(process.env.BENCHMARKS_ENABLED !== "1")(
     "should benchmark error fingerprinting performance impact",
     async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => {
       // Enable replica identity for TaskRun table