Skip to content

Commit fa15438

Browse files
authored
perf(ci): speed up unit tests with LPT sharding + container scoping (#3855)
Speeds up and de-flakes the unit-test suite: testcontainers booted once per vitest worker (per-test isolation kept only where a test runs background redis work that outlives it), a duration-weighted shard sequencer so each shard does roughly equal work, the slowest suites split, two genuine flakes fixed (`streamBatchItems` shared-redis leak; run-engine waits that relied on fixed sleeps), and transient DockerHub pulls retried. **Timings (CI, per-shard wall):** worst unit-test shard ~771s → ~294s; packages/webapp shards ~250-270s, most internal ~190-240s. All 25 shards green. A shard breaks down as ~70s fixed setup (install / image-pull / generate) + ~70s cold `^build` + the actual container tests. So the remaining cost is mostly the tests themselves plus that fixed setup. **Next (separate, timings):** - **typecheck (~6m24s)** — the slowest check overall; bound by full-graph `tsc`, not the TS version (a TS6 branch is still ~6m17s). The real lever is **tsgo** (the Go compiler). - Possible later: turbo CI caching could trim the ~70s cold build on *warm* runs, but it's conditional (cold runs rebuild anyway) and doesn't touch setup or test time — secondary. `cli-v3` e2e and `sdk-compat` are path-gated (don't run on test-infra changes) and already comfortably fast.
1 parent 97036fb commit fa15438

45 files changed

Lines changed: 4239 additions & 3123 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/unit-tests-internal.yml

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ jobs:
1919
# one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard
2020
fail-fast: false
2121
matrix:
22-
shardIndex: [1, 2, 3, 4, 5, 6, 7, 8]
23-
shardTotal: [8]
22+
shardIndex: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
23+
shardTotal: [12]
2424
env:
2525
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
2626
SHARD_INDEX: ${{ matrix.shardIndex }}
@@ -83,12 +83,22 @@ jobs:
8383
- name: 🐳 Pre-pull testcontainer images
8484
if: ${{ env.DOCKERHUB_USERNAME }}
8585
run: |
86+
# Retry each pull - DockerHub registry timeouts are a recurring transient CI flake.
87+
pull() {
88+
for attempt in 1 2 3; do
89+
docker pull "$1" && return 0
90+
echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s"
91+
sleep 10
92+
done
93+
echo "::error::docker pull $1 failed after 3 attempts"
94+
return 1
95+
}
8696
echo "Pre-pulling Docker images with authenticated session..."
87-
docker pull postgres:14
88-
docker pull clickhouse/clickhouse-server:25.4-alpine
89-
docker pull redis:7.2
90-
docker pull testcontainers/ryuk:0.14.0
91-
docker pull electricsql/electric:1.2.4
97+
pull postgres:14
98+
pull clickhouse/clickhouse-server:25.4-alpine
99+
pull redis:7.2
100+
pull testcontainers/ryuk:0.14.0
101+
pull electricsql/electric:1.2.4
92102
echo "Image pre-pull complete"
93103
94104
- name: 📥 Download deps

.github/workflows/unit-tests-packages.yml

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@ jobs:
1616
name: "🧪 Unit Tests: Packages"
1717
runs-on: ubuntu-latest
1818
strategy:
19+
# one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard
20+
fail-fast: false
1921
matrix:
20-
shardIndex: [1]
21-
shardTotal: [1]
22+
shardIndex: [1, 2, 3]
23+
shardTotal: [3]
2224
env:
2325
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
2426
SHARD_INDEX: ${{ matrix.shardIndex }}
@@ -81,12 +83,22 @@ jobs:
8183
- name: 🐳 Pre-pull testcontainer images
8284
if: ${{ env.DOCKERHUB_USERNAME }}
8385
run: |
86+
# Retry each pull - DockerHub registry timeouts are a recurring transient CI flake.
87+
pull() {
88+
for attempt in 1 2 3; do
89+
docker pull "$1" && return 0
90+
echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s"
91+
sleep 10
92+
done
93+
echo "::error::docker pull $1 failed after 3 attempts"
94+
return 1
95+
}
8496
echo "Pre-pulling Docker images with authenticated session..."
85-
docker pull postgres:14
86-
docker pull clickhouse/clickhouse-server:25.4-alpine
87-
docker pull redis:7.2
88-
docker pull testcontainers/ryuk:0.14.0
89-
docker pull electricsql/electric:1.2.4
97+
pull postgres:14
98+
pull clickhouse/clickhouse-server:25.4-alpine
99+
pull redis:7.2
100+
pull testcontainers/ryuk:0.14.0
101+
pull electricsql/electric:1.2.4
90102
echo "Image pre-pull complete"
91103
92104
- name: 📥 Download deps

.github/workflows/unit-tests-webapp.yml

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ jobs:
1919
# one flaky shard shouldn't cancel its siblings - lets us re-run only the failed shard
2020
fail-fast: false
2121
matrix:
22-
shardIndex: [1, 2, 3, 4, 5, 6, 7, 8]
23-
shardTotal: [8]
22+
shardIndex: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
23+
shardTotal: [10]
2424
env:
2525
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
2626
SHARD_INDEX: ${{ matrix.shardIndex }}
@@ -83,13 +83,23 @@ jobs:
8383
- name: 🐳 Pre-pull testcontainer images
8484
if: ${{ env.DOCKERHUB_USERNAME }}
8585
run: |
86+
# Retry each pull - DockerHub registry timeouts are a recurring transient CI flake.
87+
pull() {
88+
for attempt in 1 2 3; do
89+
docker pull "$1" && return 0
90+
echo "::warning::docker pull $1 failed (attempt ${attempt}/3); retrying in 10s"
91+
sleep 10
92+
done
93+
echo "::error::docker pull $1 failed after 3 attempts"
94+
return 1
95+
}
8696
echo "Pre-pulling Docker images with authenticated session..."
87-
docker pull postgres:14
88-
docker pull clickhouse/clickhouse-server:25.4-alpine
89-
docker pull redis:7.2
90-
docker pull testcontainers/ryuk:0.14.0
91-
docker pull electricsql/electric:1.2.4
92-
docker pull minio/minio:latest
97+
pull postgres:14
98+
pull clickhouse/clickhouse-server:25.4-alpine
99+
pull redis:7.2
100+
pull testcontainers/ryuk:0.14.0
101+
pull electricsql/electric:1.2.4
102+
pull minio/minio:latest
93103
echo "Image pre-pull complete"
94104
95105
- name: 📥 Download deps

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,6 @@ apps/**/public/build
7272
.mcp.log
7373
.mcp.json
7474
.cursor/debug.log
75-
ailogger-output.log
75+
ailogger-output.log
76+
# per-package vitest timing capture (transient; merged into root test-timings.json)
77+
.vitest-timing.json

apps/webapp/test/engine/streamBatchItems.test.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@ vi.mock("~/services/platform.v3.server", async (importOriginal) => {
1616

1717
import { RunEngine } from "@internal/run-engine";
1818
import { setupAuthenticatedEnvironment } from "@internal/run-engine/tests";
19-
import { containerTest } from "@internal/testcontainers";
19+
// Per-test redis (isolated): each test spins up its own RunEngine and runs batch work, which leaves
20+
// background activity on redis that outlives the test - sharing a worker redis across the 16 cases
21+
// here caused cross-test interference and 30s seal-timeout flakes. Same carve-out as the run-engine
22+
// batch tests.
23+
import { containerTestWithIsolatedRedis as containerTest } from "@internal/testcontainers";
2024
import { trace } from "@opentelemetry/api";
2125
import { PrismaClient } from "@trigger.dev/database";
2226
import { BatchId } from "@trigger.dev/core/v3/isomorphic";
@@ -1584,10 +1588,7 @@ describe("createNdjsonParserStream", () => {
15841588
const parser = createNdjsonParserStream(1024);
15851589
const results = await collectStream(stream.pipeThrough(parser));
15861590

1587-
expect(results).toEqual([
1588-
{ payload: "line1\nline2\nline3" },
1589-
{ payload: "no newlines" },
1590-
]);
1591+
expect(results).toEqual([{ payload: "line1\nline2\nline3" }, { payload: "no newlines" }]);
15911592
});
15921593

15931594
it("should skip empty lines", async () => {
@@ -1888,7 +1889,9 @@ describe("extractIndexAndTask", () => {
18881889
});
18891890

18901891
it("should not match nested keys", () => {
1891-
const bytes = encoder.encode('{"nested":{"index":999,"task":"inner"},"index":5,"task":"outer"}');
1892+
const bytes = encoder.encode(
1893+
'{"nested":{"index":999,"task":"inner"},"index":5,"task":"outer"}'
1894+
);
18921895
const result = extractIndexAndTask(bytes);
18931896
expect(result.index).toBe(5);
18941897
expect(result.task).toBe("outer");

apps/webapp/test/runsBackfiller.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ vi.mock("~/db.server", () => ({
77
}));
88

99
import { ClickHouse } from "@internal/clickhouse";
10-
import { containerTest } from "@internal/testcontainers";
10+
import { replicationContainerTest } from "@internal/testcontainers";
1111
import { z } from "zod";
1212
import { RunsBackfillerService } from "~/services/runsBackfiller.server";
1313
import { RunsReplicationService } from "~/services/runsReplicationService.server";
@@ -17,7 +17,7 @@ import { TestReplicationClickhouseFactory } from "./utils/testReplicationClickho
1717
vi.setConfig({ testTimeout: 60_000 });
1818

1919
describe("RunsBackfillerService", () => {
20-
containerTest(
20+
replicationContainerTest(
2121
"should backfill completed runs to clickhouse",
2222
async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => {
2323
const clickhouse = new ClickHouse({

apps/webapp/test/runsReplicationBenchmark.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { ClickHouse } from "@internal/clickhouse";
2-
import { containerTest } from "@internal/testcontainers";
2+
import { replicationContainerTest } from "@internal/testcontainers";
33
import { fork, type ChildProcess } from "node:child_process";
44
import { performance, PerformanceObserver } from "node:perf_hooks";
55
import { setTimeout } from "node:timers/promises";
@@ -501,7 +501,7 @@ function compareBenchmarks(baseline: BenchmarkResult, comparison: BenchmarkResul
501501
}
502502

503503
describe("RunsReplicationService Benchmark", () => {
504-
containerTest.skipIf(process.env.BENCHMARKS_ENABLED !== "1")(
504+
replicationContainerTest.skipIf(process.env.BENCHMARKS_ENABLED !== "1")(
505505
"should benchmark error fingerprinting performance impact",
506506
async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => {
507507
// Enable replica identity for TaskRun table

0 commit comments

Comments
 (0)