From 05765df5d31be09f7c7446f74442eed39397d771 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Mon, 11 May 2026 15:40:55 +0000 Subject: [PATCH 01/12] fix(codex): compute cost from token deltas, not nonexistent cost_usd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex cost logging was silently broken in three compounding ways: (1) `codex exec --json` never emits cost_usd fields (openai/codex#17539), so the parser branches reading total_cost_usd / cost_usd / usage.cost_usd were dead code; (2) `usage` on turn.completed is the CUMULATIVE session total, not per-turn — persisting it per row gave dashboards O(N²) inflated totals when summed; (3) the pricing table had no entry for gpt-5.4 / gpt-5.3-codex / gpt-5.3-codex-spark / codex-mini-latest, so even if cost-from-tokens were wired, calculateCost returned 0. Now: the engine tracks a run-level cumulative high-water mark, computes per-turn deltas on each turn.completed (clamping out-of-order regressions to 0), folds reasoning_output_tokens into output for billing, and computes cost via calculateCost('openai:'+model, delta) — mirroring the working claude-code path. context.cost accumulates additively instead of overwriting. Public OpenAI metered rates added for every Codex model; gpt-5.3-codex-spark proxies to gpt-5.3-codex rates since it has no metered API listing. A coverage test fails CI when a new Codex model is added without a pricing row. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/backends/codex/index.ts | 116 ++++- src/backends/codex/jsonlParser.ts | 30 +- src/utils/llmMetrics.ts | 16 +- tests/unit/backends/codex-cost.test.ts | 463 ++++++++++++++++++ tests/unit/backends/codex-jsonlParser.test.ts | 29 +- tests/unit/backends/codex.test.ts | 40 +- tests/unit/utils/llmMetrics-codex.test.ts | 76 +++ 7 files changed, 730 insertions(+), 40 deletions(-) create mode 100644 tests/unit/backends/codex-cost.test.ts create mode 100644 tests/unit/utils/llmMetrics-codex.test.ts diff --git a/src/backends/codex/index.ts b/src/backends/codex/index.ts index 61f62a6e0..23fe4d860 100644 --- a/src/backends/codex/index.ts +++ b/src/backends/codex/index.ts @@ -6,6 +6,7 @@ import { join } from 'node:path'; import { createInterface } from 'node:readline'; import { writeProjectCredential } from '../../db/repositories/credentialsRepository.js'; +import { calculateCost } from '../../utils/llmMetrics.js'; import { CODEX_ENGINE_DEFINITION } from '../catalog.js'; import { cleanupContextFiles } from '../shared/contextFiles.js'; import { appendEngineLog } from '../shared/engineLog.js'; @@ -53,6 +54,24 @@ type CodexTurnAccumulator = { usage: UsageSummary | null; }; +/** + * Run-level cumulative usage high-water mark. + * + * `codex exec --json` emits `usage` on every `turn.completed` as the CUMULATIVE + * session total — NOT a per-turn delta (upstream openai/codex#17539). We track + * the previous cumulative here so each turn persists its DELTA, not the running + * total. Without this, a 10-turn run with 100k tokens each would persist + * {100k, 200k, 300k, ...} = 5.5M summed instead of the true 1M. + * + * Reset/init: all zeros at the start of a run. Never reset per-turn. + */ +type CodexCumulativeUsage = { + inputTokens: number; + outputTokens: number; + cachedTokens: number; + reasoningTokens: number; +}; + type CodexLineContext = { input: AgentExecutionPlan; model: string; @@ -64,6 +83,8 @@ type CodexLineContext = { finalError?: string; /** Accumulator for the turn currently in progress. Reset on turn.started/thread.started. */ currentTurn: CodexTurnAccumulator; + /** Previous turn's cumulative usage — used to compute per-turn deltas. */ + cumulativeUsage: CodexCumulativeUsage; }; function tomlString(value: string): string { @@ -99,39 +120,110 @@ function accumulateTurnUsage(context: CodexLineContext, usage: UsageSummary): vo if (usage.inputTokens !== undefined) acc.usage.inputTokens = usage.inputTokens; if (usage.outputTokens !== undefined) acc.usage.outputTokens = usage.outputTokens; if (usage.cachedTokens !== undefined) acc.usage.cachedTokens = usage.cachedTokens; - if (usage.costUsd !== undefined) acc.usage.costUsd = usage.costUsd; + if (usage.reasoningTokens !== undefined) acc.usage.reasoningTokens = usage.reasoningTokens; } } +/** + * Compute the per-turn delta against the run-level cumulative high-water mark, + * then advance the high-water mark. Returns the delta. Clamps to 0 on out-of-order + * events (cumulative goes backwards) and logs a WARN. + * + * Upstream codex emits the SESSION-WIDE cumulative on every turn.completed (see + * openai/codex#17539). The delta is what we want to persist per-turn-row so that + * dashboard aggregations sum correctly. + */ +function computeTurnDelta(context: CodexLineContext, usage: UsageSummary): CodexCumulativeUsage { + const prev = context.cumulativeUsage; + const curr: CodexCumulativeUsage = { + inputTokens: usage.inputTokens ?? prev.inputTokens, + outputTokens: usage.outputTokens ?? prev.outputTokens, + cachedTokens: usage.cachedTokens ?? prev.cachedTokens, + reasoningTokens: usage.reasoningTokens ?? prev.reasoningTokens, + }; + const delta: CodexCumulativeUsage = { + inputTokens: Math.max(0, curr.inputTokens - prev.inputTokens), + outputTokens: Math.max(0, curr.outputTokens - prev.outputTokens), + cachedTokens: Math.max(0, curr.cachedTokens - prev.cachedTokens), + reasoningTokens: Math.max(0, curr.reasoningTokens - prev.reasoningTokens), + }; + + if ( + curr.inputTokens < prev.inputTokens || + curr.outputTokens < prev.outputTokens || + curr.cachedTokens < prev.cachedTokens || + curr.reasoningTokens < prev.reasoningTokens + ) { + context.input.logWriter( + 'WARN', + 'Codex turn.completed reported lower cumulative usage than previous turn — clamping delta to 0', + { prev, curr }, + ); + // Don't advance backwards. Keep the high-water mark. + return delta; + } + + context.cumulativeUsage = curr; + return delta; +} + /** * Persist exactly one storeLlmCall row for the completed turn, then reset the accumulator. * Called only from turn.completed to guarantee one row per turn, never from intermediate events. + * + * Cost = calculateCost('openai:', delta) — Codex never emits cost_usd + * upstream (openai/codex#17539), so cost is always computed CASCADE-side from + * the per-turn token DELTA × the pricing table at src/utils/llmMetrics.ts. + * Reasoning output tokens are folded into output for billing (OpenAI bills them + * at the output rate) while being preserved separately in the stored payload. */ function persistTurnLlmCall(context: CodexLineContext): void { const acc = context.currentTurn; const usage = acc.usage; + context.llmCallCount += 1; + + let delta: CodexCumulativeUsage | null = null; + let costUsd: number | undefined; + let outputForRow: number | undefined; + if (usage) { - context.cost = usage.costUsd ?? context.cost; + delta = computeTurnDelta(context, usage); + // Output billing includes reasoning tokens — OpenAI bills reasoning at the + // output rate. Fold them in for cost math; the stored row's `outputTokens` + // reflects the same combined figure for dashboard accuracy. + const billableOutput = delta.outputTokens + delta.reasoningTokens; + outputForRow = delta.outputTokens === 0 && billableOutput === 0 ? 0 : billableOutput; + const turnCost = calculateCost(`openai:${context.model}`, { + inputTokens: delta.inputTokens, + outputTokens: billableOutput, + totalTokens: delta.inputTokens + billableOutput, + cachedInputTokens: delta.cachedTokens, + }); + if (turnCost > 0) { + costUsd = turnCost; + context.cost = (context.cost ?? 0) + turnCost; + } } - context.llmCallCount += 1; - // Build a compact turn-scoped payload: text summary + tool names + usage. - // Storing this instead of the raw event JSONL keeps the payload small and readable. const turnPayload = JSON.stringify({ turn: context.llmCallCount, text: acc.textSummary.join(' ').slice(0, 500) || undefined, tools: acc.toolNames.length > 0 ? acc.toolNames : undefined, usage: usage ?? undefined, + delta: delta ?? undefined, + // Surfaced explicitly so a future maintainer triaging cost drift can + // see at a glance that reasoning was folded into output. + reasoning: delta && delta.reasoningTokens > 0 ? delta.reasoningTokens : undefined, }); logLlmCall({ runId: context.input.runId, callNumber: context.llmCallCount, model: context.model, - inputTokens: usage?.inputTokens, - outputTokens: usage?.outputTokens, - cachedTokens: usage?.cachedTokens, - costUsd: usage?.costUsd, + inputTokens: delta?.inputTokens, + outputTokens: outputForRow, + cachedTokens: delta?.cachedTokens, + costUsd, response: turnPayload, engineLabel: 'Codex', }); @@ -516,6 +608,12 @@ export class CodexEngine extends NativeToolEngine { cost, finalError, currentTurn: { textSummary: [], toolNames: [], usage: null }, + cumulativeUsage: { + inputTokens: 0, + outputTokens: 0, + cachedTokens: 0, + reasoningTokens: 0, + }, }; child.once('error', (error) => { diff --git a/src/backends/codex/jsonlParser.ts b/src/backends/codex/jsonlParser.ts index 17037bb94..08f35447c 100644 --- a/src/backends/codex/jsonlParser.ts +++ b/src/backends/codex/jsonlParser.ts @@ -19,7 +19,12 @@ export type UsageSummary = { inputTokens?: number; outputTokens?: number; cachedTokens?: number; - costUsd?: number; + /** + * Reasoning-model output tokens (gpt-5-codex / o1 / o3-class). OpenAI bills + * these as output tokens; CASCADE folds them into outputTokens for cost math + * while keeping the breakdown visible in the stored response payload. + */ + reasoningTokens?: number; }; export function extractTextFromContentParts(candidate: unknown): string[] { @@ -161,17 +166,18 @@ export function extractUsage(event: JsonRecord): UsageSummary | null { : undefined; const cachedTokens = typeof usage?.cached_input_tokens === 'number' ? usage.cached_input_tokens : undefined; - const costUsd = - typeof event.total_cost_usd === 'number' - ? event.total_cost_usd - : typeof event.cost_usd === 'number' - ? event.cost_usd - : typeof usage?.cost_usd === 'number' - ? usage.cost_usd - : undefined; - - return inputTokens !== undefined || outputTokens !== undefined || costUsd !== undefined - ? { inputTokens, outputTokens, cachedTokens, costUsd } + const reasoningTokens = + typeof usage?.reasoning_output_tokens === 'number' ? usage.reasoning_output_tokens : undefined; + + // `codex exec --json` does not emit any cost field today (see openai/codex#17539). + // Cost is computed CASCADE-side from token deltas × the pricing table; any + // cost-shaped fields on the event are intentionally ignored. + + return inputTokens !== undefined || + outputTokens !== undefined || + cachedTokens !== undefined || + reasoningTokens !== undefined + ? { inputTokens, outputTokens, cachedTokens, reasoningTokens } : null; } diff --git a/src/utils/llmMetrics.ts b/src/utils/llmMetrics.ts index e68a3c33b..03924dae7 100644 --- a/src/utils/llmMetrics.ts +++ b/src/utils/llmMetrics.ts @@ -23,8 +23,20 @@ const MODEL_PRICING: Record Promise>(); +const mockWriteFile = vi.fn<() => Promise>(); +const mockMkdir = vi.fn<() => Promise>(); +const mockReadFile = vi.fn<() => Promise>(); + +vi.mock('node:child_process', () => ({ + spawn: (...args: unknown[]) => mockSpawn(...args), +})); + +vi.mock('node:fs/promises', () => ({ + mkdir: (...args: unknown[]) => mockMkdir(...args), + writeFile: (...args: unknown[]) => mockWriteFile(...args), + readFile: (...args: unknown[]) => mockReadFile(...args), +})); + +vi.mock('../../../src/db/repositories/credentialsRepository.js', () => ({ + writeProjectCredential: (...args: unknown[]) => mockWriteProjectCredential(...args), +})); + +vi.mock('../../../src/db/repositories/runsRepository.js', () => ({ + storeLlmCall: (...args: unknown[]) => mockStoreLlmCall(...args), +})); + +vi.mock('../../../src/utils/logging.js', () => ({ + logger: { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + }, +})); + +import { CodexEngine } from '../../../src/backends/codex/index.js'; +import { DEFAULT_CODEX_MODEL } from '../../../src/backends/codex/models.js'; +import type { AgentExecutionPlan } from '../../../src/backends/types.js'; +import { calculateCost } from '../../../src/utils/llmMetrics.js'; + +function makeInput(overrides: Partial = {}): AgentExecutionPlan { + return { + agentType: 'implementation', + project: { + id: 'test-project', + orgId: 'org-1', + name: 'Test Project', + repo: 'owner/repo', + baseBranch: 'main', + branchPrefix: 'feature/', + pm: { type: 'trello' }, + trello: { boardId: 'b1', lists: {}, labels: {} }, + engineSettings: undefined, + }, + config: { projects: [] }, + repoDir: '/tmp/repo', + systemPrompt: 'You are an agent.', + taskPrompt: 'Implement feature X.', + cliToolsDir: '/usr/bin', + availableTools: [], + contextInjections: [], + maxIterations: 20, + budgetUsd: 5, + model: DEFAULT_CODEX_MODEL, + nativeToolCapabilities: ['fs:read', 'fs:write', 'shell:exec'], + progressReporter: { + onIteration: vi.fn().mockResolvedValue(undefined), + onToolCall: vi.fn(), + onText: vi.fn(), + }, + logWriter: vi.fn(), + agentInput: { workItemId: 'card-1' }, + projectSecrets: { OPENAI_API_KEY: 'sk-test' }, + engineLogPath: undefined, + ...overrides, + }; +} + +function createMockChild({ + stdoutLines = [], + stderr = '', + exitCode = 0, + onBeforeClose, +}: { + stdoutLines?: string[]; + stderr?: string; + exitCode?: number; + onBeforeClose?: () => void; +}) { + const child = new EventEmitter() as EventEmitter & { + stdout: PassThrough; + stderr: PassThrough; + stdin: PassThrough; + }; + child.stdout = new PassThrough(); + child.stderr = new PassThrough(); + child.stdin = new PassThrough(); + + queueMicrotask(() => { + for (const line of stdoutLines) child.stdout.write(`${line}\n`); + if (stderr) child.stderr.write(stderr); + onBeforeClose?.(); + child.stdout.end(); + child.stderr.end(); + child.emit('close', exitCode); + }); + + return child; +} + +function stdoutFor(events: object[]): string[] { + return events.map((e) => JSON.stringify(e)); +} + +describe('CodexEngine — cost and token deltas', () => { + let workspaceDir: string; + + beforeEach(() => { + workspaceDir = mkdtempSync(join(tmpdir(), 'cascade-codex-cost-test-')); + mockStoreLlmCall.mockClear(); + mockMkdir.mockResolvedValue(undefined); + mockWriteFile.mockResolvedValue(undefined); + mockReadFile.mockRejectedValue(Object.assign(new Error('ENOENT'), { code: 'ENOENT' })); + mockWriteProjectCredential.mockResolvedValue(undefined); + }); + + afterEach(() => { + rmSync(workspaceDir, { recursive: true, force: true }); + Reflect.deleteProperty(process.env, 'OPENAI_API_KEY'); + }); + + // ─── Test 1: single-turn ────────────────────────────────────────────────── + + it('persists one row with delta tokens and computed cost on a single-turn run', async () => { + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([ + { type: 'turn.started' }, + { + type: 'turn.completed', + usage: { input_tokens: 1000, output_tokens: 500 }, + }, + ]), + onBeforeClose: () => writeFileSync(outputPath, 'Finished.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + const input = makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-single', + }); + + const result = await engine.execute(input); + + expect(result.success).toBe(true); + expect(mockStoreLlmCall).toHaveBeenCalledTimes(1); + const [call] = mockStoreLlmCall.mock.calls[0]; + expect(call.inputTokens).toBe(1000); + expect(call.outputTokens).toBe(500); + expect(call.costUsd).toBeGreaterThan(0); + + const expected = calculateCost(`openai:${DEFAULT_CODEX_MODEL}`, { + inputTokens: 1000, + outputTokens: 500, + }); + expect(call.costUsd).toBeCloseTo(expected, 6); + expect(result.cost).toBeCloseTo(expected, 6); + }); + + // ─── Test 2: three-turn cumulative → delta ──────────────────────────────── + + it('converts cumulative session usage into per-turn deltas across multiple turns', async () => { + // Upstream codex emits cumulative session totals on each turn.completed. + // CASCADE must persist DELTAS, not the running totals. + // Cumulative: turn 1 → {1000,500}, turn 2 → {2500,1100}, turn 3 → {4000,1800} + // Deltas: turn 1 → {1000,500}, turn 2 → {1500,600}, turn 3 → {1500,700} + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([ + { type: 'turn.started' }, + { type: 'turn.completed', usage: { input_tokens: 1000, output_tokens: 500 } }, + { type: 'turn.started' }, + { type: 'turn.completed', usage: { input_tokens: 2500, output_tokens: 1100 } }, + { type: 'turn.started' }, + { type: 'turn.completed', usage: { input_tokens: 4000, output_tokens: 1800 } }, + ]), + onBeforeClose: () => writeFileSync(outputPath, 'Done.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + const result = await engine.execute( + makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-multi', + }), + ); + + expect(result.success).toBe(true); + expect(mockStoreLlmCall).toHaveBeenCalledTimes(3); + + const rows = mockStoreLlmCall.mock.calls.map((c) => c[0]); + expect(rows[0].inputTokens).toBe(1000); + expect(rows[0].outputTokens).toBe(500); + expect(rows[1].inputTokens).toBe(1500); + expect(rows[1].outputTokens).toBe(600); + expect(rows[2].inputTokens).toBe(1500); + expect(rows[2].outputTokens).toBe(700); + + // result.cost = sum of per-turn deltas priced individually. + // (Equivalently, since pricing is linear, it should equal the cost of the + // final cumulative — this asserts the additive accumulator is correct.) + const expectedTotal = calculateCost(`openai:${DEFAULT_CODEX_MODEL}`, { + inputTokens: 4000, + outputTokens: 1800, + }); + expect(result.cost).toBeCloseTo(expectedTotal, 6); + }); + + // ─── Test 3: reasoning tokens ───────────────────────────────────────────── + + it('folds reasoning_output_tokens into output for cost math and stores them in the row', async () => { + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([ + { type: 'turn.started' }, + { + type: 'turn.completed', + usage: { + input_tokens: 100, + output_tokens: 200, + reasoning_output_tokens: 800, + }, + }, + ]), + onBeforeClose: () => writeFileSync(outputPath, 'Done.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + const result = await engine.execute( + makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-reasoning', + }), + ); + + const [row] = mockStoreLlmCall.mock.calls[0]; + // Cost is computed against output = visible_output + reasoning = 1000. + const expected = calculateCost(`openai:${DEFAULT_CODEX_MODEL}`, { + inputTokens: 100, + outputTokens: 1000, + }); + expect(row.costUsd).toBeCloseTo(expected, 6); + expect(result.cost).toBeCloseTo(expected, 6); + // Reasoning is preserved in the stored response JSON for observability. + expect(row.response).toContain('reasoning'); + }); + + // ─── Test 4: cached input ───────────────────────────────────────────────── + + it('applies cached-input discount when cached_input_tokens are present', async () => { + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([ + { type: 'turn.started' }, + { + type: 'turn.completed', + usage: { input_tokens: 1000, output_tokens: 0, cached_input_tokens: 800 }, + }, + ]), + onBeforeClose: () => writeFileSync(outputPath, 'Done.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + const result = await engine.execute( + makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-cached', + }), + ); + + const expected = calculateCost(`openai:${DEFAULT_CODEX_MODEL}`, { + inputTokens: 1000, + outputTokens: 0, + cachedInputTokens: 800, + }); + const noCacheCost = calculateCost(`openai:${DEFAULT_CODEX_MODEL}`, { + inputTokens: 1000, + outputTokens: 0, + }); + expect(result.cost).toBeCloseTo(expected, 6); + expect(expected).toBeLessThan(noCacheCost); + }); + + // ─── Test 5: out-of-order event (defensive clamp) ───────────────────────── + + it('clamps deltas to zero when cumulative usage goes backwards (out-of-order event)', async () => { + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([ + { type: 'turn.started' }, + { type: 'turn.completed', usage: { input_tokens: 1000, output_tokens: 500 } }, + { type: 'turn.started' }, + // Lower cumulative — should be clamped, not produce negative tokens. + { type: 'turn.completed', usage: { input_tokens: 900, output_tokens: 400 } }, + ]), + onBeforeClose: () => writeFileSync(outputPath, 'Done.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + await engine.execute( + makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-out-of-order', + }), + ); + + const rows = mockStoreLlmCall.mock.calls.map((c) => c[0]); + expect(rows).toHaveLength(2); + expect(rows[1].inputTokens).toBe(0); + expect(rows[1].outputTokens).toBe(0); + }); + + // ─── Test 6: no usage on turn.completed ─────────────────────────────────── + + it('persists a row with undefined token fields when turn.completed carries no usage', async () => { + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([{ type: 'turn.started' }, { type: 'turn.completed' }]), + onBeforeClose: () => writeFileSync(outputPath, 'Done.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + const result = await engine.execute( + makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-no-usage', + }), + ); + + expect(result.success).toBe(true); + expect(mockStoreLlmCall).toHaveBeenCalledTimes(1); + const [row] = mockStoreLlmCall.mock.calls[0]; + expect(row.inputTokens).toBeUndefined(); + expect(row.outputTokens).toBeUndefined(); + expect(row.costUsd).toBeUndefined(); + expect(result.cost).toBeUndefined(); + }); + + // ─── Test 7: smuggled cost_usd field is ignored (no-rug-sweep) ──────────── + + it('ignores any cost_usd field on the event — cost always derives from tokens', async () => { + // Upstream codex doesn't emit cost_usd, but if a future version (or a + // malformed event) smuggles one in, we MUST compute from tokens and + // ignore the smuggled value. Pins the "delete dead branches" decision. + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([ + { type: 'turn.started' }, + { + type: 'turn.completed', + usage: { input_tokens: 100, output_tokens: 50 }, + total_cost_usd: 999, // ← smuggled value; must be ignored + cost_usd: 999, // ← also ignored + }, + ]), + onBeforeClose: () => writeFileSync(outputPath, 'Done.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + const result = await engine.execute( + makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-smuggled', + }), + ); + + const expected = calculateCost(`openai:${DEFAULT_CODEX_MODEL}`, { + inputTokens: 100, + outputTokens: 50, + }); + expect(result.cost).toBeCloseTo(expected, 6); + expect(result.cost).toBeLessThan(1); // certainly not 999 + }); + + // ─── Test 8: response.completed intermediate event does not persist ────── + + it('accumulates intermediate response.completed usage but persists only on turn.completed', async () => { + // Pins the existing invariant: exactly one row per turn, not per usage event. + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([ + { type: 'turn.started' }, + // Intermediate usage — should NOT persist a row. + { + type: 'response.completed', + response: { usage: { input_tokens: 50, output_tokens: 20 } }, + }, + // Final usage (cumulative session total at end of turn). + { + type: 'turn.completed', + usage: { input_tokens: 100, output_tokens: 40 }, + }, + ]), + onBeforeClose: () => writeFileSync(outputPath, 'Done.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + await engine.execute( + makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-intermediate', + }), + ); + + expect(mockStoreLlmCall).toHaveBeenCalledTimes(1); + const [row] = mockStoreLlmCall.mock.calls[0]; + expect(row.inputTokens).toBe(100); + expect(row.outputTokens).toBe(40); + }); +}); diff --git a/tests/unit/backends/codex-jsonlParser.test.ts b/tests/unit/backends/codex-jsonlParser.test.ts index 8340998e6..53b1e37b9 100644 --- a/tests/unit/backends/codex-jsonlParser.test.ts +++ b/tests/unit/backends/codex-jsonlParser.test.ts @@ -297,7 +297,7 @@ describe('parseCodexEvent', () => { inputTokens: 100, outputTokens: 50, cachedTokens: 80, - costUsd: undefined, + reasoningTokens: undefined, }); expect(result.textParts).toEqual([]); expect(result.toolCall).toBeNull(); @@ -313,7 +313,20 @@ describe('parseCodexEvent', () => { inputTokens: 42, outputTokens: 7, cachedTokens: undefined, - costUsd: undefined, + reasoningTokens: undefined, + }); + }); + + it('parses a turn.completed event with reasoning_output_tokens', () => { + const result = parseCodexEvent({ + type: 'turn.completed', + usage: { input_tokens: 100, output_tokens: 200, reasoning_output_tokens: 800 }, + }); + expect(result.usage).toEqual({ + inputTokens: 100, + outputTokens: 200, + cachedTokens: undefined, + reasoningTokens: 800, }); }); @@ -403,17 +416,23 @@ describe('parseCodexEvent', () => { expect(result.toolCall).toBeNull(); }); - it('parses event with total_cost_usd', () => { + it('ignores any cost_usd / total_cost_usd field on the event (cost is computed CASCADE-side from tokens)', () => { + // codex exec --json does not emit cost fields upstream (openai/codex#17539). + // If a future version smuggles one in, the parser must NOT surface it — + // cost is computed by the engine from token deltas × pricing table. const result = parseCodexEvent({ usage: { input_tokens: 10, output_tokens: 5 }, total_cost_usd: 0.0042, + cost_usd: 0.0042, }); expect(result.usage).toEqual({ inputTokens: 10, outputTokens: 5, cachedTokens: undefined, - costUsd: 0.0042, + reasoningTokens: undefined, }); + // Sanity: no costUsd-shaped field leaks through. + expect(JSON.stringify(result.usage)).not.toContain('costUsd'); }); it('parses event with camelCase usage fields (inputTokens/outputTokens)', () => { @@ -424,7 +443,7 @@ describe('parseCodexEvent', () => { inputTokens: 200, outputTokens: 80, cachedTokens: undefined, - costUsd: undefined, + reasoningTokens: undefined, }); }); }); diff --git a/tests/unit/backends/codex.test.ts b/tests/unit/backends/codex.test.ts index df2db8ad2..676345d1d 100644 --- a/tests/unit/backends/codex.test.ts +++ b/tests/unit/backends/codex.test.ts @@ -335,7 +335,7 @@ describe('extractUsage', () => { inputTokens: 100, outputTokens: 50, cachedTokens: undefined, - costUsd: undefined, + reasoningTokens: undefined, }); }); @@ -348,11 +348,27 @@ describe('extractUsage', () => { inputTokens: 500, outputTokens: 30, cachedTokens: 450, - costUsd: undefined, + reasoningTokens: undefined, }); }); - it('still extracts top-level usage field (backward compat)', () => { + it('extracts reasoning_output_tokens from turn.completed usage', () => { + const result = extractUsage({ + type: 'turn.completed', + usage: { input_tokens: 100, output_tokens: 200, reasoning_output_tokens: 800 }, + }); + expect(result).toEqual({ + inputTokens: 100, + outputTokens: 200, + cachedTokens: undefined, + reasoningTokens: 800, + }); + }); + + it('ignores any cost_usd-shaped fields on the event (cost is computed CASCADE-side)', () => { + // Pins the "delete dead branches" decision — codex exec --json doesn't + // emit cost upstream (openai/codex#17539); the engine computes cost from + // token deltas via calculateCost, so cost fields must NOT leak through. const result = extractUsage({ usage: { input_tokens: 10, output_tokens: 5 }, total_cost_usd: 0.01, @@ -361,7 +377,7 @@ describe('extractUsage', () => { inputTokens: 10, outputTokens: 5, cachedTokens: undefined, - costUsd: 0.01, + reasoningTokens: undefined, }); }); @@ -495,15 +511,11 @@ describe('CodexEngine', () => { tool_input: { command: 'cascade-tools session finish --comment done' }, }), // Intermediate usage event — accumulates into turn, does NOT persist a row - JSON.stringify({ - usage: { input_tokens: 11, output_tokens: 7 }, - total_cost_usd: 0.42, - }), + JSON.stringify({ usage: { input_tokens: 11, output_tokens: 7 } }), // turn.completed finalizes and persists the accumulated turn data JSON.stringify({ type: 'turn.completed', usage: { input_tokens: 11, output_tokens: 7 }, - total_cost_usd: 0.42, }), ], onBeforeClose: () => { @@ -528,7 +540,10 @@ describe('CodexEngine', () => { expect(result.success).toBe(true); expect(result.output).toContain('Finished work.'); expect(result.prUrl).toBe('https://github.com/owner/repo/pull/123'); - expect(result.cost).toBe(0.42); + // Cost is computed CASCADE-side from tokens × pricing table; the + // per-1M-token rate keeps the absolute number tiny for 11 in / 7 out. + expect(result.cost).toBeGreaterThan(0); + expect(result.cost).toBeLessThan(0.01); expect(input.progressReporter.onIteration).toHaveBeenCalled(); expect(input.progressReporter.onToolCall).toHaveBeenCalledWith('Bash', { command: 'cascade-tools session finish --comment done', @@ -1011,14 +1026,15 @@ describe('CodexEngine', () => { expect(result.success).toBe(true); // Exactly two rows — one per completed turn expect(mockStoreLlmCall).toHaveBeenCalledTimes(2); - // Stable, sequential callNumber values + // Codex emits CUMULATIVE session usage; rows must store per-turn DELTAS. + // Feeding cumulative {50,20} then {80,30} → deltas {50,20} and {30,10}. expect(mockStoreLlmCall).toHaveBeenNthCalledWith( 1, expect.objectContaining({ callNumber: 1, inputTokens: 50, outputTokens: 20 }), ); expect(mockStoreLlmCall).toHaveBeenNthCalledWith( 2, - expect.objectContaining({ callNumber: 2, inputTokens: 80, outputTokens: 30 }), + expect.objectContaining({ callNumber: 2, inputTokens: 30, outputTokens: 10 }), ); }); diff --git a/tests/unit/utils/llmMetrics-codex.test.ts b/tests/unit/utils/llmMetrics-codex.test.ts new file mode 100644 index 000000000..674c63624 --- /dev/null +++ b/tests/unit/utils/llmMetrics-codex.test.ts @@ -0,0 +1,76 @@ +import { describe, expect, it } from 'vitest'; + +import { CODEX_MODEL_IDS } from '../../../src/backends/codex/models.js'; +import { calculateCost } from '../../../src/utils/llmMetrics.js'; + +/** + * Pricing-table coverage for the Codex engine. + * + * If a new Codex model is added to src/backends/codex/models.ts without a + * matching entry in src/utils/llmMetrics.ts MODEL_PRICING, this test fails + * loudly with the missing model name — preventing the silent "cost = 0" + * regression that plagued every Codex run before this fix. + */ +describe('codex pricing coverage', () => { + for (const modelId of CODEX_MODEL_IDS) { + it(`has a non-zero pricing row for "${modelId}"`, () => { + // 1M input + 1M output tokens — any priced model must return > 0. + const cost = calculateCost(`openai:${modelId}`, { + inputTokens: 1_000_000, + outputTokens: 1_000_000, + }); + expect( + cost, + `Missing pricing entry for openai:${modelId} in src/utils/llmMetrics.ts. ` + + `Every Codex model in CODEX_MODEL_IDS must have a pricing row.`, + ).toBeGreaterThan(0); + }); + } + + it('applies cached-input discount for codex-mini-latest (25% cached, not 10%)', () => { + // codex-mini-latest published rate is cachedInput=$0.375/1M (25% of $1.50 input) + // — verify the row preserves this non-standard ratio. + const noCache = calculateCost('openai:codex-mini-latest', { + inputTokens: 1_000_000, + outputTokens: 0, + }); + const fullCache = calculateCost('openai:codex-mini-latest', { + inputTokens: 1_000_000, + outputTokens: 0, + cachedInputTokens: 1_000_000, + }); + // Discount = (input - cachedInput) * 1M tokens. + // If preserved at 25% ratio, discount = (1.50 - 0.375) = $1.125, leaving $0.375 cost. + expect(fullCache).toBeCloseTo(0.375, 4); + expect(noCache).toBeCloseTo(1.5, 4); + }); + + it('proxies gpt-5.3-codex-spark to gpt-5.3-codex rates', () => { + // spark has no metered API listing — we proxy to gpt-5.3-codex rates so + // ChatGPT Pro users still get an API-equivalent cost figure. If a future + // commit decouples them, this test alerts the maintainer. + const codex = calculateCost('openai:gpt-5.3-codex', { + inputTokens: 1_000_000, + outputTokens: 1_000_000, + }); + const spark = calculateCost('openai:gpt-5.3-codex-spark', { + inputTokens: 1_000_000, + outputTokens: 1_000_000, + }); + expect(spark).toBeCloseTo(codex, 6); + }); + + it('gpt-5.5 has a cachedInput rate (not undefined)', () => { + // Regression net for the existing gpt-5.5 row that shipped without cachedInput. + const noCache = calculateCost('openai:gpt-5.5', { + inputTokens: 1_000_000, + outputTokens: 0, + }); + const fullCache = calculateCost('openai:gpt-5.5', { + inputTokens: 1_000_000, + outputTokens: 0, + cachedInputTokens: 1_000_000, + }); + expect(fullCache).toBeLessThan(noCache); + }); +}); From c483f4403352fa734347fe19253c059618778e12 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 11 May 2026 15:55:53 +0000 Subject: [PATCH 02/12] fix(codex): treat reasoning_output_tokens as output subset, not extra counter OpenAI/Codex usage reports reasoning_output_tokens as a breakdown of output_tokens (not an additional counter). A turn with output_tokens:47 and reasoning_output_tokens:41 has 47 total output tokens, 41 of which are reasoning. The previous code computed billableOutput = outputTokens + reasoningTokens, inflating a turn with {output:200, reasoning:800} to 1000 billed output tokens instead of the correct 200. Fix: use delta.outputTokens directly for billing and outputForRow. Store delta.reasoningTokens in the response payload for observability only. Update Test 3 in codex-cost.test.ts to use realistic values where reasoning_output_tokens is a subset of output_tokens and assert that outputTokens (not outputTokens+reasoning) is billed. Co-Authored-By: Claude Sonnet 4.6 --- src/backends/codex/index.ts | 25 ++++++++++++++----------- tests/unit/backends/codex-cost.test.ts | 16 +++++++++++----- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/backends/codex/index.ts b/src/backends/codex/index.ts index 23fe4d860..e3d2b5a21 100644 --- a/src/backends/codex/index.ts +++ b/src/backends/codex/index.ts @@ -174,8 +174,13 @@ function computeTurnDelta(context: CodexLineContext, usage: UsageSummary): Codex * Cost = calculateCost('openai:', delta) — Codex never emits cost_usd * upstream (openai/codex#17539), so cost is always computed CASCADE-side from * the per-turn token DELTA × the pricing table at src/utils/llmMetrics.ts. - * Reasoning output tokens are folded into output for billing (OpenAI bills them - * at the output rate) while being preserved separately in the stored payload. + * + * Reasoning tokens: OpenAI/Codex treats `reasoning_output_tokens` as a + * breakdown/subset of `output_tokens` — NOT an additional counter. A turn + * with `output_tokens: 47, reasoning_output_tokens: 41` has 47 total output + * tokens (41 of which are reasoning). We store the reasoning count separately + * in the response payload for observability, but billing always uses + * `delta.outputTokens` as-is (it already includes reasoning). */ function persistTurnLlmCall(context: CodexLineContext): void { const acc = context.currentTurn; @@ -188,15 +193,13 @@ function persistTurnLlmCall(context: CodexLineContext): void { if (usage) { delta = computeTurnDelta(context, usage); - // Output billing includes reasoning tokens — OpenAI bills reasoning at the - // output rate. Fold them in for cost math; the stored row's `outputTokens` - // reflects the same combined figure for dashboard accuracy. - const billableOutput = delta.outputTokens + delta.reasoningTokens; - outputForRow = delta.outputTokens === 0 && billableOutput === 0 ? 0 : billableOutput; + // Use delta.outputTokens directly — reasoning_output_tokens is already + // counted within output_tokens (it is a breakdown, not an extra counter). + outputForRow = delta.outputTokens; const turnCost = calculateCost(`openai:${context.model}`, { inputTokens: delta.inputTokens, - outputTokens: billableOutput, - totalTokens: delta.inputTokens + billableOutput, + outputTokens: delta.outputTokens, + totalTokens: delta.inputTokens + delta.outputTokens, cachedInputTokens: delta.cachedTokens, }); if (turnCost > 0) { @@ -211,8 +214,8 @@ function persistTurnLlmCall(context: CodexLineContext): void { tools: acc.toolNames.length > 0 ? acc.toolNames : undefined, usage: usage ?? undefined, delta: delta ?? undefined, - // Surfaced explicitly so a future maintainer triaging cost drift can - // see at a glance that reasoning was folded into output. + // Reasoning breakdown preserved for observability; it is already counted + // within outputTokens above and must NOT be added to it for billing. reasoning: delta && delta.reasoningTokens > 0 ? delta.reasoningTokens : undefined, }); diff --git a/tests/unit/backends/codex-cost.test.ts b/tests/unit/backends/codex-cost.test.ts index 672b96a56..623e0f9de 100644 --- a/tests/unit/backends/codex-cost.test.ts +++ b/tests/unit/backends/codex-cost.test.ts @@ -243,7 +243,12 @@ describe('CodexEngine — cost and token deltas', () => { // ─── Test 3: reasoning tokens ───────────────────────────────────────────── - it('folds reasoning_output_tokens into output for cost math and stores them in the row', async () => { + it('uses output_tokens as-is for billing — reasoning_output_tokens is a subset, not extra', async () => { + // OpenAI/Codex: reasoning_output_tokens is a BREAKDOWN of output_tokens, + // not an additional counter. A turn with output_tokens: 1000 and + // reasoning_output_tokens: 800 has 1000 total output tokens (800 of which + // are internal reasoning). Billing uses output_tokens (1000) directly; + // adding reasoning on top would over-count to 1800. mockSpawn.mockImplementation((_cmd: string, args: string[]) => { const outputPath = args[args.indexOf('-o') + 1]; return createMockChild({ @@ -253,8 +258,8 @@ describe('CodexEngine — cost and token deltas', () => { type: 'turn.completed', usage: { input_tokens: 100, - output_tokens: 200, - reasoning_output_tokens: 800, + output_tokens: 1000, + reasoning_output_tokens: 800, // subset of the 1000 output tokens }, }, ]), @@ -272,14 +277,15 @@ describe('CodexEngine — cost and token deltas', () => { ); const [row] = mockStoreLlmCall.mock.calls[0]; - // Cost is computed against output = visible_output + reasoning = 1000. + // Cost uses output_tokens (1000) directly — NOT output + reasoning (1800). const expected = calculateCost(`openai:${DEFAULT_CODEX_MODEL}`, { inputTokens: 100, outputTokens: 1000, }); + expect(row.outputTokens).toBe(1000); // not 1800 expect(row.costUsd).toBeCloseTo(expected, 6); expect(result.cost).toBeCloseTo(expected, 6); - // Reasoning is preserved in the stored response JSON for observability. + // Reasoning breakdown is preserved in the stored response JSON for observability. expect(row.response).toContain('reasoning'); }); From b31dcddcf653a58d386d0344322f0eb3d4ddf832 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 11 May 2026 18:07:45 +0000 Subject: [PATCH 03/12] fix: address feedback --- tests/unit/backends/codex-cost.test.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/backends/codex-cost.test.ts b/tests/unit/backends/codex-cost.test.ts index 623e0f9de..f146cf3d9 100644 --- a/tests/unit/backends/codex-cost.test.ts +++ b/tests/unit/backends/codex-cost.test.ts @@ -286,7 +286,10 @@ describe('CodexEngine — cost and token deltas', () => { expect(row.costUsd).toBeCloseTo(expected, 6); expect(result.cost).toBeCloseTo(expected, 6); // Reasoning breakdown is preserved in the stored response JSON for observability. - expect(row.response).toContain('reasoning'); + const response = JSON.parse(row.response); + expect(response.reasoning).toBe(800); + expect(response.delta.outputTokens).toBe(1000); + expect(response.delta.reasoningTokens).toBe(800); }); // ─── Test 4: cached input ───────────────────────────────────────────────── From 4de56aa05ca3e21298f98c5c4430562fa22e2a35 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 11 May 2026 18:26:49 +0000 Subject: [PATCH 04/12] fix(codex): return all-zero delta on regression; constrain resolver to priced models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two accounting paths could silently persist wrong or zero Codex cost: 1. computeTurnDelta returned the per-field Math.max(0, curr-prev) delta even when the backwards-cumulative guard tripped. If inputTokens regressed while outputTokens increased, the positive outputTokens delta was still persisted — and double-counted on the next valid event because the high-water mark was not advanced. Fix: return an all-zero delta when any counter regresses, discarding the entire event rather than a mix of zero and positive fields. 2. resolveCodexModel() accepted any openai:* string and any gpt-*codex* pattern, but MODEL_PRICING only covers CODEX_MODEL_IDS. A project configured with e.g. openai:gpt-5-codex would run through the Codex engine and then persist cost=0 because calculateCost('openai:gpt-5-codex', ...) returns 0. Fix: constrain the resolver to CODEX_MODEL_IDS (with optional openai: prefix); remove the wildcard gpt-*codex* branch entirely. Tests added: - codex-cost.test.ts Test 5b: mixed-regression scenario (inputTokens down, outputTokens up) verifies all-zero delta and no double-counting on the following valid event. - codex.test.ts: two new resolveCodexModel tests — openai:gpt-5-codex and gpt-5-turbo-codex both throw rather than silently accepting unpriced models. Co-Authored-By: Claude Sonnet 4.6 --- src/backends/codex/index.ts | 21 +++++++-- tests/unit/backends/codex-cost.test.ts | 62 ++++++++++++++++++++++++++ tests/unit/backends/codex.test.ts | 17 +++++++ 3 files changed, 96 insertions(+), 4 deletions(-) diff --git a/src/backends/codex/index.ts b/src/backends/codex/index.ts index e3d2b5a21..eb31798a8 100644 --- a/src/backends/codex/index.ts +++ b/src/backends/codex/index.ts @@ -159,8 +159,13 @@ function computeTurnDelta(context: CodexLineContext, usage: UsageSummary): Codex 'Codex turn.completed reported lower cumulative usage than previous turn — clamping delta to 0', { prev, curr }, ); - // Don't advance backwards. Keep the high-water mark. - return delta; + // Return all-zero delta — discard the entire backwards event rather than + // persisting partial positive fields (e.g. outputTokens increased while + // inputTokens went backwards). The high-water mark stays at `prev` so + // the next valid cumulative is correctly subtracted from the last known + // good baseline; without this, any positive per-field delta from the + // discarded event would be double-counted on the following valid event. + return { inputTokens: 0, outputTokens: 0, cachedTokens: 0, reasoningTokens: 0 }; } context.cumulativeUsage = curr; @@ -337,8 +342,16 @@ async function processStdoutLine(context: CodexLineContext, line: string): Promi function resolveCodexModel(cascadeModel: string): string { if (CODEX_MODEL_IDS.includes(cascadeModel)) return cascadeModel; - if (cascadeModel.startsWith('openai:')) return cascadeModel.replace('openai:', ''); - if (cascadeModel.startsWith('gpt-') && cascadeModel.includes('codex')) return cascadeModel; + // Accept openai: prefix as a convenience shorthand (e.g. "openai:gpt-5.4"). + // Only resolve to a known Codex model ID — the old gpt-*codex* wildcard was + // removed because unrecognised model IDs have no pricing row in MODEL_PRICING + // and would silently persist zero cost. Add new models to CODEX_MODEL_IDS in + // src/backends/codex/models.ts AND add a pricing row to MODEL_PRICING in + // src/utils/llmMetrics.ts before accepting them here. + if (cascadeModel.startsWith('openai:')) { + const bareId = cascadeModel.replace('openai:', ''); + if (CODEX_MODEL_IDS.includes(bareId)) return bareId; + } throw new Error( `Model "${cascadeModel}" is not compatible with the Codex engine. Configure a Codex-compatible model (e.g. "${DEFAULT_CODEX_MODEL}") or switch to a different engine.`, diff --git a/tests/unit/backends/codex-cost.test.ts b/tests/unit/backends/codex-cost.test.ts index f146cf3d9..ba8bc2dbf 100644 --- a/tests/unit/backends/codex-cost.test.ts +++ b/tests/unit/backends/codex-cost.test.ts @@ -363,6 +363,68 @@ describe('CodexEngine — cost and token deltas', () => { expect(rows[1].outputTokens).toBe(0); }); + // ─── Test 5b: mixed regression — one counter up, one down ───────────────── + + it('returns all-zero delta when any single counter regresses — prevents double-counting', async () => { + // Regression test for the bug where computeTurnDelta returned per-field + // Math.max(0, curr-prev) even when the backwards guard tripped. If inputTokens + // went backwards while outputTokens increased, the positive outputTokens delta + // was still persisted AND double-counted on the next valid event (because the + // high-water mark was not advanced for the rejected event). + // + // Scenario: T1 valid {1000,500} → T2 mixed {900,600} → T3 valid {2000,1200} + // Expected: + // T1 delta: {1000, 500} (first event, HWM=0→{1000,500}) + // T2 delta: {0, 0} (inputTokens regressed — whole event discarded) + // T3 delta: {1000, 700} (2000-1000, 1200-500 from unchanged HWM) + // Bug produced T2 delta {0, 100} which was then double-counted in T3. + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: stdoutFor([ + { type: 'turn.started' }, + // T1: valid baseline + { type: 'turn.completed', usage: { input_tokens: 1000, output_tokens: 500 } }, + { type: 'turn.started' }, + // T2: inputTokens goes backwards (1000→900), outputTokens increases (500→600) + { type: 'turn.completed', usage: { input_tokens: 900, output_tokens: 600 } }, + { type: 'turn.started' }, + // T3: both counters advance from valid T1 baseline (HWM was not advanced for T2) + { type: 'turn.completed', usage: { input_tokens: 2000, output_tokens: 1200 } }, + ]), + onBeforeClose: () => writeFileSync(outputPath, 'Done.', 'utf-8'), + }); + }); + + const engine = new CodexEngine(); + await engine.execute( + makeInput({ + repoDir: workspaceDir, + engineLogPath: join(workspaceDir, 'codex.log'), + runId: 'run-mixed-regression', + }), + ); + + const rows = mockStoreLlmCall.mock.calls.map((c) => c[0]); + expect(rows).toHaveLength(3); + + // T1: valid delta + expect(rows[0].inputTokens).toBe(1000); + expect(rows[0].outputTokens).toBe(500); + + // T2: all-zero delta — whole event discarded because inputTokens regressed + // (was {0, 100} before the fix — outputTokens was charged even though the + // event was invalid, and would be double-counted in T3) + expect(rows[1].inputTokens).toBe(0); + expect(rows[1].outputTokens).toBe(0); // NOT 100 + expect(rows[1].costUsd).toBeUndefined(); + + // T3: delta computed from the unchanged HWM {1000, 500} + // → {2000-1000, 1200-500} = {1000, 700} + expect(rows[2].inputTokens).toBe(1000); + expect(rows[2].outputTokens).toBe(700); + }); + // ─── Test 6: no usage on turn.completed ─────────────────────────────────── it('persists a row with undefined token fields when turn.completed carries no usage', async () => { diff --git a/tests/unit/backends/codex.test.ts b/tests/unit/backends/codex.test.ts index 676345d1d..7caf72f44 100644 --- a/tests/unit/backends/codex.test.ts +++ b/tests/unit/backends/codex.test.ts @@ -151,6 +151,23 @@ describe('resolveCodexModel', () => { 'not compatible with the Codex engine', ); }); + + it('throws for openai: prefix with an unlisted model (no pricing row)', () => { + // Pins the constraint that openai:* only resolves when the bare ID is in + // CODEX_MODEL_IDS — prevents silent zero-cost runs on models with no + // pricing entry in MODEL_PRICING. + expect(() => resolveCodexModel('openai:gpt-5-codex')).toThrow( + 'not compatible with the Codex engine', + ); + }); + + it('throws for gpt-*codex* pattern not in CODEX_MODEL_IDS', () => { + // The old wildcard gpt-*codex* branch was removed; unrecognised model IDs + // must throw rather than silently pass through with zero cost. + expect(() => resolveCodexModel('gpt-5-turbo-codex')).toThrow( + 'not compatible with the Codex engine', + ); + }); }); describe('extractErrorMessage', () => { From e21c3393e1a93017fc7f36e83f01ca58a3071763 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 11 May 2026 18:39:53 +0000 Subject: [PATCH 05/12] fix(codex): add gpt-5.4-mini to CODEX_MODELS and MODEL_PRICING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restores support for openai:gpt-5.4-mini which was previously accepted via the old openai:* wildcard in resolveCodexModel but became inaccessible after the resolver was tightened to only allow models in CODEX_MODEL_IDS. Adds gpt-5.4-mini to CODEX_MODELS (models.ts) and a pricing row at input=$0.75, cachedInput=$0.075, output=$4.50 per 1M (from developers.openai.com/api/docs/models/gpt-5.4-mini, 2026-05-11). The existing llmMetrics-codex.test.ts coverage test automatically pins the new row — failing loudly if pricing is ever dropped. Co-Authored-By: Claude Sonnet 4.6 --- src/backends/codex/models.ts | 1 + src/utils/llmMetrics.ts | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/backends/codex/models.ts b/src/backends/codex/models.ts index 2b8648f67..7d3d96124 100644 --- a/src/backends/codex/models.ts +++ b/src/backends/codex/models.ts @@ -1,6 +1,7 @@ export const CODEX_MODELS = [ { value: 'gpt-5.5', label: 'GPT-5.5' }, { value: 'gpt-5.4', label: 'GPT-5.4' }, + { value: 'gpt-5.4-mini', label: 'GPT-5.4 mini' }, { value: 'gpt-5.3-codex', label: 'GPT-5.3 Codex' }, { value: 'gpt-5.3-codex-spark', label: 'GPT-5.3 Codex Spark' }, { value: 'codex-mini-latest', label: 'Codex Mini (latest)' }, diff --git a/src/utils/llmMetrics.ts b/src/utils/llmMetrics.ts index 03924dae7..b58e9b25e 100644 --- a/src/utils/llmMetrics.ts +++ b/src/utils/llmMetrics.ts @@ -28,6 +28,8 @@ const MODEL_PRICING: Record Date: Mon, 11 May 2026 19:19:18 +0000 Subject: [PATCH 06/12] fix(scm): support body-file for update PR comment --- src/gadgets/github/definitions.ts | 7 ++ .../backends/shared-nativeToolPrompts.test.ts | 9 ++- tests/unit/backends/toolManifests.test.ts | 10 +++ tests/unit/cli/file-input-flags.test.ts | 72 +++++++++++++++++++ tests/unit/gadgets/github/definitions.test.ts | 9 +++ 5 files changed, 106 insertions(+), 1 deletion(-) diff --git a/src/gadgets/github/definitions.ts b/src/gadgets/github/definitions.ts index dc81f2eeb..43407d035 100644 --- a/src/gadgets/github/definitions.ts +++ b/src/gadgets/github/definitions.ts @@ -545,6 +545,13 @@ export const updatePRCommentDef: ToolDefinition = { ], cli: { autoResolved: ownerRepoAutoResolved, + fileInputAlternatives: [ + { + paramName: 'body', + fileFlag: 'body-file', + description: 'Read comment body from file (use - for stdin)', + }, + ], }, }; diff --git a/tests/unit/backends/shared-nativeToolPrompts.test.ts b/tests/unit/backends/shared-nativeToolPrompts.test.ts index e788c7ad5..fb0747b1b 100644 --- a/tests/unit/backends/shared-nativeToolPrompts.test.ts +++ b/tests/unit/backends/shared-nativeToolPrompts.test.ts @@ -25,7 +25,7 @@ import { buildTaskPrompt, buildToolGuidance, } from '../../../src/backends/shared/nativeToolPrompts.js'; -import { createPRReviewDef } from '../../../src/gadgets/github/definitions.js'; +import { createPRReviewDef, updatePRCommentDef } from '../../../src/gadgets/github/definitions.js'; import { readWorkItemDef } from '../../../src/gadgets/pm/definitions.js'; import { generateToolManifest } from '../../../src/gadgets/shared/manifestGenerator.js'; @@ -188,6 +188,13 @@ describe('buildToolGuidance', () => { expect(result).toContain('[--body-file ]'); expect(result).toContain('Read review body from file (use - for stdin)'); }); + + it('renders UpdatePRComment body-file guidance from definition metadata', () => { + const result = buildToolGuidance([generateToolManifest(updatePRCommentDef)]); + + expect(result).toContain('[--body-file ]'); + expect(result).toContain('Read comment body from file (use - for stdin)'); + }); }); describe('formatParam — optional string param', () => { diff --git a/tests/unit/backends/toolManifests.test.ts b/tests/unit/backends/toolManifests.test.ts index 0dae5b39d..83079e232 100644 --- a/tests/unit/backends/toolManifests.test.ts +++ b/tests/unit/backends/toolManifests.test.ts @@ -148,4 +148,14 @@ describe('getToolManifests', () => { head: { type: 'string', required: true }, }); }); + + it('UpdatePRComment has body-file support', () => { + const manifests = getToolManifests(); + const updatePRComment = manifests.find((m) => m.name === 'UpdatePRComment'); + expect(updatePRComment).toBeDefined(); + expect(updatePRComment?.parameters).toMatchObject({ + body: { type: 'string', required: true }, + 'body-file': { type: 'string' }, + }); + }); }); diff --git a/tests/unit/cli/file-input-flags.test.ts b/tests/unit/cli/file-input-flags.test.ts index 2df3b30d8..ec1110532 100644 --- a/tests/unit/cli/file-input-flags.test.ts +++ b/tests/unit/cli/file-input-flags.test.ts @@ -47,6 +47,9 @@ vi.mock('../../../src/gadgets/github/core/createPRReview.js', () => ({ vi.mock('../../../src/gadgets/github/core/postPRComment.js', () => ({ postPRComment: vi.fn().mockResolvedValue({ id: 123 }), })); +vi.mock('../../../src/gadgets/github/core/updatePRComment.js', () => ({ + updatePRComment: vi.fn().mockResolvedValue({ id: 456 }), +})); import CreateWorkItem from '../../../src/cli/pm/create-work-item.js'; import PostComment from '../../../src/cli/pm/post-comment.js'; @@ -55,9 +58,11 @@ import UpdateWorkItem from '../../../src/cli/pm/update-work-item.js'; import CreatePR from '../../../src/cli/scm/create-pr.js'; import CreatePRReview from '../../../src/cli/scm/create-pr-review.js'; import PostPRComment from '../../../src/cli/scm/post-pr-comment.js'; +import UpdatePRComment from '../../../src/cli/scm/update-pr-comment.js'; import { createPR } from '../../../src/gadgets/github/core/createPR.js'; import { createPRReview } from '../../../src/gadgets/github/core/createPRReview.js'; import { postPRComment } from '../../../src/gadgets/github/core/postPRComment.js'; +import { updatePRComment } from '../../../src/gadgets/github/core/updatePRComment.js'; import { createWorkItem } from '../../../src/gadgets/pm/core/createWorkItem.js'; import { postComment } from '../../../src/gadgets/pm/core/postComment.js'; import { reportFriction } from '../../../src/gadgets/pm/core/reportFriction.js'; @@ -69,6 +74,7 @@ let tmpDir: string; const mockConfig = { runHook: vi.fn().mockResolvedValue({ successes: [], failures: [] }) }; beforeEach(() => { + vi.clearAllMocks(); tmpDir = mkdtempSync(join(tmpdir(), 'cascade-cli-test-')); }); @@ -495,3 +501,69 @@ describe('PostPRComment --body-file', () => { expect(output.error.flag).toBe('body'); }); }); + +describe('UpdatePRComment --body-file', () => { + const originalEnv = process.env; + + beforeEach(() => { + process.env = { + ...originalEnv, + CASCADE_REPO_OWNER: 'owner', + CASCADE_REPO_NAME: 'repo', + }; + }); + + afterEach(() => { + process.env = originalEnv; + }); + + it('reads comment body from file', async () => { + const filePath = writeTempFile('comment.md', 'Updated PR comment from file'); + const cmd = new UpdatePRComment( + ['--commentId', '456', '--body-file', filePath], + mockConfig as never, + ); + await cmd.run(); + + expect(updatePRComment).toHaveBeenCalledWith( + 'owner', + 'repo', + 456, + 'Updated PR comment from file', + ); + }); + + it('prefers --body-file over --body', async () => { + const filePath = writeTempFile('comment.md', 'from file'); + const cmd = new UpdatePRComment( + ['--commentId', '456', '--body', 'from flag', '--body-file', filePath], + mockConfig as never, + ); + await cmd.run(); + + expect(updatePRComment).toHaveBeenCalledWith('owner', 'repo', 456, 'from file'); + }); + + it('still works with inline --body flag', async () => { + const cmd = new UpdatePRComment( + ['--commentId', '456', '--body', 'inline body'], + mockConfig as never, + ); + await cmd.run(); + + expect(updatePRComment).toHaveBeenCalledWith('owner', 'repo', 456, 'inline body'); + }); + + it('errors when neither --body nor --body-file is provided (spec 014 envelope)', async () => { + const cmd = new UpdatePRComment(['--commentId', '456'], mockConfig as never); + const logSpy = vi.spyOn(cmd, 'log'); + await expect(cmd.run()).rejects.toThrow(); + const output = JSON.parse(logSpy.mock.calls[0][0] as string) as { + success: boolean; + error: { type: string; flag?: string }; + }; + expect(output.success).toBe(false); + expect(output.error.type).toBe('missing-required'); + expect(output.error.flag).toBe('body'); + }); +}); diff --git a/tests/unit/gadgets/github/definitions.test.ts b/tests/unit/gadgets/github/definitions.test.ts index 2d1cce598..aef7f3df8 100644 --- a/tests/unit/gadgets/github/definitions.test.ts +++ b/tests/unit/gadgets/github/definitions.test.ts @@ -266,6 +266,15 @@ describe('GitHub SCM gadget definitions', () => { it('does not have prNumber (comment ID is enough)', () => { expect(updatePRCommentDef.parameters.prNumber).toBeUndefined(); }); + + it('has body file input alternative', () => { + const bodyAlt = updatePRCommentDef.cli?.fileInputAlternatives?.find( + (a) => a.paramName === 'body', + ); + expect(bodyAlt).toBeDefined(); + expect(bodyAlt?.fileFlag).toBe('body-file'); + expect(bodyAlt?.description).toBeTruthy(); + }); }); }); From 852964271bde89225603b745348b737368b9548a Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Mon, 11 May 2026 19:49:38 +0000 Subject: [PATCH 07/12] fix(triggers): auto-review fires for stacked PRs + survives late check-suite completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two distinct production bugs surfaced on project ucho (2026-05-11) where the review agent failed to auto-dispatch after a PR's CI passed, both requiring manual review requests to unblock. Bug 1 (PR #394, MNG-683): check-suite-success deferred-on-incomplete returned a plain skip and relied on GitHub firing another check_suite event when the final suite finished. When the Actions API lagged webhook delivery, the API still showed the final suite as in_progress at query time AND no further webhook arrived — review never fired. Fix: schedule a deferredRecheck (30s, coalesce-keyed on owner/repo/PR/ head-SHA) so the trigger re-evaluates against fresh API state. Bug 2 (PR #393, MNG-691): stacked PR targeting feature/MNG-690-… was rejected by the base-branch gate even though cascade authored it. The gate exists to filter human or third-party-bot drive-bys, which the upstream persona check already handles. Fix: skip the base-branch gate for cascade-authored PRs; non-cascade authors still get the rejection. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/triggers/github/check-suite-decision.ts | 12 +++- src/triggers/github/check-suite-success.ts | 16 ++++- .../unit/triggers/check-suite-success.test.ts | 70 ++++++++++++++++--- .../github/check-suite-decision.test.ts | 55 ++++++++++++++- 4 files changed, 140 insertions(+), 13 deletions(-) diff --git a/src/triggers/github/check-suite-decision.ts b/src/triggers/github/check-suite-decision.ts index 163690e5b..a20939aba 100644 --- a/src/triggers/github/check-suite-decision.ts +++ b/src/triggers/github/check-suite-decision.ts @@ -91,7 +91,17 @@ export function decideCheckSuiteGates( : cascadePersonaDecision(prAuthorLogin, personaIdentities, prNumber); if (authorSkip) return authorSkip; - if (prBaseRef !== project.baseBranch) { + // Bug 2 (2026-05-11 prod incident on ucho PR #393, MNG-691): + // the base-branch gate was rejecting stacked PRs (MNG-691 → MNG-690 + // feature branch) even though the PR was opened by the cascade + // implementer persona. The gate exists to filter human-authored / + // third-party-bot drive-bys against random branches in the repo; that + // case is already covered by the upstream persona check. Cascade- + // authored PRs targeting any base branch are legitimate work product. + const authorIsCascade = personaIdentities + ? isCascadeBot(prAuthorLogin, personaIdentities) + : false; + if (!authorIsCascade && prBaseRef !== project.baseBranch) { return { action: 'skip', message: `PR #${prNumber} targets ${prBaseRef}, not project base branch ${project.baseBranch}`, diff --git a/src/triggers/github/check-suite-success.ts b/src/triggers/github/check-suite-success.ts index 946519de9..ae2ac50d2 100644 --- a/src/triggers/github/check-suite-success.ts +++ b/src/triggers/github/check-suite-success.ts @@ -2,6 +2,7 @@ import { githubClient } from '../../github/client.js'; import type { TriggerContext, TriggerHandler, TriggerResult } from '../../types/index.js'; import { logger } from '../../utils/logging.js'; import { parseRepoFullName } from '../../utils/repo.js'; +import { buildDeferredRecheckResult } from '../shared/result-builders.js'; import { skip } from '../shared/skip.js'; import { checkTriggerEnabledWithParams } from '../shared/trigger-check.js'; import { decideCheckSuiteGates, decideCheckSuiteOutcome } from './check-suite-decision.js'; @@ -157,13 +158,24 @@ export class CheckSuiteSuccessTrigger implements TriggerHandler { }); if (decision.action === 'defer') { - logger.info('Not all checks complete yet, waiting for next check_suite event', { + // Bug 1 (2026-05-11 prod incident on ucho PR #394, MNG-683): + // returning a plain skip() relies on GitHub firing another + // check_suite event when the final suite completes. But when + // the Actions API lags webhook delivery, the API still shows + // the final suite as "in_progress" at query time AND no further + // webhook arrives (GitHub already fired its one event for that + // workflow). Schedule a deferred re-check so the trigger + // re-evaluates against fresh API state ~30s later. + const coalesceKey = `check-suite-success:${owner}/${repo}:pr-${prNumber}:${headSha}`; + logger.info('Not all checks complete yet, scheduling deferred re-check', { handler: this.name, prNumber, totalChecks: checkStatus.totalCount, incompleteChecks: decision.incompleteChecks, + coalesceKey, + delayMs: 30_000, }); - return skip(this.name, decision.message); + return buildDeferredRecheckResult({ delayMs: 30_000, coalesceKey }); } if (decision.action === 'skip') { diff --git a/tests/unit/triggers/check-suite-success.test.ts b/tests/unit/triggers/check-suite-success.test.ts index 7eab8a804..ba3f121be 100644 --- a/tests/unit/triggers/check-suite-success.test.ts +++ b/tests/unit/triggers/check-suite-success.test.ts @@ -294,7 +294,10 @@ describe('CheckSuiteSuccessTrigger', () => { expectSkip(checkSuiteResult); }); - it('returns null when PR targets non-base branch', async () => { + it('returns skip when non-cascade-authored PR targets non-base branch', async () => { + // Fix B (2026-05-11): the base-branch gate still applies to NON- + // cascade-authored PRs. Cascade-authored stacked PRs now bypass + // the gate — see the "cascade-authored stacked PR" test below. vi.mocked(githubClient.getPR).mockResolvedValue({ number: 42, title: 'Test PR', @@ -305,7 +308,7 @@ describe('CheckSuiteSuccessTrigger', () => { baseRef: 'develop', merged: false, htmlUrl: 'https://github.com/owner/repo/pull/42', - user: { login: 'cascade-impl' }, + user: { login: 'random-contributor' }, }); const ctx: TriggerContext = { @@ -908,7 +911,15 @@ describe('CheckSuiteSuccessTrigger', () => { // later success event. Mirrors the existing check-suite-failure deferral // shape — the LAST check_suite event makes the dispatch decision based // on full aggregate state. - it('skips with "Not all checks complete yet" when an in-progress check exists, naming the pending check', async () => { + // Bug 1 (2026-05-11 prod incident on ucho PR #394, MNG-683): + // the handler used to return a plain skip when checks were incomplete, + // relying on GitHub to fire another check_suite event when the final + // suite finishes. But when the GitHub Actions API lags webhook delivery + // by >0ms, the API still shows that final suite as "in_progress" — and + // no further webhook arrives because GitHub already fired its event. + // Review never dispatched; user had to manually request from the + // reviewer persona. Fix: schedule a deferred re-check with delay. + it('returns deferredRecheck (not plain skip) when an in-progress check exists', async () => { vi.mocked(githubClient.getPR).mockResolvedValue(baseImplementerPR); vi.mocked(githubClient.getPRReviews).mockResolvedValue([]); vi.mocked(githubClient.getCheckSuiteStatus).mockResolvedValue({ @@ -929,11 +940,19 @@ describe('CheckSuiteSuccessTrigger', () => { const result = await trigger.handle(ctx); - expectSkip(result, /Not all checks complete yet.*lint-and-test/); - // Crucially, no agent dispatched — the worker bail-out path is gone. + // No agent dispatch — the worker bail-out path is gone. expect(result?.agentType).toBeNull(); - // Dedup must NOT be claimed for a deferred event; the next - // check_suite event for this SHA must be free to make the call. + // New contract: a deferredRecheck job is scheduled so the trigger + // re-evaluates even when GitHub does not fire another check_suite event. + expect(result?.deferredRecheck).toBeDefined(); + expect(result?.deferredRecheck?.delayMs).toBe(30_000); + // coalesceKey must include owner/repo + PR number + head SHA so + // concurrent deferrals collapse to a single delayed job. + expect(result?.deferredRecheck?.coalesceKey).toBe( + 'check-suite-success:owner/repo:pr-42:sha123', + ); + // Dedup must NOT be claimed for a deferred event; the recheck must + // be free to make the dispatch call. expect(result?.onBlocked).toBeUndefined(); }); @@ -968,12 +987,47 @@ describe('CheckSuiteSuccessTrigger', () => { }; const firstResult = await trigger.handle(ctx); - expectSkip(firstResult, /Not all checks complete yet/); + // First event: defer-with-recheck (no dedup claim yet). + expect(firstResult?.agentType).toBeNull(); + expect(firstResult?.deferredRecheck).toBeDefined(); + expect(mockClaimReviewDispatch).not.toHaveBeenCalled(); const secondResult = await trigger.handle(ctx); expect(secondResult?.agentType).toBe('review'); }); + // Bug 2 (2026-05-11 prod incident on ucho PR #393, MNG-691): + // the handler rejected stacked PRs targeting a feature branch even + // when the PR was opened by the cascade implementer persona. Fix: + // the base-branch gate now skips cascade-authored PRs. + it('dispatches review for cascade-authored stacked PR targeting a feature branch', async () => { + vi.mocked(githubClient.getPR).mockResolvedValue({ + ...baseImplementerPR, + baseRef: 'feature/MNG-690-calendar-event-context-tables', + }); + vi.mocked(githubClient.getPRReviews).mockResolvedValue([]); + vi.mocked(githubClient.getCheckSuiteStatus).mockResolvedValue({ + allPassing: true, + totalCount: 2, + checkRuns: [ + { name: 'CI', status: 'completed', conclusion: 'success' }, + { name: 'lint-and-test', status: 'completed', conclusion: 'success' }, + ], + }); + + const ctx: TriggerContext = { + project: mockProject, + source: 'github', + payload: makeCheckSuitePayload(), + personaIdentities: mockPersonaIdentities, + }; + + const result = await trigger.handle(ctx); + + expect(result?.agentType).toBe('review'); + expect(result?.prNumber).toBe(42); + }); + it('dispatches respond-to-ci on timed_out conclusion', async () => { vi.mocked(githubClient.getPR).mockResolvedValue(baseImplementerPR); vi.mocked(githubClient.getPRReviews).mockResolvedValue([]); diff --git a/tests/unit/triggers/github/check-suite-decision.test.ts b/tests/unit/triggers/github/check-suite-decision.test.ts index 20c5d8045..7d72d50a4 100644 --- a/tests/unit/triggers/github/check-suite-decision.test.ts +++ b/tests/unit/triggers/github/check-suite-decision.test.ts @@ -87,11 +87,15 @@ describe('decideCheckSuiteOutcome', () => { }); }); - it('returns base-branch skip before aggregate decisions', () => { + it('returns base-branch skip for non-cascade-authored PR targeting a non-canonical branch', () => { + // Non-cascade authors hitting an unrelated branch are still rejected — + // the gate exists to filter human-authored / third-party-bot drive-bys + // against random branches in the repo. const decision = decideCheckSuiteGates({ ...baseOptions, + prAuthorLogin: 'random-contributor', prBaseRef: 'develop', - mode: { kind: 'review', parameters: {} }, + mode: { kind: 'review', parameters: { authorMode: 'all' } }, }); expect(decision).toEqual({ @@ -99,6 +103,53 @@ describe('decideCheckSuiteOutcome', () => { message: 'PR #42 targets develop, not project base branch main', }); }); + + // Bug 2 (2026-05-11 prod incident on ucho PR #393, MNG-691): + // stacked PRs targeting a feature branch (MNG-691 → MNG-690's branch) + // were rejected by the base-branch gate even though the PR was opened + // by the cascade implementer. The persona check upstream already trusts + // these — the base-branch check adds no value for that case. + it('allows cascade-authored stacked PR through the base-branch gate (Bug 2)', () => { + const decision = decideCheckSuiteGates({ + ...baseOptions, + prAuthorLogin: 'cascade-impl', // matches mockPersonaIdentities.implementer + prBaseRef: 'feature/MNG-690-calendar-event-context-tables', + mode: { kind: 'review', parameters: {} }, + }); + + expect(decision).toBeNull(); + }); + + it('preserves the existing pass-through for cascade-authored canonical PRs', () => { + const decision = decideCheckSuiteGates({ + ...baseOptions, + prAuthorLogin: 'cascade-impl', + prBaseRef: 'main', // canonical base + mode: { kind: 'review', parameters: {} }, + }); + + expect(decision).toBeNull(); + }); + + it('rejects cascade-authored stacked PR if persona identities cannot be resolved', () => { + // Defense in depth: with no personaIdentities, isCascadeBot is unreliable. + // The author-mode gate already returns its own skip in that case (see + // cascadePersonaDecision); this assertion pins that we never grant the + // stacked-PR bypass on a degraded identity path. + const decision = decideCheckSuiteGates({ + ...baseOptions, + prAuthorLogin: 'cascade-impl', + prBaseRef: 'feature/MNG-690-x', + personaIdentities: undefined, + mode: { kind: 'review', parameters: {} }, + }); + + // The persona-failure skip from authorModeDecision takes precedence. + expect(decision?.action).toBe('skip'); + expect((decision as { action: 'skip'; message: string }).message).toMatch( + /Cascade persona identities could not be resolved/, + ); + }); }); describe('resolveCheckSuitePRNumber', () => { From f0817e39bfd322b3cc6f7bcc80fe553a8d937904 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Mon, 11 May 2026 20:00:24 +0000 Subject: [PATCH 08/12] fix(codex): late shell-state corruption signal no longer voids successful runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When codex's persistent bash session breaks with `write_stdin failed: stdin is closed`, the engine used to fail the run unconditionally — even when all real work was already captured. Prod evidence (cascade/MNG-718, run f801342b, 2026-05-11): an implementation agent opened PR #1350, ran the full verification suite, filed a follow-up friction ticket, and was still marked failed because the signal fired during session-close. This cost cascade the post-completion review dispatch and surfaced a misleading "agent failed" comment via aaight42. The discriminator is whether success evidence was captured BEFORE the signal: if `prUrl` is set and `finalOutput` is non-empty, the corruption was late and the work is real → log WARN, fall through to success. Otherwise the corruption may have masked the agent's work → fail loudly so ops retry (preserved behavior from PR #1245). Extracted classifyShellCorruption() to keep execute()'s cognitive complexity unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/backends/codex/index.ts | 89 +++++++++++++++++++++++++------ tests/unit/backends/codex.test.ts | 75 +++++++++++++++++++++++--- 2 files changed, 141 insertions(+), 23 deletions(-) diff --git a/src/backends/codex/index.ts b/src/backends/codex/index.ts index 61f62a6e0..1bf20d437 100644 --- a/src/backends/codex/index.ts +++ b/src/backends/codex/index.ts @@ -344,6 +344,59 @@ async function captureRefreshedToken( } } +/** + * Inspect stderr for the SHELL_CORRUPTED signal and decide the outcome. + * + * The signal can fire mid-work (corruption masks real output → fail) or + * at session-close (real work already captured → success-with-warning). + * Discriminator: if `prUrl` is set and `finalOutput` is non-empty, the + * agent emitted real artifacts before the signal fired. See MNG-718 + * (run f801342b, 2026-05-11) for the late-corruption prod case. + * + * Returns the failure `AgentEngineResult` to surface, or `null` when + * the run should continue through the normal success path (either no + * signal present, or signal fired after success evidence was captured). + */ +function classifyShellCorruption( + stderrOutput: string, + exitCode: number, + prUrl: string | undefined, + prEvidence: ReturnType['prEvidence'], + finalOutput: string, + cost: number | undefined, + logWriter: LogWriter, +): AgentEngineResult | null { + if (exitCode !== 0 || !SHELL_CORRUPTED_RE.test(stderrOutput)) return null; + + const hasSuccessEvidence = !!prUrl && finalOutput.length > 0; + if (hasSuccessEvidence) { + logWriter( + 'WARN', + 'Codex shell-state corruption signal fired after success evidence captured — treating as success-with-warning', + { + stderr: stderrOutput.slice(-500), + prUrl, + finalOutputLength: finalOutput.length, + hint: 'PR was created and final output captured before the corruption signal; verify the PR manually if anything looks off', + }, + ); + return null; + } + + logWriter('ERROR', 'Codex shell-state corrupted (write_stdin closed) — failing run', { + stderr: stderrOutput, + hint: 'codex_core::tools::router lost its persistent bash session; subsequent commands may have inherited stale state', + }); + return buildEngineResult({ + success: false, + output: finalOutput, + error: 'codex shell-state corrupted: write_stdin failed (stdin closed for session)', + cost, + prUrl, + prEvidence, + }); +} + /** * Codex CLI backend for CASCADE. * @@ -595,22 +648,26 @@ export class CodexEngine extends NativeToolEngine { // failures with PR-creation evidence missing despite a real PR // existing on GitHub. Codex itself exits cleanly (exit=0) — the // stderr signal is the only evidence the session was corrupted. - // Surface it as a run failure so ops can retry against a fresh - // session rather than continuing in a corrupted state. - if (exitCode === 0 && SHELL_CORRUPTED_RE.test(stderrOutput)) { - input.logWriter('ERROR', 'Codex shell-state corrupted (write_stdin closed) — failing run', { - stderr: stderrOutput, - hint: 'codex_core::tools::router lost its persistent bash session; subsequent commands may have inherited stale state', - }); - return buildEngineResult({ - success: false, - output: finalOutput, - error: 'codex shell-state corrupted: write_stdin failed (stdin closed for session)', - cost, - prUrl, - prEvidence, - }); - } + // + // MNG-718 follow-up (run f801342b, 2026-05-11): the signal can + // also fire LATE, at session-close, after all real work has been + // captured. That run opened PR #1350, ran the full verification + // suite, and filed a follow-up friction ticket — yet was marked + // failed, costing cascade the post-completion review dispatch + // and surfacing a misleading "agent failed" comment. Split the + // two cases by whether success evidence (prUrl + finalOutput) + // was captured before the signal: late → WARN + success, early + // → ERROR + fail. + const shellCorruptionResult = classifyShellCorruption( + stderrOutput, + exitCode, + prUrl, + prEvidence, + finalOutput, + cost, + input.logWriter, + ); + if (shellCorruptionResult) return shellCorruptionResult; if (exitCode !== 0) { return buildEngineResult({ diff --git a/tests/unit/backends/codex.test.ts b/tests/unit/backends/codex.test.ts index df2db8ad2..c4acd8682 100644 --- a/tests/unit/backends/codex.test.ts +++ b/tests/unit/backends/codex.test.ts @@ -1224,13 +1224,21 @@ describe('Codex subscription auth', () => { // with `ERROR codex_core::tools::router: error=write_stdin failed: stdin // is closed for this session`. Once that signal fires, every subsequent // command in the session inherits a corrupted stdout buffer (lint output - // from one command bleeding into the next, sidecar writes racing). We - // observed this leading to silent run failures with PR-creation evidence - // missing despite a real PR existing on GitHub. Fail loudly: when the - // signal appears in stderr, surface it as a run failure rather than - // continuing in a corrupted state. + // from one command bleeding into the next, sidecar writes racing). + // + // Originally we failed the run on ANY occurrence of this signal. That + // turned out to over-correct: the signal can fire LATE (at session-close, + // after all real work was captured). MNG-718 (run f801342b, 2026-05-11) + // opened a valid PR #1350 and ran a full verification suite, then was + // marked failed because the signal fired during cleanup. The fix below + // splits the two cases: late signal with success evidence (prUrl + + // finalOutput) → WARN + success; early signal with no evidence → fail. describe('shell-state corruption (write_stdin closed)', () => { - it('marks the run as failed when stderr contains the persistent-shell stdin-closed signal', async () => { + // EARLY-corruption variant: signal fires mid-work, no PR URL extracted, + // no meaningful finalOutput. The corruption likely masked the agent's + // real work, so we must fail loudly. Note the empty finalOutput — that's + // the discriminator from the late-corruption path below. + it('marks the run as failed when the signal fires with no success evidence (no PR URL, no output)', async () => { mockSpawn.mockImplementation((_cmd: string, args: string[]) => { const outputPath = args[args.indexOf('-o') + 1]; return createMockChild({ @@ -1244,7 +1252,8 @@ describe('Codex subscription auth', () => { stderr: '2026-05-09T15:36:59.079680Z ERROR codex_core::tools::router: error=write_stdin failed: stdin is closed for this session; rerun exec_command with tty=true to keep stdin open\n', onBeforeClose: () => { - writeFileSync(outputPath, 'PR created at https://github.com/o/r/pull/1', 'utf-8'); + // No PR URL, no meaningful output — this is the early-corruption case. + writeFileSync(outputPath, '', 'utf-8'); }, exitCode: 0, // <-- exit code 0 — codex itself doesn't fail; the shell-state signal is the only evidence }); @@ -1265,6 +1274,58 @@ describe('Codex subscription auth', () => { ); }); + // LATE-corruption regression net: MNG-718 (run f801342b-1129-4502-a4b0- + // b7808e9b2a2e, project=cascade, 2026-05-11). The agent opened PR #1350 + // AND ran a full verification suite AND filed a follow-up friction + // ticket — all of which made it into finalOutput. The corruption signal + // fired during/after session close. Marking the run failed cost cascade + // the post-completion review dispatch and surfaced a misleading + // "agent failed" comment via aaight42. The fix preserves success when + // the evidence is real. + it('treats late-arriving signal as success-with-warning when a PR URL was extracted (MNG-718 regression)', async () => { + mockSpawn.mockImplementation((_cmd: string, args: string[]) => { + const outputPath = args[args.indexOf('-o') + 1]; + return createMockChild({ + stdoutLines: [ + JSON.stringify({ type: 'turn.started' }), + JSON.stringify({ + type: 'turn.completed', + usage: { input_tokens: 5, output_tokens: 3 }, + }), + ], + stderr: + '2026-05-11T19:21:55.000000Z ERROR codex_core::tools::router: error=write_stdin failed: stdin is closed for this session; rerun exec_command with tty=true to keep stdin open\n', + onBeforeClose: () => { + // PR URL extracted + non-empty final output = real work was captured + // before the signal fired. + writeFileSync( + outputPath, + 'PR https://github.com/o/r/pull/1 created. Verification passed.', + 'utf-8', + ); + }, + exitCode: 0, + }); + }); + + const engine = new CodexEngine(); + const input = makeInput({ repoDir: workspaceDir }); + const result = await engine.execute(input); + + // Despite the signal, success evidence was captured before the corruption. + expect(result.success).toBe(true); + expect(result.error).toBeUndefined(); + expect(result.prUrl).toBe('https://github.com/o/r/pull/1'); + // And we logged loudly so operators can spot-check the PR. + expect(input.logWriter).toHaveBeenCalledWith( + 'WARN', + expect.stringContaining('success-with-warning'), + expect.objectContaining({ + prUrl: 'https://github.com/o/r/pull/1', + }), + ); + }); + it('passes through cleanly when stderr is benign (no false positives)', async () => { mockSpawn.mockImplementation((_cmd: string, args: string[]) => { const outputPath = args[args.indexOf('-o') + 1]; From 5d8e3c30b9995056c3555b32197737c5d9111080 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 11 May 2026 20:27:20 +0000 Subject: [PATCH 09/12] fix(check-suite): distinguish check-suite rechecks from mergeability rechecks Adds `recheckKind: 'check-suite'` to `TriggerResult.deferredRecheck` so the router stamps `checkSuiteRecheckAttempt` on the job (not `mergeabilityRecheckAttempt`). On the worker side, `processGitHubWebhook` now reschedules another coalesced delayed job when `isCheckSuiteRecheckJob=true` and the trigger is still stale, rather than silently dropping the dispatch via the `mergeability_recheck_exhausted` path. The recheck-handling logic is extracted into a `handleRecheckResult` helper to keep `processGitHubWebhook` within the lint complexity budget. Closes the silent-drop bug where a 30s check-suite recheck that found the Actions API still stale would never trigger review again. Co-Authored-By: Claude Sonnet 4.6 --- src/router/adapters/github.ts | 6 +- src/router/queue.ts | 9 ++ src/triggers/github/check-suite-success.ts | 6 +- src/triggers/github/webhook-handler.ts | 89 +++++++++++++++--- src/triggers/shared/result-builders.ts | 3 + src/types/index.ts | 12 +++ src/worker-entry.ts | 7 ++ tests/unit/router/deferred-recheck.test.ts | 44 +++++++++ .../unit/triggers/check-suite-success.test.ts | 4 + .../github-webhook-handler-recheck.test.ts | 90 +++++++++++++++++++ tests/unit/worker-entry.test.ts | 2 + 11 files changed, 259 insertions(+), 13 deletions(-) diff --git a/src/router/adapters/github.ts b/src/router/adapters/github.ts index 9b352c999..c8adf7595 100644 --- a/src/router/adapters/github.ts +++ b/src/router/adapters/github.ts @@ -374,6 +374,9 @@ export class GitHubRouterAdapter implements RouterPlatformAdapter { ackResult?: AckResult, ): CascadeJob { const isDeferredRecheck = !!result.deferredRecheck; + const isCheckSuiteRecheck = + isDeferredRecheck && result.deferredRecheck?.recheckKind === 'check-suite'; + const isMergeabilityRecheck = isDeferredRecheck && !isCheckSuiteRecheck; const job: GitHubJob = { type: 'github', source: 'github', @@ -384,7 +387,8 @@ export class GitHubRouterAdapter implements RouterPlatformAdapter { triggerResult: isDeferredRecheck ? undefined : result, ackCommentId: ackResult?.commentId as number | undefined, ackMessage: ackResult?.message, - ...(isDeferredRecheck ? { mergeabilityRecheckAttempt: 1 } : {}), + ...(isMergeabilityRecheck ? { mergeabilityRecheckAttempt: 1 } : {}), + ...(isCheckSuiteRecheck ? { checkSuiteRecheckAttempt: 1 } : {}), }; return job; } diff --git a/src/router/queue.ts b/src/router/queue.ts index 69d9c3bb7..2fada3c16 100644 --- a/src/router/queue.ts +++ b/src/router/queue.ts @@ -49,6 +49,15 @@ export interface GitHubJob { * mergeability. */ mergeabilityRecheckAttempt?: number; + /** + * Set to 1 when this job is a deferred check-suite re-check (router side). + * Unlike `mergeabilityRecheckAttempt`, a still-stale response from the + * trigger reschedules another delayed job (coalesceKey deduplicates) instead + * of emitting a Sentry capture and giving up. This handles the Actions API + * lag case where the final `check_suite.completed` webhook arrives while the + * API still reports the suite as in-progress. + */ + checkSuiteRecheckAttempt?: number; } export interface JiraJob { diff --git a/src/triggers/github/check-suite-success.ts b/src/triggers/github/check-suite-success.ts index ae2ac50d2..8c0c050fe 100644 --- a/src/triggers/github/check-suite-success.ts +++ b/src/triggers/github/check-suite-success.ts @@ -175,7 +175,11 @@ export class CheckSuiteSuccessTrigger implements TriggerHandler { coalesceKey, delayMs: 30_000, }); - return buildDeferredRecheckResult({ delayMs: 30_000, coalesceKey }); + return buildDeferredRecheckResult({ + delayMs: 30_000, + coalesceKey, + recheckKind: 'check-suite', + }); } if (decision.action === 'skip') { diff --git a/src/triggers/github/webhook-handler.ts b/src/triggers/github/webhook-handler.ts index 6ba7fd4be..39ea0dae8 100644 --- a/src/triggers/github/webhook-handler.ts +++ b/src/triggers/github/webhook-handler.ts @@ -15,6 +15,8 @@ import { isPMFocusedAgent } from '../../agents/definitions/loader.js'; import { withGitHubToken } from '../../github/client.js'; import { getPersonaToken, resolvePersonaIdentities } from '../../github/personas.js'; import { extractGitHubContext, generateAckMessage } from '../../router/ackMessageGenerator.js'; +import type { GitHubJob } from '../../router/queue.js'; +import { scheduleCoalescedJob } from '../../router/queue.js'; import { captureException } from '../../sentry.js'; import type { CascadeConfig, ProjectConfig, TriggerContext } from '../../types/index.js'; import { logger, startWatchdog } from '../../utils/index.js'; @@ -197,6 +199,72 @@ async function runGitHubAgent( } } +/** + * Handles the case where a re-check job finds the trigger still returning + * `deferredRecheck`. Returns `true` when the caller should return immediately. + * + * - Mergeability re-check (`isRecheckJob=true`): one-shot — fire Sentry and + * give up. A second deferred result means the mergeability flag is stuck + * and we should not queue-flood. + * - Check-suite re-check (`isCheckSuiteRecheckJob=true`): safe rescheduling — + * the Actions API is still stale; schedule another delayed job via the same + * coalesceKey so the trigger re-evaluates when the API catches up. Incoming + * `check_suite.completed` webhooks for the same SHA coalesce with the + * pending job, preventing queue flooding. + */ +async function handleRecheckResult( + result: TriggerResult | null, + isRecheckJob: boolean, + isCheckSuiteRecheckJob: boolean, + eventType: string, + payload: unknown, + projectIdentifier: string, +): Promise { + if (!result?.deferredRecheck) return false; + + if (isRecheckJob) { + logger.warn('Mergeability still null after deferred re-check — giving up', { eventType }); + captureException( + new Error('mergeability_recheck_exhausted: still null after deferred re-check'), + { tags: { source: 'mergeability_recheck_exhausted' }, extra: { eventType } }, + ); + return true; + } + + if (isCheckSuiteRecheckJob) { + const { coalesceKey, delayMs } = result.deferredRecheck; + logger.info('Check-suite state still stale after deferred re-check — rescheduling', { + eventType, + coalesceKey, + delayMs, + }); + const recheckJob: GitHubJob = { + type: 'github', + source: 'github', + payload, + eventType, + repoFullName: projectIdentifier, + receivedAt: new Date().toISOString(), + checkSuiteRecheckAttempt: 1, + }; + try { + await scheduleCoalescedJob(recheckJob, coalesceKey, delayMs); + } catch (err) { + captureException(err instanceof Error ? err : new Error(String(err)), { + tags: { source: 'check_suite_recheck_reschedule_failure' }, + extra: { coalesceKey, eventType }, + }); + logger.error('Failed to reschedule check-suite recheck', { + error: String(err), + coalesceKey, + }); + } + return true; + } + + return false; +} + export async function processGitHubWebhook( payload: unknown, eventType: string, @@ -205,6 +273,7 @@ export async function processGitHubWebhook( ackMessage?: string, triggerResult?: TriggerResult, isRecheckJob?: boolean, + isCheckSuiteRecheckJob?: boolean, ): Promise { logger.info('Processing GitHub webhook', { eventType, hasTriggerResult: !!triggerResult }); @@ -238,17 +307,15 @@ export async function processGitHubWebhook( }); if (!triggerResult) { - if (result?.deferredRecheck && isRecheckJob) { - logger.warn('Mergeability still null after deferred re-check — giving up', { eventType }); - captureException( - new Error('mergeability_recheck_exhausted: still null after deferred re-check'), - { - tags: { source: 'mergeability_recheck_exhausted' }, - extra: { eventType }, - }, - ); - return; - } + const recheckHandled = await handleRecheckResult( + result, + !!isRecheckJob, + !!isCheckSuiteRecheckJob, + eventType, + payload, + event.projectIdentifier, + ); + if (recheckHandled) return; } if (!result) { diff --git a/src/triggers/shared/result-builders.ts b/src/triggers/shared/result-builders.ts index f8c595811..f09ff62bd 100644 --- a/src/triggers/shared/result-builders.ts +++ b/src/triggers/shared/result-builders.ts @@ -41,6 +41,8 @@ interface DeferredRecheckOptions { delayMs: number; coalesceKey: string; agentInput?: AgentInput; + /** See `TriggerResult.deferredRecheck.recheckKind` for semantics. */ + recheckKind?: 'check-suite'; } interface AgentInputExtraContext { @@ -163,6 +165,7 @@ export function buildDeferredRecheckResult(options: DeferredRecheckOptions): Tri deferredRecheck: { delayMs: options.delayMs, coalesceKey: options.coalesceKey, + ...(options.recheckKind ? { recheckKind: options.recheckKind } : {}), }, }; } diff --git a/src/types/index.ts b/src/types/index.ts index 2a1db8fbb..3eb4069ce 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -177,6 +177,18 @@ export interface TriggerResult { deferredRecheck?: { delayMs: number; coalesceKey: string; + /** + * Distinguishes the type of deferred re-check so the router adapter can + * stamp the right job field and the worker can apply the right retry policy. + * + * - `'check-suite'` — Actions API was stale at first dispatch; the job gets + * `checkSuiteRecheckAttempt: 1` and the worker reschedules safely on a + * still-stale response (coalesceKey deduplication prevents queue flooding). + * - absent (undefined) — treated as a mergeability re-check; the job gets + * `mergeabilityRecheckAttempt: 1` and the worker emits + * `mergeability_recheck_exhausted` when the trigger is still indeterminate. + */ + recheckKind?: 'check-suite'; }; } diff --git a/src/worker-entry.ts b/src/worker-entry.ts index e281970b4..630ac7f05 100644 --- a/src/worker-entry.ts +++ b/src/worker-entry.ts @@ -77,6 +77,12 @@ export interface GitHubJobData { ackMessage?: string; triggerResult?: TriggerResult; mergeabilityRecheckAttempt?: number; + /** + * Set to 1 when this job is a check-suite deferred re-check. + * Unlike mergeabilityRecheckAttempt, a second deferredRecheck result + * causes processGitHubWebhook to reschedule rather than exhaust. + */ + checkSuiteRecheckAttempt?: number; } export interface JiraJobData { @@ -329,6 +335,7 @@ export async function dispatchJob( jobData.ackMessage, jobData.triggerResult, !!jobData.mergeabilityRecheckAttempt, + !!jobData.checkSuiteRecheckAttempt, ); break; case 'jira': { diff --git a/tests/unit/router/deferred-recheck.test.ts b/tests/unit/router/deferred-recheck.test.ts index cb68645b8..a76ce7f6f 100644 --- a/tests/unit/router/deferred-recheck.test.ts +++ b/tests/unit/router/deferred-recheck.test.ts @@ -174,6 +174,20 @@ describe('GitHubJob.mergeabilityRecheckAttempt type field', () => { }; expect(job.mergeabilityRecheckAttempt).toBe(1); }); + + it('accepts checkSuiteRecheckAttempt field', () => { + // TypeScript compile-time check for the new field. + const job: GitHubJob = { + type: 'github', + source: 'github', + payload: {}, + eventType: 'check_suite', + repoFullName: 'owner/repo', + receivedAt: '', + checkSuiteRecheckAttempt: 1, + }; + expect(job.checkSuiteRecheckAttempt).toBe(1); + }); }); describe('GitHubRouterAdapter.buildJob — deferred re-check behavior', () => { @@ -236,6 +250,36 @@ describe('GitHubRouterAdapter.buildJob — deferred re-check behavior', () => { const job = adapter.buildJob(event, {}, mockProject, normalResult, undefined) as GitHubJob; expect(job.mergeabilityRecheckAttempt).toBeUndefined(); }); + + it('sets checkSuiteRecheckAttempt: 1 when recheckKind is check-suite', () => { + // Check-suite rechecks must NOT get mergeabilityRecheckAttempt — that field + // triggers the exhaustion path in processGitHubWebhook. + const checkSuiteResult: TriggerResult = { + agentType: null, + agentInput: {}, + deferredRecheck: { + delayMs: 30_000, + coalesceKey: 'check-suite-success:owner/repo:pr-42:sha123', + recheckKind: 'check-suite', + }, + }; + const job = adapter.buildJob(event, {}, mockProject, checkSuiteResult, undefined) as GitHubJob; + expect(job.checkSuiteRecheckAttempt).toBe(1); + expect(job.mergeabilityRecheckAttempt).toBeUndefined(); + }); + + it('sets mergeabilityRecheckAttempt: 1 when recheckKind is absent (backward-compat)', () => { + // The existing mergeability-recheck case: no recheckKind → mergeabilityRecheckAttempt. + const job = adapter.buildJob( + event, + {}, + mockProject, + deferredRecheckResult, + undefined, + ) as GitHubJob; + expect(job.mergeabilityRecheckAttempt).toBe(1); + expect(job.checkSuiteRecheckAttempt).toBeUndefined(); + }); }); describe('processRouterWebhook — deferred re-check branch', () => { diff --git a/tests/unit/triggers/check-suite-success.test.ts b/tests/unit/triggers/check-suite-success.test.ts index ba3f121be..302eafd1d 100644 --- a/tests/unit/triggers/check-suite-success.test.ts +++ b/tests/unit/triggers/check-suite-success.test.ts @@ -951,6 +951,10 @@ describe('CheckSuiteSuccessTrigger', () => { expect(result?.deferredRecheck?.coalesceKey).toBe( 'check-suite-success:owner/repo:pr-42:sha123', ); + // recheckKind must be 'check-suite' so buildJob sets checkSuiteRecheckAttempt + // (not mergeabilityRecheckAttempt) on the delayed job, allowing safe + // rescheduling if the Actions API is still stale on the first re-check. + expect(result?.deferredRecheck?.recheckKind).toBe('check-suite'); // Dedup must NOT be claimed for a deferred event; the recheck must // be free to make the dispatch call. expect(result?.onBlocked).toBeUndefined(); diff --git a/tests/unit/triggers/github-webhook-handler-recheck.test.ts b/tests/unit/triggers/github-webhook-handler-recheck.test.ts index 0af7f6910..69c2f32e0 100644 --- a/tests/unit/triggers/github-webhook-handler-recheck.test.ts +++ b/tests/unit/triggers/github-webhook-handler-recheck.test.ts @@ -7,6 +7,10 @@ vi.mock('../../../src/sentry.js', () => ({ flush: vi.fn().mockResolvedValue(undefined), })); +vi.mock('../../../src/router/queue.js', () => ({ + scheduleCoalescedJob: vi.fn().mockResolvedValue({ jobId: 'cj-1', superseded: false }), +})); + vi.mock('../../../src/triggers/github/integration.js', () => { const mockIntegration = { type: 'github', @@ -97,6 +101,7 @@ vi.mock('../../../src/utils/index.js', () => ({ // ── Imports ─────────────────────────────────────────────────────────────────── +import { scheduleCoalescedJob } from '../../../src/router/queue.js'; import { captureException } from '../../../src/sentry.js'; import { processGitHubWebhook } from '../../../src/triggers/github/webhook-handler.js'; import type { TriggerResult } from '../../../src/types/index.js'; @@ -109,6 +114,18 @@ const deferredRecheckResult: TriggerResult = { deferredRecheck: { delayMs: 45_000, coalesceKey: 'project-1:pr-conflict-recheck:42' }, }; +// Fixture used to simulate the check-suite trigger still returning stale state +// after a re-check dispatch. +const checkSuiteDeferredRecheckResult: TriggerResult = { + agentType: null, + agentInput: {}, + deferredRecheck: { + delayMs: 30_000, + coalesceKey: 'check-suite-success:owner/repo:pr-42:sha123', + recheckKind: 'check-suite', + }, +}; + const normalResult: TriggerResult = { agentType: 'resolve-conflicts', agentInput: { prNumber: 42 }, @@ -171,3 +188,76 @@ describe('processGitHubWebhook — re-check exhaustion detection', () => { expect(registry.dispatch).not.toHaveBeenCalled(); }); }); + +describe('processGitHubWebhook — check-suite re-check rescheduling', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('reschedules via scheduleCoalescedJob when isCheckSuiteRecheckJob=true and trigger returns deferredRecheck', async () => { + // This is the case where the worker fires a check-suite recheck but the + // Actions API is still stale (trigger returns another deferredRecheck). + // The fix: reschedule instead of treating as exhausted. + const registry = makeRegistry(checkSuiteDeferredRecheckResult); + await processGitHubWebhook( + {}, + 'check_suite', + registry, + undefined, + undefined, + undefined, + false, // isRecheckJob (mergeability) = false + true, // isCheckSuiteRecheckJob = true + ); + // Must reschedule with checkSuiteRecheckAttempt: 1 + expect(scheduleCoalescedJob).toHaveBeenCalledOnce(); + expect(scheduleCoalescedJob).toHaveBeenCalledWith( + expect.objectContaining({ type: 'github', checkSuiteRecheckAttempt: 1 }), + 'check-suite-success:owner/repo:pr-42:sha123', + 30_000, + ); + // Must NOT fire the mergeability_recheck_exhausted Sentry capture + expect(captureException).not.toHaveBeenCalled(); + }); + + it('does NOT reschedule when isCheckSuiteRecheckJob=false (first-time dispatch falls through normally)', async () => { + // First-time dispatch where trigger returns deferredRecheck: handled by the + // router (handleDeferredRecheck); processGitHubWebhook exits via the + // no-agent path without calling scheduleCoalescedJob. + const registry = makeRegistry(checkSuiteDeferredRecheckResult); + await processGitHubWebhook( + {}, + 'check_suite', + registry, + undefined, + undefined, + undefined, + false, // isRecheckJob (mergeability) = false + false, // isCheckSuiteRecheckJob = false + ); + expect(scheduleCoalescedJob).not.toHaveBeenCalled(); + expect(captureException).not.toHaveBeenCalled(); + }); + + it('Sentry-captures and keeps running when scheduleCoalescedJob throws on check-suite reschedule', async () => { + vi.mocked(scheduleCoalescedJob).mockRejectedValueOnce(new Error('Redis down')); + const registry = makeRegistry(checkSuiteDeferredRecheckResult); + // Should not throw even if Redis is down + await expect( + processGitHubWebhook( + {}, + 'check_suite', + registry, + undefined, + undefined, + undefined, + false, + true, + ), + ).resolves.toBeUndefined(); + expect(captureException).toHaveBeenCalledWith( + expect.any(Error), + expect.objectContaining({ tags: { source: 'check_suite_recheck_reschedule_failure' } }), + ); + }); +}); diff --git a/tests/unit/worker-entry.test.ts b/tests/unit/worker-entry.test.ts index c3ba74fc7..b73c1af17 100644 --- a/tests/unit/worker-entry.test.ts +++ b/tests/unit/worker-entry.test.ts @@ -185,6 +185,7 @@ describe('dispatchJob routing', () => { 'Starting implementation...', triggerResult, false, + false, ); }); @@ -212,6 +213,7 @@ describe('dispatchJob routing', () => { undefined, undefined, true, + false, ); }); From 438748ef5a7f46d47260975ced5edc5966535365 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 11 May 2026 21:01:42 +0000 Subject: [PATCH 10/12] =?UTF-8?q?fix(triggers):=20address=20review=20feedb?= =?UTF-8?q?ack=20=E2=80=94=20failure=20gate=20bypass=20+=20respond-to-ci?= =?UTF-8?q?=20cross-process=20dedup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue 1: CheckSuiteFailureTrigger used `gateCascadePersona ?? gateBaseBranch` which evaluated gateBaseBranch even for cascade-authored PRs. Cascade-authored stacked PRs targeting a non-base branch were incorrectly skipped instead of dispatching respond-to-ci. Fix: replace the `??` chain with an explicit gateCascadePersona check, mirroring the authorIsCascade bypass already present in decideCheckSuiteGates. Issue 2: The success handler's 30s deferred recheck fired in a fresh worker container with an empty in-process fixAttempts Map — it had no memory of what the router had already dispatched. This allowed duplicate respond-to-ci agents for the same PR+SHA. Fix: add Redis-backed dedup (respond-to-ci-dedup.ts) to dispatchRespondToCi, mirroring the review-dispatch-dedup pattern. Co-Authored-By: Claude Sonnet 4.6 --- src/triggers/github/check-suite-failure.ts | 17 ++- src/triggers/github/respond-to-ci-dedup.ts | 142 ++++++++++++++++++ src/triggers/github/respond-to-ci-dispatch.ts | 33 +++- .../unit/triggers/check-suite-failure.test.ts | 61 +++++++- .../unit/triggers/check-suite-success.test.ts | 15 +- 5 files changed, 255 insertions(+), 13 deletions(-) create mode 100644 src/triggers/github/respond-to-ci-dedup.ts diff --git a/src/triggers/github/check-suite-failure.ts b/src/triggers/github/check-suite-failure.ts index f7d9a6995..a2146a88e 100644 --- a/src/triggers/github/check-suite-failure.ts +++ b/src/triggers/github/check-suite-failure.ts @@ -2,7 +2,7 @@ import { githubClient } from '../../github/client.js'; import type { TriggerContext, TriggerHandler, TriggerResult } from '../../types/index.js'; import { logger } from '../../utils/logging.js'; import { parseRepoFullName } from '../../utils/repo.js'; -import { gateBaseBranch, gateCascadePersona, requirePersonaIdentities } from '../shared/gates.js'; +import { gateCascadePersona, requirePersonaIdentities } from '../shared/gates.js'; import { skip } from '../shared/skip.js'; import { checkTriggerEnabled } from '../shared/trigger-check.js'; import { decideCheckSuiteOutcome } from './check-suite-decision.js'; @@ -74,10 +74,17 @@ export class CheckSuiteFailureTrigger implements TriggerHandler { const personasResult = requirePersonaIdentities(ctx.personaIdentities, prNumber, this.name); if (!personasResult.ok) return personasResult.skip; - const gateChainSkip = - gateCascadePersona(prDetails.user.login, prNumber, personasResult.value, this.name) ?? - gateBaseBranch(prDetails.baseRef, prNumber, ctx.project, this.name); - if (gateChainSkip) return gateChainSkip; + // Cascade-authored PRs bypass the base-branch gate — a cascade PR + // targeting a non-base branch is a stacked PR, not a drive-by. + // Non-cascade authors are filtered here and never reach the gate. + // Mirrors the authorIsCascade bypass in decideCheckSuiteGates (lines 101-109). + const cascadePersonaSkip = gateCascadePersona( + prDetails.user.login, + prNumber, + personasResult.value, + this.name, + ); + if (cascadePersonaSkip) return cascadePersonaSkip; // Resolve work item from DB const workItemId = await resolveWorkItemId(ctx.project.id, prNumber); diff --git a/src/triggers/github/respond-to-ci-dedup.ts b/src/triggers/github/respond-to-ci-dedup.ts new file mode 100644 index 000000000..4885bf8e0 --- /dev/null +++ b/src/triggers/github/respond-to-ci-dedup.ts @@ -0,0 +1,142 @@ +/** + * Respond-to-CI dispatch deduplication, Redis-backed. + * + * Guards against duplicate dispatch when a success-side deferred recheck fires + * ~30 s after a failure handler has already dispatched respond-to-ci for the + * same PR+SHA. The failure handler (or the first success-side dispatch) claims + * the slot; the recheck finds it taken and skips. + * + * Why Redis (and not a process-local Map): + * The deferred recheck runs inside a *worker* container that starts with an + * empty in-process Map — it has no memory of what the router process dispatched + * 30 s earlier. Only a shared external store bridges that gap. Mirrors the + * cross-process dedup rationale in `review-dispatch-dedup.ts`. + * + * Redis primitive: `SET key value NX EX ` — atomic check-and-set with + * TTL. Returns `'OK'` on first claim, `null` on duplicate. No race window. + * + * Failure mode: when Redis is unreachable, `claim` returns `false` (treats + * the call as a duplicate) and Sentry-captures under + * `respond_to_ci_dedup_redis_down`. Better to skip a legit dispatch than to + * launch duplicate fix agents; mirrors spec-017's fail-closed posture. + */ + +import { Redis } from 'ioredis'; +import { routerConfig } from '../../router/config.js'; +import { captureException } from '../../sentry.js'; +import { logger } from '../../utils/logging.js'; + +// 2 minutes — covers the 30 s deferred-recheck window with generous buffer. +// Kept shorter than the review-dispatch TTL (5 min) because the dedup scope +// is narrower: we only need to block the one pending recheck job. +const DEDUP_TTL_SEC = 2 * 60; + +const KEY_NS = 'cascade:respond-to-ci-dedup:'; + +let redisInstance: Redis | null = null; + +/** + * Lazy singleton — first call connects, subsequent calls reuse the same + * client. Reads `routerConfig.redisUrl` (captured at config.ts module load) + * rather than `process.env.REDIS_URL` so the URL survives `scrubSensitiveEnv()` + * in worker containers. + */ +function getRedis(): Redis { + if (!redisInstance) { + if (!routerConfig.redisUrl) { + throw new Error('REDIS_URL is required for respond-to-ci-dispatch dedup'); + } + redisInstance = new Redis(routerConfig.redisUrl); + } + return redisInstance; +} + +export function buildRespondToCiDispatchKey( + owner: string, + repo: string, + prNumber: number, + headSha: string, +): string { + return `${owner}/${repo}:${prNumber}:${headSha}`; +} + +/** + * Atomically claim a dispatch slot for the given key. Returns `true` exactly + * once per key within the TTL window across ALL connected processes. + * + * Fails closed on Redis errors: returns `false` so the caller skips the + * dispatch. Sentry-captures under `respond_to_ci_dedup_redis_down`. + */ +export async function claimRespondToCiDispatch( + key: string, + triggerName: string, + context: { prNumber: number; headSha: string }, +): Promise { + const namespacedKey = `${KEY_NS}${key}`; + try { + const result = await getRedis().set(namespacedKey, triggerName, 'EX', DEDUP_TTL_SEC, 'NX'); + if (result === 'OK') { + logger.info('Claimed respond-to-ci dispatch for PR+SHA', { + trigger: triggerName, + respondToCiDispatchKey: key, + prNumber: context.prNumber, + headSha: context.headSha, + }); + return true; + } + logger.info('Respond-to-ci already dispatched for this PR+SHA, skipping', { + trigger: triggerName, + respondToCiDispatchKey: key, + prNumber: context.prNumber, + headSha: context.headSha, + }); + return false; + } catch (err) { + logger.error('Respond-to-ci dedup Redis call failed — failing closed', { + trigger: triggerName, + respondToCiDispatchKey: key, + error: String(err), + }); + captureException(err, { + tags: { source: 'respond_to_ci_dedup_redis_down' }, + extra: { respondToCiDispatchKey: key, trigger: triggerName }, + level: 'error', + }); + return false; + } +} + +/** + * Release a previously-claimed dispatch slot. Used by `onBlocked` callbacks + * when downstream rejects the dispatch (work-item lock, agent-type concurrency, + * etc.) so the next legitimate trigger can claim. + * + * Errors are logged but never thrown — release is best-effort, and the TTL + * is the safety net. + */ +export async function releaseRespondToCiDispatch(key: string): Promise { + const namespacedKey = `${KEY_NS}${key}`; + try { + await getRedis().del(namespacedKey); + } catch (err) { + logger.warn('Respond-to-ci dedup release failed (TTL will reap)', { + respondToCiDispatchKey: key, + error: String(err), + }); + } +} + +/** + * Test-only: flush the dedup namespace and reset the singleton. Intended for + * `beforeEach` in unit tests and for the integration suite's per-test cleanup. + * Never call from production code. + * + * @internal + */ +export async function __resetForTests(): Promise { + if (!redisInstance) return; + const keys = await redisInstance.keys(`${KEY_NS}*`); + if (keys.length > 0) await redisInstance.del(...keys); + await redisInstance.quit().catch(() => {}); + redisInstance = null; +} diff --git a/src/triggers/github/respond-to-ci-dispatch.ts b/src/triggers/github/respond-to-ci-dispatch.ts index 5fa3300d8..48ba3afe0 100644 --- a/src/triggers/github/respond-to-ci-dispatch.ts +++ b/src/triggers/github/respond-to-ci-dispatch.ts @@ -3,7 +3,13 @@ import type { TriggerContext, TriggerResult } from '../../types/index.js'; import { logger } from '../../utils/logging.js'; import { parseRepoFullName } from '../../utils/repo.js'; import { gateAttemptLimit } from '../shared/gates.js'; +import { skip } from '../shared/skip.js'; import { checkTriggerEnabled } from '../shared/trigger-check.js'; +import { + buildRespondToCiDispatchKey, + claimRespondToCiDispatch, + releaseRespondToCiDispatch, +} from './respond-to-ci-dedup.js'; import { buildRespondToCiResult } from './result-builders.js'; import type { GitHubCheckSuitePayload } from './types.js'; @@ -58,10 +64,29 @@ export async function dispatchRespondToCi(opts: { return null; } + const { owner, repo } = parseRepoFullName(opts.payload.repository.full_name); + const headSha = opts.payload.check_suite.head_sha; + + // Cross-process dedup: the success handler's 30 s deferred recheck fires + // in a fresh worker container with an empty in-process Map — it has no + // memory of what the router dispatched earlier. Claim the Redis slot here; + // the recheck finds it taken and skips, preventing duplicate fix agents for + // the same PR+SHA. Mirrors the review-dispatch-dedup pattern. + const dedupKey = buildRespondToCiDispatchKey(owner, repo, opts.prNumber, headSha); + const claimed = await claimRespondToCiDispatch(dedupKey, opts.handlerName, { + prNumber: opts.prNumber, + headSha, + }); + if (!claimed) { + return skip( + opts.handlerName, + `Respond-to-ci already dispatched for PR #${opts.prNumber}@${headSha} (dedup)`, + ); + } + const attempts = fixAttempts.get(opts.prNumber) ?? 0; const limitSkip = gateAttemptLimit(attempts, MAX_ATTEMPTS, opts.prNumber, opts.handlerName); if (limitSkip) { - const { owner, repo } = parseRepoFullName(opts.payload.repository.full_name); await githubClient.createPRComment( owner, repo, @@ -94,10 +119,14 @@ export async function dispatchRespondToCi(opts: { prNumber: opts.prNumber, prDetails: opts.prDetails, repoFullName: opts.payload.repository.full_name, - headSha: opts.payload.check_suite.head_sha, + headSha, workItemId: opts.workItemId, workItemUrl: opts.workItemUrl, workItemTitle: opts.workItemTitle, }), + onBlocked: () => { + // Fire-and-forget — release is best-effort, TTL is the safety net. + void releaseRespondToCiDispatch(dedupKey); + }, }; } diff --git a/tests/unit/triggers/check-suite-failure.test.ts b/tests/unit/triggers/check-suite-failure.test.ts index f624ba7c6..a9e4201a5 100644 --- a/tests/unit/triggers/check-suite-failure.test.ts +++ b/tests/unit/triggers/check-suite-failure.test.ts @@ -11,6 +11,18 @@ vi.mock('../../../src/triggers/shared/trigger-check.js', () => mockTriggerCheckM vi.mock('../../../src/github/client.js', () => mockGitHubClientModule); +// Stub the Redis-backed dedup module so tests don't need a Redis connection. +// Each `claim` resolves to true (success) by default; per-test overrides via +// `mockClaimRespondToCiDispatch.mockResolvedValueOnce(false)` simulate a duplicate. +const mockClaimRespondToCiDispatch = vi.fn().mockResolvedValue(true); +const mockReleaseRespondToCiDispatch = vi.fn().mockResolvedValue(undefined); +vi.mock('../../../src/triggers/github/respond-to-ci-dedup.js', () => ({ + buildRespondToCiDispatchKey: (owner: string, repo: string, prNumber: number, headSha: string) => + `${owner}/${repo}:${prNumber}:${headSha}`, + claimRespondToCiDispatch: (...args: unknown[]) => mockClaimRespondToCiDispatch(...args), + releaseRespondToCiDispatch: (...args: unknown[]) => mockReleaseRespondToCiDispatch(...args), +})); + import { githubClient } from '../../../src/github/client.js'; import { CheckSuiteFailureTrigger, @@ -51,6 +63,8 @@ describe('CheckSuiteFailureTrigger', () => { beforeEach(() => { resetFixAttempts(42); vi.mocked(lookupWorkItemForPR).mockResolvedValue('abc123'); + mockClaimRespondToCiDispatch.mockReset().mockResolvedValue(true); + mockReleaseRespondToCiDispatch.mockReset().mockResolvedValue(undefined); }); describe('matches', () => { @@ -246,12 +260,16 @@ describe('CheckSuiteFailureTrigger', () => { workItemId: 'abc123', workItemUrl: undefined, workItemTitle: undefined, - onBlocked: undefined, + onBlocked: expect.any(Function), coalesceKey: undefined, }); }); - it('returns a structured skip when PR targets non-base branch', async () => { + // Bug 2 fix (2026-05-11): cascade-authored stacked PRs bypass the base-branch + // gate — only non-cascade authors are filtered here. The base-branch gate + // used to be applied unconditionally via `??` which also blocked cascade- + // authored stacked PRs. Now only non-cascade authors hit the persona gate. + it('returns a structured skip when PR not authored by cascade but targets non-base branch', async () => { vi.mocked(githubClient.getPR).mockResolvedValue({ number: 42, title: 'Test PR', @@ -262,7 +280,7 @@ describe('CheckSuiteFailureTrigger', () => { headSha: 'sha123', baseRef: 'develop', merged: false, - user: { login: 'cascade-impl' }, + user: { login: 'some-human' }, }); const ctx: TriggerContext = { @@ -274,10 +292,43 @@ describe('CheckSuiteFailureTrigger', () => { const result = await trigger.handle(ctx); - expectSkip(result, /targets develop, not project base branch main/); + expectSkip(result, /not authored by a cascade persona.*author: some-human/i); expect(githubClient.getCheckSuiteStatus).not.toHaveBeenCalled(); }); + it('dispatches respond-to-ci for cascade-authored stacked PR targeting non-base branch', async () => { + // Bug 2 fix: cascade-authored PRs bypass the base-branch gate. + vi.mocked(githubClient.getPR).mockResolvedValue({ + number: 42, + title: 'Stacked PR', + body: null, + state: 'open', + htmlUrl: 'https://github.com/owner/repo/pull/42', + headRef: 'feature/stacked-child', + headSha: 'sha123', + baseRef: 'feature/stacked-parent', + merged: false, + user: { login: 'cascade-impl' }, + }); + vi.mocked(githubClient.getCheckSuiteStatus).mockResolvedValue({ + allPassing: false, + totalCount: 1, + checkRuns: [{ name: 'test', status: 'completed', conclusion: 'failure' }], + }); + + const ctx: TriggerContext = { + project: mockProject, + source: 'github', + payload: makeFailurePayload(), + personaIdentities: mockPersonaIdentities, + }; + + const result = await trigger.handle(ctx); + + expect(result?.agentType).toBe('respond-to-ci'); + expect(result?.prNumber).toBe(42); + }); + it('returns a structured skip when PR not authored by any cascade persona', async () => { vi.mocked(githubClient.getPR).mockResolvedValue({ number: 42, @@ -570,7 +621,7 @@ describe('CheckSuiteFailureTrigger', () => { workItemId: 'abc123', workItemUrl: undefined, workItemTitle: undefined, - onBlocked: undefined, + onBlocked: expect.any(Function), coalesceKey: undefined, }); }); diff --git a/tests/unit/triggers/check-suite-success.test.ts b/tests/unit/triggers/check-suite-success.test.ts index 302eafd1d..60be682fd 100644 --- a/tests/unit/triggers/check-suite-success.test.ts +++ b/tests/unit/triggers/check-suite-success.test.ts @@ -14,7 +14,7 @@ vi.mock('../../../src/triggers/shared/trigger-check.js', () => mockTriggerCheckM vi.mock('../../../src/github/client.js', () => mockGitHubClientModule); -// Stub the Redis-backed dedup module so tests don't need a Redis connection. +// Stub the Redis-backed review dedup module so tests don't need a Redis connection. // Each `claim` resolves to true (success) by default; per-test overrides via // `mockClaimReviewDispatch.mockResolvedValueOnce(false)` simulate a duplicate. const mockClaimReviewDispatch = vi.fn().mockResolvedValue(true); @@ -26,6 +26,17 @@ vi.mock('../../../src/triggers/github/review-dispatch-dedup.js', () => ({ releaseReviewDispatch: (...args: unknown[]) => mockReleaseReviewDispatch(...args), })); +// Stub the Redis-backed respond-to-ci dedup module (used by dispatchRespondToCi +// called from the success handler's mixed-state fork). +const mockClaimRespondToCiDispatch = vi.fn().mockResolvedValue(true); +const mockReleaseRespondToCiDispatch = vi.fn().mockResolvedValue(undefined); +vi.mock('../../../src/triggers/github/respond-to-ci-dedup.js', () => ({ + buildRespondToCiDispatchKey: (owner: string, repo: string, prNumber: number, headSha: string) => + `${owner}/${repo}:${prNumber}:${headSha}`, + claimRespondToCiDispatch: (...args: unknown[]) => mockClaimRespondToCiDispatch(...args), + releaseRespondToCiDispatch: (...args: unknown[]) => mockReleaseRespondToCiDispatch(...args), +})); + import { githubClient } from '../../../src/github/client.js'; import { resetFixAttempts } from '../../../src/triggers/github/check-suite-failure.js'; import { CheckSuiteSuccessTrigger } from '../../../src/triggers/github/check-suite-success.js'; @@ -54,6 +65,8 @@ describe('CheckSuiteSuccessTrigger', () => { vi.mocked(lookupWorkItemForPR).mockResolvedValue('abc123'); mockClaimReviewDispatch.mockReset().mockResolvedValue(true); mockReleaseReviewDispatch.mockReset().mockResolvedValue(undefined); + mockClaimRespondToCiDispatch.mockReset().mockResolvedValue(true); + mockReleaseRespondToCiDispatch.mockReset().mockResolvedValue(undefined); resetFixAttempts(42); // Default: aggregate status reflects all checks passing. Tests that need // a mixed-state SHA override this per-case. From 76175c02ff7e96e14afc73c5923f773a39e9fff5 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 11 May 2026 21:17:30 +0000 Subject: [PATCH 11/12] =?UTF-8?q?fix:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20failure=20defer=20recheck=20+=20extend=20dedup=20TT?= =?UTF-8?q?L?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two gaps closed from the 2026-05-11 review (nhopeatall, PR #1351): 1. check-suite-failure.ts — the `defer` branch now returns a deferredRecheck (30s delay, recheckKind: 'check-suite') instead of a plain skip(), matching the API-lag fix already applied to check-suite-success. Repro: final check_suite.completed webhook with conclusion=failure arrives while Actions API still reports a check as in_progress; no follow-up webhook fires, so the plain skip would permanently lose respond-to-ci dispatch. 2. respond-to-ci-dedup.ts — TTL extended from 2 min to 10 min to cover the full duplicate window: 30s deferred-recheck delay + up to 5min worker-slot wait (slotWaitTimeoutMs default) + buffer. The 2-min key could expire before a recheck job even started under backlog, allowing the recheck to reclaim the key and launch a second fix agent for the same PR+SHA. Co-Authored-By: Claude Sonnet 4.6 --- src/triggers/github/check-suite-failure.ts | 19 ++++++++++++++-- src/triggers/github/respond-to-ci-dedup.ts | 11 ++++++---- .../unit/triggers/check-suite-failure.test.ts | 22 +++++++++++++++++-- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/triggers/github/check-suite-failure.ts b/src/triggers/github/check-suite-failure.ts index a2146a88e..f1f0b9c22 100644 --- a/src/triggers/github/check-suite-failure.ts +++ b/src/triggers/github/check-suite-failure.ts @@ -3,6 +3,7 @@ import type { TriggerContext, TriggerHandler, TriggerResult } from '../../types/ import { logger } from '../../utils/logging.js'; import { parseRepoFullName } from '../../utils/repo.js'; import { gateCascadePersona, requirePersonaIdentities } from '../shared/gates.js'; +import { buildDeferredRecheckResult } from '../shared/result-builders.js'; import { skip } from '../shared/skip.js'; import { checkTriggerEnabled } from '../shared/trigger-check.js'; import { decideCheckSuiteOutcome } from './check-suite-decision.js'; @@ -103,12 +104,26 @@ export class CheckSuiteFailureTrigger implements TriggerHandler { }); if (decision.action === 'defer') { - logger.info('Not all checks complete yet, waiting', { + // Same API-lag fix applied to check-suite-success (Bug 1, 2026-05-11): + // returning a plain skip() would rely on GitHub firing another + // check_suite.completed event, but when Actions API lags webhook + // delivery the API still shows a check as in_progress even after + // GitHub has already fired its final event. Schedule a deferred + // re-check so the trigger re-evaluates against fresh API state ~30s later. + const coalesceKey = `check-suite-failure:${owner}/${repo}:pr-${prNumber}:${headSha}`; + logger.info('Not all checks complete yet, scheduling deferred re-check', { + handler: this.name, prNumber, totalChecks: checkStatus.totalCount, incompleteChecks: decision.incompleteChecks, + coalesceKey, + delayMs: 30_000, + }); + return buildDeferredRecheckResult({ + delayMs: 30_000, + coalesceKey, + recheckKind: 'check-suite', }); - return skip(this.name, decision.message); } if (decision.action === 'skip') { if (decision.message.startsWith('All ')) { diff --git a/src/triggers/github/respond-to-ci-dedup.ts b/src/triggers/github/respond-to-ci-dedup.ts index 4885bf8e0..24e42f1ee 100644 --- a/src/triggers/github/respond-to-ci-dedup.ts +++ b/src/triggers/github/respond-to-ci-dedup.ts @@ -26,10 +26,13 @@ import { routerConfig } from '../../router/config.js'; import { captureException } from '../../sentry.js'; import { logger } from '../../utils/logging.js'; -// 2 minutes — covers the 30 s deferred-recheck window with generous buffer. -// Kept shorter than the review-dispatch TTL (5 min) because the dedup scope -// is narrower: we only need to block the one pending recheck job. -const DEDUP_TTL_SEC = 2 * 60; +// 10 minutes — must cover the full duplicate window: 30 s deferred-recheck +// delay + up to 5 min waiting for a worker slot (slotWaitTimeoutMs default) +// + buffer for active dispatch and edge cases. The previous 2-min TTL could +// expire before a recheck job even started under backlog, allowing the recheck +// to claim the key and launch a second fix agent for the same PR+SHA. +// See review comment 2026-05-11 (nhopeatall) on PR #1351 for the failure mode. +const DEDUP_TTL_SEC = 10 * 60; const KEY_NS = 'cascade:respond-to-ci-dedup:'; diff --git a/tests/unit/triggers/check-suite-failure.test.ts b/tests/unit/triggers/check-suite-failure.test.ts index a9e4201a5..67103e80a 100644 --- a/tests/unit/triggers/check-suite-failure.test.ts +++ b/tests/unit/triggers/check-suite-failure.test.ts @@ -451,7 +451,12 @@ describe('CheckSuiteFailureTrigger', () => { expect(result?.agentInput.workItemId).toBeUndefined(); }); - it('returns a structured skip when not all checks are complete', async () => { + // API-lag fix (same as check-suite-success Bug 1, 2026-05-11): + // when the Actions API reports a check as in_progress even after the + // final check_suite.completed event, a plain skip would wait for a + // follow-up webhook that GitHub has already sent. The handler now + // schedules a deferred re-check so it re-evaluates against fresh state. + it('returns deferredRecheck (not plain skip) when not all checks are complete', async () => { vi.mocked(githubClient.getPR).mockResolvedValue({ number: 42, title: 'Test PR', @@ -482,7 +487,20 @@ describe('CheckSuiteFailureTrigger', () => { const result = await trigger.handle(ctx); - expectSkip(result, /Not all checks complete yet.*test/); + // No agent dispatch — wait for the deferred re-check. + expect(result?.agentType).toBeNull(); + expect(result?.deferredRecheck).toBeDefined(); + expect(result?.deferredRecheck?.delayMs).toBe(30_000); + // coalesceKey must include owner/repo + PR number + head SHA. + expect(result?.deferredRecheck?.coalesceKey).toBe( + 'check-suite-failure:owner/repo:pr-42:sha123', + ); + // recheckKind must be 'check-suite' so the router stamps + // checkSuiteRecheckAttempt (not mergeabilityRecheckAttempt) on the + // delayed job, enabling safe rescheduling if still stale. + expect(result?.deferredRecheck?.recheckKind).toBe('check-suite'); + // Dedup must NOT be claimed for the deferred event. + expect(result?.onBlocked).toBeUndefined(); }); it('returns a structured skip when all checks actually passed (no failures)', async () => { From e4b6929cc23e2d610875b5465404e90c0a24e4b0 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 11 May 2026 21:36:00 +0000 Subject: [PATCH 12/12] fix: extend respond-to-ci dedup TTL to cover full worker window + update docs Increases DEDUP_TTL_SEC from 10 min to 35 min to cover the full 30-minute workerTimeoutMs window (+ 5-min buffer). The 10-min key could expire while the original respond-to-ci job was still running, allowing a check-suite recheck to claim the same PR+SHA and launch a duplicate fix agent. Also updates AGENTS.md/CLAUDE.md, src/triggers/README.md, and docs/architecture/{01,02,03,10} to document the two deferred-recheck kinds (mergeabilityRecheckAttempt one-shot vs checkSuiteRecheckAttempt safe-rescheduling), replacing stale descriptions that only mentioned the mergeability path. Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 2 +- docs/architecture/01-services.md | 2 +- docs/architecture/02-webhook-pipeline.md | 2 +- docs/architecture/03-trigger-system.md | 9 ++++++-- docs/architecture/10-resilience.md | 7 ++++-- src/triggers/README.md | 2 +- src/triggers/github/respond-to-ci-dedup.ts | 25 ++++++++++++++++------ 7 files changed, 35 insertions(+), 14 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 51d4ff117..f7eaf6134 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -123,7 +123,7 @@ Some triggers take params (e.g. `review` + `scm:check-suite-success` accepts `{" **Post-completion review dispatch** — when an implementation agent succeeds with a PR, the execution pipeline checks CI status and fires the review agent deterministically (before the container exits). This guarantees review dispatch within seconds of implementation completion, regardless of GitHub webhook timing. Uses the same `claimReviewDispatch` dedup key as the `check-suite-success` trigger, so the two paths cannot double-enqueue. -**Deferred re-check** — a trigger handler can return `TriggerResult.deferredRecheck: { delayMs, coalesceKey }` (with `agentType: null`) to schedule a bare delayed job via `scheduleCoalescedJob`. The router scheduling is adapter-agnostic, but **bare re-dispatch is currently GitHub-only**: `GitHubRouterAdapter.buildJob()` strips `triggerResult` from the job so the GitHub worker re-dispatches through the trigger registry for fresh provider state. Non-GitHub adapters (Trello, JIRA, Linear, Sentry) embed `triggerResult` in the job; their workers pass it to `resolveTriggerResult()`, which returns the pre-resolved `agentType: null` result without re-dispatching — a non-GitHub handler using this field would schedule a job that reuses the same result rather than re-evaluating provider state. The worker detects GitHub re-check jobs via `GitHubJob.mergeabilityRecheckAttempt` and Sentry-captures under tag `mergeability_recheck_exhausted` when state still cannot resolve. Workers do not re-queue — a second `deferredRecheck` return exits gracefully. +**Deferred re-check** — a trigger handler can return `TriggerResult.deferredRecheck: { delayMs, coalesceKey, recheckKind? }` (with `agentType: null`) to schedule a bare delayed job via `scheduleCoalescedJob`. The router scheduling is adapter-agnostic, but **bare re-dispatch is currently GitHub-only**: `GitHubRouterAdapter.buildJob()` strips `triggerResult` from the job so the GitHub worker re-dispatches through the trigger registry for fresh provider state. Non-GitHub adapters (Trello, JIRA, Linear, Sentry) embed `triggerResult` in the job; their workers pass it to `resolveTriggerResult()`, which returns the pre-resolved `agentType: null` result without re-dispatching — a non-GitHub handler using this field would schedule a job that reuses the same result rather than re-evaluating provider state. There are two recheck kinds, controlled by the optional `recheckKind` field on `deferredRecheck`: **mergeability re-check** (no `recheckKind`, sets `mergeabilityRecheckAttempt: 1` on the job) — one-shot; if the re-check still cannot resolve state, the worker Sentry-captures under `mergeability_recheck_exhausted` and stops without re-queueing. **Check-suite re-check** (`recheckKind: 'check-suite'`, sets `checkSuiteRecheckAttempt: 1` on the job) — safe rescheduling; if the Actions API is still stale when the job fires, the worker reschedules another coalesced delayed job instead of exhausting, so review/respond-to-ci dispatch stays alive until the API catches up. Used by `check-suite-success` and `check-suite-failure` handlers for the Actions-API-lag case (ucho PR #394/MNG-683, 2026-05-11). **Worker exit diagnostics** — when a worker container exits non-zero, the router calls `container.inspect()` *before* AutoRemove reaps it and stamps the run record's `error` field with a structured, grep-stable string: `Worker crashed with exit code N · OOMKilled= · reason=""`. The `OOMKilled=true` marker is the definitive cgroup-OOM signal (per Docker's own `State.OOMKilled`); a 137 exit *without* `OOMKilled=true` means the kill came from inside the container or from a non-cgroup signal — *not* memory. The `[WorkerManager] Resolved spawn settings` log emitted at every spawn includes both `projectWatchdogTimeoutMs` and `globalWorkerTimeoutMs` so post-mortems can confirm whether the per-project override actually won. See `src/router/active-workers.ts:formatCrashReason` for the format and `tests/unit/router/container-manager-diagnostics.test.ts` for regression pins. diff --git a/docs/architecture/01-services.md b/docs/architecture/01-services.md index a95eb1522..9ee911e60 100644 --- a/docs/architecture/01-services.md +++ b/docs/architecture/01-services.md @@ -100,7 +100,7 @@ The router passes job data to workers via Docker container env vars: |----------|---------| | `JOB_ID` | Unique job identifier | | `JOB_TYPE` | `trello`, `github`, `jira`, `linear`, `sentry`, `manual-run`, `retry-run`, `debug-analysis` | -| `JOB_DATA` | JSON-encoded job payload; GitHub jobs include `mergeabilityRecheckAttempt` in this payload for deferred re-checks | +| `JOB_DATA` | JSON-encoded job payload; GitHub jobs include `mergeabilityRecheckAttempt` (mergeability re-check) or `checkSuiteRecheckAttempt` (check-suite Actions-API-lag re-check) in this payload for deferred re-checks | | `CASCADE_CREDENTIAL_KEYS` | Comma-separated list of credential env var names | | Individual credential vars | Pre-loaded project credentials (e.g., `GITHUB_TOKEN_IMPLEMENTER`) | diff --git a/docs/architecture/02-webhook-pipeline.md b/docs/architecture/02-webhook-pipeline.md index fd87b5b48..fda4e4ef3 100644 --- a/docs/architecture/02-webhook-pipeline.md +++ b/docs/architecture/02-webhook-pipeline.md @@ -147,7 +147,7 @@ Structured skip is intentionally different from bare `null`: it preserves the ha PM status-change dispatches can include a `coalesceKey`, normally `${projectId}:${workItemId}`. When `PM_COALESCE_WINDOW_MS` is positive, the router schedules a delayed job via `scheduleCoalescedJob`; a newer dispatch with the same key supersedes the pending one and releases the superseded job's in-memory locks. PM ack comments are deferred to job fire time for coalesced jobs so superseded work does not leave orphan comments. -Deferred re-check also uses `scheduleCoalescedJob` and exits without dispatch locks or an ack comment. The bare re-dispatch on job fire is currently **GitHub-only**: `GitHubRouterAdapter.buildJob()` strips `triggerResult` and sets `mergeabilityRecheckAttempt: 1`, so the GitHub worker re-dispatches through the trigger registry to evaluate fresh provider state. Non-GitHub adapters (Trello, JIRA, Linear, Sentry) embed `triggerResult` in the job regardless of `deferredRecheck`, so their workers return the pre-resolved `agentType: null` result directly without re-dispatching. If a deferred re-check schedule call fails, the router captures Sentry under `deferred_recheck_schedule_failure` and still returns `Deferred re-check scheduled` — it does not call `onBlocked`. GitHub mergeability uses this when `mergeable` is still `null` after the synchronous retry budget; if the re-check still cannot resolve state, the GitHub worker records `mergeability_recheck_exhausted` and stops rather than re-queueing indefinitely. +Deferred re-check also uses `scheduleCoalescedJob` and exits without dispatch locks or an ack comment. The bare re-dispatch on job fire is currently **GitHub-only**: `GitHubRouterAdapter.buildJob()` strips `triggerResult` and stamps a re-check field determined by the optional `recheckKind` discriminator on `deferredRecheck`. Two kinds exist: **mergeability re-check** (no `recheckKind`, stamps `mergeabilityRecheckAttempt: 1`) — one-shot; if the re-check still cannot resolve state the worker records `mergeability_recheck_exhausted` and stops. **Check-suite re-check** (`recheckKind: 'check-suite'`, stamps `checkSuiteRecheckAttempt: 1`) — safe rescheduling; when the Actions API is still stale the worker reschedules another coalesced delayed job so dispatch stays alive until the API catches up (used for Actions-API-lag). Non-GitHub adapters (Trello, JIRA, Linear, Sentry) embed `triggerResult` in the job regardless of `deferredRecheck`, so their workers return the pre-resolved `agentType: null` result directly without re-dispatching. If a deferred re-check schedule call fails, the router captures Sentry under `deferred_recheck_schedule_failure` and still returns `Deferred re-check scheduled` — it does not call `onBlocked`. ### Concurrency controls diff --git a/docs/architecture/03-trigger-system.md b/docs/architecture/03-trigger-system.md index 6352548be..39e1a1d29 100644 --- a/docs/architecture/03-trigger-system.md +++ b/docs/architecture/03-trigger-system.md @@ -172,9 +172,14 @@ The router preserves structured skips in webhook logs with `Trigger sk ### Deferred re-checks -Handlers that cannot make a final decision yet can return `deferredRecheck: { delayMs, coalesceKey }` with `agentType: null`. The router schedules a coalesced delayed BullMQ job and exits without spawning an agent. GitHub mergeability checks use this path; the worker recognizes re-check jobs via `mergeabilityRecheckAttempt` and captures a Sentry diagnostic if the second pass still cannot resolve state. +Handlers that cannot make a final decision yet can return `deferredRecheck: { delayMs, coalesceKey, recheckKind? }` with `agentType: null`. The router schedules a coalesced delayed BullMQ job and exits without spawning an agent. -The bare re-dispatch on job fire is currently **GitHub-only**: `GitHubRouterAdapter.buildJob()` strips `triggerResult` and sets `mergeabilityRecheckAttempt: 1`, so the GitHub worker re-dispatches through the trigger registry to evaluate fresh provider state. Non-GitHub adapters (Trello, JIRA, Linear, Sentry) embed `triggerResult` in the job regardless of `deferredRecheck`; `resolveTriggerResult()` returns the pre-resolved result directly, skipping registry dispatch. A non-GitHub handler returning `buildDeferredRecheckResult` would therefore schedule a job that reuses the same `agentType: null` result rather than re-evaluating provider state. See `src/triggers/README.md` for the full authoring contract. Workers do not schedule another re-check after exhaustion. +The bare re-dispatch on job fire is currently **GitHub-only**: `GitHubRouterAdapter.buildJob()` strips `triggerResult` and stamps the right re-check field based on the optional `recheckKind` discriminator. Two re-check kinds exist: + +- **Mergeability re-check** (`recheckKind` absent) — `mergeabilityRecheckAttempt: 1` is set on the job. The GitHub worker re-dispatches through the registry for fresh provider state. If the re-check still cannot resolve state, the worker Sentry-captures under `mergeability_recheck_exhausted` and stops (one-shot — no further rescheduling). +- **Check-suite re-check** (`recheckKind: 'check-suite'`) — `checkSuiteRecheckAttempt: 1` is set on the job. If the Actions API is still stale when the job fires, the worker reschedules another coalesced delayed job instead of exhausting, so review/respond-to-ci dispatch stays alive until the API catches up. Used by `check-suite-success` and `check-suite-failure` for the Actions-API-lag case (ucho PR #394/MNG-683, 2026-05-11). + +Non-GitHub adapters (Trello, JIRA, Linear, Sentry) embed `triggerResult` in the job regardless of `deferredRecheck`; `resolveTriggerResult()` returns the pre-resolved result directly, skipping registry dispatch. A non-GitHub handler returning `buildDeferredRecheckResult` would therefore schedule a job that reuses the same `agentType: null` result rather than re-evaluating provider state. See `src/triggers/README.md` for the full authoring contract. ### Config resolution diff --git a/docs/architecture/10-resilience.md b/docs/architecture/10-resilience.md index a2d5020e7..b0713e315 100644 --- a/docs/architecture/10-resilience.md +++ b/docs/architecture/10-resilience.md @@ -97,9 +97,12 @@ This split prevents both classes of failure from wedging a work item for the loc ### Deferred re-check exhaustion -Some provider state is eventually consistent and has no follow-up webhook. A trigger can return `TriggerResult.deferredRecheck` with `agentType: null`; the router schedules a coalesced delayed bare job and does not take normal dispatch locks. The bare re-dispatch on job fire is currently **GitHub-only**: `GitHubRouterAdapter.buildJob()` strips `triggerResult` and sets `mergeabilityRecheckAttempt: 1`, so the GitHub worker re-dispatches through the registry to get fresh provider state. Non-GitHub adapters (Trello, JIRA, Linear, Sentry) embed `triggerResult` in the job; their workers return the pre-resolved `agentType: null` result directly without re-dispatching through the registry. +Some provider state is eventually consistent and has no follow-up webhook. A trigger can return `TriggerResult.deferredRecheck` (with `agentType: null` and an optional `recheckKind`); the router schedules a coalesced delayed bare job and does not take normal dispatch locks. The bare re-dispatch on job fire is currently **GitHub-only**: `GitHubRouterAdapter.buildJob()` strips `triggerResult` and stamps the right re-check field. Two re-check kinds exist: -GitHub mergeability uses this for `pull_request` events where `mergeable === null`. If the deferred job still gets another deferred result, workers do not schedule a second re-check. The GitHub worker emits a WARN and captures to Sentry with tag `mergeability_recheck_exhausted`, making pathological provider latency visible without creating an infinite retry loop. +- **Mergeability re-check** (`recheckKind` absent, sets `mergeabilityRecheckAttempt: 1`) — used for `pull_request` events where `mergeable === null`. One-shot: if the deferred job still gets another deferred result, the worker does not schedule a second re-check; it emits a WARN and captures to Sentry with tag `mergeability_recheck_exhausted`. +- **Check-suite re-check** (`recheckKind: 'check-suite'`, sets `checkSuiteRecheckAttempt: 1`) — used by `check-suite-success` and `check-suite-failure` when the Actions API lags webhook delivery. Safe rescheduling: if the deferred job still sees a stale result, the worker reschedules another coalesced delayed job instead of exhausting. This keeps review/respond-to-ci dispatch alive until the API catches up without risk of infinite loop (coalesceKey deduplicates concurrent rechecks). + +Non-GitHub adapters (Trello, JIRA, Linear, Sentry) embed `triggerResult` in the job; their workers return the pre-resolved `agentType: null` result directly without re-dispatching through the registry. ### Wedged-lock canary diff --git a/src/triggers/README.md b/src/triggers/README.md index c2f61fc40..7c4d7cc7f 100644 --- a/src/triggers/README.md +++ b/src/triggers/README.md @@ -135,7 +135,7 @@ This distinction prevents "No trigger matched for event" from hiding real handle `buildDeferredRecheckResult` is currently a **GitHub-only** contract for bare re-dispatch. The router always schedules the delayed job via `scheduleCoalescedJob` and exits without taking dispatch locks or posting an ack — that part is generic. What differs is what the worker does when the job fires: -- **GitHub adapter** — `GitHubRouterAdapter.buildJob()` strips `triggerResult` from the job and sets `mergeabilityRecheckAttempt: 1`. The GitHub worker detects the bare job and re-dispatches through the registry to evaluate fresh provider state. If the re-check still cannot resolve state, the GitHub worker emits `mergeability_recheck_exhausted` and stops without re-queueing. +- **GitHub adapter** — `GitHubRouterAdapter.buildJob()` strips `triggerResult` from the job. The recheck kind is controlled by the optional `recheckKind` field on `deferredRecheck`: if absent (mergeability re-check), `mergeabilityRecheckAttempt: 1` is set and the worker emits `mergeability_recheck_exhausted` if the re-check still cannot resolve state (one-shot, no re-queueing); if `recheckKind === 'check-suite'`, `checkSuiteRecheckAttempt: 1` is set and the worker safely reschedules another coalesced delayed job when the Actions API is still stale, so dispatch stays alive until the API catches up. `check-suite-success` and `check-suite-failure` use the `check-suite` kind for the Actions-API-lag case. - **Non-GitHub adapters (Trello, JIRA, Linear, Sentry)** — these adapters always embed `triggerResult` in the job regardless of `deferredRecheck`. When the job fires, `resolveTriggerResult()` receives the pre-resolved result and returns it directly, skipping registry dispatch. A non-GitHub handler returning `buildDeferredRecheckResult` would therefore schedule a job that re-uses the same `agentType: null` result instead of re-evaluating provider state. Use this builder only in GitHub handlers unless the adapter and worker for your provider have been updated to support bare re-dispatch. diff --git a/src/triggers/github/respond-to-ci-dedup.ts b/src/triggers/github/respond-to-ci-dedup.ts index 24e42f1ee..c34de14e1 100644 --- a/src/triggers/github/respond-to-ci-dedup.ts +++ b/src/triggers/github/respond-to-ci-dedup.ts @@ -26,13 +26,26 @@ import { routerConfig } from '../../router/config.js'; import { captureException } from '../../sentry.js'; import { logger } from '../../utils/logging.js'; -// 10 minutes — must cover the full duplicate window: 30 s deferred-recheck +// 35 minutes — must cover the full duplicate window: 30 s deferred-recheck // delay + up to 5 min waiting for a worker slot (slotWaitTimeoutMs default) -// + buffer for active dispatch and edge cases. The previous 2-min TTL could -// expire before a recheck job even started under backlog, allowing the recheck -// to claim the key and launch a second fix agent for the same PR+SHA. -// See review comment 2026-05-11 (nhopeatall) on PR #1351 for the failure mode. -const DEDUP_TTL_SEC = 10 * 60; +// + up to 30 min of active worker execution (workerTimeoutMs default) + buffer. +// +// The critical scenario this prevents: +// 1. Failure webhook fires → key claimed → respond-to-ci dispatched (starts running). +// 2. Success-side deferred recheck fires 30 s later → key taken → skips. ✓ +// 3. respond-to-ci worker runs for e.g. 25 min. +// 4. With a 10-min TTL, the key has already expired by now. +// 5. A second delayed check-suite recheck fires → key NOT taken → would +// dispatch ANOTHER fix agent for the same PR+SHA. ✗ +// +// By setting the TTL to 35 min (30-min workerTimeoutMs + 5-min buffer), the key +// stays alive for the entire execution window of the original respond-to-ci job. +// After that job exits and the work-item lock is released, the next legitimate +// trigger can go through the router's own lock mechanism. +// +// Matches the work-item lock TTL (30 min, src/router/work-item-lock.ts) so the +// two mechanisms age out together. +const DEDUP_TTL_SEC = 35 * 60; const KEY_NS = 'cascade:respond-to-ci-dedup:';