diff --git a/.changeset/gain-drift-detector.md b/.changeset/gain-drift-detector.md new file mode 100644 index 0000000..9bcdda8 --- /dev/null +++ b/.changeset/gain-drift-detector.md @@ -0,0 +1,21 @@ +--- +'colonyq': minor +'@colony/storage': minor +'@colony/core': minor +'@colony/mcp-server': minor +--- + +`colony gain drift` and a matching `savings_drift_report` MCP tool flag +tools whose median tokens-per-call has drifted up or down. Default windows +are non-overlapping: recent = last 3 days, baseline = 14 days ending 3 days +before recent. Default thresholds: `--threshold 1.25` (up), `--down-threshold +0.75`, `--min-calls 20` per window. Classifications: `up_drift`, +`down_drift`, `new_tool` (no baseline), `gone` (no recent), `insufficient_data`, +`stable`. + +Storage gains `Storage.mcpTokenDriftPerOperation()` which computes per-operation +medians with a `ROW_NUMBER() OVER (PARTITION BY operation ORDER BY tpc)` +window function — chosen over the correlated `LIMIT 1 OFFSET (COUNT-1)/2` +form because SQLite forbids outer aggregate references in scalar-subquery +`OFFSET`. A `mcpMetricsMinTs()` helper surfaces a one-line warning when the +baseline window starts before the first recorded metric. diff --git a/apps/cli/src/commands/gain.ts b/apps/cli/src/commands/gain.ts index 3864a6e..4685957 100644 --- a/apps/cli/src/commands/gain.ts +++ b/apps/cli/src/commands/gain.ts @@ -5,6 +5,8 @@ import { type SavingsLiveComparisonCost, type SavingsReferenceRow, type SavingsReferenceTotals, + classifyDrift, + type DriftReport, savingsLiveComparison, savingsLiveComparisonCost, savingsReferenceTotals, @@ -40,6 +42,16 @@ interface GainOptions { topOps?: string; } +interface GainDriftOptions { + json?: boolean; + baselineDays?: string; + recentDays?: string; + minCalls?: string; + threshold?: string; + downThreshold?: string; + operation?: string; +} + export interface MoverRow { operation: string; recent_calls: number; @@ -78,7 +90,7 @@ interface TopErrorReason { } export function registerGainCommand(program: Command): void { - program + const gain = program .command('gain') .description('Show colony token/cost savings from live mcp_metrics receipts') .option('--json', 'emit structured JSON') @@ -250,6 +262,91 @@ export function registerGainCommand(program: Command): void { movers, ); }); + + gain + .command('drift') + .description( + 'Flag tools whose median tokens-per-call has drifted vs a baseline window (no schema change)', + ) + .option('--baseline-days ', 'baseline window length in days (default 14)') + .option('--recent-days ', 'recent window length in days (default 3)') + .option('--min-calls ', 'minimum sample size per window to trust signal (default 20)') + .option('--threshold ', 'up-drift trigger ratio (default 1.25 = +25%)') + .option('--down-threshold ', 'down-drift trigger ratio (default 0.75 = -25%)') + .option('--operation ', 'show only this operation row in the table') + .option('--json', 'emit structured JSON') + .action(async (opts: GainDriftOptions) => { + const settings = loadSettings(); + const baselineDays = parsePositiveFloat(opts.baselineDays) ?? 14; + const recentDays = parsePositiveFloat(opts.recentDays) ?? 3; + const minCalls = parsePositiveInt(opts.minCalls) ?? 20; + const threshold = parsePositiveFloat(opts.threshold) ?? 1.25; + const downThreshold = parsePositiveFloat(opts.downThreshold) ?? 0.75; + + const now = Date.now(); + const recentSince = now - recentDays * 24 * 60 * 60_000; + // 3-day gap between recent and baseline so day-of-week noise does not + // bleed across — see spec/v0.x roadmap. + const baselineUntil = recentSince - 3 * 24 * 60 * 60_000; + const baselineSince = baselineUntil - baselineDays * 24 * 60 * 60_000; + const recentUntil = now; + + const { rawRows, minTs } = await withStorage( + settings, + (storage) => { + const allRows = storage.mcpTokenDriftPerOperation({ + baseline_since: baselineSince, + baseline_until: baselineUntil, + recent_since: recentSince, + recent_until: recentUntil, + }); + const filtered = + opts.operation !== undefined + ? allRows.filter((row) => row.operation === opts.operation) + : allRows; + return { rawRows: filtered, minTs: storage.mcpMetricsMinTs() }; + }, + { readonly: true }, + ); + + const report = classifyDrift(rawRows, { + threshold, + down_threshold: downThreshold, + min_calls: minCalls, + }); + + const baselineWarning = + minTs !== null && minTs > baselineSince + ? `baseline window starts before first recorded metric — drift detection needs ~${ + Math.ceil((recentDays + baselineDays + 3) - (now - minTs) / (24 * 60 * 60_000)) + } more day${baselineDays > 1 ? 's' : ''} of history` + : null; + + if (opts.json === true) { + const payload = { + window: { + baseline_since: baselineSince, + baseline_until: baselineUntil, + recent_since: recentSince, + recent_until: recentUntil, + }, + threshold: report.threshold, + rows: report.rows, + new_tools: report.new_tools, + gone_tools: report.gone_tools, + insufficient_data: report.insufficient_data, + ...(baselineWarning !== null ? { warning: baselineWarning } : {}), + }; + process.stdout.write(`${JSON.stringify(payload, null, 2)}\n`); + return; + } + + writeDriftReport(report, { + recentDays, + baselineDays, + baselineWarning, + }); + }); } export function writeGainReport( @@ -1175,6 +1272,12 @@ function parsePositiveInt(raw: string | undefined): number | undefined { return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : undefined; } +function parsePositiveFloat(raw: string | undefined): number | undefined { + if (raw === undefined || raw.trim() === '') return undefined; + const parsed = Number(raw); + return Number.isFinite(parsed) && parsed > 0 ? parsed : undefined; +} + // rtk-style proportional bar. The row whose value equals `max` gets a full // bar; smaller rows scale linearly. Empty when max <= 0. export function renderImpactBar(value: number, max: number, width: number): string { @@ -1586,3 +1689,132 @@ function colorByEfficiency(pct: number, text: string): string { if (pct >= 40) return kleur.yellow().bold(text); return kleur.red().bold(text); } + +export interface DriftReportInput { + recentDays: number; + baselineDays: number; + baselineWarning: string | null; +} + +// Plaintext rendering for `colony gain drift`. Mirrors the gain layout +// (kleur-colored, padded columns). Drift-classified rows print first; +// new/gone/insufficient sets get one-line summaries underneath so a +// quick scan answers "is anything regressing?" without re-running. +export function writeDriftReport(report: DriftReport, input: DriftReportInput): void { + const w = process.stdout; + const { recentDays, baselineDays, baselineWarning } = input; + w.write( + `${kleur.bold( + `colony gain drift (recent ${formatDaysLabel(recentDays)} vs baseline ${formatDaysLabel( + baselineDays, + )})`, + )}\n`, + ); + w.write( + kleur.dim( + `Thresholds: up >= ${report.threshold.up.toFixed(2)}x, down <= ${report.threshold.down.toFixed( + 2, + )}x, min ${report.threshold.min_calls} calls per window.\n`, + ), + ); + if (baselineWarning !== null) { + w.write(`${kleur.yellow('[warn] ')}${baselineWarning}\n`); + } + const tableRows = report.rows.filter( + (row) => + row.classification === 'up_drift' || + row.classification === 'down_drift' || + row.classification === 'stable', + ); + if (tableRows.length === 0 && report.new_tools.length === 0 && report.gone_tools.length === 0) { + w.write(kleur.dim('No operations had enough samples in both windows.\n')); + if (report.insufficient_data.length > 0) { + writeDriftInsufficient(report); + } + return; + } + if (tableRows.length > 0) { + const widths = [24, 13, 11, 8, 7, 7, 18]; + const head = padRow( + ['Operation', 'Baseline med', 'Recent med', 'Ratio', 'n_base', 'n_rec', 'Class'], + widths, + ); + w.write(`${kleur.dim(head)}\n`); + // Up-drift first (most urgent), then down, then stable. Within each + // bucket keep the storage-emitted alphabetical order so output is + // deterministic for tests. + const ordered = [ + ...tableRows.filter((row) => row.classification === 'up_drift'), + ...tableRows.filter((row) => row.classification === 'down_drift'), + ...tableRows.filter((row) => row.classification === 'stable'), + ]; + for (const row of ordered) { + const cells = [ + truncate(row.operation, widths[0] ?? 24), + formatTokens(row.baseline_median ?? 0), + formatTokens(row.recent_median ?? 0), + formatDriftRatio(row.ratio, row.classification), + formatInt(row.baseline_n), + formatInt(row.recent_n), + formatDriftClass(row.classification), + ]; + w.write(`${padRow(cells, widths)}\n`); + } + } + if (report.insufficient_data.length > 0) { + writeDriftInsufficient(report); + } + if (report.new_tools.length > 0) { + w.write( + `${kleur.dim('New tools (no baseline):')} ${report.new_tools.join(', ')}\n`, + ); + } + if (report.gone_tools.length > 0) { + w.write( + `${kleur.dim('Gone tools (no recent calls):')} ${report.gone_tools.join(', ')}\n`, + ); + } +} + +function writeDriftInsufficient(report: DriftReport): void { + const names = report.insufficient_data + .map((row) => row.operation) + .slice(0, 12) + .join(', '); + const more = report.insufficient_data.length > 12 + ? `, +${report.insufficient_data.length - 12} more` + : ''; + process.stdout.write( + `${kleur.dim(`Insufficient data (n<${report.threshold.min_calls}):`)} ${names}${more}\n`, + ); +} + +function formatDriftRatio(ratio: number | null, classification: DriftReport['rows'][number]['classification']): string { + if (ratio === null) return '-'; + const rounded = ratio >= 10 ? ratio.toFixed(1) : ratio.toFixed(2); + if (classification === 'up_drift') return kleur.red(`▲${rounded}x`); + if (classification === 'down_drift') return kleur.green(`▼${rounded}x`); + return `${rounded}x`; +} + +function formatDriftClass(classification: DriftReport['rows'][number]['classification']): string { + switch (classification) { + case 'up_drift': + return kleur.red('up_drift'); + case 'down_drift': + return kleur.green('down_drift'); + case 'stable': + return kleur.dim('stable'); + case 'new_tool': + return kleur.cyan('new_tool'); + case 'gone': + return kleur.dim('gone'); + case 'insufficient_data': + return kleur.dim('insufficient'); + } +} + +function formatDaysLabel(days: number): string { + if (Number.isInteger(days)) return `${days}d`; + return `${days.toFixed(1)}d`; +} diff --git a/apps/cli/test/gain-drift.test.ts b/apps/cli/test/gain-drift.test.ts new file mode 100644 index 0000000..b1a394d --- /dev/null +++ b/apps/cli/test/gain-drift.test.ts @@ -0,0 +1,175 @@ +import { classifyDrift, type DriftRawRow } from '@colony/core'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { writeDriftReport } from '../src/commands/gain.js'; + +// kleur emits ANSI when COLORTERM is set (Anthropic harness frequently +// flips this on). Strip escapes so substring checks hold regardless of +// the local color mode. +const ANSI_RE = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, 'g'); +const stripAnsi = (chunk: string | Uint8Array): string => String(chunk).replace(ANSI_RE, ''); + +function capture(): { read: () => string } { + let buf = ''; + vi.spyOn(process.stdout, 'write').mockImplementation((chunk: string | Uint8Array) => { + buf += stripAnsi(chunk); + return true; + }); + return { read: () => buf }; +} + +describe('writeDriftReport', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('renders header, threshold note, and an up-drift row first', () => { + const rows: DriftRawRow[] = [ + { + operation: 'search', + baseline_median: 400, + baseline_n: 2_140, + recent_median: 580, + recent_n: 412, + }, + { + operation: 'task_post', + baseline_median: 180, + baseline_n: 980, + recent_median: 182, + recent_n: 198, + }, + ]; + const report = classifyDrift(rows, { + threshold: 1.25, + down_threshold: 0.75, + min_calls: 20, + }); + const out = capture(); + writeDriftReport(report, { recentDays: 3, baselineDays: 14, baselineWarning: null }); + const text = out.read(); + + expect(text).toContain('colony gain drift (recent 3d vs baseline 14d)'); + expect(text).toContain('Thresholds: up >= 1.25x, down <= 0.75x, min 20 calls'); + expect(text).toContain('Operation'); + expect(text).toContain('Baseline med'); + expect(text).toContain('Class'); + // up_drift row appears before stable row. + const searchIdx = text.indexOf('search'); + const taskIdx = text.indexOf('task_post'); + expect(searchIdx).toBeGreaterThan(-1); + expect(taskIdx).toBeGreaterThan(searchIdx); + expect(text).toContain('up_drift'); + expect(text).toContain('stable'); + }); + + it('lists new tools, gone tools, and insufficient data under the table', () => { + const rows: DriftRawRow[] = [ + { + operation: 'savings_drift_report', + baseline_median: null, + baseline_n: 0, + recent_median: 220, + recent_n: 25, + }, + { + operation: 'legacy_search', + baseline_median: 300, + baseline_n: 50, + recent_median: null, + recent_n: 0, + }, + { + operation: 'suggest', + baseline_median: 80, + baseline_n: 3, + recent_median: 88, + recent_n: 18, + }, + ]; + const report = classifyDrift(rows, { + threshold: 1.25, + down_threshold: 0.75, + min_calls: 20, + }); + const out = capture(); + writeDriftReport(report, { recentDays: 3, baselineDays: 14, baselineWarning: null }); + const text = out.read(); + expect(text).toContain('New tools (no baseline): savings_drift_report'); + expect(text).toContain('Gone tools (no recent calls): legacy_search'); + expect(text).toContain('Insufficient data (n<20): suggest'); + }); + + it('emits a [warn] line when the baseline window predates the first receipt', () => { + const report = classifyDrift([], { + threshold: 1.25, + down_threshold: 0.75, + min_calls: 20, + }); + const out = capture(); + writeDriftReport(report, { + recentDays: 3, + baselineDays: 14, + baselineWarning: + 'baseline window starts before first recorded metric — drift detection needs ~5 more days of history', + }); + const text = out.read(); + expect(text).toContain('[warn]'); + expect(text).toContain('baseline window starts before first recorded metric'); + }); + + it('classifies a down-drift row separately and renders down_drift class', () => { + const rows: DriftRawRow[] = [ + { + operation: 'task_post', + baseline_median: 600, + baseline_n: 30, + recent_median: 300, + recent_n: 30, + }, + ]; + const report = classifyDrift(rows, { + threshold: 1.25, + down_threshold: 0.75, + min_calls: 20, + }); + expect(report.rows[0]?.classification).toBe('down_drift'); + expect(report.rows[0]?.ratio).toBeCloseTo(0.5, 6); + const out = capture(); + writeDriftReport(report, { recentDays: 3, baselineDays: 14, baselineWarning: null }); + expect(out.read()).toContain('down_drift'); + }); + + it('classifyDrift round-trips JSON-friendly shape', () => { + const rows: DriftRawRow[] = [ + { + operation: 'search', + baseline_median: 400, + baseline_n: 100, + recent_median: 600, + recent_n: 100, + }, + ]; + const report = classifyDrift(rows, { + threshold: 1.25, + down_threshold: 0.75, + min_calls: 20, + }); + expect(JSON.parse(JSON.stringify(report))).toEqual({ + threshold: { up: 1.25, down: 0.75, min_calls: 20 }, + rows: [ + { + operation: 'search', + baseline_median: 400, + baseline_n: 100, + recent_median: 600, + recent_n: 100, + ratio: 1.5, + classification: 'up_drift', + }, + ], + new_tools: [], + gone_tools: [], + insufficient_data: [], + }); + }); +}); diff --git a/apps/mcp-server/src/server.ts b/apps/mcp-server/src/server.ts index 93138b7..b71aee6 100644 --- a/apps/mcp-server/src/server.ts +++ b/apps/mcp-server/src/server.ts @@ -29,6 +29,7 @@ import * as recall from './tools/recall.js'; import * as relay from './tools/relay.js'; import * as rescue from './tools/rescue.js'; import * as savings from './tools/savings.js'; +import * as savingsDrift from './tools/savings-drift.js'; import * as search from './tools/search.js'; import * as spec from './tools/spec.js'; import * as startupPanel from './tools/startup-panel.js'; @@ -120,6 +121,7 @@ export function buildServer( suggest.register(server, ctx); rescue.register(server, ctx); savings.register(server, ctx); + savingsDrift.register(server, ctx); // Autopilot lane (tick advisor + drift checker). Cheap compositions of // existing primitives; registered after the core surface so the heartbeat diff --git a/apps/mcp-server/src/tools/savings-drift.ts b/apps/mcp-server/src/tools/savings-drift.ts new file mode 100644 index 0000000..e68a073 --- /dev/null +++ b/apps/mcp-server/src/tools/savings-drift.ts @@ -0,0 +1,136 @@ +import { classifyDrift } from '@colony/core'; +import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; +import { z } from 'zod'; +import { type ToolContext, defaultWrapHandler } from './context.js'; + +const DAY_MS = 24 * 60 * 60_000; + +/** + * Reports per-operation tokens-per-call drift across two non-overlapping + * windows (recent vs baseline) using only the existing mcp_metrics table. + * No schema change. + * + * Progressive disclosure: the response is the classified rows plus three + * one-line summary arrays (new_tools, gone_tools, insufficient_data) — no + * observation bodies, no per-call samples. Callers reach for + * savings_report for the wider per-operation view. + * + * Lives in its own file (rather than tools/drift.ts which already owns + * task_drift_check) so the file-edit-drift checker and the token-drift + * detector keep separate module surfaces. + */ +export function register(server: McpServer, ctx: ToolContext): void { + const wrapHandler = ctx.wrapHandler ?? defaultWrapHandler; + const { store } = ctx; + + server.tool( + 'savings_drift_report', + 'Flag tools whose median tokens-per-call has drifted up or down over a recent window vs a baseline window. Pure read path over mcp_metrics; no schema change. Pass baseline_days/recent_days to scope the windows, min_calls to set the sample-size guard, and threshold/down_threshold to tune up/down sensitivity.', + { + baseline_days: z + .number() + .positive() + .max(180) + .optional() + .describe('baseline window length in days; defaults to 14'), + recent_days: z + .number() + .positive() + .max(60) + .optional() + .describe('recent window length in days; defaults to 3'), + min_calls: z + .number() + .int() + .min(1) + .max(10_000) + .optional() + .describe('minimum sample size per window before flagging drift; defaults to 20'), + threshold: z + .number() + .positive() + .optional() + .describe('up-drift trigger ratio (recent_median / baseline_median); defaults to 1.25'), + down_threshold: z + .number() + .positive() + .optional() + .describe('down-drift trigger ratio; defaults to 0.75'), + operation: z + .string() + .min(1) + .optional() + .describe('filter rows by exact operation name (e.g. "search")'), + }, + wrapHandler( + 'savings_drift_report', + async ({ + baseline_days, + recent_days, + min_calls, + threshold, + down_threshold, + operation, + }) => { + const baselineDays = baseline_days ?? 14; + const recentDays = recent_days ?? 3; + const minCalls = min_calls ?? 20; + const up = threshold ?? 1.25; + const down = down_threshold ?? 0.75; + + const now = Date.now(); + const recentSince = now - recentDays * DAY_MS; + // 3-day gap mirrors the CLI: keeps day-of-week noise from bleeding + // across windows. + const baselineUntil = recentSince - 3 * DAY_MS; + const baselineSince = baselineUntil - baselineDays * DAY_MS; + const recentUntil = now; + + const allRows = store.storage.mcpTokenDriftPerOperation({ + baseline_since: baselineSince, + baseline_until: baselineUntil, + recent_since: recentSince, + recent_until: recentUntil, + }); + const filtered = + operation !== undefined + ? allRows.filter((row) => row.operation === operation) + : allRows; + + const report = classifyDrift(filtered, { + threshold: up, + down_threshold: down, + min_calls: minCalls, + }); + + const minTs = store.storage.mcpMetricsMinTs(); + const warning = + minTs !== null && minTs > baselineSince + ? 'baseline window starts before first recorded metric — drift detection needs more history before signals can be trusted' + : null; + + return { + content: [ + { + type: 'text', + text: JSON.stringify({ + window: { + baseline_since: baselineSince, + baseline_until: baselineUntil, + recent_since: recentSince, + recent_until: recentUntil, + }, + threshold: report.threshold, + rows: report.rows, + new_tools: report.new_tools, + gone_tools: report.gone_tools, + insufficient_data: report.insufficient_data, + ...(warning !== null ? { warning } : {}), + }), + }, + ], + }; + }, + ), + ); +} diff --git a/apps/mcp-server/test/server.test.ts b/apps/mcp-server/test/server.test.ts index 8d8ecea..58de97f 100644 --- a/apps/mcp-server/test/server.test.ts +++ b/apps/mcp-server/test/server.test.ts @@ -65,6 +65,7 @@ describe('MCP server', () => { 'recall_session', 'rescue_stranded_run', 'rescue_stranded_scan', + 'savings_drift_report', 'savings_report', 'search', 'semantic_search', @@ -419,6 +420,82 @@ describe('MCP server', () => { }); }); + it('savings_drift_report classifies up-drift from mcp_metrics receipts', async () => { + const DAY_MS = 24 * 60 * 60_000; + const now = Date.now(); + // Recent window (last 3 days): 25 search receipts @ ~200 tpc. + for (let i = 0; i < 25; i += 1) { + store.storage.recordMcpMetric({ + ts: now - i * 1_000, + operation: 'search', + input_bytes: 100, + output_bytes: 200, + input_tokens: 60, + output_tokens: 140, + duration_ms: 5, + ok: true, + }); + } + // Baseline window: starts after recent + 3-day gap. With default + // recent_days=3 and baseline_days=14, baseline is [now - 20d, now - 3d). + // Plant baseline at now - 10d so both windows have data with no overlap. + const baselineTs = now - 10 * DAY_MS; + for (let i = 0; i < 25; i += 1) { + store.storage.recordMcpMetric({ + ts: baselineTs + i * 1_000, + operation: 'search', + input_bytes: 100, + output_bytes: 200, + input_tokens: 30, + output_tokens: 70, + duration_ms: 5, + ok: true, + }); + } + + const res = await client.callTool({ + name: 'savings_drift_report', + arguments: { baseline_days: 14, recent_days: 3, min_calls: 20 }, + }); + const text = (res.content as Array<{ type: string; text: string }>)[0]?.text ?? '{}'; + const payload = JSON.parse(text) as { + window: { + baseline_since: number; + baseline_until: number; + recent_since: number; + recent_until: number; + }; + threshold: { up: number; down: number; min_calls: number }; + rows: Array<{ + operation: string; + baseline_median: number | null; + recent_median: number | null; + baseline_n: number; + recent_n: number; + ratio: number | null; + classification: string; + }>; + new_tools: string[]; + gone_tools: string[]; + insufficient_data: Array<{ operation: string; baseline_n: number; recent_n: number }>; + }; + + expect(payload.window.recent_until).toBeGreaterThan(payload.window.recent_since); + expect(payload.window.baseline_until).toBeLessThan(payload.window.recent_since); + expect(payload.threshold).toEqual({ up: 1.25, down: 0.75, min_calls: 20 }); + + const search = payload.rows.find((row) => row.operation === 'search'); + expect(search).toBeDefined(); + expect(search?.baseline_median).toBe(100); + expect(search?.recent_median).toBe(200); + expect(search?.baseline_n).toBe(25); + expect(search?.recent_n).toBe(25); + expect(search?.classification).toBe('up_drift'); + expect(search?.ratio).toBeCloseTo(2, 6); + expect(payload.new_tools).toEqual([]); + expect(payload.gone_tools).toEqual([]); + }); + it('task_post reports a structured error for stale task ids', async () => { store.startSession({ id: 's-post', ide: 'test', cwd: '/tmp' }); diff --git a/docs/mcp.md b/docs/mcp.md index e992307..77cfb00 100644 --- a/docs/mcp.md +++ b/docs/mcp.md @@ -101,6 +101,7 @@ workflow guidance. | Rescue | `rescue_stranded_scan` | Dry-run stranded lane/claim rescue candidates. | | Rescue | `rescue_stranded_run` | Emit rescue relays and release abandoned claims. | | Metrics | `savings_report` | Report live MCP token receipts and reference savings model. | +| Metrics | `savings_drift_report` | Flag tools whose median tokens-per-call drifted vs a baseline window. | ## Ruflo sidecar boundary @@ -2235,6 +2236,36 @@ Response shape: Live tokens are counted with `@colony/compress#countTokens` — the same primitive that produces observation token receipts, so values line up across surfaces. Estimated USD cost is computed at report time from the live token totals and caller-provided USD-per-1M rates; unset rates keep `cost_basis.configured=false` and report zero cost fields. New failure rows record `error_code` / `error_message`; older failure rows may have unknown reason fields. The reference table is static and sourced from `packages/core/src/savings-reference.ts`; the CLI command `colony gain` and the worker's `/savings` page render the same payload. +## `savings_drift_report` + +Flags tools whose median tokens-per-call has drifted up or down over a recent window vs. a baseline window. Pure read path over `mcp_metrics` — no schema change. Mirrors the `colony gain drift` CLI subcommand. + +Args: + +- `baseline_days?` — baseline window length in days. Defaults to 14, max 180. +- `recent_days?` — recent window length in days. Defaults to 3, max 60. +- `min_calls?` — minimum sample size per window before flagging drift. Defaults to 20. +- `threshold?` — up-drift trigger ratio (`recent_median / baseline_median`). Defaults to 1.25 (+25%). +- `down_threshold?` — down-drift trigger ratio. Defaults to 0.75 (-25%). +- `operation?` — filter rows by exact operation name (e.g. `"search"`). + +Windows are non-overlapping with a 3-day gap (`baseline` ends 3 days before `recent` begins) so day-of-week noise does not bleed across. Only `ok=1` receipts are considered — retry storms and rejections never inflate the signal. The median is exact (no interpolation) and computed with `ROW_NUMBER()` window aggregation in SQLite. + +Response shape: + +```json +{ + "window": { "baseline_since": 1729000000000, "baseline_until": 1730000000000, "recent_since": 1730259200000, "recent_until": 1730518400000 }, + "threshold": { "up": 1.25, "down": 0.75, "min_calls": 20 }, + "rows": [{ "operation": "search", "baseline_median": 412, "baseline_n": 2140, "recent_median": 587, "recent_n": 412, "ratio": 1.42, "classification": "up_drift" }], + "new_tools": ["savings_drift_report"], + "gone_tools": [], + "insufficient_data": [{ "operation": "suggest", "baseline_n": 3, "recent_n": 18 }] +} +``` + +Classifications: `up_drift`, `down_drift`, `new_tool` (no baseline data), `gone` (no recent calls), `insufficient_data` (either window below `min_calls`), or `stable`. When the baseline window starts before the earliest `mcp_metrics` receipt the response adds a `warning` field nudging callers to wait for more history before trusting signals. + ## Plan observation kinds The lane introduces several observation kinds on the parent spec task and on the sub-task threads. They are written through `MemoryStore.addObservation`, so content is compressed and `metadata` carries the structured payload. diff --git a/openspec/changes/gain-drift-detector-2026-05-16/CHANGE.md b/openspec/changes/gain-drift-detector-2026-05-16/CHANGE.md new file mode 100644 index 0000000..ee28a4b --- /dev/null +++ b/openspec/changes/gain-drift-detector-2026-05-16/CHANGE.md @@ -0,0 +1,63 @@ +--- +slug: gain-drift-detector-2026-05-16 +--- + +# CHANGE · gain-drift-detector-2026-05-16 + +## §P proposal +# Long-run tokens-per-call drift detector + +## Problem + +README §v0.x "Receipts and observability" calls out a missing long-run +regression detector for tool token spend. Operators today can spot a single +hot loop with `colony gain`, but they cannot answer "did `search` quietly +get 40% more expensive in the last three days?" without manually charting +the daily aggregates. We need a deterministic signal that fires when a +tool's median tokens-per-call drifts outside a configurable band against a +baseline window. + +## Acceptance criteria + +- `colony gain drift` ships with sensible defaults + (`--baseline-days 14 --recent-days 3 --min-calls 20 --threshold 1.25 + --down-threshold 0.75`) and a `--json` mode whose payload includes window + bounds, classified rows, and `new_tools` / `gone_tools` / `insufficient_data` + arrays. +- `savings_drift_report` MCP tool exposes the same surface, named so it + groups with `savings_report` and does not collide with the existing + `task_drift_check` (file-edit drift). +- No schema change. The signal reads `mcp_metrics` only. +- Windows are non-overlapping with a 3-day gap so day-of-week noise does + not bleed across baseline and recent. +- Only `ok=1` receipts contribute; retry storms cannot inflate the median. +- A baseline-shorter-than-history warning appears in both CLI and MCP + outputs. +- Storage method, classifier, CLI render, MCP envelope, and listed-tools + set are covered by focused tests. + +## Sub-tasks + +### Sub-task 0: Implement and verify + +File scope: packages/storage/src/storage.ts, packages/core/src/drift.ts, +packages/core/src/index.ts, apps/cli/src/commands/gain.ts, +apps/mcp-server/src/tools/savings-drift.ts, apps/mcp-server/src/server.ts, +docs/mcp.md, packages/storage/test/mcp-metrics.test.ts, +apps/cli/test/gain-drift.test.ts, apps/mcp-server/test/server.test.ts. + +Verification: `pnpm --filter @colony/storage test`, +`pnpm --filter colonyq typecheck`, `pnpm --filter colonyq test -- gain`, +`pnpm --filter @colony/mcp-server test`, `pnpm --filter colonyq build`. + +## §S delta +op|target|row +-|-|- + +## §T tasks +id|status|task|cites +-|-|-|- + +## §B bugs +id|status|task|cites +-|-|-|- diff --git a/packages/core/src/drift.ts b/packages/core/src/drift.ts new file mode 100644 index 0000000..5f06d2a --- /dev/null +++ b/packages/core/src/drift.ts @@ -0,0 +1,139 @@ +// Token-per-call drift detector. Pure function: takes the raw rows the +// storage method emits and classifies each operation into one of six +// buckets. No DB access here — keeps the classifier easy to unit-test +// in isolation and lets the CLI and MCP tool share one implementation. + +export type DriftClassification = + | 'up_drift' + | 'down_drift' + | 'new_tool' + | 'gone' + | 'insufficient_data' + | 'stable'; + +export interface DriftRawRow { + operation: string; + baseline_median: number | null; + baseline_n: number; + recent_median: number | null; + recent_n: number; +} + +export interface DriftRow { + operation: string; + baseline_median: number | null; + baseline_n: number; + recent_median: number | null; + recent_n: number; + ratio: number | null; + classification: DriftClassification; +} + +export interface DriftClassifyOptions { + /** Inclusive ratio cut-off for `up_drift` (e.g. 1.25 = +25%). */ + threshold: number; + /** Inclusive ratio cut-off for `down_drift` (e.g. 0.75 = -25%). */ + down_threshold: number; + /** Minimum sample size in each window to trust the median signal. */ + min_calls: number; +} + +export interface DriftReport { + threshold: { + up: number; + down: number; + min_calls: number; + }; + rows: DriftRow[]; + new_tools: string[]; + gone_tools: string[]; + insufficient_data: Array<{ operation: string; baseline_n: number; recent_n: number }>; +} + +/** + * Classify each tool's tokens-per-call drift between a baseline and a + * recent window. The caller is responsible for picking non-overlapping + * windows; this function only consumes the rows. + * + * Buckets (evaluated in order): + * - `new_tool`: no baseline data, recent has >= min_calls samples + * - `gone`: no recent data, baseline has >= min_calls samples + * - `insufficient_data`: either window below min_calls (not new/gone) + * - `up_drift`: ratio >= threshold AND both windows >= min_calls + * - `down_drift`: ratio <= down_threshold AND both windows >= min_calls + * - `stable`: otherwise + * + * The returned `rows` array preserves every row from the input so callers + * can render the full table; the convenience arrays (`new_tools`, + * `gone_tools`, `insufficient_data`) point at the same operations for + * one-line summary lines. + */ +export function classifyDrift( + rawRows: ReadonlyArray, + opts: DriftClassifyOptions, +): DriftReport { + const rows: DriftRow[] = rawRows.map((raw) => { + const ratio = computeRatio(raw.baseline_median, raw.recent_median); + const classification = classifyOne(raw, ratio, opts); + return { + operation: raw.operation, + baseline_median: raw.baseline_median, + baseline_n: raw.baseline_n, + recent_median: raw.recent_median, + recent_n: raw.recent_n, + ratio, + classification, + }; + }); + + const new_tools = rows + .filter((row) => row.classification === 'new_tool') + .map((row) => row.operation); + const gone_tools = rows + .filter((row) => row.classification === 'gone') + .map((row) => row.operation); + const insufficient_data = rows + .filter((row) => row.classification === 'insufficient_data') + .map((row) => ({ + operation: row.operation, + baseline_n: row.baseline_n, + recent_n: row.recent_n, + })); + + return { + threshold: { + up: opts.threshold, + down: opts.down_threshold, + min_calls: opts.min_calls, + }, + rows, + new_tools, + gone_tools, + insufficient_data, + }; +} + +function classifyOne( + raw: DriftRawRow, + ratio: number | null, + opts: DriftClassifyOptions, +): DriftClassification { + const hasBaseline = raw.baseline_n >= opts.min_calls; + const hasRecent = raw.recent_n >= opts.min_calls; + if (raw.baseline_n === 0 && hasRecent) return 'new_tool'; + if (raw.recent_n === 0 && hasBaseline) return 'gone'; + if (!hasBaseline || !hasRecent) return 'insufficient_data'; + if (ratio === null) return 'stable'; + if (ratio >= opts.threshold) return 'up_drift'; + if (ratio <= opts.down_threshold) return 'down_drift'; + return 'stable'; +} + +function computeRatio( + baseline: number | null, + recent: number | null, +): number | null { + if (baseline === null || recent === null) return null; + if (baseline <= 0) return null; + return recent / baseline; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index ebeeecc..402f430 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -17,6 +17,14 @@ export { type SavingsReferenceRow, type SavingsReferenceTotals, } from './savings-reference.js'; +export { + classifyDrift, + type DriftClassification, + type DriftClassifyOptions, + type DriftRawRow, + type DriftReport, + type DriftRow, +} from './drift.js'; export { autoReleaseStalePlanSubtaskClaims, bulkRescueStrandedSessions, diff --git a/packages/storage/src/storage.ts b/packages/storage/src/storage.ts index 2c89e3f..f2e39a2 100644 --- a/packages/storage/src/storage.ts +++ b/packages/storage/src/storage.ts @@ -1419,6 +1419,98 @@ export class Storage { })); } + // Earliest ts recorded in mcp_metrics. Used by the drift detector to warn + // when the requested baseline window starts before the first recorded + // receipt — drift signals are noisy when the baseline lacks history. + mcpMetricsMinTs(): number | null { + const row = this.db + .prepare('SELECT MIN(ts) AS min_ts FROM mcp_metrics') + .get() as { min_ts: number | null } | undefined; + return row?.min_ts ?? null; + } + + // Per-operation median tokens-per-call across two non-overlapping windows. + // Reads from the existing mcp_metrics table — no schema change. SQLite has + // no PERCENTILE_CONT, so the median is computed via correlated subquery + // with LIMIT 1 OFFSET (count-1)/2 on ordered rows. Only ok=1 receipts are + // considered so retry storms and rejections do not skew the signal. The + // FULL OUTER JOIN produces a row for every operation present in either + // window, with NULL median+n=0 for the missing side (used to classify + // new_tool / gone in the caller). SQLite 3.39+ supports FULL OUTER JOIN + // (better-sqlite3 ^11 ships 3.49). + mcpTokenDriftPerOperation(opts: { + baseline_since: number; + baseline_until: number; + recent_since: number; + recent_until: number; + }): Array<{ + operation: string; + baseline_median: number | null; + baseline_n: number; + recent_median: number | null; + recent_n: number; + }> { + const rows = this.db + .prepare( + `WITH + baseline AS ( + SELECT operation, + (input_tokens + output_tokens) AS tpc, + ROW_NUMBER() OVER (PARTITION BY operation ORDER BY input_tokens + output_tokens) AS rn, + COUNT(*) OVER (PARTITION BY operation) AS n + FROM mcp_metrics + WHERE ok = 1 AND ts >= ? AND ts < ? + ), + recent AS ( + SELECT operation, + (input_tokens + output_tokens) AS tpc, + ROW_NUMBER() OVER (PARTITION BY operation ORDER BY input_tokens + output_tokens) AS rn, + COUNT(*) OVER (PARTITION BY operation) AS n + FROM mcp_metrics + WHERE ok = 1 AND ts >= ? AND ts <= ? + ), + baseline_med AS ( + SELECT operation, tpc AS median, n + FROM baseline + WHERE rn = (n - 1) / 2 + 1 + ), + recent_med AS ( + SELECT operation, tpc AS median, n + FROM recent + WHERE rn = (n - 1) / 2 + 1 + ) + SELECT COALESCE(b.operation, r.operation) AS operation, + b.median AS baseline_median, + b.n AS baseline_n, + r.median AS recent_median, + r.n AS recent_n + FROM baseline_med b + FULL OUTER JOIN recent_med r ON b.operation = r.operation + ORDER BY operation`, + ) + .all( + opts.baseline_since, + opts.baseline_until, + opts.recent_since, + opts.recent_until, + ) as Array<{ + operation: string | null; + baseline_median: number | null; + baseline_n: number | null; + recent_median: number | null; + recent_n: number | null; + }>; + return rows + .filter((row): row is { operation: string } & typeof row => row.operation !== null) + .map((row) => ({ + operation: row.operation, + baseline_median: row.baseline_median ?? null, + baseline_n: row.baseline_n ?? 0, + recent_median: row.recent_median ?? null, + recent_n: row.recent_n ?? 0, + })); + } + private mcpMetricSessionCount(where: string, args: ReadonlyArray): number { const row = this.db .prepare( diff --git a/packages/storage/test/mcp-metrics.test.ts b/packages/storage/test/mcp-metrics.test.ts index 6f3f038..a18563c 100644 --- a/packages/storage/test/mcp-metrics.test.ts +++ b/packages/storage/test/mcp-metrics.test.ts @@ -292,4 +292,134 @@ describe('mcp_metrics storage', () => { it('aggregateMcpMetricsDaily returns empty array when no rows in window', () => { expect(storage.aggregateMcpMetricsDaily({ since: 0 })).toEqual([]); }); + + describe('mcpTokenDriftPerOperation', () => { + // Helper: build a row at ts with explicit tokens-per-call sum. Each row + // is one mcp_metrics receipt; tpc is the (input + output) total. + function recordTpc( + storage: Storage, + ts: number, + operation: string, + tpc: number, + ok = true, + ): void { + // Split the tpc as 1/3 input, 2/3 output to keep the receipts realistic. + const input = Math.round(tpc / 3); + const output = tpc - input; + record(storage, { ts, operation, input_tokens: input, output_tokens: output, ok }); + } + + it('returns no rows when both windows are empty', () => { + const rows = storage.mcpTokenDriftPerOperation({ + baseline_since: 0, + baseline_until: 1_000, + recent_since: 2_000, + recent_until: 3_000, + }); + expect(rows).toEqual([]); + }); + + it('reports stable median when baseline and recent overlap closely', () => { + // Baseline: 5 calls @ 100 tpc, baseline window [0, 1000). + for (let i = 0; i < 5; i += 1) recordTpc(storage, 100 + i, 'search', 100); + // Recent: 5 calls @ 105 tpc, recent window [2000, 3000]. + for (let i = 0; i < 5; i += 1) recordTpc(storage, 2_000 + i, 'search', 105); + + const rows = storage.mcpTokenDriftPerOperation({ + baseline_since: 0, + baseline_until: 1_000, + recent_since: 2_000, + recent_until: 3_000, + }); + expect(rows).toHaveLength(1); + expect(rows[0]).toMatchObject({ + operation: 'search', + baseline_median: 100, + baseline_n: 5, + recent_median: 105, + recent_n: 5, + }); + }); + + it('captures clear up-drift when recent median is much higher than baseline', () => { + // Baseline: 21 calls @ ~100 tpc; recent: 21 calls @ ~200 tpc. + for (let i = 0; i < 21; i += 1) recordTpc(storage, 100 + i, 'search', 100); + for (let i = 0; i < 21; i += 1) recordTpc(storage, 2_000 + i, 'search', 200); + + const rows = storage.mcpTokenDriftPerOperation({ + baseline_since: 0, + baseline_until: 1_000, + recent_since: 2_000, + recent_until: 3_000, + }); + expect(rows).toHaveLength(1); + expect(rows[0]?.baseline_median).toBe(100); + expect(rows[0]?.recent_median).toBe(200); + expect(rows[0]?.baseline_n).toBe(21); + expect(rows[0]?.recent_n).toBe(21); + }); + + it('captures down-drift when recent median is much lower than baseline', () => { + for (let i = 0; i < 25; i += 1) recordTpc(storage, 100 + i, 'task_post', 600); + for (let i = 0; i < 25; i += 1) recordTpc(storage, 2_000 + i, 'task_post', 300); + + const rows = storage.mcpTokenDriftPerOperation({ + baseline_since: 0, + baseline_until: 1_000, + recent_since: 2_000, + recent_until: 3_000, + }); + const tp = rows.find((r) => r.operation === 'task_post'); + expect(tp?.baseline_median).toBe(600); + expect(tp?.recent_median).toBe(300); + }); + + it('reports new tools with baseline_n=0 and recent_n>0', () => { + // Only recent window has the operation. + for (let i = 0; i < 20; i += 1) recordTpc(storage, 2_000 + i, 'savings_drift_report', 150); + + const rows = storage.mcpTokenDriftPerOperation({ + baseline_since: 0, + baseline_until: 1_000, + recent_since: 2_000, + recent_until: 3_000, + }); + const newOp = rows.find((r) => r.operation === 'savings_drift_report'); + expect(newOp?.baseline_n).toBe(0); + expect(newOp?.baseline_median).toBeNull(); + expect(newOp?.recent_n).toBe(20); + expect(newOp?.recent_median).toBe(150); + }); + + it('honors ok=0 filter so retry storms do not skew the median', () => { + // 10 ok=1 receipts and 10 ok=0 receipts in recent; only ok=1 should + // contribute to recent_median + recent_n. + for (let i = 0; i < 10; i += 1) recordTpc(storage, 100 + i, 'search', 100); + for (let i = 0; i < 10; i += 1) recordTpc(storage, 2_000 + i, 'search', 200, true); + for (let i = 0; i < 10; i += 1) recordTpc(storage, 2_500 + i, 'search', 5_000, false); + + const rows = storage.mcpTokenDriftPerOperation({ + baseline_since: 0, + baseline_until: 1_000, + recent_since: 2_000, + recent_until: 3_000, + }); + const search = rows.find((r) => r.operation === 'search'); + expect(search?.recent_n).toBe(10); + expect(search?.recent_median).toBe(200); + }); + }); + + describe('mcpMetricsMinTs', () => { + it('returns null on empty table', () => { + expect(storage.mcpMetricsMinTs()).toBeNull(); + }); + + it('returns the earliest ts when receipts exist', () => { + record(storage, { ts: 500, operation: 'search' }); + record(storage, { ts: 200, operation: 'search' }); + record(storage, { ts: 900, operation: 'task_post' }); + expect(storage.mcpMetricsMinTs()).toBe(200); + }); + }); });