From a7e96b9406fcc2c7c41c0209ca5536546790e239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 13 Apr 2026 21:54:41 +0200 Subject: [PATCH 1/3] feat: summarize video transitions --- .../agent-device/references/verification.md | 19 + src/__tests__/cli-diff.test.ts | 76 ++++ src/cli/commands/screenshot.ts | 110 +++++- .../__tests__/transition-summary.test.ts | 89 +++++ src/utils/command-schema.ts | 50 ++- src/utils/output.ts | 70 ++++ src/utils/screenshot-diff.ts | 3 +- src/utils/transition-summary.ts | 355 ++++++++++++++++++ src/utils/video-frames.ts | 113 ++++++ website/docs/docs/commands.md | 5 + 10 files changed, 875 insertions(+), 15 deletions(-) create mode 100644 src/utils/__tests__/transition-summary.test.ts create mode 100644 src/utils/transition-summary.ts create mode 100644 src/utils/video-frames.ts diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index 40500119..836c6e62 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -9,6 +9,8 @@ Open this file when the task needs evidence, regression checks, replay maintenan - `screenshot` - `diff snapshot` - `diff screenshot` +- `diff frames` +- `diff video` - `record` - `replay -u` - `perf` @@ -65,6 +67,23 @@ agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --ove - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. - Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide for a live capture. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. Saved-image comparisons do not have live accessibility refs, so omit `--overlay-refs` when passing a current image path. +## Transition summaries with diff frames/video + +Use `diff frames` or `diff video` when a screenshot pair is too static and you need a compact timeline for a transition, animation, or recorded interaction. + +```bash +agent-device diff frames ./frames --out /tmp/settings-transition +agent-device diff frames ./frame-001.png ./frame-002.png ./frame-003.png --out /tmp/settings-transition +agent-device diff video ./recordings/settings.mov --out /tmp/settings-transition --telemetry ./recordings/settings.gesture-telemetry.json +agent-device diff video ./recordings/settings.mov --sample-fps 8 --max-frames 120 --json +``` + +- `diff frames` accepts a directory of PNG frames or explicit PNG paths. It works without external video tools. +- `diff video` requires `ffmpeg` and `ffprobe` in `PATH`; it samples the recording into PNG frames, then runs the same transition summarizer. +- Add `--telemetry ` with a recording gesture sidecar when available. The output can then anchor transitions to events such as `after tap` or `during up scroll`. +- The text output stays capped to the top transitions, keyframes, changed-region summaries, and optional OCR movement hints. Use `--json` when you need the structured metrics. +- Install `tesseract` for OCR movement hints on selected transition boundaries. OCR is optional and is not run for every sampled frame. + ## Session recording Use `record` for debugging, documentation, or shareable verification artifacts. diff --git a/src/__tests__/cli-diff.test.ts b/src/__tests__/cli-diff.test.ts index e76ad6f5..49ab1dda 100644 --- a/src/__tests__/cli-diff.test.ts +++ b/src/__tests__/cli-diff.test.ts @@ -43,6 +43,26 @@ function solidPngBuffer( return PNG.sync.write(png); } +function movingBlockPngBuffer(offset: number): Buffer { + const png = new PNG({ width: 40, height: 40 }); + for (let i = 0; i < png.data.length; i += 4) { + png.data[i] = 240; + png.data[i + 1] = 240; + png.data[i + 2] = 240; + png.data[i + 3] = 255; + } + for (let y = 12; y < 28; y += 1) { + for (let x = 8 + offset; x < 24 + offset; x += 1) { + const index = (y * png.width + x) * 4; + png.data[index] = 30; + png.data[index + 1] = 30; + png.data[index + 2] = 30; + png.data[index + 3] = 255; + } + } + return PNG.sync.write(png); +} + async function runCliCapture( argv: string[], options: RunCliCaptureOptions = {}, @@ -436,4 +456,60 @@ describe('cli diff commands', () => { fs.rmSync(dir, { recursive: true, force: true }); } }); + + test('diff frames summarizes a local PNG frame sequence without daemon calls', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-frames-test-')); + const outputDir = path.join(dir, 'out'); + const telemetryPath = path.join(dir, 'capture.gesture-telemetry.json'); + for (const [index, offset] of [0, 6, 12, 12].entries()) { + fs.writeFileSync(path.join(dir, `frame-${index}.png`), movingBlockPngBuffer(offset)); + } + fs.writeFileSync( + telemetryPath, + JSON.stringify({ + version: 1, + generatedAt: new Date(0).toISOString(), + events: [{ kind: 'tap', tMs: 10, x: 20, y: 20 }], + }), + ); + + try { + const result = await runCliCapture([ + 'diff', + 'frames', + path.join(dir, 'frame-0.png'), + path.join(dir, 'frame-1.png'), + path.join(dir, 'frame-2.png'), + path.join(dir, 'frame-3.png'), + '--out', + outputDir, + '--telemetry', + telemetryPath, + '--threshold', + '0', + ]); + assert.equal(result.code, null); + assert.equal(result.calls.length, 0); + assert.match(result.stdout, /Frame transition summary: 1 transition/); + assert.match(result.stdout, /after tap x=20 y=20/); + assert.match(result.stdout, /keyframes:/); + assert.equal(fs.existsSync(path.join(outputDir, 'transition-1.diff.png')), true); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + test('diff frames rejects screenshot-only overlay refs flag', async () => { + const result = await runCliCapture([ + 'diff', + 'frames', + './frame-1.png', + './frame-2.png', + '--overlay-refs', + ]); + + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /diff frames does not support --overlay-refs/); + }); }); diff --git a/src/cli/commands/screenshot.ts b/src/cli/commands/screenshot.ts index 0f6077f1..1c0519a3 100644 --- a/src/cli/commands/screenshot.ts +++ b/src/cli/commands/screenshot.ts @@ -1,11 +1,17 @@ import fs from 'node:fs'; import os from 'node:os'; import path from 'node:path'; -import { formatScreenshotDiffText, formatSnapshotDiffText } from '../../utils/output.ts'; +import { + formatScreenshotDiffText, + formatSnapshotDiffText, + formatTransitionSummaryText, +} from '../../utils/output.ts'; import { AppError } from '../../utils/errors.ts'; import { compareScreenshots, type ScreenshotDiffResult } from '../../utils/screenshot-diff.ts'; import { attachCurrentOverlayMatches } from '../../utils/screenshot-diff-overlay-matches.ts'; import { resolveUserPath } from '../../utils/path-resolution.ts'; +import { collectFrameInputs, summarizeFrameTransitions } from '../../utils/transition-summary.ts'; +import { extractVideoFrames } from '../../utils/video-frames.ts'; import { buildSelectionOptions, writeCommandOutput } from './shared.ts'; import type { ClientCommandHandler } from './router.ts'; @@ -43,7 +49,66 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl return true; } + if (positionals[0] === 'frames') { + rejectUnsupportedDiffFlags(flags, ['overlayRefs', 'sampleFps', 'maxFrames'], 'diff frames'); + const outputDir = resolveTransitionOutputDir(flags.out); + const frames = await collectFrameInputs(positionals.slice(1)); + const result = await summarizeFrameTransitions({ + frames, + input: { + kind: 'frames', + frameCount: frames.length, + sampledFrameCount: frames.length, + ...(flags.telemetry ? { telemetryPath: resolveUserPath(flags.telemetry) } : {}), + }, + options: { + threshold: readDiffThreshold(flags.threshold), + outputDir, + ...(flags.telemetry ? { telemetryPath: flags.telemetry } : {}), + }, + }); + writeCommandOutput(flags, result, () => formatTransitionSummaryText(result)); + return true; + } + + if (positionals[0] === 'video') { + rejectUnsupportedDiffFlags(flags, ['overlayRefs'], 'diff video'); + const videoRaw = positionals[1]; + if (!videoRaw || positionals.length > 2) { + throw new AppError('INVALID_ARGS', 'diff video requires exactly one video path'); + } + const videoPath = resolveUserPath(videoRaw); + const outputDir = resolveTransitionOutputDir(flags.out); + const framesDir = path.join(outputDir, 'frames'); + const extracted = await extractVideoFrames({ + videoPath, + outputDir: framesDir, + sampleFps: flags.sampleFps, + maxFrames: flags.maxFrames, + }); + const result = await summarizeFrameTransitions({ + frames: extracted.frames, + input: { + kind: 'video', + path: videoPath, + frameCount: extracted.frames.length, + sampledFrameCount: extracted.frames.length, + sampleFps: extracted.sampleFps, + ...(extracted.durationMs ? { durationMs: extracted.durationMs } : {}), + ...(flags.telemetry ? { telemetryPath: resolveUserPath(flags.telemetry) } : {}), + }, + options: { + threshold: readDiffThreshold(flags.threshold), + outputDir, + ...(flags.telemetry ? { telemetryPath: flags.telemetry } : {}), + }, + }); + writeCommandOutput(flags, result, () => formatTransitionSummaryText(result)); + return true; + } + if (positionals[0] !== 'screenshot') return false; + rejectUnsupportedDiffFlags(flags, ['sampleFps', 'maxFrames', 'telemetry'], 'diff screenshot'); const baselineRaw = flags.baseline; if (!baselineRaw || typeof baselineRaw !== 'string') { @@ -60,13 +125,7 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl ); } - let thresholdNum = 0.1; - if (flags.threshold != null && flags.threshold !== '') { - thresholdNum = Number(flags.threshold); - if (Number.isNaN(thresholdNum) || thresholdNum < 0 || thresholdNum > 1) { - throw new AppError('INVALID_ARGS', '--threshold must be a number between 0 and 1'); - } - } + const thresholdNum = readDiffThreshold(flags.threshold); if (currentRaw) { if (flags.overlayRefs) { @@ -144,3 +203,38 @@ function removeStaleCurrentOverlay(outputPath: string): void { function isFsError(error: unknown, code: string): error is NodeJS.ErrnoException { return typeof error === 'object' && error !== null && 'code' in error && error.code === code; } + +function readDiffThreshold(rawThreshold: unknown): number { + if (rawThreshold == null || rawThreshold === '') return 0.1; + const threshold = Number(rawThreshold); + if (Number.isNaN(threshold) || threshold < 0 || threshold > 1) { + throw new AppError('INVALID_ARGS', '--threshold must be a number between 0 and 1'); + } + return threshold; +} + +function resolveTransitionOutputDir(rawOut: unknown): string { + const outputDir = + typeof rawOut === 'string' + ? resolveUserPath(rawOut) + : fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-transition-diff-')); + fs.mkdirSync(outputDir, { recursive: true }); + return outputDir; +} + +function rejectUnsupportedDiffFlags( + flags: Record, + flagKeys: string[], + commandLabel: string, +): void { + const unsupported = flagKeys.filter((key) => flags[key] !== undefined); + if (unsupported.length === 0) return; + throw new AppError( + 'INVALID_ARGS', + `${commandLabel} does not support ${unsupported.map((key) => `--${toKebabCase(key)}`).join(', ')}`, + ); +} + +function toKebabCase(value: string): string { + return value.replace(/[A-Z]/g, (match) => `-${match.toLowerCase()}`); +} diff --git a/src/utils/__tests__/transition-summary.test.ts b/src/utils/__tests__/transition-summary.test.ts new file mode 100644 index 00000000..7072ca37 --- /dev/null +++ b/src/utils/__tests__/transition-summary.test.ts @@ -0,0 +1,89 @@ +import { test } from 'vitest'; +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { PNG } from 'pngjs'; +import { summarizeFrameTransitions, type FrameSample } from '../transition-summary.ts'; + +function tmpDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-transition-summary-')); +} + +function writeSettingsFrame(filePath: string, xOffset: number): void { + const png = new PNG({ width: 120, height: 180 }); + paintRect(png, { x: 0, y: 0, width: 120, height: 180 }, { r: 242, g: 242, b: 247 }); + paintRect(png, { x: 0, y: 0, width: 120, height: 36 }, { r: 248, g: 248, b: 248 }); + paintRect(png, { x: 10 + xOffset, y: 54, width: 100, height: 38 }, { r: 255, g: 255, b: 255 }); + paintRect(png, { x: 18 + xOffset, y: 66, width: 36, height: 8 }, { r: 30, g: 30, b: 30 }); + paintRect(png, { x: 96 + xOffset, y: 66, width: 6, height: 10 }, { r: 130, g: 130, b: 130 }); + paintRect(png, { x: 10 + xOffset, y: 100, width: 100, height: 38 }, { r: 255, g: 255, b: 255 }); + paintRect(png, { x: 18 + xOffset, y: 112, width: 48, height: 8 }, { r: 30, g: 30, b: 30 }); + paintRect(png, { x: 96 + xOffset, y: 112, width: 6, height: 10 }, { r: 130, g: 130, b: 130 }); + fs.writeFileSync(filePath, PNG.sync.write(png)); +} + +function paintRect( + png: PNG, + rect: { x: number; y: number; width: number; height: number }, + color: { r: number; g: number; b: number }, +): void { + const startX = Math.max(0, rect.x); + const endX = Math.min(png.width, rect.x + rect.width); + const startY = Math.max(0, rect.y); + const endY = Math.min(png.height, rect.y + rect.height); + for (let y = startY; y < endY; y += 1) { + for (let x = startX; x < endX; x += 1) { + const index = (y * png.width + x) * 4; + png.data[index] = color.r; + png.data[index + 1] = color.g; + png.data[index + 2] = color.b; + png.data[index + 3] = 255; + } + } +} + +test('summarizes a Settings-like transition and anchors it to telemetry', async () => { + const dir = tmpDir(); + const outputDir = path.join(dir, 'out'); + const offsets = [0, -10, -24, -36, -36]; + const frames: FrameSample[] = offsets.map((offset, index) => { + const framePath = path.join(dir, `settings-${index}.png`); + writeSettingsFrame(framePath, offset); + return { index, path: framePath, timestampMs: index * 120 }; + }); + const telemetryPath = path.join(dir, 'settings.gesture-telemetry.json'); + fs.writeFileSync( + telemetryPath, + JSON.stringify({ + version: 1, + generatedAt: new Date(0).toISOString(), + events: [{ kind: 'tap', tMs: 20, x: 96, y: 66, referenceWidth: 120, referenceHeight: 180 }], + }), + ); + + try { + const result = await summarizeFrameTransitions({ + frames, + input: { + kind: 'frames', + frameCount: frames.length, + sampledFrameCount: frames.length, + telemetryPath, + }, + options: { + threshold: 0, + outputDir, + telemetryPath, + }, + }); + + assert.equal(result.transitions.length, 1); + assert.equal(result.transitions[0]?.trigger, 'after tap x=96 y=66'); + assert.ok(result.transitions[0]?.peakMismatchPercentage); + assert.ok(result.transitions[0]?.regions?.length); + assert.equal(fs.existsSync(result.transitions[0]?.keyframes.diff ?? ''), true); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } +}); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index b1f6f812..819c6820 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -60,6 +60,9 @@ export type CliFlags = { screenshotFullscreen?: boolean; baseline?: string; threshold?: string; + sampleFps?: number; + maxFrames?: number; + telemetry?: string; appsFilter?: 'user-installed' | 'all'; count?: number; fps?: number; @@ -924,7 +927,32 @@ const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ names: ['--threshold'], type: 'string', usageLabel: '--threshold <0-1>', - usageDescription: 'Diff screenshot: color distance threshold (default 0.1)', + usageDescription: 'Diff screenshot/frames/video: color distance threshold (default 0.1)', + }, + { + key: 'sampleFps', + names: ['--sample-fps'], + type: 'int', + min: 1, + max: 60, + usageLabel: '--sample-fps ', + usageDescription: 'Diff video: frames per second to sample before transition analysis', + }, + { + key: 'maxFrames', + names: ['--max-frames'], + type: 'int', + min: 2, + max: 500, + usageLabel: '--max-frames ', + usageDescription: 'Diff video: maximum sampled frames to extract', + }, + { + key: 'telemetry', + names: ['--telemetry'], + type: 'string', + usageLabel: '--telemetry ', + usageDescription: 'Diff frames/video: gesture telemetry JSON sidecar for transition labels', }, ]; @@ -1053,11 +1081,21 @@ const COMMAND_SCHEMAS: Record = { }, diff: { usageOverride: - 'diff snapshot | diff screenshot --baseline [current.png] [--out ] [--threshold <0-1>] [--overlay-refs]', - helpDescription: 'Diff accessibility snapshot or compare screenshots pixel-by-pixel', - summary: 'Diff snapshot or screenshot', - positionalArgs: ['kind', 'current?'], - allowedFlags: [...SNAPSHOT_FLAGS, 'baseline', 'threshold', 'out', 'overlayRefs'], + 'diff snapshot | diff screenshot --baseline [current.png] [--out ] | diff frames [--out ] | diff video [--out ]', + helpDescription: 'Diff accessibility snapshots, screenshots, frame sequences, or videos', + summary: 'Diff snapshots, screenshots, frames, or videos', + positionalArgs: ['kind', 'input?'], + allowsExtraPositionals: true, + allowedFlags: [ + ...SNAPSHOT_FLAGS, + 'baseline', + 'threshold', + 'out', + 'overlayRefs', + 'sampleFps', + 'maxFrames', + 'telemetry', + ], }, 'ensure-simulator': { helpDescription: 'Ensure an iOS simulator exists in a device set (create if missing)', diff --git a/src/utils/output.ts b/src/utils/output.ts index 78e47e58..e0c6dd28 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -5,6 +5,7 @@ import type { SnapshotNode, SnapshotVisibility } from './snapshot.ts'; import { displayNodeLabel } from './snapshot-tree.ts'; import type { ScreenshotDiffResult } from './screenshot-diff.ts'; import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; +import type { TransitionSummaryResult } from './transition-summary.ts'; import { styleText } from 'node:util'; import { buildMobileSnapshotPresentation } from './mobile-snapshot-semantics.ts'; @@ -330,6 +331,66 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { return `${lines.join('\n')}\n`; } +export function formatTransitionSummaryText(data: TransitionSummaryResult): string { + const useColor = supportsColor(); + const lines: string[] = []; + const inputLabel = + data.input.kind === 'video' ? 'Video transition summary' : 'Frame transition summary'; + const duration = data.input.durationMs ? ` from ${formatMs(data.input.durationMs)}` : ''; + const sampleFps = data.input.sampleFps ? ` at ${roundTwo(data.input.sampleFps)} fps` : ''; + lines.push( + `${inputLabel}: ${data.transitions.length} transition${data.transitions.length === 1 ? '' : 's'}, ` + + `sampled ${data.input.sampledFrameCount} frame${data.input.sampledFrameCount === 1 ? '' : 's'}${duration}${sampleFps}`, + ); + if (data.outputDir) { + lines.push(` ${formatMuted('Artifacts:', useColor)} ${toRelativePath(data.outputDir)}`); + } + if (data.omittedTransitions > 0) { + lines.push( + ` ${formatMuted(`Omitted ${data.omittedTransitions} lower-ranked transition(s).`, useColor)}`, + ); + } + if (data.transitions.length === 0) { + lines.push(' No significant frame-to-frame transitions detected.'); + return `${lines.join('\n')}\n`; + } + + for (const transition of data.transitions) { + const timing = `${formatMs(transition.startMs)}-${formatMs(transition.endMs)}`; + const trigger = transition.trigger ? ` ${transition.trigger}` : ''; + lines.push(`\n${transition.index}. ${timing}${trigger}`); + lines.push(` ${transition.summary}`); + lines.push( + ` peak=${transition.peakMismatchPercentage}% avg=${transition.averageMismatchPercentage}% ` + + `duration=${formatMs(transition.durationMs)}`, + ); + const region = transition.regions?.[0]; + if (region) { + lines.push( + ` changed: ${region.size} ${region.location} ${region.shape}, ` + + `${region.shareOfDiffPercentage}% of diff, ${region.dominantChange}`, + ); + } + const cluster = transition.ocr?.movementClusters?.[0]; + if (cluster) { + lines.push( + ` text: ${formatQuotedList(cluster.texts)} dx=${formatRange(cluster.xRange)}px ` + + `dy=${formatRange(cluster.yRange)}px`, + ); + } + lines.push( + ` keyframes: before=${toRelativePath(transition.keyframes.before)} ` + + `mid=${toRelativePath(transition.keyframes.mid)} ` + + `after=${toRelativePath(transition.keyframes.after)}`, + ); + if (transition.keyframes.diff) { + lines.push(` diff: ${toRelativePath(transition.keyframes.diff)}`); + } + } + + return `${lines.join('\n')}\n`; +} + function formatRect(rect: { x: number; y: number; width: number; height: number }): string { return `x=${rect.x},y=${rect.y},w=${rect.width},h=${rect.height}`; } @@ -415,6 +476,15 @@ function toNumber(value: unknown): number { return typeof value === 'number' && Number.isFinite(value) ? value : 0; } +function formatMs(value: number): string { + if (value >= 1_000) return `${roundTwo(value / 1_000)}s`; + return `${Math.round(value)}ms`; +} + +function roundTwo(value: number): number { + return Math.round(value * 100) / 100; +} + function applyContextWindow(lines: SnapshotDiffLine[], contextWindow: number): SnapshotDiffLine[] { if (lines.length === 0) return lines; const changedIndices = lines diff --git a/src/utils/screenshot-diff.ts b/src/utils/screenshot-diff.ts index b15fca23..fbe3b629 100644 --- a/src/utils/screenshot-diff.ts +++ b/src/utils/screenshot-diff.ts @@ -34,6 +34,7 @@ export type ScreenshotDiffOptions = { threshold?: number; outputPath?: string; maxRegions?: number; + includeOcr?: boolean; }; // Each pixel is a point in 3D RGB space (R, G, B each 0–255). @@ -136,7 +137,7 @@ export async function compareScreenshots( } const ocrAnalysis = - differentPixels > 0 + differentPixels > 0 && options.includeOcr !== false ? await summarizeScreenshotOcr({ baselinePath, currentPath, diff --git a/src/utils/transition-summary.ts b/src/utils/transition-summary.ts new file mode 100644 index 00000000..333e9776 --- /dev/null +++ b/src/utils/transition-summary.ts @@ -0,0 +1,355 @@ +import { promises as fs } from 'node:fs'; +import path from 'node:path'; +import type { RecordingGestureEvent } from '../daemon/types.ts'; +import { AppError } from './errors.ts'; +import { resolveUserPath } from './path-resolution.ts'; +import { compareScreenshots } from './screenshot-diff.ts'; +import type { ScreenshotDiffResult } from './screenshot-diff.ts'; +import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; + +export type FrameSample = { + index: number; + path: string; + timestampMs: number; +}; + +export type TransitionTelemetryEvent = RecordingGestureEvent; + +export type TransitionSummaryInput = { + kind: 'frames' | 'video'; + path?: string; + frameCount: number; + sampledFrameCount: number; + durationMs?: number; + sampleFps?: number; + telemetryPath?: string; +}; + +export type TransitionSummaryEvent = { + index: number; + startMs: number; + endMs: number; + durationMs: number; + classification: string; + trigger?: string; + summary: string; + peakMismatchPercentage: number; + averageMismatchPercentage: number; + keyframes: { + before: string; + mid: string; + after: string; + diff?: string; + }; + regions?: ScreenshotDiffRegion[]; + ocr?: ScreenshotDiffResult['ocr']; + nonTextDeltas?: ScreenshotDiffResult['nonTextDeltas']; +}; + +export type TransitionSummaryResult = { + input: TransitionSummaryInput; + outputDir?: string; + transitions: TransitionSummaryEvent[]; + omittedTransitions: number; +}; + +export type TransitionSummaryOptions = { + threshold?: number; + outputDir?: string; + telemetryPath?: string; + maxTransitions?: number; +}; + +type PairDiff = { + index: number; + mismatchPercentage: number; +}; + +type TransitionSegment = { + startPairIndex: number; + endPairIndex: number; + peakMismatchPercentage: number; + averageMismatchPercentage: number; +}; + +type TelemetryEnvelope = { + version?: unknown; + events?: unknown; +}; + +const MIN_SIGNIFICANT_MISMATCH_PERCENTAGE = 0.5; +const SEGMENT_GAP_TOLERANCE_PAIRS = 1; +const DEFAULT_MAX_TRANSITIONS = 5; + +export async function summarizeFrameTransitions(params: { + frames: FrameSample[]; + input: TransitionSummaryInput; + options?: TransitionSummaryOptions; +}): Promise { + const frames = [...params.frames].sort((left, right) => left.index - right.index); + if (frames.length < 2) { + throw new AppError('INVALID_ARGS', 'transition summary requires at least two frames'); + } + + const threshold = params.options?.threshold ?? 0.1; + const pairDiffs: PairDiff[] = []; + for (let index = 0; index < frames.length - 1; index += 1) { + const diff = await compareScreenshots(frames[index]!.path, frames[index + 1]!.path, { + threshold, + includeOcr: false, + maxRegions: 3, + }); + pairDiffs.push({ + index, + mismatchPercentage: diff.mismatchPercentage, + }); + } + + const telemetryEvents = params.options?.telemetryPath + ? await readTelemetryEvents(params.options.telemetryPath) + : []; + const segments = segmentPairDiffs(pairDiffs); + const selectedSegments = segments + .sort((left, right) => right.peakMismatchPercentage - left.peakMismatchPercentage) + .slice(0, params.options?.maxTransitions ?? DEFAULT_MAX_TRANSITIONS) + .sort((left, right) => left.startPairIndex - right.startPairIndex); + + const transitions: TransitionSummaryEvent[] = []; + for (const [index, segment] of selectedSegments.entries()) { + const before = frames[segment.startPairIndex]!; + const after = frames[segment.endPairIndex + 1]!; + const mid = frames[Math.round((segment.startPairIndex + segment.endPairIndex + 1) / 2)]!; + const diffPath = params.options?.outputDir + ? path.join(params.options.outputDir, `transition-${index + 1}.diff.png`) + : undefined; + const boundaryDiff = await compareScreenshots(before.path, after.path, { + threshold, + outputPath: diffPath, + maxRegions: 5, + }); + const trigger = findTrigger(telemetryEvents, before.timestampMs, after.timestampMs); + const classification = classifyTransition(boundaryDiff, trigger); + transitions.push({ + index: index + 1, + startMs: before.timestampMs, + endMs: after.timestampMs, + durationMs: Math.max(0, after.timestampMs - before.timestampMs), + classification, + ...(trigger + ? { trigger: formatTrigger(trigger, before.timestampMs, after.timestampMs) } + : {}), + summary: buildTransitionSummary(boundaryDiff, classification), + peakMismatchPercentage: roundTwo(segment.peakMismatchPercentage), + averageMismatchPercentage: roundTwo(segment.averageMismatchPercentage), + keyframes: { + before: before.path, + mid: mid.path, + after: after.path, + ...(boundaryDiff.diffPath ? { diff: boundaryDiff.diffPath } : {}), + }, + ...(boundaryDiff.regions && boundaryDiff.regions.length > 0 + ? { regions: boundaryDiff.regions.slice(0, 5) } + : {}), + ...(boundaryDiff.ocr ? { ocr: boundaryDiff.ocr } : {}), + ...(boundaryDiff.nonTextDeltas && boundaryDiff.nonTextDeltas.length > 0 + ? { nonTextDeltas: boundaryDiff.nonTextDeltas.slice(0, 5) } + : {}), + }); + } + + return { + input: params.input, + ...(params.options?.outputDir ? { outputDir: params.options.outputDir } : {}), + transitions, + omittedTransitions: Math.max(0, segments.length - selectedSegments.length), + }; +} + +export async function collectFrameInputs(inputs: string[]): Promise { + if (inputs.length === 0) { + throw new AppError('INVALID_ARGS', 'diff frames requires a frame path or directory'); + } + + const framePaths: string[] = []; + if (inputs.length === 1) { + const resolved = resolveUserPath(inputs[0]!); + const stat = await fs.stat(resolved); + if (stat.isDirectory()) { + const entries = await fs.readdir(resolved); + framePaths.push( + ...entries + .filter((entry) => entry.toLowerCase().endsWith('.png')) + .sort(compareNatural) + .map((entry) => path.join(resolved, entry)), + ); + } else { + framePaths.push(resolved); + } + } else { + framePaths.push(...inputs.map((input) => resolveUserPath(input))); + } + + if (framePaths.length < 2) { + throw new AppError('INVALID_ARGS', 'diff frames requires at least two PNG frames'); + } + + return framePaths.map((framePath, index) => ({ + index, + path: framePath, + timestampMs: index * 100, + })); +} + +function segmentPairDiffs(pairDiffs: PairDiff[]): TransitionSegment[] { + const segments: TransitionSegment[] = []; + let activePairs: PairDiff[] = []; + let inactiveGap = 0; + + for (const pair of pairDiffs) { + if (pair.mismatchPercentage >= MIN_SIGNIFICANT_MISMATCH_PERCENTAGE) { + activePairs.push(pair); + inactiveGap = 0; + continue; + } + if (activePairs.length === 0) continue; + inactiveGap += 1; + if (inactiveGap <= SEGMENT_GAP_TOLERANCE_PAIRS) continue; + segments.push(toSegment(activePairs)); + activePairs = []; + inactiveGap = 0; + } + + if (activePairs.length > 0) { + segments.push(toSegment(activePairs)); + } + + return segments; +} + +function toSegment(pairs: PairDiff[]): TransitionSegment { + const peakMismatchPercentage = Math.max(...pairs.map((pair) => pair.mismatchPercentage)); + const averageMismatchPercentage = + pairs.reduce((sum, pair) => sum + pair.mismatchPercentage, 0) / pairs.length; + return { + startPairIndex: pairs[0]!.index, + endPairIndex: pairs[pairs.length - 1]!.index, + peakMismatchPercentage, + averageMismatchPercentage, + }; +} + +async function readTelemetryEvents(telemetryPath: string): Promise { + try { + const raw = JSON.parse( + await fs.readFile(resolveUserPath(telemetryPath), 'utf8'), + ) as TelemetryEnvelope; + if (!Array.isArray(raw.events)) return []; + return raw.events.filter(isTelemetryEvent); + } catch (error) { + throw new AppError( + 'INVALID_ARGS', + `invalid gesture telemetry JSON: ${telemetryPath}`, + undefined, + error, + ); + } +} + +function isTelemetryEvent(value: unknown): value is TransitionTelemetryEvent { + if (!value || typeof value !== 'object') return false; + const event = value as Partial; + return typeof event.kind === 'string' && typeof event.tMs === 'number'; +} + +function findTrigger( + events: TransitionTelemetryEvent[], + startMs: number, + endMs: number, +): TransitionTelemetryEvent | undefined { + let best: TransitionTelemetryEvent | undefined; + let bestDistance = Number.POSITIVE_INFINITY; + for (const event of events) { + const durationMs = + 'durationMs' in event && typeof event.durationMs === 'number' ? event.durationMs : 0; + const eventEndMs = event.tMs + durationMs; + const overlaps = event.tMs <= endMs + 250 && eventEndMs >= startMs - 250; + const before = event.tMs <= startMs && startMs - event.tMs <= 1_000; + if (!overlaps && !before) continue; + const distance = Math.abs(startMs - event.tMs); + if (distance >= bestDistance) continue; + best = event; + bestDistance = distance; + } + return best; +} + +function classifyTransition( + diff: ScreenshotDiffResult, + trigger: TransitionTelemetryEvent | undefined, +): string { + if (trigger?.kind === 'scroll') return `${trigger.contentDirection} scroll`; + if (trigger?.kind === 'swipe' || trigger?.kind === 'back-swipe') return 'gesture navigation'; + const cluster = diff.ocr?.movementClusters?.[0]; + if (cluster) { + const dx = maxAbs(cluster.xRange); + const dy = maxAbs(cluster.yRange); + if (dx > 24 && dx > dy * 1.5) return 'horizontal navigation'; + if (dy > 24 && dy > dx * 1.5) return 'vertical movement'; + } + const largestRegion = diff.regions?.[0]; + if (largestRegion?.size === 'large' && largestRegion.shape === 'large-area') { + if (largestRegion.dominantChange === 'brighter') return 'brightening transition'; + if (largestRegion.dominantChange === 'darker') return 'dimming transition'; + return 'screen replacement'; + } + return 'screen update'; +} + +function buildTransitionSummary(diff: ScreenshotDiffResult, classification: string): string { + const details: string[] = [classification]; + const cluster = diff.ocr?.movementClusters?.[0]; + if (cluster) { + details.push( + `text moved dx=${formatRange(cluster.xRange)}px dy=${formatRange(cluster.yRange)}px`, + ); + } + const region = diff.regions?.[0]; + if (region) { + details.push(`${region.size} ${region.location} ${region.shape} changed`); + } + return details.join('; '); +} + +function formatTrigger(event: TransitionTelemetryEvent, startMs: number, endMs: number): string { + const relation = + isContinuousGesture(event) && event.tMs >= startMs && event.tMs <= endMs ? 'during' : 'after'; + if (event.kind === 'scroll') return `${relation} ${event.contentDirection} scroll`; + if (event.kind === 'pinch') return `${relation} pinch scale=${event.scale}`; + return `${relation} ${event.kind} x=${Math.round(event.x)} y=${Math.round(event.y)}`; +} + +function isContinuousGesture(event: TransitionTelemetryEvent): boolean { + return ['scroll', 'swipe', 'back-swipe', 'pinch'].includes(event.kind); +} + +function maxAbs(range: { min: number; max: number }): number { + return Math.max(Math.abs(range.min), Math.abs(range.max)); +} + +function formatRange(range: { min: number; max: number }): string { + return range.min === range.max + ? formatSigned(range.min) + : `${formatSigned(range.min)}..${formatSigned(range.max)}`; +} + +function formatSigned(value: number): string { + return value > 0 ? `+${value}` : String(value); +} + +function compareNatural(left: string, right: string): number { + return left.localeCompare(right, undefined, { numeric: true, sensitivity: 'base' }); +} + +function roundTwo(value: number): number { + return Math.round(value * 100) / 100; +} diff --git a/src/utils/video-frames.ts b/src/utils/video-frames.ts new file mode 100644 index 00000000..7f18f603 --- /dev/null +++ b/src/utils/video-frames.ts @@ -0,0 +1,113 @@ +import { promises as fs } from 'node:fs'; +import path from 'node:path'; +import { AppError } from './errors.ts'; +import { runCmd, whichCmd } from './exec.ts'; +import type { FrameSample } from './transition-summary.ts'; + +export type ExtractedVideoFrames = { + frames: FrameSample[]; + durationMs?: number; + sampleFps: number; +}; + +type FfprobePayload = { + format?: { + duration?: string; + }; +}; + +const DEFAULT_SAMPLE_FPS = 6; +const DEFAULT_MAX_FRAMES = 80; +const VIDEO_PROBE_TIMEOUT_MS = 10_000; +const VIDEO_EXTRACT_TIMEOUT_MS = 60_000; + +export async function extractVideoFrames(params: { + videoPath: string; + outputDir: string; + sampleFps?: number; + maxFrames?: number; +}): Promise { + if (!(await whichCmd('ffmpeg')) || !(await whichCmd('ffprobe'))) { + throw new AppError('TOOL_MISSING', 'diff video requires ffmpeg and ffprobe in PATH', { + hint: 'Install FFmpeg, then retry diff video.', + }); + } + + await fs.mkdir(params.outputDir, { recursive: true }); + const maxFrames = params.maxFrames ?? DEFAULT_MAX_FRAMES; + const requestedFps = params.sampleFps ?? DEFAULT_SAMPLE_FPS; + const durationMs = await probeVideoDurationMs(params.videoPath); + const sampleFps = + durationMs && durationMs > 0 + ? Math.min(requestedFps, Math.max(1 / (durationMs / 1_000), maxFrames / (durationMs / 1_000))) + : requestedFps; + const pattern = path.join(params.outputDir, 'frame-%06d.png'); + const result = await runCmd( + 'ffmpeg', + [ + '-hide_banner', + '-loglevel', + 'info', + '-i', + params.videoPath, + '-vf', + `fps=${formatFps(sampleFps)},showinfo`, + '-frames:v', + String(maxFrames), + '-vsync', + '0', + pattern, + ], + { allowFailure: true, timeoutMs: VIDEO_EXTRACT_TIMEOUT_MS }, + ); + if (result.exitCode !== 0) { + throw new AppError('COMMAND_FAILED', 'ffmpeg failed to extract video frames', { + stderr: result.stderr, + }); + } + + const files = (await fs.readdir(params.outputDir)) + .filter((entry) => /^frame-\d+\.png$/i.test(entry)) + .sort((left, right) => left.localeCompare(right, undefined, { numeric: true })); + const timestamps = parseShowInfoTimestamps(result.stderr); + return { + frames: files.map((entry, index) => ({ + index, + path: path.join(params.outputDir, entry), + timestampMs: timestamps[index] ?? Math.round((index / sampleFps) * 1_000), + })), + ...(durationMs ? { durationMs } : {}), + sampleFps, + }; +} + +async function probeVideoDurationMs(videoPath: string): Promise { + const result = await runCmd( + 'ffprobe', + ['-v', 'error', '-show_entries', 'format=duration', '-of', 'json', videoPath], + { allowFailure: true, timeoutMs: VIDEO_PROBE_TIMEOUT_MS }, + ); + if (result.exitCode !== 0) return undefined; + try { + const parsed = JSON.parse(result.stdout) as FfprobePayload; + const seconds = parsed.format?.duration ? Number(parsed.format.duration) : Number.NaN; + return Number.isFinite(seconds) && seconds > 0 ? Math.round(seconds * 1_000) : undefined; + } catch { + return undefined; + } +} + +function parseShowInfoTimestamps(stderr: string): number[] { + return stderr + .split(/\r?\n/) + .map((line) => /pts_time:([0-9.]+)/.exec(line)?.[1]) + .filter((value): value is string => value !== undefined) + .map((value) => Math.round(Number(value) * 1_000)) + .filter((value) => Number.isFinite(value)); +} + +function formatFps(value: number): string { + return Number.isInteger(value) + ? String(value) + : value.toFixed(3).replace(/0+$/, '').replace(/\.$/, ''); +} diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index 9a0345bb..4cf4c012 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -544,6 +544,8 @@ agent-device open --platform macos --surface desktop && agent-device screenshot agent-device diff screenshot --baseline baseline.png --out diff.png agent-device diff screenshot --baseline baseline.png current.png --out diff.png agent-device diff screenshot --baseline baseline.png --out diff.png --overlay-refs +agent-device diff frames ./frames --out transition-summary +agent-device diff video session.mp4 --out transition-summary --telemetry session.gesture-telemetry.json agent-device record start # Start screen recording to auto filename agent-device record start session.mp4 # Start recording to explicit path agent-device record start session.mp4 --fps 30 # Override iOS device runner FPS @@ -556,6 +558,9 @@ agent-device record stop # Stop active recording - If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas, movement clusters, and bbox size-change hints to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, and separators, not semantic icon recognition. - `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide for live captures without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. Saved-image comparisons do not have live accessibility refs, so `--overlay-refs` is unavailable when a `current.png` path is provided. +- `diff frames` summarizes transitions from a PNG frame directory or explicit PNG frame list. It prints a capped timeline with transition timing, keyframes, changed-region summaries, and optional OCR movement hints. +- `diff video` requires `ffmpeg` and `ffprobe` in `PATH`; it samples a recording into frames under the `--out` directory and runs the same transition summarizer. Use `--sample-fps ` and `--max-frames ` to tune extraction cost. +- Add `--telemetry ` to `diff frames` or `diff video` when you have a recording gesture sidecar. Transition labels can then include anchors such as `after tap` or `during up scroll`. - In `--json` mode, each overlay ref also includes a screenshot-space `center` point for coordinate fallback like `press `. - Burned-in touch overlays are exported only on macOS hosts, because the overlay pipeline depends on Swift + AVFoundation helpers. - On Linux or other non-macOS hosts, `record stop` still succeeds and returns the raw video plus telemetry sidecar, and includes `overlayWarning` when burn-in overlays were skipped. From 6569b558be706237f5d9ed19a5d3f4ae0f3f13e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Tue, 14 Apr 2026 09:38:56 +0200 Subject: [PATCH 2/3] fix: address review feedback --- .../agent-device/references/verification.md | 2 +- src/__tests__/cli-diff.test.ts | 40 +++++++++++++++++++ src/cli/commands/screenshot.ts | 18 +++++++-- src/utils/__tests__/video-frames.test.ts | 32 +++++++++++++++ src/utils/command-schema.ts | 12 ++++++ src/utils/transition-summary.ts | 18 ++++++--- src/utils/video-frames.ts | 7 ++-- website/docs/docs/commands.md | 2 +- 8 files changed, 116 insertions(+), 15 deletions(-) create mode 100644 src/utils/__tests__/video-frames.test.ts diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index 836c6e62..4406d719 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -78,7 +78,7 @@ agent-device diff video ./recordings/settings.mov --out /tmp/settings-transition agent-device diff video ./recordings/settings.mov --sample-fps 8 --max-frames 120 --json ``` -- `diff frames` accepts a directory of PNG frames or explicit PNG paths. It works without external video tools. +- `diff frames` accepts a directory of PNG frames or explicit PNG paths. It works without external video tools. Use `--frame-interval-ms ` when recording telemetry timestamps need to line up with a known frame cadence; otherwise frames are spaced at `100ms`. - `diff video` requires `ffmpeg` and `ffprobe` in `PATH`; it samples the recording into PNG frames, then runs the same transition summarizer. - Add `--telemetry ` with a recording gesture sidecar when available. The output can then anchor transitions to events such as `after tap` or `during up scroll`. - The text output stays capped to the top transitions, keyframes, changed-region summaries, and optional OCR movement hints. Use `--json` when you need the structured metrics. diff --git a/src/__tests__/cli-diff.test.ts b/src/__tests__/cli-diff.test.ts index 49ab1dda..ef4cd1e6 100644 --- a/src/__tests__/cli-diff.test.ts +++ b/src/__tests__/cli-diff.test.ts @@ -485,12 +485,15 @@ describe('cli diff commands', () => { outputDir, '--telemetry', telemetryPath, + '--frame-interval-ms', + '250', '--threshold', '0', ]); assert.equal(result.code, null); assert.equal(result.calls.length, 0); assert.match(result.stdout, /Frame transition summary: 1 transition/); + assert.match(result.stdout, /0ms-500ms after tap x=20 y=20/); assert.match(result.stdout, /after tap x=20 y=20/); assert.match(result.stdout, /keyframes:/); assert.equal(fs.existsSync(path.join(outputDir, 'transition-1.diff.png')), true); @@ -512,4 +515,41 @@ describe('cli diff commands', () => { assert.equal(result.calls.length, 0); assert.match(result.stderr, /diff frames does not support --overlay-refs/); }); + + test('diff frames rejects screenshot-only baseline flag', async () => { + const result = await runCliCapture([ + 'diff', + 'frames', + './frame-1.png', + './frame-2.png', + '--baseline', + './baseline.png', + ]); + + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /diff frames does not support --baseline/); + }); + + test('diff video rejects extra positional paths before probing ffmpeg', async () => { + const result = await runCliCapture(['diff', 'video', './one.mp4', './two.mp4']); + + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /diff video requires exactly one video path/); + }); + + test('diff video rejects screenshot-only baseline flag before probing ffmpeg', async () => { + const result = await runCliCapture([ + 'diff', + 'video', + './session.mp4', + '--baseline', + './baseline.png', + ]); + + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /diff video does not support --baseline/); + }); }); diff --git a/src/cli/commands/screenshot.ts b/src/cli/commands/screenshot.ts index 1c0519a3..d0d4e4c9 100644 --- a/src/cli/commands/screenshot.ts +++ b/src/cli/commands/screenshot.ts @@ -50,9 +50,15 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl } if (positionals[0] === 'frames') { - rejectUnsupportedDiffFlags(flags, ['overlayRefs', 'sampleFps', 'maxFrames'], 'diff frames'); + rejectUnsupportedDiffFlags( + flags, + ['baseline', 'overlayRefs', 'sampleFps', 'maxFrames'], + 'diff frames', + ); const outputDir = resolveTransitionOutputDir(flags.out); - const frames = await collectFrameInputs(positionals.slice(1)); + const frames = await collectFrameInputs(positionals.slice(1), { + frameIntervalMs: flags.frameIntervalMs, + }); const result = await summarizeFrameTransitions({ frames, input: { @@ -72,7 +78,7 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl } if (positionals[0] === 'video') { - rejectUnsupportedDiffFlags(flags, ['overlayRefs'], 'diff video'); + rejectUnsupportedDiffFlags(flags, ['baseline', 'frameIntervalMs', 'overlayRefs'], 'diff video'); const videoRaw = positionals[1]; if (!videoRaw || positionals.length > 2) { throw new AppError('INVALID_ARGS', 'diff video requires exactly one video path'); @@ -108,7 +114,11 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl } if (positionals[0] !== 'screenshot') return false; - rejectUnsupportedDiffFlags(flags, ['sampleFps', 'maxFrames', 'telemetry'], 'diff screenshot'); + rejectUnsupportedDiffFlags( + flags, + ['sampleFps', 'maxFrames', 'frameIntervalMs', 'telemetry'], + 'diff screenshot', + ); const baselineRaw = flags.baseline; if (!baselineRaw || typeof baselineRaw !== 'string') { diff --git a/src/utils/__tests__/video-frames.test.ts b/src/utils/__tests__/video-frames.test.ts new file mode 100644 index 00000000..7622f14a --- /dev/null +++ b/src/utils/__tests__/video-frames.test.ts @@ -0,0 +1,32 @@ +import { beforeEach, test, vi } from 'vitest'; +import assert from 'node:assert/strict'; +import { extractVideoFrames } from '../video-frames.ts'; +import { runCmd, whichCmd } from '../exec.ts'; +import { AppError } from '../errors.ts'; + +vi.mock('../exec.ts', async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, runCmd: vi.fn(), whichCmd: vi.fn() }; +}); + +const mockRunCmd = vi.mocked(runCmd); +const mockWhichCmd = vi.mocked(whichCmd); + +beforeEach(() => { + mockRunCmd.mockReset(); + mockWhichCmd.mockReset(); +}); + +test('extractVideoFrames reports a TOOL_MISSING error when ffmpeg tooling is absent', async () => { + mockWhichCmd.mockResolvedValue(false); + + await assert.rejects( + () => extractVideoFrames({ videoPath: '/tmp/session.mp4', outputDir: '/tmp/frames' }), + (error) => + error instanceof AppError && + error.code === 'TOOL_MISSING' && + error.message === 'diff video requires ffmpeg and ffprobe in PATH' && + error.details?.hint === 'Install FFmpeg, then retry diff video.', + ); + assert.equal(mockRunCmd.mock.calls.length, 0); +}); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 819c6820..5d82deeb 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -62,6 +62,7 @@ export type CliFlags = { threshold?: string; sampleFps?: number; maxFrames?: number; + frameIntervalMs?: number; telemetry?: string; appsFilter?: 'user-installed' | 'all'; count?: number; @@ -947,6 +948,16 @@ const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ usageLabel: '--max-frames ', usageDescription: 'Diff video: maximum sampled frames to extract', }, + { + key: 'frameIntervalMs', + names: ['--frame-interval-ms'], + type: 'int', + min: 1, + max: 60_000, + usageLabel: '--frame-interval-ms ', + usageDescription: + 'Diff frames: timestamp spacing for frame sequences when aligning telemetry (default 100)', + }, { key: 'telemetry', names: ['--telemetry'], @@ -1094,6 +1105,7 @@ const COMMAND_SCHEMAS: Record = { 'overlayRefs', 'sampleFps', 'maxFrames', + 'frameIntervalMs', 'telemetry', ], }, diff --git a/src/utils/transition-summary.ts b/src/utils/transition-summary.ts index 333e9776..e5c303db 100644 --- a/src/utils/transition-summary.ts +++ b/src/utils/transition-summary.ts @@ -15,9 +15,7 @@ export type FrameSample = { export type TransitionTelemetryEvent = RecordingGestureEvent; -export type TransitionSummaryInput = { - kind: 'frames' | 'video'; - path?: string; +type BaseTransitionSummaryInput = { frameCount: number; sampledFrameCount: number; durationMs?: number; @@ -25,6 +23,10 @@ export type TransitionSummaryInput = { telemetryPath?: string; }; +export type TransitionSummaryInput = + | ({ kind: 'frames' } & BaseTransitionSummaryInput) + | ({ kind: 'video'; path: string } & BaseTransitionSummaryInput); + export type TransitionSummaryEvent = { index: number; startMs: number; @@ -165,7 +167,10 @@ export async function summarizeFrameTransitions(params: { }; } -export async function collectFrameInputs(inputs: string[]): Promise { +export async function collectFrameInputs( + inputs: string[], + options: { frameIntervalMs?: number } = {}, +): Promise { if (inputs.length === 0) { throw new AppError('INVALID_ARGS', 'diff frames requires a frame path or directory'); } @@ -193,10 +198,11 @@ export async function collectFrameInputs(inputs: string[]): Promise ({ index, path: framePath, - timestampMs: index * 100, + timestampMs: index * frameIntervalMs, })); } @@ -272,6 +278,8 @@ function findTrigger( const durationMs = 'durationMs' in event && typeof event.durationMs === 'number' ? event.durationMs : 0; const eventEndMs = event.tMs + durationMs; + // Continuous gestures should attach while overlapping the transition; discrete taps + // often land just before the first changed frame because UI animations start after input. const overlaps = event.tMs <= endMs + 250 && eventEndMs >= startMs - 250; const before = event.tMs <= startMs && startMs - event.tMs <= 1_000; if (!overlaps && !before) continue; diff --git a/src/utils/video-frames.ts b/src/utils/video-frames.ts index 7f18f603..68509f2c 100644 --- a/src/utils/video-frames.ts +++ b/src/utils/video-frames.ts @@ -27,7 +27,8 @@ export async function extractVideoFrames(params: { sampleFps?: number; maxFrames?: number; }): Promise { - if (!(await whichCmd('ffmpeg')) || !(await whichCmd('ffprobe'))) { + const [hasFfmpeg, hasFfprobe] = await Promise.all([whichCmd('ffmpeg'), whichCmd('ffprobe')]); + if (!hasFfmpeg || !hasFfprobe) { throw new AppError('TOOL_MISSING', 'diff video requires ffmpeg and ffprobe in PATH', { hint: 'Install FFmpeg, then retry diff video.', }); @@ -39,7 +40,7 @@ export async function extractVideoFrames(params: { const durationMs = await probeVideoDurationMs(params.videoPath); const sampleFps = durationMs && durationMs > 0 - ? Math.min(requestedFps, Math.max(1 / (durationMs / 1_000), maxFrames / (durationMs / 1_000))) + ? Math.min(requestedFps, maxFrames / (durationMs / 1_000)) : requestedFps; const pattern = path.join(params.outputDir, 'frame-%06d.png'); const result = await runCmd( @@ -54,8 +55,6 @@ export async function extractVideoFrames(params: { `fps=${formatFps(sampleFps)},showinfo`, '-frames:v', String(maxFrames), - '-vsync', - '0', pattern, ], { allowFailure: true, timeoutMs: VIDEO_EXTRACT_TIMEOUT_MS }, diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index 4cf4c012..4ab3859f 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -558,7 +558,7 @@ agent-device record stop # Stop active recording - If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas, movement clusters, and bbox size-change hints to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, and separators, not semantic icon recognition. - `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide for live captures without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. Saved-image comparisons do not have live accessibility refs, so `--overlay-refs` is unavailable when a `current.png` path is provided. -- `diff frames` summarizes transitions from a PNG frame directory or explicit PNG frame list. It prints a capped timeline with transition timing, keyframes, changed-region summaries, and optional OCR movement hints. +- `diff frames` summarizes transitions from a PNG frame directory or explicit PNG frame list. It prints a capped timeline with transition timing, keyframes, changed-region summaries, and optional OCR movement hints. Use `--frame-interval-ms ` when aligning frame sequences with recording telemetry; the default is `100`. - `diff video` requires `ffmpeg` and `ffprobe` in `PATH`; it samples a recording into frames under the `--out` directory and runs the same transition summarizer. Use `--sample-fps ` and `--max-frames ` to tune extraction cost. - Add `--telemetry ` to `diff frames` or `diff video` when you have a recording gesture sidecar. Transition labels can then include anchors such as `after tap` or `during up scroll`. - In `--json` mode, each overlay ref also includes a screenshot-space `center` point for coordinate fallback like `press `. From ddeeb9fbdc26c713f4bf92d694654ad3336b1b95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Tue, 14 Apr 2026 10:32:38 +0200 Subject: [PATCH 3/3] fix: clear stale video frame artifacts --- src/utils/__tests__/video-frames.test.ts | 26 ++++++++++++++++++++++++ src/utils/video-frames.ts | 13 ++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/utils/__tests__/video-frames.test.ts b/src/utils/__tests__/video-frames.test.ts index 7622f14a..9d744106 100644 --- a/src/utils/__tests__/video-frames.test.ts +++ b/src/utils/__tests__/video-frames.test.ts @@ -1,5 +1,8 @@ import { beforeEach, test, vi } from 'vitest'; import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; import { extractVideoFrames } from '../video-frames.ts'; import { runCmd, whichCmd } from '../exec.ts'; import { AppError } from '../errors.ts'; @@ -30,3 +33,26 @@ test('extractVideoFrames reports a TOOL_MISSING error when ffmpeg tooling is abs ); assert.equal(mockRunCmd.mock.calls.length, 0); }); + +test('extractVideoFrames clears stale frame PNGs before reading extracted frames', async () => { + const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-video-frames-')); + const stalePath = path.join(outputDir, 'frame-000002.png'); + fs.writeFileSync(stalePath, Buffer.from('stale')); + + mockWhichCmd.mockResolvedValue(true); + mockRunCmd.mockImplementation(async () => { + const freshPath = path.join(outputDir, 'frame-000001.png'); + fs.writeFileSync(freshPath, Buffer.from('fresh')); + return { stdout: '', stderr: 'pts_time:0.100', exitCode: 0 }; + }); + + const result = await extractVideoFrames({ + videoPath: '/tmp/session.mp4', + outputDir, + sampleFps: 2, + maxFrames: 2, + }); + + assert.equal(result.frames.length, 1); + assert.equal(fs.existsSync(stalePath), false); +}); diff --git a/src/utils/video-frames.ts b/src/utils/video-frames.ts index 68509f2c..f97a2db6 100644 --- a/src/utils/video-frames.ts +++ b/src/utils/video-frames.ts @@ -35,6 +35,7 @@ export async function extractVideoFrames(params: { } await fs.mkdir(params.outputDir, { recursive: true }); + await removeStaleFrames(params.outputDir); const maxFrames = params.maxFrames ?? DEFAULT_MAX_FRAMES; const requestedFps = params.sampleFps ?? DEFAULT_SAMPLE_FPS; const durationMs = await probeVideoDurationMs(params.videoPath); @@ -80,6 +81,18 @@ export async function extractVideoFrames(params: { }; } +async function removeStaleFrames(outputDir: string): Promise { + const entries = await fs.readdir(outputDir); + const stale = entries.filter((entry) => /^frame-\d+\.png$/i.test(entry)); + await Promise.all( + stale.map((entry) => + fs.rm(path.join(outputDir, entry), { + force: true, + }), + ), + ); +} + async function probeVideoDurationMs(videoPath: string): Promise { const result = await runCmd( 'ffprobe',