diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index 40500119..4406d719 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -9,6 +9,8 @@ Open this file when the task needs evidence, regression checks, replay maintenan - `screenshot` - `diff snapshot` - `diff screenshot` +- `diff frames` +- `diff video` - `record` - `replay -u` - `perf` @@ -65,6 +67,23 @@ agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --ove - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. - Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide for a live capture. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. Saved-image comparisons do not have live accessibility refs, so omit `--overlay-refs` when passing a current image path. +## Transition summaries with diff frames/video + +Use `diff frames` or `diff video` when a screenshot pair is too static and you need a compact timeline for a transition, animation, or recorded interaction. + +```bash +agent-device diff frames ./frames --out /tmp/settings-transition +agent-device diff frames ./frame-001.png ./frame-002.png ./frame-003.png --out /tmp/settings-transition +agent-device diff video ./recordings/settings.mov --out /tmp/settings-transition --telemetry ./recordings/settings.gesture-telemetry.json +agent-device diff video ./recordings/settings.mov --sample-fps 8 --max-frames 120 --json +``` + +- `diff frames` accepts a directory of PNG frames or explicit PNG paths. It works without external video tools. Use `--frame-interval-ms ` when recording telemetry timestamps need to line up with a known frame cadence; otherwise frames are spaced at `100ms`. +- `diff video` requires `ffmpeg` and `ffprobe` in `PATH`; it samples the recording into PNG frames, then runs the same transition summarizer. +- Add `--telemetry ` with a recording gesture sidecar when available. The output can then anchor transitions to events such as `after tap` or `during up scroll`. +- The text output stays capped to the top transitions, keyframes, changed-region summaries, and optional OCR movement hints. Use `--json` when you need the structured metrics. +- Install `tesseract` for OCR movement hints on selected transition boundaries. OCR is optional and is not run for every sampled frame. + ## Session recording Use `record` for debugging, documentation, or shareable verification artifacts. diff --git a/src/__tests__/cli-diff.test.ts b/src/__tests__/cli-diff.test.ts index e76ad6f5..ef4cd1e6 100644 --- a/src/__tests__/cli-diff.test.ts +++ b/src/__tests__/cli-diff.test.ts @@ -43,6 +43,26 @@ function solidPngBuffer( return PNG.sync.write(png); } +function movingBlockPngBuffer(offset: number): Buffer { + const png = new PNG({ width: 40, height: 40 }); + for (let i = 0; i < png.data.length; i += 4) { + png.data[i] = 240; + png.data[i + 1] = 240; + png.data[i + 2] = 240; + png.data[i + 3] = 255; + } + for (let y = 12; y < 28; y += 1) { + for (let x = 8 + offset; x < 24 + offset; x += 1) { + const index = (y * png.width + x) * 4; + png.data[index] = 30; + png.data[index + 1] = 30; + png.data[index + 2] = 30; + png.data[index + 3] = 255; + } + } + return PNG.sync.write(png); +} + async function runCliCapture( argv: string[], options: RunCliCaptureOptions = {}, @@ -436,4 +456,100 @@ describe('cli diff commands', () => { fs.rmSync(dir, { recursive: true, force: true }); } }); + + test('diff frames summarizes a local PNG frame sequence without daemon calls', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-frames-test-')); + const outputDir = path.join(dir, 'out'); + const telemetryPath = path.join(dir, 'capture.gesture-telemetry.json'); + for (const [index, offset] of [0, 6, 12, 12].entries()) { + fs.writeFileSync(path.join(dir, `frame-${index}.png`), movingBlockPngBuffer(offset)); + } + fs.writeFileSync( + telemetryPath, + JSON.stringify({ + version: 1, + generatedAt: new Date(0).toISOString(), + events: [{ kind: 'tap', tMs: 10, x: 20, y: 20 }], + }), + ); + + try { + const result = await runCliCapture([ + 'diff', + 'frames', + path.join(dir, 'frame-0.png'), + path.join(dir, 'frame-1.png'), + path.join(dir, 'frame-2.png'), + path.join(dir, 'frame-3.png'), + '--out', + outputDir, + '--telemetry', + telemetryPath, + '--frame-interval-ms', + '250', + '--threshold', + '0', + ]); + assert.equal(result.code, null); + assert.equal(result.calls.length, 0); + assert.match(result.stdout, /Frame transition summary: 1 transition/); + assert.match(result.stdout, /0ms-500ms after tap x=20 y=20/); + assert.match(result.stdout, /after tap x=20 y=20/); + assert.match(result.stdout, /keyframes:/); + assert.equal(fs.existsSync(path.join(outputDir, 'transition-1.diff.png')), true); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + test('diff frames rejects screenshot-only overlay refs flag', async () => { + const result = await runCliCapture([ + 'diff', + 'frames', + './frame-1.png', + './frame-2.png', + '--overlay-refs', + ]); + + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /diff frames does not support --overlay-refs/); + }); + + test('diff frames rejects screenshot-only baseline flag', async () => { + const result = await runCliCapture([ + 'diff', + 'frames', + './frame-1.png', + './frame-2.png', + '--baseline', + './baseline.png', + ]); + + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /diff frames does not support --baseline/); + }); + + test('diff video rejects extra positional paths before probing ffmpeg', async () => { + const result = await runCliCapture(['diff', 'video', './one.mp4', './two.mp4']); + + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /diff video requires exactly one video path/); + }); + + test('diff video rejects screenshot-only baseline flag before probing ffmpeg', async () => { + const result = await runCliCapture([ + 'diff', + 'video', + './session.mp4', + '--baseline', + './baseline.png', + ]); + + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /diff video does not support --baseline/); + }); }); diff --git a/src/cli/commands/screenshot.ts b/src/cli/commands/screenshot.ts index 0f6077f1..d0d4e4c9 100644 --- a/src/cli/commands/screenshot.ts +++ b/src/cli/commands/screenshot.ts @@ -1,11 +1,17 @@ import fs from 'node:fs'; import os from 'node:os'; import path from 'node:path'; -import { formatScreenshotDiffText, formatSnapshotDiffText } from '../../utils/output.ts'; +import { + formatScreenshotDiffText, + formatSnapshotDiffText, + formatTransitionSummaryText, +} from '../../utils/output.ts'; import { AppError } from '../../utils/errors.ts'; import { compareScreenshots, type ScreenshotDiffResult } from '../../utils/screenshot-diff.ts'; import { attachCurrentOverlayMatches } from '../../utils/screenshot-diff-overlay-matches.ts'; import { resolveUserPath } from '../../utils/path-resolution.ts'; +import { collectFrameInputs, summarizeFrameTransitions } from '../../utils/transition-summary.ts'; +import { extractVideoFrames } from '../../utils/video-frames.ts'; import { buildSelectionOptions, writeCommandOutput } from './shared.ts'; import type { ClientCommandHandler } from './router.ts'; @@ -43,7 +49,76 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl return true; } + if (positionals[0] === 'frames') { + rejectUnsupportedDiffFlags( + flags, + ['baseline', 'overlayRefs', 'sampleFps', 'maxFrames'], + 'diff frames', + ); + const outputDir = resolveTransitionOutputDir(flags.out); + const frames = await collectFrameInputs(positionals.slice(1), { + frameIntervalMs: flags.frameIntervalMs, + }); + const result = await summarizeFrameTransitions({ + frames, + input: { + kind: 'frames', + frameCount: frames.length, + sampledFrameCount: frames.length, + ...(flags.telemetry ? { telemetryPath: resolveUserPath(flags.telemetry) } : {}), + }, + options: { + threshold: readDiffThreshold(flags.threshold), + outputDir, + ...(flags.telemetry ? { telemetryPath: flags.telemetry } : {}), + }, + }); + writeCommandOutput(flags, result, () => formatTransitionSummaryText(result)); + return true; + } + + if (positionals[0] === 'video') { + rejectUnsupportedDiffFlags(flags, ['baseline', 'frameIntervalMs', 'overlayRefs'], 'diff video'); + const videoRaw = positionals[1]; + if (!videoRaw || positionals.length > 2) { + throw new AppError('INVALID_ARGS', 'diff video requires exactly one video path'); + } + const videoPath = resolveUserPath(videoRaw); + const outputDir = resolveTransitionOutputDir(flags.out); + const framesDir = path.join(outputDir, 'frames'); + const extracted = await extractVideoFrames({ + videoPath, + outputDir: framesDir, + sampleFps: flags.sampleFps, + maxFrames: flags.maxFrames, + }); + const result = await summarizeFrameTransitions({ + frames: extracted.frames, + input: { + kind: 'video', + path: videoPath, + frameCount: extracted.frames.length, + sampledFrameCount: extracted.frames.length, + sampleFps: extracted.sampleFps, + ...(extracted.durationMs ? { durationMs: extracted.durationMs } : {}), + ...(flags.telemetry ? { telemetryPath: resolveUserPath(flags.telemetry) } : {}), + }, + options: { + threshold: readDiffThreshold(flags.threshold), + outputDir, + ...(flags.telemetry ? { telemetryPath: flags.telemetry } : {}), + }, + }); + writeCommandOutput(flags, result, () => formatTransitionSummaryText(result)); + return true; + } + if (positionals[0] !== 'screenshot') return false; + rejectUnsupportedDiffFlags( + flags, + ['sampleFps', 'maxFrames', 'frameIntervalMs', 'telemetry'], + 'diff screenshot', + ); const baselineRaw = flags.baseline; if (!baselineRaw || typeof baselineRaw !== 'string') { @@ -60,13 +135,7 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl ); } - let thresholdNum = 0.1; - if (flags.threshold != null && flags.threshold !== '') { - thresholdNum = Number(flags.threshold); - if (Number.isNaN(thresholdNum) || thresholdNum < 0 || thresholdNum > 1) { - throw new AppError('INVALID_ARGS', '--threshold must be a number between 0 and 1'); - } - } + const thresholdNum = readDiffThreshold(flags.threshold); if (currentRaw) { if (flags.overlayRefs) { @@ -144,3 +213,38 @@ function removeStaleCurrentOverlay(outputPath: string): void { function isFsError(error: unknown, code: string): error is NodeJS.ErrnoException { return typeof error === 'object' && error !== null && 'code' in error && error.code === code; } + +function readDiffThreshold(rawThreshold: unknown): number { + if (rawThreshold == null || rawThreshold === '') return 0.1; + const threshold = Number(rawThreshold); + if (Number.isNaN(threshold) || threshold < 0 || threshold > 1) { + throw new AppError('INVALID_ARGS', '--threshold must be a number between 0 and 1'); + } + return threshold; +} + +function resolveTransitionOutputDir(rawOut: unknown): string { + const outputDir = + typeof rawOut === 'string' + ? resolveUserPath(rawOut) + : fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-transition-diff-')); + fs.mkdirSync(outputDir, { recursive: true }); + return outputDir; +} + +function rejectUnsupportedDiffFlags( + flags: Record, + flagKeys: string[], + commandLabel: string, +): void { + const unsupported = flagKeys.filter((key) => flags[key] !== undefined); + if (unsupported.length === 0) return; + throw new AppError( + 'INVALID_ARGS', + `${commandLabel} does not support ${unsupported.map((key) => `--${toKebabCase(key)}`).join(', ')}`, + ); +} + +function toKebabCase(value: string): string { + return value.replace(/[A-Z]/g, (match) => `-${match.toLowerCase()}`); +} diff --git a/src/utils/__tests__/transition-summary.test.ts b/src/utils/__tests__/transition-summary.test.ts new file mode 100644 index 00000000..7072ca37 --- /dev/null +++ b/src/utils/__tests__/transition-summary.test.ts @@ -0,0 +1,89 @@ +import { test } from 'vitest'; +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { PNG } from 'pngjs'; +import { summarizeFrameTransitions, type FrameSample } from '../transition-summary.ts'; + +function tmpDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-transition-summary-')); +} + +function writeSettingsFrame(filePath: string, xOffset: number): void { + const png = new PNG({ width: 120, height: 180 }); + paintRect(png, { x: 0, y: 0, width: 120, height: 180 }, { r: 242, g: 242, b: 247 }); + paintRect(png, { x: 0, y: 0, width: 120, height: 36 }, { r: 248, g: 248, b: 248 }); + paintRect(png, { x: 10 + xOffset, y: 54, width: 100, height: 38 }, { r: 255, g: 255, b: 255 }); + paintRect(png, { x: 18 + xOffset, y: 66, width: 36, height: 8 }, { r: 30, g: 30, b: 30 }); + paintRect(png, { x: 96 + xOffset, y: 66, width: 6, height: 10 }, { r: 130, g: 130, b: 130 }); + paintRect(png, { x: 10 + xOffset, y: 100, width: 100, height: 38 }, { r: 255, g: 255, b: 255 }); + paintRect(png, { x: 18 + xOffset, y: 112, width: 48, height: 8 }, { r: 30, g: 30, b: 30 }); + paintRect(png, { x: 96 + xOffset, y: 112, width: 6, height: 10 }, { r: 130, g: 130, b: 130 }); + fs.writeFileSync(filePath, PNG.sync.write(png)); +} + +function paintRect( + png: PNG, + rect: { x: number; y: number; width: number; height: number }, + color: { r: number; g: number; b: number }, +): void { + const startX = Math.max(0, rect.x); + const endX = Math.min(png.width, rect.x + rect.width); + const startY = Math.max(0, rect.y); + const endY = Math.min(png.height, rect.y + rect.height); + for (let y = startY; y < endY; y += 1) { + for (let x = startX; x < endX; x += 1) { + const index = (y * png.width + x) * 4; + png.data[index] = color.r; + png.data[index + 1] = color.g; + png.data[index + 2] = color.b; + png.data[index + 3] = 255; + } + } +} + +test('summarizes a Settings-like transition and anchors it to telemetry', async () => { + const dir = tmpDir(); + const outputDir = path.join(dir, 'out'); + const offsets = [0, -10, -24, -36, -36]; + const frames: FrameSample[] = offsets.map((offset, index) => { + const framePath = path.join(dir, `settings-${index}.png`); + writeSettingsFrame(framePath, offset); + return { index, path: framePath, timestampMs: index * 120 }; + }); + const telemetryPath = path.join(dir, 'settings.gesture-telemetry.json'); + fs.writeFileSync( + telemetryPath, + JSON.stringify({ + version: 1, + generatedAt: new Date(0).toISOString(), + events: [{ kind: 'tap', tMs: 20, x: 96, y: 66, referenceWidth: 120, referenceHeight: 180 }], + }), + ); + + try { + const result = await summarizeFrameTransitions({ + frames, + input: { + kind: 'frames', + frameCount: frames.length, + sampledFrameCount: frames.length, + telemetryPath, + }, + options: { + threshold: 0, + outputDir, + telemetryPath, + }, + }); + + assert.equal(result.transitions.length, 1); + assert.equal(result.transitions[0]?.trigger, 'after tap x=96 y=66'); + assert.ok(result.transitions[0]?.peakMismatchPercentage); + assert.ok(result.transitions[0]?.regions?.length); + assert.equal(fs.existsSync(result.transitions[0]?.keyframes.diff ?? ''), true); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } +}); diff --git a/src/utils/__tests__/video-frames.test.ts b/src/utils/__tests__/video-frames.test.ts new file mode 100644 index 00000000..9d744106 --- /dev/null +++ b/src/utils/__tests__/video-frames.test.ts @@ -0,0 +1,58 @@ +import { beforeEach, test, vi } from 'vitest'; +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { extractVideoFrames } from '../video-frames.ts'; +import { runCmd, whichCmd } from '../exec.ts'; +import { AppError } from '../errors.ts'; + +vi.mock('../exec.ts', async (importOriginal) => { + const actual = await importOriginal(); + return { ...actual, runCmd: vi.fn(), whichCmd: vi.fn() }; +}); + +const mockRunCmd = vi.mocked(runCmd); +const mockWhichCmd = vi.mocked(whichCmd); + +beforeEach(() => { + mockRunCmd.mockReset(); + mockWhichCmd.mockReset(); +}); + +test('extractVideoFrames reports a TOOL_MISSING error when ffmpeg tooling is absent', async () => { + mockWhichCmd.mockResolvedValue(false); + + await assert.rejects( + () => extractVideoFrames({ videoPath: '/tmp/session.mp4', outputDir: '/tmp/frames' }), + (error) => + error instanceof AppError && + error.code === 'TOOL_MISSING' && + error.message === 'diff video requires ffmpeg and ffprobe in PATH' && + error.details?.hint === 'Install FFmpeg, then retry diff video.', + ); + assert.equal(mockRunCmd.mock.calls.length, 0); +}); + +test('extractVideoFrames clears stale frame PNGs before reading extracted frames', async () => { + const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-video-frames-')); + const stalePath = path.join(outputDir, 'frame-000002.png'); + fs.writeFileSync(stalePath, Buffer.from('stale')); + + mockWhichCmd.mockResolvedValue(true); + mockRunCmd.mockImplementation(async () => { + const freshPath = path.join(outputDir, 'frame-000001.png'); + fs.writeFileSync(freshPath, Buffer.from('fresh')); + return { stdout: '', stderr: 'pts_time:0.100', exitCode: 0 }; + }); + + const result = await extractVideoFrames({ + videoPath: '/tmp/session.mp4', + outputDir, + sampleFps: 2, + maxFrames: 2, + }); + + assert.equal(result.frames.length, 1); + assert.equal(fs.existsSync(stalePath), false); +}); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index b1f6f812..5d82deeb 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -60,6 +60,10 @@ export type CliFlags = { screenshotFullscreen?: boolean; baseline?: string; threshold?: string; + sampleFps?: number; + maxFrames?: number; + frameIntervalMs?: number; + telemetry?: string; appsFilter?: 'user-installed' | 'all'; count?: number; fps?: number; @@ -924,7 +928,42 @@ const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ names: ['--threshold'], type: 'string', usageLabel: '--threshold <0-1>', - usageDescription: 'Diff screenshot: color distance threshold (default 0.1)', + usageDescription: 'Diff screenshot/frames/video: color distance threshold (default 0.1)', + }, + { + key: 'sampleFps', + names: ['--sample-fps'], + type: 'int', + min: 1, + max: 60, + usageLabel: '--sample-fps ', + usageDescription: 'Diff video: frames per second to sample before transition analysis', + }, + { + key: 'maxFrames', + names: ['--max-frames'], + type: 'int', + min: 2, + max: 500, + usageLabel: '--max-frames ', + usageDescription: 'Diff video: maximum sampled frames to extract', + }, + { + key: 'frameIntervalMs', + names: ['--frame-interval-ms'], + type: 'int', + min: 1, + max: 60_000, + usageLabel: '--frame-interval-ms ', + usageDescription: + 'Diff frames: timestamp spacing for frame sequences when aligning telemetry (default 100)', + }, + { + key: 'telemetry', + names: ['--telemetry'], + type: 'string', + usageLabel: '--telemetry ', + usageDescription: 'Diff frames/video: gesture telemetry JSON sidecar for transition labels', }, ]; @@ -1053,11 +1092,22 @@ const COMMAND_SCHEMAS: Record = { }, diff: { usageOverride: - 'diff snapshot | diff screenshot --baseline [current.png] [--out ] [--threshold <0-1>] [--overlay-refs]', - helpDescription: 'Diff accessibility snapshot or compare screenshots pixel-by-pixel', - summary: 'Diff snapshot or screenshot', - positionalArgs: ['kind', 'current?'], - allowedFlags: [...SNAPSHOT_FLAGS, 'baseline', 'threshold', 'out', 'overlayRefs'], + 'diff snapshot | diff screenshot --baseline [current.png] [--out ] | diff frames [--out ] | diff video [--out ]', + helpDescription: 'Diff accessibility snapshots, screenshots, frame sequences, or videos', + summary: 'Diff snapshots, screenshots, frames, or videos', + positionalArgs: ['kind', 'input?'], + allowsExtraPositionals: true, + allowedFlags: [ + ...SNAPSHOT_FLAGS, + 'baseline', + 'threshold', + 'out', + 'overlayRefs', + 'sampleFps', + 'maxFrames', + 'frameIntervalMs', + 'telemetry', + ], }, 'ensure-simulator': { helpDescription: 'Ensure an iOS simulator exists in a device set (create if missing)', diff --git a/src/utils/output.ts b/src/utils/output.ts index 78e47e58..e0c6dd28 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -5,6 +5,7 @@ import type { SnapshotNode, SnapshotVisibility } from './snapshot.ts'; import { displayNodeLabel } from './snapshot-tree.ts'; import type { ScreenshotDiffResult } from './screenshot-diff.ts'; import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; +import type { TransitionSummaryResult } from './transition-summary.ts'; import { styleText } from 'node:util'; import { buildMobileSnapshotPresentation } from './mobile-snapshot-semantics.ts'; @@ -330,6 +331,66 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { return `${lines.join('\n')}\n`; } +export function formatTransitionSummaryText(data: TransitionSummaryResult): string { + const useColor = supportsColor(); + const lines: string[] = []; + const inputLabel = + data.input.kind === 'video' ? 'Video transition summary' : 'Frame transition summary'; + const duration = data.input.durationMs ? ` from ${formatMs(data.input.durationMs)}` : ''; + const sampleFps = data.input.sampleFps ? ` at ${roundTwo(data.input.sampleFps)} fps` : ''; + lines.push( + `${inputLabel}: ${data.transitions.length} transition${data.transitions.length === 1 ? '' : 's'}, ` + + `sampled ${data.input.sampledFrameCount} frame${data.input.sampledFrameCount === 1 ? '' : 's'}${duration}${sampleFps}`, + ); + if (data.outputDir) { + lines.push(` ${formatMuted('Artifacts:', useColor)} ${toRelativePath(data.outputDir)}`); + } + if (data.omittedTransitions > 0) { + lines.push( + ` ${formatMuted(`Omitted ${data.omittedTransitions} lower-ranked transition(s).`, useColor)}`, + ); + } + if (data.transitions.length === 0) { + lines.push(' No significant frame-to-frame transitions detected.'); + return `${lines.join('\n')}\n`; + } + + for (const transition of data.transitions) { + const timing = `${formatMs(transition.startMs)}-${formatMs(transition.endMs)}`; + const trigger = transition.trigger ? ` ${transition.trigger}` : ''; + lines.push(`\n${transition.index}. ${timing}${trigger}`); + lines.push(` ${transition.summary}`); + lines.push( + ` peak=${transition.peakMismatchPercentage}% avg=${transition.averageMismatchPercentage}% ` + + `duration=${formatMs(transition.durationMs)}`, + ); + const region = transition.regions?.[0]; + if (region) { + lines.push( + ` changed: ${region.size} ${region.location} ${region.shape}, ` + + `${region.shareOfDiffPercentage}% of diff, ${region.dominantChange}`, + ); + } + const cluster = transition.ocr?.movementClusters?.[0]; + if (cluster) { + lines.push( + ` text: ${formatQuotedList(cluster.texts)} dx=${formatRange(cluster.xRange)}px ` + + `dy=${formatRange(cluster.yRange)}px`, + ); + } + lines.push( + ` keyframes: before=${toRelativePath(transition.keyframes.before)} ` + + `mid=${toRelativePath(transition.keyframes.mid)} ` + + `after=${toRelativePath(transition.keyframes.after)}`, + ); + if (transition.keyframes.diff) { + lines.push(` diff: ${toRelativePath(transition.keyframes.diff)}`); + } + } + + return `${lines.join('\n')}\n`; +} + function formatRect(rect: { x: number; y: number; width: number; height: number }): string { return `x=${rect.x},y=${rect.y},w=${rect.width},h=${rect.height}`; } @@ -415,6 +476,15 @@ function toNumber(value: unknown): number { return typeof value === 'number' && Number.isFinite(value) ? value : 0; } +function formatMs(value: number): string { + if (value >= 1_000) return `${roundTwo(value / 1_000)}s`; + return `${Math.round(value)}ms`; +} + +function roundTwo(value: number): number { + return Math.round(value * 100) / 100; +} + function applyContextWindow(lines: SnapshotDiffLine[], contextWindow: number): SnapshotDiffLine[] { if (lines.length === 0) return lines; const changedIndices = lines diff --git a/src/utils/screenshot-diff.ts b/src/utils/screenshot-diff.ts index b15fca23..fbe3b629 100644 --- a/src/utils/screenshot-diff.ts +++ b/src/utils/screenshot-diff.ts @@ -34,6 +34,7 @@ export type ScreenshotDiffOptions = { threshold?: number; outputPath?: string; maxRegions?: number; + includeOcr?: boolean; }; // Each pixel is a point in 3D RGB space (R, G, B each 0–255). @@ -136,7 +137,7 @@ export async function compareScreenshots( } const ocrAnalysis = - differentPixels > 0 + differentPixels > 0 && options.includeOcr !== false ? await summarizeScreenshotOcr({ baselinePath, currentPath, diff --git a/src/utils/transition-summary.ts b/src/utils/transition-summary.ts new file mode 100644 index 00000000..e5c303db --- /dev/null +++ b/src/utils/transition-summary.ts @@ -0,0 +1,363 @@ +import { promises as fs } from 'node:fs'; +import path from 'node:path'; +import type { RecordingGestureEvent } from '../daemon/types.ts'; +import { AppError } from './errors.ts'; +import { resolveUserPath } from './path-resolution.ts'; +import { compareScreenshots } from './screenshot-diff.ts'; +import type { ScreenshotDiffResult } from './screenshot-diff.ts'; +import type { ScreenshotDiffRegion } from './screenshot-diff-regions.ts'; + +export type FrameSample = { + index: number; + path: string; + timestampMs: number; +}; + +export type TransitionTelemetryEvent = RecordingGestureEvent; + +type BaseTransitionSummaryInput = { + frameCount: number; + sampledFrameCount: number; + durationMs?: number; + sampleFps?: number; + telemetryPath?: string; +}; + +export type TransitionSummaryInput = + | ({ kind: 'frames' } & BaseTransitionSummaryInput) + | ({ kind: 'video'; path: string } & BaseTransitionSummaryInput); + +export type TransitionSummaryEvent = { + index: number; + startMs: number; + endMs: number; + durationMs: number; + classification: string; + trigger?: string; + summary: string; + peakMismatchPercentage: number; + averageMismatchPercentage: number; + keyframes: { + before: string; + mid: string; + after: string; + diff?: string; + }; + regions?: ScreenshotDiffRegion[]; + ocr?: ScreenshotDiffResult['ocr']; + nonTextDeltas?: ScreenshotDiffResult['nonTextDeltas']; +}; + +export type TransitionSummaryResult = { + input: TransitionSummaryInput; + outputDir?: string; + transitions: TransitionSummaryEvent[]; + omittedTransitions: number; +}; + +export type TransitionSummaryOptions = { + threshold?: number; + outputDir?: string; + telemetryPath?: string; + maxTransitions?: number; +}; + +type PairDiff = { + index: number; + mismatchPercentage: number; +}; + +type TransitionSegment = { + startPairIndex: number; + endPairIndex: number; + peakMismatchPercentage: number; + averageMismatchPercentage: number; +}; + +type TelemetryEnvelope = { + version?: unknown; + events?: unknown; +}; + +const MIN_SIGNIFICANT_MISMATCH_PERCENTAGE = 0.5; +const SEGMENT_GAP_TOLERANCE_PAIRS = 1; +const DEFAULT_MAX_TRANSITIONS = 5; + +export async function summarizeFrameTransitions(params: { + frames: FrameSample[]; + input: TransitionSummaryInput; + options?: TransitionSummaryOptions; +}): Promise { + const frames = [...params.frames].sort((left, right) => left.index - right.index); + if (frames.length < 2) { + throw new AppError('INVALID_ARGS', 'transition summary requires at least two frames'); + } + + const threshold = params.options?.threshold ?? 0.1; + const pairDiffs: PairDiff[] = []; + for (let index = 0; index < frames.length - 1; index += 1) { + const diff = await compareScreenshots(frames[index]!.path, frames[index + 1]!.path, { + threshold, + includeOcr: false, + maxRegions: 3, + }); + pairDiffs.push({ + index, + mismatchPercentage: diff.mismatchPercentage, + }); + } + + const telemetryEvents = params.options?.telemetryPath + ? await readTelemetryEvents(params.options.telemetryPath) + : []; + const segments = segmentPairDiffs(pairDiffs); + const selectedSegments = segments + .sort((left, right) => right.peakMismatchPercentage - left.peakMismatchPercentage) + .slice(0, params.options?.maxTransitions ?? DEFAULT_MAX_TRANSITIONS) + .sort((left, right) => left.startPairIndex - right.startPairIndex); + + const transitions: TransitionSummaryEvent[] = []; + for (const [index, segment] of selectedSegments.entries()) { + const before = frames[segment.startPairIndex]!; + const after = frames[segment.endPairIndex + 1]!; + const mid = frames[Math.round((segment.startPairIndex + segment.endPairIndex + 1) / 2)]!; + const diffPath = params.options?.outputDir + ? path.join(params.options.outputDir, `transition-${index + 1}.diff.png`) + : undefined; + const boundaryDiff = await compareScreenshots(before.path, after.path, { + threshold, + outputPath: diffPath, + maxRegions: 5, + }); + const trigger = findTrigger(telemetryEvents, before.timestampMs, after.timestampMs); + const classification = classifyTransition(boundaryDiff, trigger); + transitions.push({ + index: index + 1, + startMs: before.timestampMs, + endMs: after.timestampMs, + durationMs: Math.max(0, after.timestampMs - before.timestampMs), + classification, + ...(trigger + ? { trigger: formatTrigger(trigger, before.timestampMs, after.timestampMs) } + : {}), + summary: buildTransitionSummary(boundaryDiff, classification), + peakMismatchPercentage: roundTwo(segment.peakMismatchPercentage), + averageMismatchPercentage: roundTwo(segment.averageMismatchPercentage), + keyframes: { + before: before.path, + mid: mid.path, + after: after.path, + ...(boundaryDiff.diffPath ? { diff: boundaryDiff.diffPath } : {}), + }, + ...(boundaryDiff.regions && boundaryDiff.regions.length > 0 + ? { regions: boundaryDiff.regions.slice(0, 5) } + : {}), + ...(boundaryDiff.ocr ? { ocr: boundaryDiff.ocr } : {}), + ...(boundaryDiff.nonTextDeltas && boundaryDiff.nonTextDeltas.length > 0 + ? { nonTextDeltas: boundaryDiff.nonTextDeltas.slice(0, 5) } + : {}), + }); + } + + return { + input: params.input, + ...(params.options?.outputDir ? { outputDir: params.options.outputDir } : {}), + transitions, + omittedTransitions: Math.max(0, segments.length - selectedSegments.length), + }; +} + +export async function collectFrameInputs( + inputs: string[], + options: { frameIntervalMs?: number } = {}, +): Promise { + if (inputs.length === 0) { + throw new AppError('INVALID_ARGS', 'diff frames requires a frame path or directory'); + } + + const framePaths: string[] = []; + if (inputs.length === 1) { + const resolved = resolveUserPath(inputs[0]!); + const stat = await fs.stat(resolved); + if (stat.isDirectory()) { + const entries = await fs.readdir(resolved); + framePaths.push( + ...entries + .filter((entry) => entry.toLowerCase().endsWith('.png')) + .sort(compareNatural) + .map((entry) => path.join(resolved, entry)), + ); + } else { + framePaths.push(resolved); + } + } else { + framePaths.push(...inputs.map((input) => resolveUserPath(input))); + } + + if (framePaths.length < 2) { + throw new AppError('INVALID_ARGS', 'diff frames requires at least two PNG frames'); + } + + const frameIntervalMs = options.frameIntervalMs ?? 100; + return framePaths.map((framePath, index) => ({ + index, + path: framePath, + timestampMs: index * frameIntervalMs, + })); +} + +function segmentPairDiffs(pairDiffs: PairDiff[]): TransitionSegment[] { + const segments: TransitionSegment[] = []; + let activePairs: PairDiff[] = []; + let inactiveGap = 0; + + for (const pair of pairDiffs) { + if (pair.mismatchPercentage >= MIN_SIGNIFICANT_MISMATCH_PERCENTAGE) { + activePairs.push(pair); + inactiveGap = 0; + continue; + } + if (activePairs.length === 0) continue; + inactiveGap += 1; + if (inactiveGap <= SEGMENT_GAP_TOLERANCE_PAIRS) continue; + segments.push(toSegment(activePairs)); + activePairs = []; + inactiveGap = 0; + } + + if (activePairs.length > 0) { + segments.push(toSegment(activePairs)); + } + + return segments; +} + +function toSegment(pairs: PairDiff[]): TransitionSegment { + const peakMismatchPercentage = Math.max(...pairs.map((pair) => pair.mismatchPercentage)); + const averageMismatchPercentage = + pairs.reduce((sum, pair) => sum + pair.mismatchPercentage, 0) / pairs.length; + return { + startPairIndex: pairs[0]!.index, + endPairIndex: pairs[pairs.length - 1]!.index, + peakMismatchPercentage, + averageMismatchPercentage, + }; +} + +async function readTelemetryEvents(telemetryPath: string): Promise { + try { + const raw = JSON.parse( + await fs.readFile(resolveUserPath(telemetryPath), 'utf8'), + ) as TelemetryEnvelope; + if (!Array.isArray(raw.events)) return []; + return raw.events.filter(isTelemetryEvent); + } catch (error) { + throw new AppError( + 'INVALID_ARGS', + `invalid gesture telemetry JSON: ${telemetryPath}`, + undefined, + error, + ); + } +} + +function isTelemetryEvent(value: unknown): value is TransitionTelemetryEvent { + if (!value || typeof value !== 'object') return false; + const event = value as Partial; + return typeof event.kind === 'string' && typeof event.tMs === 'number'; +} + +function findTrigger( + events: TransitionTelemetryEvent[], + startMs: number, + endMs: number, +): TransitionTelemetryEvent | undefined { + let best: TransitionTelemetryEvent | undefined; + let bestDistance = Number.POSITIVE_INFINITY; + for (const event of events) { + const durationMs = + 'durationMs' in event && typeof event.durationMs === 'number' ? event.durationMs : 0; + const eventEndMs = event.tMs + durationMs; + // Continuous gestures should attach while overlapping the transition; discrete taps + // often land just before the first changed frame because UI animations start after input. + const overlaps = event.tMs <= endMs + 250 && eventEndMs >= startMs - 250; + const before = event.tMs <= startMs && startMs - event.tMs <= 1_000; + if (!overlaps && !before) continue; + const distance = Math.abs(startMs - event.tMs); + if (distance >= bestDistance) continue; + best = event; + bestDistance = distance; + } + return best; +} + +function classifyTransition( + diff: ScreenshotDiffResult, + trigger: TransitionTelemetryEvent | undefined, +): string { + if (trigger?.kind === 'scroll') return `${trigger.contentDirection} scroll`; + if (trigger?.kind === 'swipe' || trigger?.kind === 'back-swipe') return 'gesture navigation'; + const cluster = diff.ocr?.movementClusters?.[0]; + if (cluster) { + const dx = maxAbs(cluster.xRange); + const dy = maxAbs(cluster.yRange); + if (dx > 24 && dx > dy * 1.5) return 'horizontal navigation'; + if (dy > 24 && dy > dx * 1.5) return 'vertical movement'; + } + const largestRegion = diff.regions?.[0]; + if (largestRegion?.size === 'large' && largestRegion.shape === 'large-area') { + if (largestRegion.dominantChange === 'brighter') return 'brightening transition'; + if (largestRegion.dominantChange === 'darker') return 'dimming transition'; + return 'screen replacement'; + } + return 'screen update'; +} + +function buildTransitionSummary(diff: ScreenshotDiffResult, classification: string): string { + const details: string[] = [classification]; + const cluster = diff.ocr?.movementClusters?.[0]; + if (cluster) { + details.push( + `text moved dx=${formatRange(cluster.xRange)}px dy=${formatRange(cluster.yRange)}px`, + ); + } + const region = diff.regions?.[0]; + if (region) { + details.push(`${region.size} ${region.location} ${region.shape} changed`); + } + return details.join('; '); +} + +function formatTrigger(event: TransitionTelemetryEvent, startMs: number, endMs: number): string { + const relation = + isContinuousGesture(event) && event.tMs >= startMs && event.tMs <= endMs ? 'during' : 'after'; + if (event.kind === 'scroll') return `${relation} ${event.contentDirection} scroll`; + if (event.kind === 'pinch') return `${relation} pinch scale=${event.scale}`; + return `${relation} ${event.kind} x=${Math.round(event.x)} y=${Math.round(event.y)}`; +} + +function isContinuousGesture(event: TransitionTelemetryEvent): boolean { + return ['scroll', 'swipe', 'back-swipe', 'pinch'].includes(event.kind); +} + +function maxAbs(range: { min: number; max: number }): number { + return Math.max(Math.abs(range.min), Math.abs(range.max)); +} + +function formatRange(range: { min: number; max: number }): string { + return range.min === range.max + ? formatSigned(range.min) + : `${formatSigned(range.min)}..${formatSigned(range.max)}`; +} + +function formatSigned(value: number): string { + return value > 0 ? `+${value}` : String(value); +} + +function compareNatural(left: string, right: string): number { + return left.localeCompare(right, undefined, { numeric: true, sensitivity: 'base' }); +} + +function roundTwo(value: number): number { + return Math.round(value * 100) / 100; +} diff --git a/src/utils/video-frames.ts b/src/utils/video-frames.ts new file mode 100644 index 00000000..f97a2db6 --- /dev/null +++ b/src/utils/video-frames.ts @@ -0,0 +1,125 @@ +import { promises as fs } from 'node:fs'; +import path from 'node:path'; +import { AppError } from './errors.ts'; +import { runCmd, whichCmd } from './exec.ts'; +import type { FrameSample } from './transition-summary.ts'; + +export type ExtractedVideoFrames = { + frames: FrameSample[]; + durationMs?: number; + sampleFps: number; +}; + +type FfprobePayload = { + format?: { + duration?: string; + }; +}; + +const DEFAULT_SAMPLE_FPS = 6; +const DEFAULT_MAX_FRAMES = 80; +const VIDEO_PROBE_TIMEOUT_MS = 10_000; +const VIDEO_EXTRACT_TIMEOUT_MS = 60_000; + +export async function extractVideoFrames(params: { + videoPath: string; + outputDir: string; + sampleFps?: number; + maxFrames?: number; +}): Promise { + const [hasFfmpeg, hasFfprobe] = await Promise.all([whichCmd('ffmpeg'), whichCmd('ffprobe')]); + if (!hasFfmpeg || !hasFfprobe) { + throw new AppError('TOOL_MISSING', 'diff video requires ffmpeg and ffprobe in PATH', { + hint: 'Install FFmpeg, then retry diff video.', + }); + } + + await fs.mkdir(params.outputDir, { recursive: true }); + await removeStaleFrames(params.outputDir); + const maxFrames = params.maxFrames ?? DEFAULT_MAX_FRAMES; + const requestedFps = params.sampleFps ?? DEFAULT_SAMPLE_FPS; + const durationMs = await probeVideoDurationMs(params.videoPath); + const sampleFps = + durationMs && durationMs > 0 + ? Math.min(requestedFps, maxFrames / (durationMs / 1_000)) + : requestedFps; + const pattern = path.join(params.outputDir, 'frame-%06d.png'); + const result = await runCmd( + 'ffmpeg', + [ + '-hide_banner', + '-loglevel', + 'info', + '-i', + params.videoPath, + '-vf', + `fps=${formatFps(sampleFps)},showinfo`, + '-frames:v', + String(maxFrames), + pattern, + ], + { allowFailure: true, timeoutMs: VIDEO_EXTRACT_TIMEOUT_MS }, + ); + if (result.exitCode !== 0) { + throw new AppError('COMMAND_FAILED', 'ffmpeg failed to extract video frames', { + stderr: result.stderr, + }); + } + + const files = (await fs.readdir(params.outputDir)) + .filter((entry) => /^frame-\d+\.png$/i.test(entry)) + .sort((left, right) => left.localeCompare(right, undefined, { numeric: true })); + const timestamps = parseShowInfoTimestamps(result.stderr); + return { + frames: files.map((entry, index) => ({ + index, + path: path.join(params.outputDir, entry), + timestampMs: timestamps[index] ?? Math.round((index / sampleFps) * 1_000), + })), + ...(durationMs ? { durationMs } : {}), + sampleFps, + }; +} + +async function removeStaleFrames(outputDir: string): Promise { + const entries = await fs.readdir(outputDir); + const stale = entries.filter((entry) => /^frame-\d+\.png$/i.test(entry)); + await Promise.all( + stale.map((entry) => + fs.rm(path.join(outputDir, entry), { + force: true, + }), + ), + ); +} + +async function probeVideoDurationMs(videoPath: string): Promise { + const result = await runCmd( + 'ffprobe', + ['-v', 'error', '-show_entries', 'format=duration', '-of', 'json', videoPath], + { allowFailure: true, timeoutMs: VIDEO_PROBE_TIMEOUT_MS }, + ); + if (result.exitCode !== 0) return undefined; + try { + const parsed = JSON.parse(result.stdout) as FfprobePayload; + const seconds = parsed.format?.duration ? Number(parsed.format.duration) : Number.NaN; + return Number.isFinite(seconds) && seconds > 0 ? Math.round(seconds * 1_000) : undefined; + } catch { + return undefined; + } +} + +function parseShowInfoTimestamps(stderr: string): number[] { + return stderr + .split(/\r?\n/) + .map((line) => /pts_time:([0-9.]+)/.exec(line)?.[1]) + .filter((value): value is string => value !== undefined) + .map((value) => Math.round(Number(value) * 1_000)) + .filter((value) => Number.isFinite(value)); +} + +function formatFps(value: number): string { + return Number.isInteger(value) + ? String(value) + : value.toFixed(3).replace(/0+$/, '').replace(/\.$/, ''); +} diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index 9a0345bb..4ab3859f 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -544,6 +544,8 @@ agent-device open --platform macos --surface desktop && agent-device screenshot agent-device diff screenshot --baseline baseline.png --out diff.png agent-device diff screenshot --baseline baseline.png current.png --out diff.png agent-device diff screenshot --baseline baseline.png --out diff.png --overlay-refs +agent-device diff frames ./frames --out transition-summary +agent-device diff video session.mp4 --out transition-summary --telemetry session.gesture-telemetry.json agent-device record start # Start screen recording to auto filename agent-device record start session.mp4 # Start recording to explicit path agent-device record start session.mp4 --fps 30 # Override iOS device runner FPS @@ -556,6 +558,9 @@ agent-device record stop # Stop active recording - If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas, movement clusters, and bbox size-change hints to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, and separators, not semantic icon recognition. - `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide for live captures without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. Saved-image comparisons do not have live accessibility refs, so `--overlay-refs` is unavailable when a `current.png` path is provided. +- `diff frames` summarizes transitions from a PNG frame directory or explicit PNG frame list. It prints a capped timeline with transition timing, keyframes, changed-region summaries, and optional OCR movement hints. Use `--frame-interval-ms ` when aligning frame sequences with recording telemetry; the default is `100`. +- `diff video` requires `ffmpeg` and `ffprobe` in `PATH`; it samples a recording into frames under the `--out` directory and runs the same transition summarizer. Use `--sample-fps ` and `--max-frames ` to tune extraction cost. +- Add `--telemetry ` to `diff frames` or `diff video` when you have a recording gesture sidecar. Transition labels can then include anchors such as `after tap` or `during up scroll`. - In `--json` mode, each overlay ref also includes a screenshot-space `center` point for coordinate fallback like `press `. - Burned-in touch overlays are exported only on macOS hosts, because the overlay pipeline depends on Swift + AVFoundation helpers. - On Linux or other non-macOS hosts, `record stop` still succeeds and returns the raw video plus telemetry sidecar, and includes `overlayWarning` when burn-in overlays were skipped.