Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions skills/agent-device/references/verification.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Open this file when the task needs evidence, regression checks, replay maintenan
- `screenshot`
- `diff snapshot`
- `diff screenshot`
- `diff frames`
- `diff video`
- `record`
- `replay -u`
- `perf`
Expand Down Expand Up @@ -65,6 +67,23 @@ agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --ove
- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition.
- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide for a live capture. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. Saved-image comparisons do not have live accessibility refs, so omit `--overlay-refs` when passing a current image path.

## Transition summaries with diff frames/video

Use `diff frames` or `diff video` when a screenshot pair is too static and you need a compact timeline for a transition, animation, or recorded interaction.

```bash
agent-device diff frames ./frames --out /tmp/settings-transition
agent-device diff frames ./frame-001.png ./frame-002.png ./frame-003.png --out /tmp/settings-transition
agent-device diff video ./recordings/settings.mov --out /tmp/settings-transition --telemetry ./recordings/settings.gesture-telemetry.json
agent-device diff video ./recordings/settings.mov --sample-fps 8 --max-frames 120 --json
```

- `diff frames` accepts a directory of PNG frames or explicit PNG paths. It works without external video tools. Use `--frame-interval-ms <n>` when recording telemetry timestamps need to line up with a known frame cadence; otherwise frames are spaced at `100ms`.
- `diff video` requires `ffmpeg` and `ffprobe` in `PATH`; it samples the recording into PNG frames, then runs the same transition summarizer.
- Add `--telemetry <path>` with a recording gesture sidecar when available. The output can then anchor transitions to events such as `after tap` or `during up scroll`.
- The text output stays capped to the top transitions, keyframes, changed-region summaries, and optional OCR movement hints. Use `--json` when you need the structured metrics.
- Install `tesseract` for OCR movement hints on selected transition boundaries. OCR is optional and is not run for every sampled frame.

## Session recording

Use `record` for debugging, documentation, or shareable verification artifacts.
Expand Down
116 changes: 116 additions & 0 deletions src/__tests__/cli-diff.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,26 @@ function solidPngBuffer(
return PNG.sync.write(png);
}

function movingBlockPngBuffer(offset: number): Buffer {
const png = new PNG({ width: 40, height: 40 });
for (let i = 0; i < png.data.length; i += 4) {
png.data[i] = 240;
png.data[i + 1] = 240;
png.data[i + 2] = 240;
png.data[i + 3] = 255;
}
for (let y = 12; y < 28; y += 1) {
for (let x = 8 + offset; x < 24 + offset; x += 1) {
const index = (y * png.width + x) * 4;
png.data[index] = 30;
png.data[index + 1] = 30;
png.data[index + 2] = 30;
png.data[index + 3] = 255;
}
}
return PNG.sync.write(png);
}

async function runCliCapture(
argv: string[],
options: RunCliCaptureOptions = {},
Expand Down Expand Up @@ -436,4 +456,100 @@ describe('cli diff commands', () => {
fs.rmSync(dir, { recursive: true, force: true });
}
});

test('diff frames summarizes a local PNG frame sequence without daemon calls', async () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-frames-test-'));
const outputDir = path.join(dir, 'out');
const telemetryPath = path.join(dir, 'capture.gesture-telemetry.json');
for (const [index, offset] of [0, 6, 12, 12].entries()) {
fs.writeFileSync(path.join(dir, `frame-${index}.png`), movingBlockPngBuffer(offset));
}
fs.writeFileSync(
telemetryPath,
JSON.stringify({
version: 1,
generatedAt: new Date(0).toISOString(),
events: [{ kind: 'tap', tMs: 10, x: 20, y: 20 }],
}),
);

try {
const result = await runCliCapture([
'diff',
'frames',
path.join(dir, 'frame-0.png'),
path.join(dir, 'frame-1.png'),
path.join(dir, 'frame-2.png'),
path.join(dir, 'frame-3.png'),
'--out',
outputDir,
'--telemetry',
telemetryPath,
'--frame-interval-ms',
'250',
'--threshold',
'0',
]);
assert.equal(result.code, null);
assert.equal(result.calls.length, 0);
assert.match(result.stdout, /Frame transition summary: 1 transition/);
assert.match(result.stdout, /0ms-500ms after tap x=20 y=20/);
assert.match(result.stdout, /after tap x=20 y=20/);
assert.match(result.stdout, /keyframes:/);
assert.equal(fs.existsSync(path.join(outputDir, 'transition-1.diff.png')), true);
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
});

test('diff frames rejects screenshot-only overlay refs flag', async () => {
const result = await runCliCapture([
'diff',
'frames',
'./frame-1.png',
'./frame-2.png',
'--overlay-refs',
]);

assert.equal(result.code, 1);
assert.equal(result.calls.length, 0);
assert.match(result.stderr, /diff frames does not support --overlay-refs/);
});

test('diff frames rejects screenshot-only baseline flag', async () => {
const result = await runCliCapture([
'diff',
'frames',
'./frame-1.png',
'./frame-2.png',
'--baseline',
'./baseline.png',
]);

assert.equal(result.code, 1);
assert.equal(result.calls.length, 0);
assert.match(result.stderr, /diff frames does not support --baseline/);
});

test('diff video rejects extra positional paths before probing ffmpeg', async () => {
const result = await runCliCapture(['diff', 'video', './one.mp4', './two.mp4']);

assert.equal(result.code, 1);
assert.equal(result.calls.length, 0);
assert.match(result.stderr, /diff video requires exactly one video path/);
});

test('diff video rejects screenshot-only baseline flag before probing ffmpeg', async () => {
const result = await runCliCapture([
'diff',
'video',
'./session.mp4',
'--baseline',
'./baseline.png',
]);

assert.equal(result.code, 1);
assert.equal(result.calls.length, 0);
assert.match(result.stderr, /diff video does not support --baseline/);
});
});
120 changes: 112 additions & 8 deletions src/cli/commands/screenshot.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import { formatScreenshotDiffText, formatSnapshotDiffText } from '../../utils/output.ts';
import {
formatScreenshotDiffText,
formatSnapshotDiffText,
formatTransitionSummaryText,
} from '../../utils/output.ts';
import { AppError } from '../../utils/errors.ts';
import { compareScreenshots, type ScreenshotDiffResult } from '../../utils/screenshot-diff.ts';
import { attachCurrentOverlayMatches } from '../../utils/screenshot-diff-overlay-matches.ts';
import { resolveUserPath } from '../../utils/path-resolution.ts';
import { collectFrameInputs, summarizeFrameTransitions } from '../../utils/transition-summary.ts';
import { extractVideoFrames } from '../../utils/video-frames.ts';
import { buildSelectionOptions, writeCommandOutput } from './shared.ts';
import type { ClientCommandHandler } from './router.ts';

Expand Down Expand Up @@ -43,7 +49,76 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl
return true;
}

if (positionals[0] === 'frames') {
rejectUnsupportedDiffFlags(
flags,
['baseline', 'overlayRefs', 'sampleFps', 'maxFrames'],
'diff frames',
);
const outputDir = resolveTransitionOutputDir(flags.out);
const frames = await collectFrameInputs(positionals.slice(1), {
frameIntervalMs: flags.frameIntervalMs,
});
const result = await summarizeFrameTransitions({
frames,
input: {
kind: 'frames',
frameCount: frames.length,
sampledFrameCount: frames.length,
...(flags.telemetry ? { telemetryPath: resolveUserPath(flags.telemetry) } : {}),
},
options: {
threshold: readDiffThreshold(flags.threshold),
outputDir,
...(flags.telemetry ? { telemetryPath: flags.telemetry } : {}),
},
});
writeCommandOutput(flags, result, () => formatTransitionSummaryText(result));
return true;
}

if (positionals[0] === 'video') {
rejectUnsupportedDiffFlags(flags, ['baseline', 'frameIntervalMs', 'overlayRefs'], 'diff video');
const videoRaw = positionals[1];
if (!videoRaw || positionals.length > 2) {
throw new AppError('INVALID_ARGS', 'diff video requires exactly one video path');
}
const videoPath = resolveUserPath(videoRaw);
const outputDir = resolveTransitionOutputDir(flags.out);
const framesDir = path.join(outputDir, 'frames');
const extracted = await extractVideoFrames({
videoPath,
outputDir: framesDir,
sampleFps: flags.sampleFps,
maxFrames: flags.maxFrames,
});
const result = await summarizeFrameTransitions({
frames: extracted.frames,
input: {
kind: 'video',
path: videoPath,
frameCount: extracted.frames.length,
sampledFrameCount: extracted.frames.length,
sampleFps: extracted.sampleFps,
...(extracted.durationMs ? { durationMs: extracted.durationMs } : {}),
...(flags.telemetry ? { telemetryPath: resolveUserPath(flags.telemetry) } : {}),
},
options: {
threshold: readDiffThreshold(flags.threshold),
outputDir,
...(flags.telemetry ? { telemetryPath: flags.telemetry } : {}),
},
});
writeCommandOutput(flags, result, () => formatTransitionSummaryText(result));
return true;
}

if (positionals[0] !== 'screenshot') return false;
rejectUnsupportedDiffFlags(
flags,
['sampleFps', 'maxFrames', 'frameIntervalMs', 'telemetry'],
'diff screenshot',
);

const baselineRaw = flags.baseline;
if (!baselineRaw || typeof baselineRaw !== 'string') {
Expand All @@ -60,13 +135,7 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl
);
}

let thresholdNum = 0.1;
if (flags.threshold != null && flags.threshold !== '') {
thresholdNum = Number(flags.threshold);
if (Number.isNaN(thresholdNum) || thresholdNum < 0 || thresholdNum > 1) {
throw new AppError('INVALID_ARGS', '--threshold must be a number between 0 and 1');
}
}
const thresholdNum = readDiffThreshold(flags.threshold);

if (currentRaw) {
if (flags.overlayRefs) {
Expand Down Expand Up @@ -144,3 +213,38 @@ function removeStaleCurrentOverlay(outputPath: string): void {
function isFsError(error: unknown, code: string): error is NodeJS.ErrnoException {
return typeof error === 'object' && error !== null && 'code' in error && error.code === code;
}

function readDiffThreshold(rawThreshold: unknown): number {
if (rawThreshold == null || rawThreshold === '') return 0.1;
const threshold = Number(rawThreshold);
if (Number.isNaN(threshold) || threshold < 0 || threshold > 1) {
throw new AppError('INVALID_ARGS', '--threshold must be a number between 0 and 1');
}
return threshold;
}

function resolveTransitionOutputDir(rawOut: unknown): string {
const outputDir =
typeof rawOut === 'string'
? resolveUserPath(rawOut)
: fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-transition-diff-'));
fs.mkdirSync(outputDir, { recursive: true });
return outputDir;
}

function rejectUnsupportedDiffFlags(
flags: Record<string, unknown>,
flagKeys: string[],
commandLabel: string,
): void {
const unsupported = flagKeys.filter((key) => flags[key] !== undefined);
if (unsupported.length === 0) return;
throw new AppError(
'INVALID_ARGS',
`${commandLabel} does not support ${unsupported.map((key) => `--${toKebabCase(key)}`).join(', ')}`,
);
}

function toKebabCase(value: string): string {
return value.replace(/[A-Z]/g, (match) => `-${match.toLowerCase()}`);
}
Loading
Loading