diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md index 860120a4..40500119 100644 --- a/skills/agent-device/references/verification.md +++ b/skills/agent-device/references/verification.md @@ -54,14 +54,16 @@ Use `diff screenshot` when comparing the current rendered screen against a saved ```bash agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png +agent-device diff screenshot --baseline ./baseline.png ./current.png --out /tmp/diff.png agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --overlay-refs ``` - Text output includes ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance. JSON also includes normalized bounds. - The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. +- When a current image path is provided, `diff screenshot` compares the two saved files instead of capturing from the live device or requiring an active session. - Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas, movement clusters, and bbox size-change hints. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. -- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. +- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide for a live capture. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. Saved-image comparisons do not have live accessibility refs, so omit `--overlay-refs` when passing a current image path. ## Session recording diff --git a/src/__tests__/cli-diff.test.ts b/src/__tests__/cli-diff.test.ts index 5dc33ea9..e76ad6f5 100644 --- a/src/__tests__/cli-diff.test.ts +++ b/src/__tests__/cli-diff.test.ts @@ -283,6 +283,57 @@ describe('cli diff commands', () => { } }); + test('diff screenshot uses supplied current image instead of capturing from daemon', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-test-')); + const baseline = path.join(dir, 'baseline.png'); + const current = path.join(dir, 'current.png'); + fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 0, g: 0, b: 0 })); + fs.writeFileSync(current, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 })); + + try { + const result = await runCliCapture([ + 'diff', + 'screenshot', + '--baseline', + baseline, + current, + '--threshold', + '0', + ]); + assert.equal(result.code, null); + assert.equal(result.calls.length, 0); + assert.match(result.stdout, /100% pixels differ/); + assert.match(result.stdout, /100 different \/ 100 total pixels/); + assert.equal(result.stderr, ''); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + + test('diff screenshot rejects overlay refs with supplied current image', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-test-')); + const baseline = path.join(dir, 'baseline.png'); + const current = path.join(dir, 'current.png'); + fs.writeFileSync(baseline, solidPngBuffer(10, 10, { r: 0, g: 0, b: 0 })); + fs.writeFileSync(current, solidPngBuffer(10, 10, { r: 255, g: 255, b: 255 })); + + try { + const result = await runCliCapture([ + 'diff', + 'screenshot', + '--baseline', + baseline, + current, + '--overlay-refs', + ]); + assert.equal(result.code, 1); + assert.equal(result.calls.length, 0); + assert.match(result.stderr, /saved-image comparisons have no live accessibility refs/); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + }); + test('diff screenshot uses os.tmpdir for temporary current capture', async () => { const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cli-diff-test-')); const baseline = path.join(dir, 'baseline.png'); diff --git a/src/cli/commands/screenshot.ts b/src/cli/commands/screenshot.ts index 49966dab..0f6077f1 100644 --- a/src/cli/commands/screenshot.ts +++ b/src/cli/commands/screenshot.ts @@ -52,6 +52,13 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl const baselinePath = resolveUserPath(baselineRaw); const outputPath = typeof flags.out === 'string' ? resolveUserPath(flags.out) : undefined; + const currentRaw = positionals[1]; + if (positionals.length > 2) { + throw new AppError( + 'INVALID_ARGS', + 'diff screenshot accepts at most one current screenshot path', + ); + } let thresholdNum = 0.1; if (flags.threshold != null && flags.threshold !== '') { @@ -61,6 +68,21 @@ export const diffCommand: ClientCommandHandler = async ({ positionals, flags, cl } } + if (currentRaw) { + if (flags.overlayRefs) { + throw new AppError( + 'INVALID_ARGS', + 'diff screenshot cannot use --overlay-refs because saved-image comparisons have no live accessibility refs', + ); + } + const result = await compareScreenshots(baselinePath, resolveUserPath(currentRaw), { + threshold: thresholdNum, + outputPath, + }); + writeCommandOutput(flags, result, () => formatScreenshotDiffText(result)); + return true; + } + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-diff-current-')); const tmpScreenshotPath = path.join(tmpDir, `current-${Date.now()}.png`); const screenshotResult = await client.capture.screenshot({ path: tmpScreenshotPath }); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index f74d25b5..e495978f 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -993,10 +993,10 @@ const COMMAND_SCHEMAS: Record = { }, diff: { usageOverride: - 'diff snapshot | diff screenshot --baseline [--out ] [--threshold <0-1>] [--overlay-refs]', + 'diff snapshot | diff screenshot --baseline [current.png] [--out ] [--threshold <0-1>] [--overlay-refs]', helpDescription: 'Diff accessibility snapshot or compare screenshots pixel-by-pixel', summary: 'Diff snapshot or screenshot', - positionalArgs: ['kind'], + positionalArgs: ['kind', 'current?'], allowedFlags: [...SNAPSHOT_FLAGS, 'baseline', 'threshold', 'out', 'overlayRefs'], }, 'ensure-simulator': { diff --git a/src/utils/output.ts b/src/utils/output.ts index 56fd7f6a..78e47e58 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -218,7 +218,8 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { const indicator = useColor ? colorize('✗', 'red') : '✗'; const pctLabel = mismatchPercentage === 0 && differentPixels > 0 ? '<0.01' : String(mismatchPercentage); - lines.push(`${indicator} ${pctLabel}% pixels differ`); + const summary = `${pctLabel}% pixels differ`; + lines.push(`${indicator} ${useColor ? colorize(summary, 'red') : summary}`); } if (diffPath && !match) { @@ -244,13 +245,13 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { const hints = !match && !dimensionMismatch ? formatScreenshotDiffHints(data) : []; if (hints.length > 0) { - lines.push(' Hints:'); + lines.push(` ${formatMuted('Hints:', useColor)}`); for (const hint of hints) lines.push(` - ${hint}`); } const regions = Array.isArray(data.regions) ? data.regions : []; if (!match && !dimensionMismatch && regions.length > 0) { - lines.push(' Changed regions:'); + lines.push(` ${formatMuted('Changed regions:', useColor)}`); for (const region of regions.slice(0, 5)) { const share = region.shareOfDiffPercentage === 0 && region.differentPixels > 0 @@ -280,11 +281,17 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { if (!match && !dimensionMismatch && ocrMatches.length > 0) { const shownOcrMatches = ocrMatches.slice(0, 8); lines.push( - ` OCR text deltas (${data.ocr?.provider}; baselineBlocks=${data.ocr?.baselineBlocks} ` + - `currentBlocks=${data.ocr?.currentBlocks}; showing ${shownOcrMatches.length}/${ocrMatches.length}; px):`, + ` ${formatMuted( + `OCR text deltas (${data.ocr?.provider}; baselineBlocks=${data.ocr?.baselineBlocks} ` + + `currentBlocks=${data.ocr?.currentBlocks}; showing ${shownOcrMatches.length}/${ocrMatches.length}; px):`, + useColor, + )}`, ); lines.push( - ' item | text | movePx | sizeDeltaPx | bboxBaseline | bboxCurrent | confidence | issueHint', + ` ${formatMuted( + 'item | text | movePx | sizeDeltaPx | bboxBaseline | bboxCurrent | confidence | issueHint', + useColor, + )}`, ); for (const [index, ocrMatch] of shownOcrMatches.entries()) { const delta = ocrMatch.delta; @@ -303,9 +310,14 @@ export function formatScreenshotDiffText(data: ScreenshotDiffResult): string { if (!match && !dimensionMismatch && nonTextDeltas.length > 0) { const shownNonTextDeltas = nonTextDeltas.slice(0, 8); lines.push( - ` Non-text visual deltas (showing ${shownNonTextDeltas.length}/${nonTextDeltas.length}; px):`, + ` ${formatMuted( + `Non-text visual deltas (showing ${shownNonTextDeltas.length}/${nonTextDeltas.length}; px):`, + useColor, + )}`, + ); + lines.push( + ` ${formatMuted('item | region | slot | kind | bboxCurrent | nearestText', useColor)}`, ); - lines.push(' item | region | slot | kind | bboxCurrent | nearestText'); for (const delta of shownNonTextDeltas) { lines.push( ` ${delta.index} | ${delta.regionIndex ? `r${delta.regionIndex}` : '-'} | ` + @@ -437,6 +449,10 @@ function colorize(text: string, format: Parameters[0]): string return styleText(format, text); } +function formatMuted(text: string, useColor: boolean): string { + return useColor ? colorize(text, 'dim') : text; +} + function buildSnapshotNotices( data: Record, nodes: SnapshotNode[], diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index 7ca28bb3..8f8cb9eb 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -544,6 +544,7 @@ agent-device screenshot textedit.png # App-session window capture on macOS agent-device screenshot --fullscreen # Force full-screen capture on macOS app sessions agent-device open --platform macos --surface desktop && agent-device screenshot desktop.png agent-device diff screenshot --baseline baseline.png --out diff.png +agent-device diff screenshot --baseline baseline.png current.png --out diff.png agent-device diff screenshot --baseline baseline.png --out diff.png --overlay-refs agent-device record start # Start screen recording to auto filename agent-device record start session.mp4 # Start recording to explicit path @@ -553,10 +554,10 @@ agent-device record stop # Stop active recording - Recordings always produce a video artifact. When touch visualization is enabled, they also produce a gesture telemetry sidecar that can be used for post-processing or inspection. - `screenshot --overlay-refs` captures a fresh full snapshot and burns visible `@eN` refs plus their target rectangles into the saved PNG. -- `diff screenshot` compares the current screenshot to `--baseline`, prints ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance, and writes a diff PNG with a light grayscale current-screen context, red-tinted changed pixels, and outlined changed regions when `--out` is provided. JSON also includes normalized bounds. +- `diff screenshot` compares the current live screenshot to `--baseline`, or compares `--baseline` to an optional saved `current.png` path without requiring an active session, then prints ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance, and writes a diff PNG with a light grayscale current-screen context, red-tinted changed pixels, and outlined changed regions when `--out` is provided. JSON also includes normalized bounds. - If `tesseract` is installed, `diff screenshot` also adds best-effort OCR text deltas, movement clusters, and bbox size-change hints to the text and JSON output. OCR improves descriptions only; it does not change the pixel comparison or the diff PNG. - When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the diff and clustering remaining residuals. These are hints for icons, controls, and separators, not semantic icon recognition. -- `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. +- `diff screenshot --overlay-refs` additionally writes a separate current-screen overlay guide for live captures without using that annotated image for the pixel comparison. If current-screen refs intersect changed regions, the output lists the best ref matches under those regions. Saved-image comparisons do not have live accessibility refs, so `--overlay-refs` is unavailable when a `current.png` path is provided. - In `--json` mode, each overlay ref also includes a screenshot-space `center` point for coordinate fallback like `press `. - Burned-in touch overlays are exported only on macOS hosts, because the overlay pipeline depends on Swift + AVFoundation helpers. - On Linux or other non-macOS hosts, `record stop` still succeeds and returns the raw video plus telemetry sidecar, and includes `overlayWarning` when burn-in overlays were skipped.