diff --git a/src/backend.ts b/src/backend.ts index 25805a972..0ad0502f3 100644 --- a/src/backend.ts +++ b/src/backend.ts @@ -137,6 +137,12 @@ export type BackendTapOptions = RepeatedInput & { button?: ClickButton; }; +export type BackendRefTarget = { + kind: 'ref'; + ref: string; + fallbackLabel?: string; +}; + export type BackendFillOptions = { delayMs?: number; }; @@ -435,12 +441,23 @@ export type AgentDeviceBackend = { point: Point, options?: BackendTapOptions, ): Promise; + tapTarget?( + context: BackendCommandContext, + target: BackendRefTarget, + options?: BackendTapOptions, + ): Promise; fill?( context: BackendCommandContext, point: Point, text: string, options?: BackendFillOptions, ): Promise; + fillTarget?( + context: BackendCommandContext, + target: BackendRefTarget, + text: string, + options?: BackendFillOptions, + ): Promise; typeText?( context: BackendCommandContext, text: string, diff --git a/src/cli/commands/web.ts b/src/cli/commands/web.ts index e07e1ac4f..4f68e40e7 100644 --- a/src/cli/commands/web.ts +++ b/src/cli/commands/web.ts @@ -1,11 +1,14 @@ import { doctorManagedAgentBrowser, setupManagedAgentBrowser, + type AgentBrowserToolStatus, } from '../../platforms/web/agent-browser-tool.ts'; import { AppError } from '../../utils/errors.ts'; import type { CliFlags } from '../../utils/cli-flags.ts'; import { printJson } from '../../utils/output.ts'; +type PublicAgentBrowserToolStatus = Omit; + export async function runWebCommand( positionals: string[], options: { flags: CliFlags; stateDir: string }, @@ -46,7 +49,7 @@ function printWebSetupResult( status: Awaited>, ): void { if (json) { - printJson({ success: true, data: { status } }); + printJson({ success: true, data: { status: toPublicAgentBrowserToolStatus(status) } }); return; } process.stdout.write( @@ -56,8 +59,34 @@ function printWebSetupResult( function printWebResult(json: boolean | undefined, message: string, data: Record) { if (json) { - printJson({ success: true, data }); + printJson({ success: true, data: toPublicWebResult(data) }); return; } process.stdout.write(`${message}\n`); } + +function toPublicAgentBrowserToolStatus( + status: AgentBrowserToolStatus, +): PublicAgentBrowserToolStatus { + const { socketDir: _socketDir, ...publicStatus } = status; + return publicStatus; +} + +function toPublicWebResult(data: Record): Record { + const status = data.status; + if (!isAgentBrowserToolStatus(status)) return data; + return { + ...data, + status: toPublicAgentBrowserToolStatus(status), + }; +} + +function isAgentBrowserToolStatus(value: unknown): value is AgentBrowserToolStatus { + return ( + typeof value === 'object' && + value !== null && + 'socketDir' in value && + 'installDir' in value && + 'binaryPath' in value + ); +} diff --git a/src/commands/interaction/runtime/gestures.ts b/src/commands/interaction/runtime/gestures.ts index 479f541f5..ec7da2da9 100644 --- a/src/commands/interaction/runtime/gestures.ts +++ b/src/commands/interaction/runtime/gestures.ts @@ -133,15 +133,13 @@ export const focusCommand: RuntimeCommand await scrollBackend(toBackendContext(runtime, options), backendTarget, { @@ -381,7 +378,7 @@ async function resolveSwipeFrom( promoteToHittableAncestor: false, }, ); - return { point: target.point, target }; + return { point: requireResolvedPoint(target), target }; } if (!options.direction) { throw new AppError('INVALID_ARGS', 'swipe requires from+to or a direction'); @@ -428,10 +425,17 @@ function buildScrollEdgeTarget(resolved: ResolvedScrollTarget): ScrollEdgeTarget ? {} : { point: resolved.point, - nodeIndex: 'node' in resolved ? resolved.node.index : undefined, + nodeIndex: 'node' in resolved ? resolved.node?.index : undefined, }; } +function requireResolvedPoint(result: { point?: Point }): Point { + if (!result.point) { + throw new AppError('COMMAND_FAILED', 'Interaction target resolved without coordinates'); + } + return result.point; +} + async function captureRuntimeScrollEdgeState( runtime: AgentDeviceRuntime, options: ScrollCommandOptions, diff --git a/src/commands/interaction/runtime/interactions.test.ts b/src/commands/interaction/runtime/interactions.test.ts index 79f10c7d6..5d716aaa0 100644 --- a/src/commands/interaction/runtime/interactions.test.ts +++ b/src/commands/interaction/runtime/interactions.test.ts @@ -27,6 +27,56 @@ test('runtime click taps an explicit point without requiring a snapshot', async assert.deepEqual(result, { kind: 'point', point: { x: 10, y: 20 } }); }); +test('runtime click uses backend ref primitive without resolving snapshot geometry', async () => { + const calls: string[] = []; + const device = createInteractionDevice(selectorSnapshot(), { + platform: 'web', + captureSnapshot: async () => { + throw new Error('native ref click should not capture a snapshot'); + }, + tapTarget: async (_context, target) => { + calls.push(target.ref); + return { ref: target.ref.replace(/^@/, '') }; + }, + }); + + const result = await device.interactions.click(ref('@e2'), { session: 'default' }); + + assert.deepEqual(calls, ['@e2']); + assert.equal(result.kind, 'ref'); + assert.deepEqual(result.target, { kind: 'ref', ref: '@e2' }); + assert.equal(result.point, undefined); + assert.equal(result.node, undefined); + assert.deepEqual(result.backendResult, { ref: 'e2' }); +}); + +test('runtime fill uses backend ref primitive without resolving snapshot geometry', async () => { + const calls: Array<{ ref: string; text: string; delayMs?: number }> = []; + const device = createInteractionDevice(fillableSnapshot(), { + platform: 'web', + captureSnapshot: async () => { + throw new Error('native ref fill should not capture a snapshot'); + }, + fillTarget: async (_context, target, text, options) => { + calls.push({ ref: target.ref, text, delayMs: options?.delayMs }); + return { ref: target.ref.replace(/^@/, ''), text }; + }, + }); + + const result = await device.interactions.fill(ref('@e1'), 'hello', { + session: 'default', + delayMs: 25, + }); + + assert.deepEqual(calls, [{ ref: '@e1', text: 'hello', delayMs: 25 }]); + assert.equal(result.kind, 'ref'); + assert.equal(result.point, undefined); + assert.equal(result.node, undefined); + assert.deepEqual(result.target, { kind: 'ref', ref: '@e1' }); + assert.equal(result.text, 'hello'); + assert.deepEqual(result.backendResult, { ref: 'e1', text: 'hello' }); +}); + test('runtime interactions pass runtime signal to backend primitives', async () => { const controller = new AbortController(); let signal: AbortSignal | undefined; @@ -1063,7 +1113,9 @@ function createInteractionDevice( AgentDeviceBackend, | 'captureSnapshot' | 'tap' + | 'tapTarget' | 'fill' + | 'fillTarget' | 'typeText' | 'focus' | 'longPress' @@ -1082,7 +1134,13 @@ function createInteractionDevice( captureSnapshot: async (...args) => overrides.captureSnapshot ? await overrides.captureSnapshot(...args) : { snapshot }, tap: async (...args) => await overrides.tap?.(...args), + tapTarget: overrides.tapTarget + ? async (...args) => await overrides.tapTarget?.(...args) + : undefined, fill: async (...args) => await overrides.fill?.(...args), + fillTarget: overrides.fillTarget + ? async (...args) => await overrides.fillTarget?.(...args) + : undefined, typeText: async (...args) => await overrides.typeText?.(...args), focus: overrides.focus ? async (...args) => await overrides.focus?.(...args) : undefined, longPress: overrides.longPress diff --git a/src/commands/interaction/runtime/interactions.ts b/src/commands/interaction/runtime/interactions.ts index 9fdb4171e..823a04163 100644 --- a/src/commands/interaction/runtime/interactions.ts +++ b/src/commands/interaction/runtime/interactions.ts @@ -2,6 +2,7 @@ import { AppError } from '../../../utils/errors.ts'; import type { ClickButton } from '../../../core/click-button.ts'; import type { AgentDeviceRuntime, CommandContext } from '../../../runtime-contract.ts'; import { isFillableType } from '../../../utils/snapshot-processing.ts'; +import type { Point } from '../../../utils/snapshot.ts'; import { requireIntInRange } from '../../../utils/validation.ts'; import { successText } from '../../../utils/success-text.ts'; import { findMistargetedTypeRefToken } from '../../../utils/type-target-warning.ts'; @@ -85,6 +86,9 @@ export const fillCommand: RuntimeCommand options, ): Promise => { if (!options.text) throw new AppError('INVALID_ARGS', 'fill requires text'); + const nativeRefFill = await maybeFillRefTarget(runtime, options); + if (nativeRefFill) return nativeRefFill; + const resolved = await resolveInteractionTarget(runtime, options, { action: 'fill', requireInteractive: true, @@ -93,14 +97,15 @@ export const fillCommand: RuntimeCommand if (!runtime.backend.fill) { throw new AppError('UNSUPPORTED_OPERATION', 'fill is not supported by this backend'); } + const point = requireResolvedPoint(resolved); const backendResult = await runtime.backend.fill( toBackendContext(runtime, options), - resolved.point, + point, options.text, { delayMs: options.delayMs }, ); const formattedBackendResult = toBackendResult(backendResult); - const nodeType = 'node' in resolved ? (resolved.node.type ?? '') : ''; + const nodeType = 'node' in resolved ? (resolved.node?.type ?? '') : ''; const warning = nodeType && !isFillableType(nodeType, runtime.backend.platform) ? `fill target ${formatTargetForWarning(resolved)} resolved to "${nodeType}", attempting fill anyway.` @@ -151,6 +156,9 @@ async function tapCommand( options: PressCommandOptions, action: 'click' | 'press', ): Promise { + const nativeRefTap = await maybeTapRefTarget(runtime, options, action); + if (nativeRefTap) return nativeRefTap; + const resolved = await resolveInteractionTarget(runtime, options, { action, requireInteractive: true, @@ -159,25 +167,86 @@ async function tapCommand( if (!runtime.backend.tap) { throw new AppError('UNSUPPORTED_OPERATION', 'tap is not supported by this backend'); } - const backendResult = await runtime.backend.tap( + const point = requireResolvedPoint(resolved); + const backendResult = await runtime.backend.tap(toBackendContext(runtime, options), point, { + button: options.button, + count: options.count, + intervalMs: options.intervalMs, + holdMs: options.holdMs, + jitterPx: options.jitterPx, + doubleTap: options.doubleTap, + }); + const formattedBackendResult = toBackendResult(backendResult); + return { + ...resolved, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + }; +} + +function requireResolvedPoint(result: { point?: Point }): Point { + if (!result.point) { + throw new AppError('COMMAND_FAILED', 'Interaction target resolved without coordinates'); + } + return result.point; +} + +async function maybeTapRefTarget( + runtime: AgentDeviceRuntime, + options: PressCommandOptions, + action: 'click' | 'press', +): Promise { + if (action !== 'click' || options.target.kind !== 'ref' || !runtime.backend.tapTarget) { + return null; + } + if (hasNonDefaultTapOptions(options)) return null; + const backendResult = await runtime.backend.tapTarget(toBackendContext(runtime, options), { + kind: 'ref', + ref: options.target.ref, + ...(options.target.fallbackLabel ? { fallbackLabel: options.target.fallbackLabel } : {}), + }); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'ref', + target: { kind: 'ref', ref: options.target.ref }, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + }; +} + +async function maybeFillRefTarget( + runtime: AgentDeviceRuntime, + options: FillCommandOptions, +): Promise { + if (options.target.kind !== 'ref' || !runtime.backend.fillTarget) return null; + const backendResult = await runtime.backend.fillTarget( toBackendContext(runtime, options), - resolved.point, { - button: options.button, - count: options.count, - intervalMs: options.intervalMs, - holdMs: options.holdMs, - jitterPx: options.jitterPx, - doubleTap: options.doubleTap, + kind: 'ref', + ref: options.target.ref, + ...(options.target.fallbackLabel ? { fallbackLabel: options.target.fallbackLabel } : {}), }, + options.text, + { delayMs: options.delayMs }, ); const formattedBackendResult = toBackendResult(backendResult); return { - ...resolved, + kind: 'ref', + target: { kind: 'ref', ref: options.target.ref }, + text: options.text, ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), }; } +function hasNonDefaultTapOptions(options: PressCommandOptions): boolean { + return Boolean( + options.count !== undefined || + options.intervalMs !== undefined || + options.holdMs !== undefined || + options.jitterPx !== undefined || + options.doubleTap !== undefined || + (options.button !== undefined && options.button !== 'primary'), + ); +} + function formatTargetForWarning(result: { kind: FillCommandResult['kind']; target?: ResolvedTarget; diff --git a/src/contracts/interaction.ts b/src/contracts/interaction.ts index e68b70d00..fc83f9017 100644 --- a/src/contracts/interaction.ts +++ b/src/contracts/interaction.ts @@ -38,10 +38,10 @@ export type ResolvedInteractionTarget = } | { kind: 'ref'; - point: Point; + point?: Point; target: Extract; - node: SnapshotNode; - selectorChain: string[]; + node?: SnapshotNode; + selectorChain?: string[]; refLabel?: string; } | { diff --git a/src/daemon/handlers/interaction-common.ts b/src/daemon/handlers/interaction-common.ts index 62d8bb869..db4b4654d 100644 --- a/src/daemon/handlers/interaction-common.ts +++ b/src/daemon/handlers/interaction-common.ts @@ -32,8 +32,8 @@ export type InteractionHandlerParams = { export function buildTouchVisualizationResult(params: { data: Record | undefined; - fallbackX: number; - fallbackY: number; + fallbackX?: number; + fallbackY?: number; referenceFrame?: GestureReferenceFrame; extra?: Record; }): Record { @@ -42,8 +42,7 @@ export function buildTouchVisualizationResult(params: { buildTouchMessage(extra, fallbackX, fallbackY) ?? (typeof data?.message === 'string' ? data.message : undefined); return { - x: fallbackX, - y: fallbackY, + ...(fallbackX === undefined || fallbackY === undefined ? {} : { x: fallbackX, y: fallbackY }), ...(referenceFrame ?? {}), ...(extra ?? {}), ...(data ?? {}), @@ -53,23 +52,24 @@ export function buildTouchVisualizationResult(params: { function buildTouchMessage( extra: Record | undefined, - x: number, - y: number, + x: number | undefined, + y: number | undefined, ): string | undefined { const ref = typeof extra?.ref === 'string' ? extra.ref : undefined; const button = typeof extra?.button === 'string' ? extra.button : undefined; const gesture = typeof extra?.gesture === 'string' ? extra.gesture : undefined; + const pointSuffix = x === undefined || y === undefined ? '' : ` (${x}, ${y})`; if (typeof extra?.text === 'string') { return `Filled ${Array.from(extra.text).length} chars`; } if (ref) { if (gesture === 'longpress') { - return `Long pressed @${ref} (${x}, ${y})`; + return `Long pressed @${ref}${pointSuffix}`; } if (button && button !== 'primary') { - return `Clicked ${button} @${ref} (${x}, ${y})`; + return `Clicked ${button} @${ref}${pointSuffix}`; } - return `Tapped @${ref} (${x}, ${y})`; + return `Tapped @${ref}${pointSuffix}`; } return undefined; } diff --git a/src/daemon/handlers/interaction-runtime.ts b/src/daemon/handlers/interaction-runtime.ts index 454353f12..b6dbed77f 100644 --- a/src/daemon/handlers/interaction-runtime.ts +++ b/src/daemon/handlers/interaction-runtime.ts @@ -12,6 +12,8 @@ import type { InteractionHandlerParams } from './interaction-common.ts'; import type { CaptureSnapshotForSession } from './interaction-snapshot.ts'; import { createDaemonRuntimePolicy } from '../runtime-policy.ts'; import { createDaemonRuntimeSessionStore } from '../runtime-session.ts'; +import { resolveWebProvider, type WebProvider } from '../../platforms/web/provider.ts'; +import { stripAtPrefix } from './interaction-touch-targets.ts'; export function createInteractionRuntime( params: InteractionHandlerParams & { @@ -42,6 +44,7 @@ function createInteractionBackend( }, ): AgentDeviceBackend { const { req, session } = params; + const webProvider = resolveNativeWebInteractionProvider(session); return { platform: session.device.platform, captureSnapshot: async (_context, options): Promise => ({ @@ -66,6 +69,12 @@ function createInteractionBackend( params.contextFromFlags(req.flags, session.appBundleId, session.trace?.outPath), ), ), + tapTarget: webProvider?.clickRef + ? async (_context, target): Promise => { + await webProvider.clickRef?.(target.ref); + return { ref: stripAtPrefix(target.ref) }; + } + : undefined, fill: async (_context, point, text): Promise => toBackendActionResult( await dispatchCommand( @@ -76,6 +85,16 @@ function createInteractionBackend( params.contextFromFlags(req.flags, session.appBundleId, session.trace?.outPath), ), ), + fillTarget: webProvider?.fillRef + ? async (_context, target, text, options): Promise => { + await webProvider.fillRef?.(target.ref, text, options); + return { + ref: stripAtPrefix(target.ref), + text, + delayMs: options?.delayMs ?? 0, + }; + } + : undefined, longPress: async (_context, point, options): Promise => toBackendActionResult( await dispatchCommand( @@ -103,6 +122,12 @@ function createInteractionBackend( }; } +function resolveNativeWebInteractionProvider(session: SessionState): WebProvider | undefined { + if (session.device.platform !== 'web') return undefined; + const provider = resolveWebProvider(); + return provider.clickRef || provider.fillRef ? provider : undefined; +} + function toBackendActionResult(data: unknown): BackendActionResult { return data && typeof data === 'object' ? (data as Record) : undefined; } diff --git a/src/daemon/handlers/interaction-touch.ts b/src/daemon/handlers/interaction-touch.ts index 7982210fe..0cec2d7c7 100644 --- a/src/daemon/handlers/interaction-touch.ts +++ b/src/daemon/handlers/interaction-touch.ts @@ -235,8 +235,8 @@ async function buildTargetedTouchResponseData(params: { : readSnapshotNodesReferenceFrame(session.snapshot?.nodes ?? []); return buildTouchVisualizationResult({ data: result.backendResult, - fallbackX: result.point.x, - fallbackY: result.point.y, + fallbackX: result.point?.x, + fallbackY: result.point?.y, referenceFrame, extra: { ...interactionResultExtra(result), @@ -452,8 +452,8 @@ async function dispatchFillViaRuntime( : readSnapshotNodesReferenceFrame(session.snapshot?.nodes ?? []); const recordedResult = buildTouchVisualizationResult({ data: result.backendResult, - fallbackX: result.point.x, - fallbackY: result.point.y, + fallbackX: result.point?.x, + fallbackY: result.point?.y, referenceFrame, extra: { ...interactionResultExtra(result), @@ -467,8 +467,7 @@ async function dispatchFillViaRuntime( ? { ...(result.backendResult ?? { ref: stripAtPrefix(result.target?.kind === 'ref' ? result.target.ref : undefined), - x: result.point.x, - y: result.point.y, + ...(result.point ? { x: result.point.x, y: result.point.y } : {}), }), } : recordedResult; @@ -610,7 +609,9 @@ function retryPositionalsForRuntimeResult( command: string, result: PressCommandResult | FillCommandResult | LongPressCommandResult, ): string[] | undefined { + if (result.kind === 'ref' && !result.node) return undefined; if (command === 'click' || command === 'press') { + if (!result.point) return undefined; return pointPositionals(result.point); } return undefined; diff --git a/src/platforms/web/agent-browser-provider.test.ts b/src/platforms/web/agent-browser-provider.test.ts index c8904836d..1584f9475 100644 --- a/src/platforms/web/agent-browser-provider.test.ts +++ b/src/platforms/web/agent-browser-provider.test.ts @@ -28,7 +28,9 @@ test('agent-browser provider maps supported operations to session-scoped JSON co await provider.open('https://example.test'); await provider.screenshot('/tmp/page.png', { fullscreen: true }); await provider.click(10.4, 20.6); + await provider.clickRef?.('@e3'); await provider.fill(11, 22, 'Ada'); + await provider.fillRef?.('@e2', 'Grace'); await provider.typeText('hello'); await provider.scroll('down', { pixels: 400 }); await provider.close(); @@ -42,11 +44,13 @@ test('agent-browser provider maps supported operations to session-scoped JSON co ['mouse', 'move', '10', '21', '--json', '--session', 'web-session'], ['mouse', 'down', '--json', '--session', 'web-session'], ['mouse', 'up', '--json', '--session', 'web-session'], + ['click', '@e3', '--json', '--session', 'web-session'], ['mouse', 'move', '11', '22', '--json', '--session', 'web-session'], ['mouse', 'down', '--json', '--session', 'web-session'], ['mouse', 'up', '--json', '--session', 'web-session'], ['press', expectedSelectAllShortcut(), '--json', '--session', 'web-session'], ['keyboard', 'type', 'Ada', '--json', '--session', 'web-session'], + ['fill', '@e2', 'Grace', '--json', '--session', 'web-session'], ['keyboard', 'type', 'hello', '--json', '--session', 'web-session'], ['scroll', 'down', '400', '--json', '--session', 'web-session'], ['close', '--json', '--session', 'web-session'], diff --git a/src/platforms/web/agent-browser-provider.ts b/src/platforms/web/agent-browser-provider.ts index 93f2a108e..5efe34cd7 100644 --- a/src/platforms/web/agent-browser-provider.ts +++ b/src/platforms/web/agent-browser-provider.ts @@ -45,6 +45,9 @@ export function createAgentBrowserWebProvider( async click(x, y) { await clickCoordinates(runJson, x, y); }, + async clickRef(ref) { + await runJson(['click', browserRefSelector(ref)]); + }, async fill(x, y, text) { // The shared web interactor is coordinate-first; bridge that to low-level // browser input until a future ref-targeted web path can call native fill. @@ -52,6 +55,9 @@ export function createAgentBrowserWebProvider( await runJson(['press', selectAllShortcut()]); await runJson(['keyboard', 'type', text]); }, + async fillRef(ref, text) { + await runJson(['fill', browserRefSelector(ref), text]); + }, async typeText(text) { await runJson(['keyboard', 'type', text]); }, @@ -258,7 +264,7 @@ function toErrorCode(value: unknown): 'COMMAND_FAILED' | (string & {}) { } function browserRefSelector(ref: string): string { - return `@${ref}`; + return ref.startsWith('@') ? ref : `@${ref}`; } function selectAllShortcut(): string { diff --git a/src/platforms/web/provider.ts b/src/platforms/web/provider.ts index 71d0cbfed..07077a0be 100644 --- a/src/platforms/web/provider.ts +++ b/src/platforms/web/provider.ts @@ -35,7 +35,9 @@ export type WebProvider = { snapshot(options?: WebSnapshotOptions): Promise; screenshot(outPath: string, options?: WebScreenshotOptions): Promise; click(x: number, y: number): Promise; + clickRef?(ref: string): Promise; fill(x: number, y: number, text: string, options?: { delayMs?: number }): Promise; + fillRef?(ref: string, text: string, options?: { delayMs?: number }): Promise; typeText(text: string, options?: { delayMs?: number }): Promise; scroll(direction: ScrollDirection, options?: { amount?: number; pixels?: number }): Promise; readText?(x: number, y: number): Promise; diff --git a/test/integration/provider-scenarios/web-desktop.test.ts b/test/integration/provider-scenarios/web-desktop.test.ts index 47e5472c2..5099c130c 100644 --- a/test/integration/provider-scenarios/web-desktop.test.ts +++ b/test/integration/provider-scenarios/web-desktop.test.ts @@ -76,7 +76,13 @@ test('Provider-backed integration web desktop flow uses semantic web provider ca name: 'click submit ref', command: 'click', positionals: ['@e4'], - expectData: { x: 84, y: 166 }, + expectData: { ref: 'e4' }, + assert: (response) => { + const data = response.json?.result?.data; + assert.equal(data?.x, undefined); + assert.equal(data?.y, undefined); + assert.equal(data?.message, 'Tapped @e4'); + }, }, { name: 'fill email ref', @@ -116,18 +122,24 @@ test('Provider-backed integration web desktop flow uses semantic web provider ca const actions = daemon.session()?.actions ?? []; assert.ok( actions.some( - (action) => action.command === 'click' && action.positionals.join(' ') === '@e4', + (action) => + action.command === 'click' && + action.positionals.join(' ') === '@e4' && + action.result?.x === undefined && + action.result?.y === undefined, ), - 'Expected ref click action to be recorded on the session', + 'Expected ref click action to be recorded on the session without fabricated coordinates', ); assert.ok( actions.some( (action) => action.command === 'fill' && action.positionals.join(' ') === '@e3 qa@example.test' && - action.flags.delayMs === 1, + action.flags.delayMs === 1 && + action.result?.x === undefined && + action.result?.y === undefined, ), - 'Expected ref fill action to be recorded on the session', + 'Expected ref fill action to be recorded on the session without fabricated coordinates', ); assert.ok( actions.some( @@ -141,8 +153,8 @@ test('Provider-backed integration web desktop flow uses semantic web provider ca assertFlatToolCall(semanticCalls, ['web', 'open', WEB_URL, '']); assertFlatToolCall(semanticCalls, ['web', 'snapshot', 'true', '']); - assertFlatToolCall(semanticCalls, ['web', 'click', '84', '166']); - assertFlatToolCall(semanticCalls, ['web', 'fill', '144', '114', 'qa@example.test', '1']); + assertFlatToolCall(semanticCalls, ['web', 'clickRef', '@e4']); + assertFlatToolCall(semanticCalls, ['web', 'fillRef', '@e3', 'qa@example.test', '1']); assertFlatToolCall(semanticCalls, ['web', 'type', ' ok', '0']); assertFlatToolCall(semanticCalls, ['web', 'scroll', 'down', '', '240']); assertFlatToolCall(semanticCalls, [ diff --git a/test/integration/provider-scenarios/web-world.ts b/test/integration/provider-scenarios/web-world.ts index 82c9b500c..d587c3c2d 100644 --- a/test/integration/provider-scenarios/web-world.ts +++ b/test/integration/provider-scenarios/web-world.ts @@ -66,6 +66,12 @@ export async function createWebDesktopWorld(): Promise { state.statusText = 'Submitted'; } }, + clickRef: async (ref) => { + semanticCalls.push(['web', 'clickRef', ref]); + if (ref === '@e4') { + state.statusText = 'Submitted'; + } + }, fill: async (x, y, text, options) => { semanticCalls.push([ 'web', @@ -79,6 +85,12 @@ export async function createWebDesktopWorld(): Promise { state.inputValue = text; } }, + fillRef: async (ref, text, options) => { + semanticCalls.push(['web', 'fillRef', ref, text, String(options?.delayMs ?? 0)]); + if (ref === '@e3') { + state.inputValue = text; + } + }, typeText: async (text, options) => { semanticCalls.push(['web', 'type', text, String(options?.delayMs ?? 0)]); state.inputValue += text; diff --git a/test/integration/smoke-web-platform.test.ts b/test/integration/smoke-web-platform.test.ts index 9499c618d..d339ac449 100644 --- a/test/integration/smoke-web-platform.test.ts +++ b/test/integration/smoke-web-platform.test.ts @@ -1,5 +1,5 @@ import assert from 'node:assert/strict'; -import { mkdirSync, rmSync, writeFileSync } from 'node:fs'; +import { mkdirSync, writeFileSync } from 'node:fs'; import { createServer, type Server } from 'node:http'; import path from 'node:path'; import test from 'node:test'; @@ -25,7 +25,6 @@ type WebSmokeContext = { lastSnapshot?: any; screenshotPath: string; server: Server; - socketDir: string; stepHistory: StepRecord[]; url: string; }; @@ -65,7 +64,6 @@ async function createWebSmokeContext(): Promise { const stateDir = path.join(artifactDir, 'agent-device-state'); const agentBrowserConfigPath = path.join(artifactDir, 'agent-browser.json'); const session = `ws-${process.pid.toString(36)}-${(Date.now() % 1_679_616).toString(36)}`; - const socketDir = path.join('/tmp', `adw-${process.pid.toString(36)}`); const fixture = await startFixtureServer(); const env = { ...process.env, @@ -73,11 +71,9 @@ async function createWebSmokeContext(): Promise { AGENT_BROWSER_CONFIG: agentBrowserConfigPath, AGENT_BROWSER_HEADED: 'false', AGENT_BROWSER_IDLE_TIMEOUT_MS: '30000', - AGENT_BROWSER_SOCKET_DIR: socketDir, }; mkdirSync(stateDir, { recursive: true }); - mkdirSync(socketDir, { recursive: true }); writeFileSync(agentBrowserConfigPath, JSON.stringify({ headed: false }, null, 2)); return { @@ -86,7 +82,6 @@ async function createWebSmokeContext(): Promise { env, screenshotPath: path.join(artifactDir, 'web-smoke.png'), server: fixture.server, - socketDir, stepHistory: [], url: fixture.url, }; @@ -226,11 +221,6 @@ async function cleanupWebSmoke(context: WebSmokeContext, opened: boolean): Promi } catch (error) { errors.push(error); } - try { - rmSync(context.socketDir, { recursive: true, force: true }); - } catch (error) { - errors.push(error); - } if (errors.length === 1) { throw errors[0]; }