Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/backend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@ export type BackendTapOptions = RepeatedInput & {
button?: ClickButton;
};

export type BackendRefTarget = {
kind: 'ref';
ref: string;
fallbackLabel?: string;
};

export type BackendFillOptions = {
delayMs?: number;
};
Expand Down Expand Up @@ -435,12 +441,23 @@ export type AgentDeviceBackend = {
point: Point,
options?: BackendTapOptions,
): Promise<BackendActionResult>;
tapTarget?(
context: BackendCommandContext,
target: BackendRefTarget,
options?: BackendTapOptions,
): Promise<BackendActionResult>;
fill?(
context: BackendCommandContext,
point: Point,
text: string,
options?: BackendFillOptions,
): Promise<BackendActionResult>;
fillTarget?(
context: BackendCommandContext,
target: BackendRefTarget,
text: string,
options?: BackendFillOptions,
): Promise<BackendActionResult>;
typeText?(
context: BackendCommandContext,
text: string,
Expand Down
33 changes: 31 additions & 2 deletions src/cli/commands/web.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import {
doctorManagedAgentBrowser,
setupManagedAgentBrowser,
type AgentBrowserToolStatus,
} from '../../platforms/web/agent-browser-tool.ts';
import { AppError } from '../../utils/errors.ts';
import type { CliFlags } from '../../utils/cli-flags.ts';
import { printJson } from '../../utils/output.ts';

type PublicAgentBrowserToolStatus = Omit<AgentBrowserToolStatus, 'socketDir'>;

export async function runWebCommand(
positionals: string[],
options: { flags: CliFlags; stateDir: string },
Expand Down Expand Up @@ -46,7 +49,7 @@ function printWebSetupResult(
status: Awaited<ReturnType<typeof setupManagedAgentBrowser>>,
): void {
if (json) {
printJson({ success: true, data: { status } });
printJson({ success: true, data: { status: toPublicAgentBrowserToolStatus(status) } });
return;
}
process.stdout.write(
Expand All @@ -56,8 +59,34 @@ function printWebSetupResult(

function printWebResult(json: boolean | undefined, message: string, data: Record<string, unknown>) {
if (json) {
printJson({ success: true, data });
printJson({ success: true, data: toPublicWebResult(data) });
return;
}
process.stdout.write(`${message}\n`);
}

function toPublicAgentBrowserToolStatus(
status: AgentBrowserToolStatus,
): PublicAgentBrowserToolStatus {
const { socketDir: _socketDir, ...publicStatus } = status;
return publicStatus;
}

function toPublicWebResult(data: Record<string, unknown>): Record<string, unknown> {
const status = data.status;
if (!isAgentBrowserToolStatus(status)) return data;
return {
...data,
status: toPublicAgentBrowserToolStatus(status),
};
}

function isAgentBrowserToolStatus(value: unknown): value is AgentBrowserToolStatus {
return (
typeof value === 'object' &&
value !== null &&
'socketDir' in value &&
'installDir' in value &&
'binaryPath' in value
);
}
32 changes: 18 additions & 14 deletions src/commands/interaction/runtime/gestures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,13 @@ export const focusCommand: RuntimeCommand<FocusCommandOptions, FocusCommandResul
if (!runtime.backend.focus) {
throw new AppError('UNSUPPORTED_OPERATION', 'focus is not supported by this backend');
}
const backendResult = await runtime.backend.focus(
toBackendContext(runtime, options),
resolved.point,
);
const point = requireResolvedPoint(resolved);
const backendResult = await runtime.backend.focus(toBackendContext(runtime, options), point);
const formattedBackendResult = toBackendResult(backendResult);
return {
...resolved,
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
...successText(`Focused (${resolved.point.x}, ${resolved.point.y})`),
...successText(`Focused (${point.x}, ${point.y})`),
};
};

Expand All @@ -161,17 +159,16 @@ export const longPressCommand: RuntimeCommand<
options.durationMs === undefined
? undefined
: requireIntInRange(options.durationMs, 'durationMs', 0, 120_000);
const backendResult = await runtime.backend.longPress(
toBackendContext(runtime, options),
resolved.point,
{ durationMs },
);
const point = requireResolvedPoint(resolved);
const backendResult = await runtime.backend.longPress(toBackendContext(runtime, options), point, {
durationMs,
});
const formattedBackendResult = toBackendResult(backendResult);
return {
...resolved,
...(durationMs !== undefined ? { durationMs } : {}),
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
...successText(`Long pressed (${resolved.point.x}, ${resolved.point.y})`),
...successText(`Long pressed (${point.x}, ${point.y})`),
};
};

Expand All @@ -193,7 +190,7 @@ export const scrollCommand: RuntimeCommand<ScrollCommandOptions, ScrollCommandRe
const backendTarget =
resolved.kind === 'viewport'
? { kind: 'viewport' as const }
: { kind: 'point' as const, point: resolved.point };
: { kind: 'point' as const, point: requireResolvedPoint(resolved) };
const scrollBackend = runtime.backend.scroll;
const runScroll = async () =>
await scrollBackend(toBackendContext(runtime, options), backendTarget, {
Expand Down Expand Up @@ -381,7 +378,7 @@ async function resolveSwipeFrom(
promoteToHittableAncestor: false,
},
);
return { point: target.point, target };
return { point: requireResolvedPoint(target), target };
}
if (!options.direction) {
throw new AppError('INVALID_ARGS', 'swipe requires from+to or a direction');
Expand Down Expand Up @@ -428,10 +425,17 @@ function buildScrollEdgeTarget(resolved: ResolvedScrollTarget): ScrollEdgeTarget
? {}
: {
point: resolved.point,
nodeIndex: 'node' in resolved ? resolved.node.index : undefined,
nodeIndex: 'node' in resolved ? resolved.node?.index : undefined,
};
}

function requireResolvedPoint(result: { point?: Point }): Point {
if (!result.point) {
throw new AppError('COMMAND_FAILED', 'Interaction target resolved without coordinates');
}
return result.point;
}

async function captureRuntimeScrollEdgeState(
runtime: AgentDeviceRuntime,
options: ScrollCommandOptions,
Expand Down
58 changes: 58 additions & 0 deletions src/commands/interaction/runtime/interactions.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,56 @@ test('runtime click taps an explicit point without requiring a snapshot', async
assert.deepEqual(result, { kind: 'point', point: { x: 10, y: 20 } });
});

test('runtime click uses backend ref primitive without resolving snapshot geometry', async () => {
const calls: string[] = [];
const device = createInteractionDevice(selectorSnapshot(), {
platform: 'web',
captureSnapshot: async () => {
throw new Error('native ref click should not capture a snapshot');
},
tapTarget: async (_context, target) => {
calls.push(target.ref);
return { ref: target.ref.replace(/^@/, '') };
},
});

const result = await device.interactions.click(ref('@e2'), { session: 'default' });

assert.deepEqual(calls, ['@e2']);
assert.equal(result.kind, 'ref');
assert.deepEqual(result.target, { kind: 'ref', ref: '@e2' });
assert.equal(result.point, undefined);
assert.equal(result.node, undefined);
assert.deepEqual(result.backendResult, { ref: 'e2' });
});

test('runtime fill uses backend ref primitive without resolving snapshot geometry', async () => {
const calls: Array<{ ref: string; text: string; delayMs?: number }> = [];
const device = createInteractionDevice(fillableSnapshot(), {
platform: 'web',
captureSnapshot: async () => {
throw new Error('native ref fill should not capture a snapshot');
},
fillTarget: async (_context, target, text, options) => {
calls.push({ ref: target.ref, text, delayMs: options?.delayMs });
return { ref: target.ref.replace(/^@/, ''), text };
},
});

const result = await device.interactions.fill(ref('@e1'), 'hello', {
session: 'default',
delayMs: 25,
});

assert.deepEqual(calls, [{ ref: '@e1', text: 'hello', delayMs: 25 }]);
assert.equal(result.kind, 'ref');
assert.equal(result.point, undefined);
assert.equal(result.node, undefined);
assert.deepEqual(result.target, { kind: 'ref', ref: '@e1' });
assert.equal(result.text, 'hello');
assert.deepEqual(result.backendResult, { ref: 'e1', text: 'hello' });
});

test('runtime interactions pass runtime signal to backend primitives', async () => {
const controller = new AbortController();
let signal: AbortSignal | undefined;
Expand Down Expand Up @@ -1063,7 +1113,9 @@ function createInteractionDevice(
AgentDeviceBackend,
| 'captureSnapshot'
| 'tap'
| 'tapTarget'
| 'fill'
| 'fillTarget'
| 'typeText'
| 'focus'
| 'longPress'
Expand All @@ -1082,7 +1134,13 @@ function createInteractionDevice(
captureSnapshot: async (...args) =>
overrides.captureSnapshot ? await overrides.captureSnapshot(...args) : { snapshot },
tap: async (...args) => await overrides.tap?.(...args),
tapTarget: overrides.tapTarget
? async (...args) => await overrides.tapTarget?.(...args)
: undefined,
fill: async (...args) => await overrides.fill?.(...args),
fillTarget: overrides.fillTarget
? async (...args) => await overrides.fillTarget?.(...args)
: undefined,
typeText: async (...args) => await overrides.typeText?.(...args),
focus: overrides.focus ? async (...args) => await overrides.focus?.(...args) : undefined,
longPress: overrides.longPress
Expand Down
91 changes: 80 additions & 11 deletions src/commands/interaction/runtime/interactions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { AppError } from '../../../utils/errors.ts';
import type { ClickButton } from '../../../core/click-button.ts';
import type { AgentDeviceRuntime, CommandContext } from '../../../runtime-contract.ts';
import { isFillableType } from '../../../utils/snapshot-processing.ts';
import type { Point } from '../../../utils/snapshot.ts';
import { requireIntInRange } from '../../../utils/validation.ts';
import { successText } from '../../../utils/success-text.ts';
import { findMistargetedTypeRefToken } from '../../../utils/type-target-warning.ts';
Expand Down Expand Up @@ -85,6 +86,9 @@ export const fillCommand: RuntimeCommand<FillCommandOptions, FillCommandResult>
options,
): Promise<FillCommandResult> => {
if (!options.text) throw new AppError('INVALID_ARGS', 'fill requires text');
const nativeRefFill = await maybeFillRefTarget(runtime, options);
if (nativeRefFill) return nativeRefFill;

const resolved = await resolveInteractionTarget(runtime, options, {
action: 'fill',
requireInteractive: true,
Expand All @@ -93,14 +97,15 @@ export const fillCommand: RuntimeCommand<FillCommandOptions, FillCommandResult>
if (!runtime.backend.fill) {
throw new AppError('UNSUPPORTED_OPERATION', 'fill is not supported by this backend');
}
const point = requireResolvedPoint(resolved);
const backendResult = await runtime.backend.fill(
toBackendContext(runtime, options),
resolved.point,
point,
options.text,
{ delayMs: options.delayMs },
);
const formattedBackendResult = toBackendResult(backendResult);
const nodeType = 'node' in resolved ? (resolved.node.type ?? '') : '';
const nodeType = 'node' in resolved ? (resolved.node?.type ?? '') : '';
const warning =
nodeType && !isFillableType(nodeType, runtime.backend.platform)
? `fill target ${formatTargetForWarning(resolved)} resolved to "${nodeType}", attempting fill anyway.`
Expand Down Expand Up @@ -151,6 +156,9 @@ async function tapCommand(
options: PressCommandOptions,
action: 'click' | 'press',
): Promise<PressCommandResult> {
const nativeRefTap = await maybeTapRefTarget(runtime, options, action);
if (nativeRefTap) return nativeRefTap;

const resolved = await resolveInteractionTarget(runtime, options, {
action,
requireInteractive: true,
Expand All @@ -159,25 +167,86 @@ async function tapCommand(
if (!runtime.backend.tap) {
throw new AppError('UNSUPPORTED_OPERATION', 'tap is not supported by this backend');
}
const backendResult = await runtime.backend.tap(
const point = requireResolvedPoint(resolved);
const backendResult = await runtime.backend.tap(toBackendContext(runtime, options), point, {
button: options.button,
count: options.count,
intervalMs: options.intervalMs,
holdMs: options.holdMs,
jitterPx: options.jitterPx,
doubleTap: options.doubleTap,
});
const formattedBackendResult = toBackendResult(backendResult);
return {
...resolved,
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
};
}

function requireResolvedPoint(result: { point?: Point }): Point {
if (!result.point) {
throw new AppError('COMMAND_FAILED', 'Interaction target resolved without coordinates');
}
return result.point;
}

async function maybeTapRefTarget(
runtime: AgentDeviceRuntime,
options: PressCommandOptions,
action: 'click' | 'press',
): Promise<PressCommandResult | null> {
if (action !== 'click' || options.target.kind !== 'ref' || !runtime.backend.tapTarget) {
return null;
}
if (hasNonDefaultTapOptions(options)) return null;
const backendResult = await runtime.backend.tapTarget(toBackendContext(runtime, options), {
kind: 'ref',
ref: options.target.ref,
...(options.target.fallbackLabel ? { fallbackLabel: options.target.fallbackLabel } : {}),
});
const formattedBackendResult = toBackendResult(backendResult);
return {
kind: 'ref',
target: { kind: 'ref', ref: options.target.ref },
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
};
}

async function maybeFillRefTarget(
runtime: AgentDeviceRuntime,
options: FillCommandOptions,
): Promise<FillCommandResult | null> {
if (options.target.kind !== 'ref' || !runtime.backend.fillTarget) return null;
const backendResult = await runtime.backend.fillTarget(
toBackendContext(runtime, options),
resolved.point,
{
button: options.button,
count: options.count,
intervalMs: options.intervalMs,
holdMs: options.holdMs,
jitterPx: options.jitterPx,
doubleTap: options.doubleTap,
kind: 'ref',
ref: options.target.ref,
...(options.target.fallbackLabel ? { fallbackLabel: options.target.fallbackLabel } : {}),
},
options.text,
{ delayMs: options.delayMs },
);
const formattedBackendResult = toBackendResult(backendResult);
return {
...resolved,
kind: 'ref',
target: { kind: 'ref', ref: options.target.ref },
text: options.text,
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
};
}

function hasNonDefaultTapOptions(options: PressCommandOptions): boolean {
return Boolean(
options.count !== undefined ||
options.intervalMs !== undefined ||
options.holdMs !== undefined ||
options.jitterPx !== undefined ||
options.doubleTap !== undefined ||
(options.button !== undefined && options.button !== 'primary'),
);
}

function formatTargetForWarning(result: {
kind: FillCommandResult['kind'];
target?: ResolvedTarget;
Expand Down
Loading
Loading