Skip to content

Commit feb6133

Browse files
committed
fix: use native web ref interactions
1 parent a285328 commit feb6133

13 files changed

Lines changed: 261 additions & 51 deletions

File tree

src/backend.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,12 @@ export type BackendTapOptions = RepeatedInput & {
137137
button?: ClickButton;
138138
};
139139

140+
export type BackendRefTarget = {
141+
kind: 'ref';
142+
ref: string;
143+
fallbackLabel?: string;
144+
};
145+
140146
export type BackendFillOptions = {
141147
delayMs?: number;
142148
};
@@ -435,12 +441,23 @@ export type AgentDeviceBackend = {
435441
point: Point,
436442
options?: BackendTapOptions,
437443
): Promise<BackendActionResult>;
444+
tapTarget?(
445+
context: BackendCommandContext,
446+
target: BackendRefTarget,
447+
options?: BackendTapOptions,
448+
): Promise<BackendActionResult>;
438449
fill?(
439450
context: BackendCommandContext,
440451
point: Point,
441452
text: string,
442453
options?: BackendFillOptions,
443454
): Promise<BackendActionResult>;
455+
fillTarget?(
456+
context: BackendCommandContext,
457+
target: BackendRefTarget,
458+
text: string,
459+
options?: BackendFillOptions,
460+
): Promise<BackendActionResult>;
444461
typeText?(
445462
context: BackendCommandContext,
446463
text: string,

src/commands/interaction/runtime/gestures.ts

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,13 @@ export const focusCommand: RuntimeCommand<FocusCommandOptions, FocusCommandResul
133133
if (!runtime.backend.focus) {
134134
throw new AppError('UNSUPPORTED_OPERATION', 'focus is not supported by this backend');
135135
}
136-
const backendResult = await runtime.backend.focus(
137-
toBackendContext(runtime, options),
138-
resolved.point,
139-
);
136+
const point = requireResolvedPoint(resolved);
137+
const backendResult = await runtime.backend.focus(toBackendContext(runtime, options), point);
140138
const formattedBackendResult = toBackendResult(backendResult);
141139
return {
142140
...resolved,
143141
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
144-
...successText(`Focused (${resolved.point.x}, ${resolved.point.y})`),
142+
...successText(`Focused (${point.x}, ${point.y})`),
145143
};
146144
};
147145

@@ -161,17 +159,16 @@ export const longPressCommand: RuntimeCommand<
161159
options.durationMs === undefined
162160
? undefined
163161
: requireIntInRange(options.durationMs, 'durationMs', 0, 120_000);
164-
const backendResult = await runtime.backend.longPress(
165-
toBackendContext(runtime, options),
166-
resolved.point,
167-
{ durationMs },
168-
);
162+
const point = requireResolvedPoint(resolved);
163+
const backendResult = await runtime.backend.longPress(toBackendContext(runtime, options), point, {
164+
durationMs,
165+
});
169166
const formattedBackendResult = toBackendResult(backendResult);
170167
return {
171168
...resolved,
172169
...(durationMs !== undefined ? { durationMs } : {}),
173170
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
174-
...successText(`Long pressed (${resolved.point.x}, ${resolved.point.y})`),
171+
...successText(`Long pressed (${point.x}, ${point.y})`),
175172
};
176173
};
177174

@@ -193,7 +190,7 @@ export const scrollCommand: RuntimeCommand<ScrollCommandOptions, ScrollCommandRe
193190
const backendTarget =
194191
resolved.kind === 'viewport'
195192
? { kind: 'viewport' as const }
196-
: { kind: 'point' as const, point: resolved.point };
193+
: { kind: 'point' as const, point: requireResolvedPoint(resolved) };
197194
const scrollBackend = runtime.backend.scroll;
198195
const runScroll = async () =>
199196
await scrollBackend(toBackendContext(runtime, options), backendTarget, {
@@ -381,7 +378,7 @@ async function resolveSwipeFrom(
381378
promoteToHittableAncestor: false,
382379
},
383380
);
384-
return { point: target.point, target };
381+
return { point: requireResolvedPoint(target), target };
385382
}
386383
if (!options.direction) {
387384
throw new AppError('INVALID_ARGS', 'swipe requires from+to or a direction');
@@ -428,10 +425,17 @@ function buildScrollEdgeTarget(resolved: ResolvedScrollTarget): ScrollEdgeTarget
428425
? {}
429426
: {
430427
point: resolved.point,
431-
nodeIndex: 'node' in resolved ? resolved.node.index : undefined,
428+
nodeIndex: 'node' in resolved ? resolved.node?.index : undefined,
432429
};
433430
}
434431

432+
function requireResolvedPoint(result: { point?: Point }): Point {
433+
if (!result.point) {
434+
throw new AppError('COMMAND_FAILED', 'Interaction target resolved without coordinates');
435+
}
436+
return result.point;
437+
}
438+
435439
async function captureRuntimeScrollEdgeState(
436440
runtime: AgentDeviceRuntime,
437441
options: ScrollCommandOptions,

src/commands/interaction/runtime/interactions.test.ts

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,56 @@ test('runtime click taps an explicit point without requiring a snapshot', async
2727
assert.deepEqual(result, { kind: 'point', point: { x: 10, y: 20 } });
2828
});
2929

30+
test('runtime click uses backend ref primitive without resolving snapshot geometry', async () => {
31+
const calls: string[] = [];
32+
const device = createInteractionDevice(selectorSnapshot(), {
33+
platform: 'web',
34+
captureSnapshot: async () => {
35+
throw new Error('native ref click should not capture a snapshot');
36+
},
37+
tapTarget: async (_context, target) => {
38+
calls.push(target.ref);
39+
return { ref: target.ref.replace(/^@/, '') };
40+
},
41+
});
42+
43+
const result = await device.interactions.click(ref('@e2'), { session: 'default' });
44+
45+
assert.deepEqual(calls, ['@e2']);
46+
assert.equal(result.kind, 'ref');
47+
assert.deepEqual(result.target, { kind: 'ref', ref: '@e2' });
48+
assert.equal(result.point, undefined);
49+
assert.equal(result.node, undefined);
50+
assert.deepEqual(result.backendResult, { ref: 'e2' });
51+
});
52+
53+
test('runtime fill uses backend ref primitive without resolving snapshot geometry', async () => {
54+
const calls: Array<{ ref: string; text: string; delayMs?: number }> = [];
55+
const device = createInteractionDevice(fillableSnapshot(), {
56+
platform: 'web',
57+
captureSnapshot: async () => {
58+
throw new Error('native ref fill should not capture a snapshot');
59+
},
60+
fillTarget: async (_context, target, text, options) => {
61+
calls.push({ ref: target.ref, text, delayMs: options?.delayMs });
62+
return { ref: target.ref.replace(/^@/, ''), text };
63+
},
64+
});
65+
66+
const result = await device.interactions.fill(ref('@e1'), 'hello', {
67+
session: 'default',
68+
delayMs: 25,
69+
});
70+
71+
assert.deepEqual(calls, [{ ref: '@e1', text: 'hello', delayMs: 25 }]);
72+
assert.equal(result.kind, 'ref');
73+
assert.equal(result.point, undefined);
74+
assert.equal(result.node, undefined);
75+
assert.deepEqual(result.target, { kind: 'ref', ref: '@e1' });
76+
assert.equal(result.text, 'hello');
77+
assert.deepEqual(result.backendResult, { ref: 'e1', text: 'hello' });
78+
});
79+
3080
test('runtime interactions pass runtime signal to backend primitives', async () => {
3181
const controller = new AbortController();
3282
let signal: AbortSignal | undefined;
@@ -1063,7 +1113,9 @@ function createInteractionDevice(
10631113
AgentDeviceBackend,
10641114
| 'captureSnapshot'
10651115
| 'tap'
1116+
| 'tapTarget'
10661117
| 'fill'
1118+
| 'fillTarget'
10671119
| 'typeText'
10681120
| 'focus'
10691121
| 'longPress'
@@ -1082,7 +1134,13 @@ function createInteractionDevice(
10821134
captureSnapshot: async (...args) =>
10831135
overrides.captureSnapshot ? await overrides.captureSnapshot(...args) : { snapshot },
10841136
tap: async (...args) => await overrides.tap?.(...args),
1137+
tapTarget: overrides.tapTarget
1138+
? async (...args) => await overrides.tapTarget?.(...args)
1139+
: undefined,
10851140
fill: async (...args) => await overrides.fill?.(...args),
1141+
fillTarget: overrides.fillTarget
1142+
? async (...args) => await overrides.fillTarget?.(...args)
1143+
: undefined,
10861144
typeText: async (...args) => await overrides.typeText?.(...args),
10871145
focus: overrides.focus ? async (...args) => await overrides.focus?.(...args) : undefined,
10881146
longPress: overrides.longPress

src/commands/interaction/runtime/interactions.ts

Lines changed: 80 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { AppError } from '../../../utils/errors.ts';
22
import type { ClickButton } from '../../../core/click-button.ts';
33
import type { AgentDeviceRuntime, CommandContext } from '../../../runtime-contract.ts';
44
import { isFillableType } from '../../../utils/snapshot-processing.ts';
5+
import type { Point } from '../../../utils/snapshot.ts';
56
import { requireIntInRange } from '../../../utils/validation.ts';
67
import { successText } from '../../../utils/success-text.ts';
78
import { findMistargetedTypeRefToken } from '../../../utils/type-target-warning.ts';
@@ -85,6 +86,9 @@ export const fillCommand: RuntimeCommand<FillCommandOptions, FillCommandResult>
8586
options,
8687
): Promise<FillCommandResult> => {
8788
if (!options.text) throw new AppError('INVALID_ARGS', 'fill requires text');
89+
const nativeRefFill = await maybeFillRefTarget(runtime, options);
90+
if (nativeRefFill) return nativeRefFill;
91+
8892
const resolved = await resolveInteractionTarget(runtime, options, {
8993
action: 'fill',
9094
requireInteractive: true,
@@ -93,14 +97,15 @@ export const fillCommand: RuntimeCommand<FillCommandOptions, FillCommandResult>
9397
if (!runtime.backend.fill) {
9498
throw new AppError('UNSUPPORTED_OPERATION', 'fill is not supported by this backend');
9599
}
100+
const point = requireResolvedPoint(resolved);
96101
const backendResult = await runtime.backend.fill(
97102
toBackendContext(runtime, options),
98-
resolved.point,
103+
point,
99104
options.text,
100105
{ delayMs: options.delayMs },
101106
);
102107
const formattedBackendResult = toBackendResult(backendResult);
103-
const nodeType = 'node' in resolved ? (resolved.node.type ?? '') : '';
108+
const nodeType = 'node' in resolved ? (resolved.node?.type ?? '') : '';
104109
const warning =
105110
nodeType && !isFillableType(nodeType, runtime.backend.platform)
106111
? `fill target ${formatTargetForWarning(resolved)} resolved to "${nodeType}", attempting fill anyway.`
@@ -151,6 +156,9 @@ async function tapCommand(
151156
options: PressCommandOptions,
152157
action: 'click' | 'press',
153158
): Promise<PressCommandResult> {
159+
const nativeRefTap = await maybeTapRefTarget(runtime, options, action);
160+
if (nativeRefTap) return nativeRefTap;
161+
154162
const resolved = await resolveInteractionTarget(runtime, options, {
155163
action,
156164
requireInteractive: true,
@@ -159,25 +167,86 @@ async function tapCommand(
159167
if (!runtime.backend.tap) {
160168
throw new AppError('UNSUPPORTED_OPERATION', 'tap is not supported by this backend');
161169
}
162-
const backendResult = await runtime.backend.tap(
170+
const point = requireResolvedPoint(resolved);
171+
const backendResult = await runtime.backend.tap(toBackendContext(runtime, options), point, {
172+
button: options.button,
173+
count: options.count,
174+
intervalMs: options.intervalMs,
175+
holdMs: options.holdMs,
176+
jitterPx: options.jitterPx,
177+
doubleTap: options.doubleTap,
178+
});
179+
const formattedBackendResult = toBackendResult(backendResult);
180+
return {
181+
...resolved,
182+
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
183+
};
184+
}
185+
186+
function requireResolvedPoint(result: { point?: Point }): Point {
187+
if (!result.point) {
188+
throw new AppError('COMMAND_FAILED', 'Interaction target resolved without coordinates');
189+
}
190+
return result.point;
191+
}
192+
193+
async function maybeTapRefTarget(
194+
runtime: AgentDeviceRuntime,
195+
options: PressCommandOptions,
196+
action: 'click' | 'press',
197+
): Promise<PressCommandResult | null> {
198+
if (action !== 'click' || options.target.kind !== 'ref' || !runtime.backend.tapTarget) {
199+
return null;
200+
}
201+
if (hasNonDefaultTapOptions(options)) return null;
202+
const backendResult = await runtime.backend.tapTarget(toBackendContext(runtime, options), {
203+
kind: 'ref',
204+
ref: options.target.ref,
205+
...(options.target.fallbackLabel ? { fallbackLabel: options.target.fallbackLabel } : {}),
206+
});
207+
const formattedBackendResult = toBackendResult(backendResult);
208+
return {
209+
kind: 'ref',
210+
target: { kind: 'ref', ref: options.target.ref },
211+
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
212+
};
213+
}
214+
215+
async function maybeFillRefTarget(
216+
runtime: AgentDeviceRuntime,
217+
options: FillCommandOptions,
218+
): Promise<FillCommandResult | null> {
219+
if (options.target.kind !== 'ref' || !runtime.backend.fillTarget) return null;
220+
const backendResult = await runtime.backend.fillTarget(
163221
toBackendContext(runtime, options),
164-
resolved.point,
165222
{
166-
button: options.button,
167-
count: options.count,
168-
intervalMs: options.intervalMs,
169-
holdMs: options.holdMs,
170-
jitterPx: options.jitterPx,
171-
doubleTap: options.doubleTap,
223+
kind: 'ref',
224+
ref: options.target.ref,
225+
...(options.target.fallbackLabel ? { fallbackLabel: options.target.fallbackLabel } : {}),
172226
},
227+
options.text,
228+
{ delayMs: options.delayMs },
173229
);
174230
const formattedBackendResult = toBackendResult(backendResult);
175231
return {
176-
...resolved,
232+
kind: 'ref',
233+
target: { kind: 'ref', ref: options.target.ref },
234+
text: options.text,
177235
...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}),
178236
};
179237
}
180238

239+
function hasNonDefaultTapOptions(options: PressCommandOptions): boolean {
240+
return Boolean(
241+
options.count !== undefined ||
242+
options.intervalMs !== undefined ||
243+
options.holdMs !== undefined ||
244+
options.jitterPx !== undefined ||
245+
options.doubleTap !== undefined ||
246+
(options.button !== undefined && options.button !== 'primary'),
247+
);
248+
}
249+
181250
function formatTargetForWarning(result: {
182251
kind: FillCommandResult['kind'];
183252
target?: ResolvedTarget;

src/contracts/interaction.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ export type ResolvedInteractionTarget =
3838
}
3939
| {
4040
kind: 'ref';
41-
point: Point;
41+
point?: Point;
4242
target: Extract<ResolvedTarget, { kind: 'ref' }>;
43-
node: SnapshotNode;
44-
selectorChain: string[];
43+
node?: SnapshotNode;
44+
selectorChain?: string[];
4545
refLabel?: string;
4646
}
4747
| {

0 commit comments

Comments
 (0)