From 58410db76848f52c500042bc2466aabd4393463f Mon Sep 17 00:00:00 2001 From: Les Orchard Date: Tue, 26 May 2026 15:15:57 -0700 Subject: [PATCH] fix(core): make repetition detection survive ref churn (#430) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The repetition detector hashed action signatures as action:ref:value, but refs are regenerated on every ARIA snapshot. A logical "click Submit" got a different ref each turn, so the detector treated each attempt as a distinct action and the warn/abort thresholds never tripped on the bug they were designed to catch. Drop ref from the signature, normalize value (lowercase + trim), and exempt scroll and wait from detection entirely — their same-value repetition is legitimate workflow (traversing an infinite feed, polling a slow page), not a stuck-loop signal. To keep ref-only actions (click, hover, check, etc.) from collapsing every call onto the bare `action:` signature, also thread element identity into the detector. ariaSnapshot.ts populates a new globalThis.__piloIdentityMap alongside __piloRefMap, and a new AriaBrowser.getRefIdentity() method (implemented in PlaywrightBrowser and ExtensionBrowser) looks it up. performActionWithValidation captures identity BEFORE the action runs (while the ref is still valid) and attaches it to ActionResult as targetIdentity, which the signature folds in when present. Now click:button:Next: and click:button:Submit: hash distinctly, while click:button:Submit: remains constant across ref churn. Defers the larger pieces from #430 (config rename, stagnant-page detector, decoupling warn from forced snapshot) to follow-up PRs. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/core/src/browser/ariaBrowser.ts | 8 + .../core/src/browser/ariaTree/ariaSnapshot.ts | 12 + .../core/src/browser/playwrightBrowser.ts | 19 + packages/core/src/tools/webActionTools.ts | 17 + packages/core/src/webAgent.ts | 47 +- .../core/test/tools/webActionTools.test.ts | 32 ++ packages/core/test/webAgent.test.ts | 424 ++++++++++++++++++ .../src/background/ExtensionBrowser.ts | 22 + 8 files changed, 576 insertions(+), 5 deletions(-) diff --git a/packages/core/src/browser/ariaBrowser.ts b/packages/core/src/browser/ariaBrowser.ts index ca1abacd..06fbf382 100644 --- a/packages/core/src/browser/ariaBrowser.ts +++ b/packages/core/src/browser/ariaBrowser.ts @@ -99,6 +99,14 @@ export interface AriaBrowser { */ performAction(ref: string, action: PageAction, value?: string): Promise; + /** + * Look up element identity (role + accessible name) for a ref from the + * most recent ariaTree snapshot. Returns null if the ref is unknown. + * Used by the repetition detector to distinguish logical targets when + * the ref string itself churns between snapshots. + */ + getRefIdentity(ref: string): Promise<{ role: string; name: string } | null>; + /** * Waits for a specific load state of the page * @param state The load state to wait for diff --git a/packages/core/src/browser/ariaTree/ariaSnapshot.ts b/packages/core/src/browser/ariaTree/ariaSnapshot.ts index cff71e8c..738fb2de 100644 --- a/packages/core/src/browser/ariaTree/ariaSnapshot.ts +++ b/packages/core/src/browser/ariaTree/ariaSnapshot.ts @@ -48,6 +48,11 @@ export function generateAndRenderAriaTree(root: Element, counter?: RefCounter): // that strip data-pilo-ref attributes, e.g. React reconciliation) (globalThis as any).__piloRefMap = new Map(); + // Initialize identity map for ref -> {role, name} lookup. Used by the + // repetition detector to distinguish logical click targets when refs + // themselves churn between snapshots. + (globalThis as any).__piloIdentityMap = new Map(); + const ariaTree = generateAriaTree(root); return renderAriaTree(ariaTree, refCounter); } @@ -493,6 +498,13 @@ function renderAriaTree(root: AriaNode, counter: RefCounter): string { if (ariaNode.element) { (globalThis as any).__piloRefMap?.set(ref, ariaNode.element); } + // Capture role + accessible name for the repetition detector. Stored + // separately from refMap so consumers that only need identity don't + // have to re-walk the DOM. + (globalThis as any).__piloIdentityMap?.set(ref, { + role: String(ariaNode.role), + name: ariaNode.name ?? "", + }); } const escapedKey = indent + "- " + yamlEscapeKeyIfNeeded(key); diff --git a/packages/core/src/browser/playwrightBrowser.ts b/packages/core/src/browser/playwrightBrowser.ts index 510f2a71..9faffcbf 100644 --- a/packages/core/src/browser/playwrightBrowser.ts +++ b/packages/core/src/browser/playwrightBrowser.ts @@ -788,6 +788,25 @@ export class PlaywrightBrowser implements AriaBrowser { return locator; } + async getRefIdentity(ref: string): Promise<{ role: string; name: string } | null> { + if (!this.page) return null; + try { + return await this.page.evaluate( + (refArg) => + ( + globalThis as unknown as { + __piloIdentityMap?: Map; + } + ).__piloIdentityMap?.get(refArg) ?? null, + ref, + ); + } catch { + // The page may have navigated or torn down the identity map. Identity + // is advisory for repetition detection — log nothing, just bail out. + return null; + } + } + async performAction(ref: string, action: PageAction, value?: string): Promise { if (!this.page) throw new Error("Browser not started"); return withSpan( diff --git a/packages/core/src/tools/webActionTools.ts b/packages/core/src/tools/webActionTools.ts index f1f415de..b6c8e04b 100644 --- a/packages/core/src/tools/webActionTools.ts +++ b/packages/core/src/tools/webActionTools.ts @@ -43,6 +43,11 @@ type ActionResult = { value?: string | number; error?: string; isRecoverable?: boolean; + // Snapshot-time role + accessible name for the targeted element. Optional + // (only populated when a ref is provided and the browser can resolve it). + // Used by the repetition detector to distinguish logical targets even + // when ref strings churn between snapshots. + targetIdentity?: { role: string; name: string }; }; /** @@ -82,6 +87,17 @@ async function performActionWithValidation( value, }); + // Capture target identity BEFORE the action runs — the snapshot's + // identity map is the basis for the ref the LLM emitted, and the + // action itself may invalidate the ref by causing navigation or DOM + // teardown. Identity is advisory; null means we just won't fold it + // into the repetition signature for this turn. + let targetIdentity: { role: string; name: string } | undefined; + if (ref) { + const id = await context.browser.getRefIdentity(ref); + if (id) targetIdentity = id; + } + // Perform the action await context.browser.performAction(ref || "", action, value); @@ -97,6 +113,7 @@ async function performActionWithValidation( action, ...(ref && { ref }), ...(value !== undefined && { value }), + ...(targetIdentity && { targetIdentity }), }; } catch (error) { // For browser exceptions, emit failure with error details and return error info diff --git a/packages/core/src/webAgent.ts b/packages/core/src/webAgent.ts index 05d68a56..187ff38c 100644 --- a/packages/core/src/webAgent.ts +++ b/packages/core/src/webAgent.ts @@ -9,7 +9,7 @@ import { streamText, ModelMessage, StreamTextResult } from "ai"; import type { ProviderConfig } from "./provider.js"; -import { AriaBrowser } from "./browser/ariaBrowser.js"; +import { AriaBrowser, PageAction } from "./browser/ariaBrowser.js"; import { BrowserReconnectedEventData, CdpEndpointConnectedEventData, @@ -230,6 +230,15 @@ export class WebAgent { private readonly onUserDataRequired: UserDataCallback | undefined; private readonly taskId: string | undefined; + // Actions where same-action-same-value repetition is legitimate workflow + // (e.g. scrolling an infinite feed, waiting for a slow page) rather than a + // stuck-loop signal. The detector skips these entirely and resets state so + // a real loop interrupted by a scroll doesn't compound across the gap. + private static readonly REPETITION_EXEMPT_ACTIONS: ReadonlySet = new Set([ + PageAction.Scroll, + PageAction.Wait, + ]); + constructor( private browser: AriaBrowser, options: WebAgentOptions, @@ -1154,6 +1163,15 @@ export class WebAgent { actionExecuted: boolean; error?: TaskError; } | null { + // Skip exempt actions entirely, and reset state so an in-progress repeat + // count doesn't carry across them (an exempt action between two clicks + // is treated as progress, not as a transparent gap). + if (WebAgent.REPETITION_EXEMPT_ACTIONS.has(actionOutput.action)) { + executionState.actionRepeatCount = 0; + executionState.lastAction = undefined; + return null; + } + // Define explicit thresholds for warning and abort const REPETITION_WARNING_THRESHOLD = this.maxRepeatedActions + 1; const REPETITION_ABORT_THRESHOLD = this.maxRepeatedActions + 2; @@ -1161,8 +1179,8 @@ export class WebAgent { // Create signature for current action const currentActionSignature = this.createActionSignature( actionOutput.action, - actionOutput.ref, actionOutput.value, + actionOutput.targetIdentity, ); // Check if this is the same action as the last one @@ -1505,10 +1523,29 @@ export class WebAgent { // === Helper Methods === /** - * Create a signature for an action to track repetitions + * Build a repetition-signature for an action. The element ref is + * deliberately excluded because refs are regenerated every snapshot — a + * logical "click Submit" gets a new ref each turn, so including it would + * make near-duplicate actions hash differently and bypass the detector. + * Element identity (role + accessible name) is folded in when present so + * ref-only actions like `click` don't collapse different logical targets + * (e.g. "click Next" vs "click Submit") onto the same signature. + * Value is normalized (lowercase + trim) so cosmetic input variation + * doesn't reset the counter on the same logical action. */ - private createActionSignature(action: string, ref?: string, value?: string | number): string { - return `${action}:${ref || ""}:${value || ""}`; + private createActionSignature( + action: string, + value?: string | number, + identity?: { role: string; name: string }, + ): string { + const normalizedValue = String(value ?? "") + .toLowerCase() + .trim(); + if (identity) { + const normalizedName = identity.name.toLowerCase().trim(); + return `${action}:${identity.role}:${normalizedName}:${normalizedValue}`; + } + return `${action}:${normalizedValue}`; } /** diff --git a/packages/core/test/tools/webActionTools.test.ts b/packages/core/test/tools/webActionTools.test.ts index 34374e74..2254bd87 100644 --- a/packages/core/test/tools/webActionTools.test.ts +++ b/packages/core/test/tools/webActionTools.test.ts @@ -73,6 +73,10 @@ class MockBrowser implements AriaBrowser { // Mock implementation - can be configured to throw errors for testing } + async getRefIdentity(_ref: string): Promise<{ role: string; name: string } | null> { + return null; + } + async waitForLoadState(): Promise {} async runInTemporaryTab(fn: (tab: any) => Promise): Promise { @@ -256,6 +260,34 @@ describe("Web Action Tools", () => { await expect(tools.click.execute({ ref: "btn1" })).rejects.toThrow("Network error"); }); + + it("should capture target identity from the browser and include it in the result", async () => { + vi.spyOn(mockBrowser, "getRefIdentity").mockResolvedValueOnce({ + role: "button", + name: "Submit", + }); + + const result = await tools.click.execute({ ref: "btn1" }); + + expect(result).toEqual({ + success: true, + action: "click", + ref: "btn1", + targetIdentity: { role: "button", name: "Submit" }, + }); + }); + + it("should omit targetIdentity when the browser returns null", async () => { + vi.spyOn(mockBrowser, "getRefIdentity").mockResolvedValueOnce(null); + + const result = await tools.click.execute({ ref: "btn1" }); + + expect(result).toEqual({ + success: true, + action: "click", + ref: "btn1", + }); + }); }); describe("Fill Action", () => { diff --git a/packages/core/test/webAgent.test.ts b/packages/core/test/webAgent.test.ts index 421456b2..a3237dcf 100644 --- a/packages/core/test/webAgent.test.ts +++ b/packages/core/test/webAgent.test.ts @@ -191,6 +191,10 @@ class MockBrowser implements AriaBrowser { async performAction(_ref: string, _action: PageAction, _value?: string): Promise {} + async getRefIdentity(_ref: string): Promise<{ role: string; name: string } | null> { + return null; + } + async waitForLoadState(): Promise {} async runInTemporaryTab(fn: (tab: any) => Promise): Promise { @@ -3867,5 +3871,425 @@ describe("WebAgent", () => { expect(result.finalAnswer).toBe("Completed"); expect(result.stats.actions).toBe(4); // 2 clicks + 1 fill + 1 done }); + + it("should detect repeated clicks across changing element refs", async () => { + // Regression test for #430: refs change every snapshot, so the agent + // calling click on the "same" logical button gets a different ref each + // turn. Pre-fix, this slipped past the detector entirely. + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Test task", + plan: "1. Click submit", + }, + }, + ], + } as any); + + // Five clicks on the "same" logical button, but ref churns each turn. + // Defaults: warning at count 3 (maxRepeatedActions+1), abort at count 4. + const churningRefs = ["E1", "E5", "E12", "E3", "E8"]; + for (const ref of churningRefs) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Click", + toolResults: [ + { + type: "tool-result", + toolCallId: `click_${ref}`, + toolName: "click", + input: { ref }, + output: { + success: true, + action: "click", + ref, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Click" }], + }, + }) as any, + ); + } + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(false); + expect(result.finalAnswer).toContain("Aborted: Excessive repetition"); + expect(result.finalAnswer).toContain("click"); + }); + + it("should not flag legitimate repeated scroll actions", async () => { + // Scroll repeats with the same direction are legitimate workflow + // (traversing an infinite-scroll feed). The detector must exempt them. + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Reach the bottom of the feed", + plan: "1. Scroll down repeatedly", + }, + }, + ], + } as any); + + // Five scrolls in a row would trip the detector under the old logic + // (same action, same value, same ref). They must not now. + for (let i = 0; i < 5; i++) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Scroll", + toolResults: [ + { + type: "tool-result", + toolCallId: `scroll_${i}`, + toolName: "scroll", + input: { direction: "down" }, + output: { + success: true, + action: "scroll", + value: "down", + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Scroll" }], + }, + }) as any, + ); + } + + // Then finish cleanly. + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Done", + toolResults: [ + { + type: "tool-result", + toolCallId: "done_1", + toolName: "done", + input: { result: "Reached bottom" }, + output: { + action: "done", + result: "Reached bottom", + isTerminal: true, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Done" }], + }, + }) as any, + ); + + mockGenerateTextWithRetry.mockResolvedValueOnce(mockValidationResponse("complete")); + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(true); + expect(result.finalAnswer).toBe("Reached bottom"); + + // No "repeated the same action" warning should have been injected. + const messages = mockStreamText.mock.calls.flatMap((call) => call[0]?.messages || []); + const warningMessage = messages.find( + (msg: any) => msg.role === "user" && msg.content?.includes("repeated the same action"), + ); + expect(warningMessage).toBeUndefined(); + }); + + it("should not flag legitimate repeated wait actions", async () => { + // Same shape as the scroll exemption test: identical wait calls must + // not be flagged as repetition (waiting for a slow page legitimately + // repeats with the same duration). + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Wait for page to load", + plan: "1. Wait for content", + }, + }, + ], + } as any); + + for (let i = 0; i < 5; i++) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Wait", + toolResults: [ + { + type: "tool-result", + toolCallId: `wait_${i}`, + toolName: "wait", + input: { seconds: 2 }, + output: { + success: true, + action: "wait", + value: "2", + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Wait" }], + }, + }) as any, + ); + } + + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Done", + toolResults: [ + { + type: "tool-result", + toolCallId: "done_1", + toolName: "done", + input: { result: "Page loaded" }, + output: { + action: "done", + result: "Page loaded", + isTerminal: true, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Done" }], + }, + }) as any, + ); + + mockGenerateTextWithRetry.mockResolvedValueOnce(mockValidationResponse("complete")); + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(true); + expect(result.finalAnswer).toBe("Page loaded"); + + const messages = mockStreamText.mock.calls.flatMap((call) => call[0]?.messages || []); + const warningMessage = messages.find( + (msg: any) => msg.role === "user" && msg.content?.includes("repeated the same action"), + ); + expect(warningMessage).toBeUndefined(); + }); + + it("should treat cosmetically different fill values as repetition", async () => { + // Value normalization (trim + lowercase) folds near-duplicates. An + // agent that fills "hello", then "HELLO ", then "hello" again is in a + // loop even though the strings aren't byte-identical. + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Test task", + plan: "1. Fill input", + }, + }, + ], + } as any); + + const variants = ["hello", "HELLO ", " Hello", "hello", "HELLO"]; + for (let i = 0; i < variants.length; i++) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Fill", + toolResults: [ + { + type: "tool-result", + toolCallId: `fill_${i}`, + toolName: "fill", + input: { ref: `input_${i}`, value: variants[i] }, + output: { + success: true, + action: "fill", + ref: `input_${i}`, + value: variants[i], + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Fill" }], + }, + }) as any, + ); + } + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(false); + expect(result.finalAnswer).toContain("Aborted: Excessive repetition"); + expect(result.finalAnswer).toContain("fill"); + }); + + it("should not flag clicks on different logical targets", async () => { + // Pass 2: ref-only actions (click, hover, check, etc.) all carry no + // `value`, so without element identity the bare `click:` signature + // collapses every click together — a multi-step wizard or pagination + // run trips the detector falsely. The targetIdentity field on the + // tool output distinguishes "click button:Next" from "click button:Submit". + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Walk through a wizard", + plan: "1. Click through wizard steps", + }, + }, + ], + } as any); + + const targets = [ + { ref: "E1", role: "button", name: "Next" }, + { ref: "E5", role: "button", name: "Continue" }, + { ref: "E12", role: "link", name: "Skip" }, + { ref: "E3", role: "button", name: "Confirm" }, + { ref: "E8", role: "button", name: "Finish" }, + ]; + for (const { ref, role, name } of targets) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Click", + toolResults: [ + { + type: "tool-result", + toolCallId: `click_${ref}`, + toolName: "click", + input: { ref }, + output: { + success: true, + action: "click", + ref, + targetIdentity: { role, name }, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Click" }], + }, + }) as any, + ); + } + + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Done", + toolResults: [ + { + type: "tool-result", + toolCallId: "done_1", + toolName: "done", + input: { result: "Wizard completed" }, + output: { + action: "done", + result: "Wizard completed", + isTerminal: true, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Done" }], + }, + }) as any, + ); + + mockGenerateTextWithRetry.mockResolvedValueOnce(mockValidationResponse("complete")); + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(true); + expect(result.finalAnswer).toBe("Wizard completed"); + + const messages = mockStreamText.mock.calls.flatMap((call) => call[0]?.messages || []); + const warningMessage = messages.find( + (msg: any) => msg.role === "user" && msg.content?.includes("repeated the same action"), + ); + expect(warningMessage).toBeUndefined(); + }); + + it("should detect repeated clicks on the same logical target across ref churn", async () => { + // Hardened version of the Pass 1 regression test: same logical button + // (role + name) across changing refs still trips the detector once + // identity is in the signature. Distinguishes the bug case (one button + // repeatedly clicked) from the wizard case (different buttons clicked). + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Test task", + plan: "1. Click submit", + }, + }, + ], + } as any); + + const churningRefs = ["E1", "E5", "E12", "E3", "E8"]; + for (const ref of churningRefs) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Click", + toolResults: [ + { + type: "tool-result", + toolCallId: `click_${ref}`, + toolName: "click", + input: { ref }, + output: { + success: true, + action: "click", + ref, + targetIdentity: { role: "button", name: "Submit" }, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Click" }], + }, + }) as any, + ); + } + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(false); + expect(result.finalAnswer).toContain("Aborted: Excessive repetition"); + expect(result.finalAnswer).toContain("click"); + }); }); }); diff --git a/packages/extension/src/background/ExtensionBrowser.ts b/packages/extension/src/background/ExtensionBrowser.ts index a3ad81c5..eff4d846 100644 --- a/packages/extension/src/background/ExtensionBrowser.ts +++ b/packages/extension/src/background/ExtensionBrowser.ts @@ -302,6 +302,28 @@ export class ExtensionBrowser implements AriaBrowser { } } + async getRefIdentity(ref: string): Promise<{ role: string; name: string } | null> { + try { + const tab = await this.getActiveTab(); + const [{ result }] = await browser.scripting.executeScript({ + target: { tabId: tab.id! }, + func: (refArg: string) => + ( + globalThis as unknown as { + __piloIdentityMap?: Map; + } + ).__piloIdentityMap?.get(refArg) ?? null, + args: [ref], + }); + return (result as { role: string; name: string } | null) ?? null; + } catch { + // Identity is advisory for repetition detection — if the page or + // identity map isn't available, bail out rather than failing the + // surrounding action. + return null; + } + } + async performAction(ref: string, action: PageAction, value?: string): Promise { console.log( `ExtensionBrowser: performAction() called with ref: ${ref}, action: ${action}, value: ${value}`,