From 768773774f26f306030f065ba3429765ac3fa084 Mon Sep 17 00:00:00 2001 From: Stephane Segning Lambou Date: Sun, 14 Jun 2026 17:40:24 +0200 Subject: [PATCH 1/9] docs(browser): design doc for human-in-the-loop UI feedback Plan for a browser_request_feedback tool that paints an annotation overlay and blocks on the user, returning structured annotations (resolved to element refs). Phased: 0) CancelFrame + per-command timeout infra, 1) confirm/choose/point overlay, 2) region/element/ comment + screenshot + side-panel fallback. Also index multi-client-routing.md in the plans list (was only linked from the browser table row). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 2 + plans/ui-feedback.md | 235 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 plans/ui-feedback.md diff --git a/CLAUDE.md b/CLAUDE.md index 43c2dfa..311d618 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -138,5 +138,7 @@ Versions are bumped **manually** — there are no changesets and no release scri - [`plans/prd.md`](plans/prd.md) — original oauth2 PRD with the phased roadmap. - [`plans/models-info-plan.md`](plans/models-info-plan.md) — design doc for the metadata plugin, including the OpenRouter→OpenCode field mapping table. +- [`plans/multi-client-routing.md`](plans/multi-client-routing.md) — design (final) for the browser bridge's auto-elect broker: multi-executor + multi-agent routing by group ownership, host-or-guest election, failover. +- [`plans/ui-feedback.md`](plans/ui-feedback.md) — design (draft) for human-in-the-loop browser feedback: a `browser_request_feedback` tool that paints an annotation overlay and blocks on the user; needs a `CancelFrame` + per-command timeout first. - [`docs/`](docs/) — the architecture doc is canonical for hook behavior. Also: [`well-known.md`](docs/well-known.md) (`.well-known/opencode` distribution), [`models-info.md`](docs/models-info.md) (enrichment composition + caching), [`ratelimit.md`](docs/ratelimit.md) (rate-limit policy/tiers), [`browser.md`](docs/browser.md) (browser-automation dual plugin — topology, wire protocol, tool reference, executors, security), [`troubleshooting.md`](docs/troubleshooting.md) (symptom-keyed fixes), plus GitHub Actions / Kubernetes cookbooks and local-dev setup. - [`docs/adr/`](docs/adr/) — Architecture Decision Records: load-bearing, non-obvious decisions and *why* (e.g. [ADR-0001](docs/adr/0001-bridge-transport-ws-not-bun-serve-or-socketio.md) — the browser bridge uses `ws`, not `Bun.serve` or socket.io). Add one when a choice closes off alternatives someone would reasonably reach for. diff --git a/plans/ui-feedback.md b/plans/ui-feedback.md new file mode 100644 index 0000000..1d63623 --- /dev/null +++ b/plans/ui-feedback.md @@ -0,0 +1,235 @@ +# UI feedback / human-in-the-loop (design) + +Status: **design / not yet implemented.** Greenfield follow-up to the browser plugin + MCP server. +Builds on the routing in [`multi-client-routing.md`](multi-client-routing.md) and the ws transport in +[`docs/adr/0001-bridge-transport-ws-not-bun-serve-or-socketio.md`](../docs/adr/0001-bridge-transport-ws-not-bun-serve-or-socketio.md). + +## Context + +Today the bridge is **fully automated**: the agent issues `command` frames, the extension executes +them programmatically (CDP or content-script), and answers with a `result`. There is **no** human in +the loop anywhere — no overlay, no prompt, no `chrome.notifications` (verified across `background/` +and `entrypoints/`). The popup/options panels are read-only dashboards. + +We want a second, *complementary* mode: when the agent is unsure what the user means, it can ask the +**user** to mark up the live page — click the element they meant, box a region, drop a comment — and +get that feedback back as structured data it can act on. "Show me which list" instead of guessing. + +## The key insight: the control direction does **not** invert + +This is tempting to model as "the UI drives the agent," but it isn't. The agent still **initiates** +(a normal tool call inside its turn). The only difference from a `click` is that the executor, instead +of acting programmatically, **renders an overlay and blocks on a human**, then returns the human's +input in the `ResultFrame`. So the whole feature lives inside the existing +`agent → command → executor → result` pipeline and is exposed to **both** the OpenCode plugin and the +MCP server for free. + +``` +agent: "which list did you mean?" + → browser_request_feedback(group, mode:"element", prompt:"Click the list you meant") + → command{action:"request_feedback"} ──▶ extension paints overlay, waits for the human + ← human clicks an element + ← result{ ok, data:{ annotations:[{ kind:"element", ref:"e42", selector:"…", text:"Inbox" }] } } + → agent now has a concrete ref and proceeds +``` + +What is genuinely new is **two infra gaps** a blocking, human-paced command exposes — neither +specific to this feature, both worth fixing cleanly first (Phase 0). + +## The two infra prerequisites (Phase 0) + +### 1. A command-cancellation frame (the blocker) + +Today, when an agent's `AbortSignal` fires (turn cancelled) or its timeout elapses, the broker +**rejects the local pending promise and sends nothing to the executor** — `broker.ts:426-429`: + +```ts +if (signal) { + const onAbort = () => this.settleReject(reqId, new BrokerError("aborted", "aborted")); + signal.addEventListener("abort", onAbort, { once: true }); + detachAbort = () => signal.removeEventListener("abort", onAbort); +} +``` + +For a 50 ms `click` that's invisible. For a blocking overlay it means an **orphaned overlay stuck on +the user's page** after the agent has given up. So we add a broker→executor frame: + +```ts +/** Server → extension: abandon an in-flight command; tear down any UI for it. */ +export interface CancelFrame { + v: number; + type: "cancel"; + /** The command id being abandoned (matches a prior CommandFrame.id). */ + id: string; +} +``` + +The broker already records `executorId` in each `pending` entry, so on abort **or** timeout it can +send `cancel{id}` to that executor before clearing the pending entry. The executor's command-router +keeps a small `Map void>` of teardown handlers for in-flight interactive commands and runs +the matching one. Non-interactive commands ignore `cancel` (idempotent no-op). This is generally +useful beyond this feature (any future long-running tool benefits), which is why it's Phase 0 and not +folded into the tool. + +### 2. Per-command timeout override + +Timeout is a single global `BrokerOptions.timeoutMs` (`broker.ts:411-423`); a tool call can't ask for +longer. A "wait for human" tool needs minutes while a `click` must still fail fast. Fix: + +- `ToolSpec` gains optional `timeoutMs?: number` (catalog declares the tool's natural ceiling). +- The adapter (`tools.ts` / MCP `server.ts`) passes it through `send(...)`; the broker's + `sendToExecutor` uses `cmd.timeoutMs ?? this.opts.timeoutMs`, **clamped to a hard max** + (`maxCommandMs`, default ~10 min) so a tool can't hang the bridge forever. +- The interactive tool also accepts a user-facing `timeoutMs` arg (bounded by the same clamp) so the + agent can say "wait up to 2 min." + +These two together: the overlay either gets human input, self-times-out and returns a clean +"no response" result, or is torn down by `cancel` when the agent's turn ends. No orphans, no hangs. + +## The tool: `browser_request_feedback` + +One tool with a `mode`, not a zoo of tools (keeps the catalog and the model's choice simple). + +``` +browser_request_feedback(group, mode, prompt?, options?, timeoutMs?, target?, tabId?) +``` + +`mode` (enum): + +| mode | UI | returns per annotation | +| --------- | ------------------------------------ | ---------------------------------------------- | +| `confirm` | branded yes/no bar | `{ kind:"confirm", value:boolean }` | +| `choose` | pick from agent-supplied `options[]` | `{ kind:"choice", value:string }` | +| `point` | click one spot | `{ kind:"point", x, y, ref?, selector? }` | +| `region` | drag a box | `{ kind:"region", rect, refs[] }` | +| `element` | hover-highlight → click an element | `{ kind:"element", ref, selector, text }` | +| `comment` | any of the above + a free-text note | the above shape plus `text` | + +Result is a neutral `{ annotations: Annotation[]; screenshot? }` rendered via the existing +`NeutralResult` (`kind:"json"`, plus optionally `kind:"image"` — see below). The `Annotation` union +mirrors the table; every spatial annotation carries **both pixels and a resolved element `ref`**. + +**Why refs matter.** Coordinates alone aren't actionable — the agent can't reconnect a pixel to the +DOM. By resolving the picked spot/region/element against the existing `data-ocb-ref` snapshot +machinery (the same refs `browser_snapshot` already emits), the agent gets back `ref:"e42"` and can +immediately `browser_click ref:e42`. This is the crux that makes feedback *useful* rather than just +informative. + +**Optional marker-annotated screenshot.** Because `NeutralResult` already supports `kind:"image"` +(written to disk by the OpenCode adapter, inline image content for MCP), the tool can burn the user's +markers into a screenshot and return it — so the agent literally *sees* what was circled, not just +coordinates. + +### Group: new opt-in `interactive` group + +This changes UX semantics (the model can now block waiting on a human), so it should be **opt-in**, +like `debug` — not silently on. Add a fourth `ToolGroup` `"interactive"`, excluded from +`DEFAULT_GROUPS`. Operators enable it via the existing `groups` option. (Alternative: fold into +`control`; rejected because `control` is on by default and a tool that blocks on a human shouldn't be +implicitly available.) + +## Where the overlay lives + +**Primary: in-page overlay** — matches the vision (annotate the *real* UI in context). But it can +**not** reuse the one-shot `pageDispatch` (`page-actions.ts:32-53`): that function is serialized, +re-evaluated per call, and `await`ed to completion by `executeScript`. Blocking it for minutes is +fragile (page navigation tears down the execution context; the MV3 service worker can be killed). So: + +- Inject a **persistent overlay content script** (its own self-contained injected module) that owns + the overlay DOM and input capture. +- It talks back to the background worker over a **`chrome.runtime` port** (not a return value). The + command-router registers a teardown handler keyed by command id, and **resolves the command when + the port delivers the annotations** (or on `cancel`/timeout). +- The open port + the bridge WebSocket keep the MV3 worker alive for the wait. Document this as the + one place the worker must not be allowed to idle out mid-command. + +**Fallback: side-panel over a screenshot** — for hostile pages (CSP, shadow DOM, z-index wars) where +an injected overlay can't be trusted to render. Capture a screenshot, show it in the extension's own +chrome with annotation tools, map annotations back to page coords. The new daisyUI panel pattern + +[`guide-panel.tsx`](../apps/browser-extension/src/components/panels/guide-panel.tsx) (nord/aqua theme) +is the template — compose with it rather than hand-rolling. + +**Getting the user's attention.** The agent and the human may be in different windows/tabs, so the +extension must signal a feedback request: `chrome.notifications`, a badge, a sound, and focusing the +target tab. Without this the agent silently blocks until timeout. + +**"Is a human even there?"** The feature assumes a present human at that executor's browser — +meaningless in headless/CI routing. The tool must time out gracefully into a +`{ annotations: [], timedOut: true }` result the agent can reason about ("no human responded"), +never hang. + +## Protocol changes + +- New `CancelFrame` (`{ v, type:"cancel", id }`), broker→executor; add to the `Frame` union, to + `decodeFrame`, and mirror into the extension's `shared/protocol.ts` copy. +- New `BrowserAction` `"request_feedback"`. +- `CommandFrame` is unchanged on the wire; the per-command timeout is a broker-internal `send` + argument derived from the `ToolSpec`/args, **not** a wire field (executors don't need it — they + rely on `cancel`). +- Optional: a `feedback_*` `EventFrame` name for progress (e.g. "overlay shown") if we want the agent + to log that the human was prompted; not required for v1. + +## Component changes + +- **`protocol.ts`** (+ extension mirror): `CancelFrame`, `"request_feedback"` action. +- **`broker.ts`**: send `cancel` to the recorded `executorId` on abort/timeout before clearing + `pending`; thread a per-command timeout through `sendToExecutor` with a `maxCommandMs` clamp. +- **`catalog.ts`**: `interactive` group; `browser_request_feedback` spec (`input`, `params`, + `result`); `ToolSpec.timeoutMs?`; the `Annotation` neutral types. +- **`tools.ts` / MCP `server.ts`**: pass `timeoutMs` into `send`; render `{annotations, screenshot}` + (json + optional image) — both adapters already handle json/image, so this is wiring, not new + rendering. +- **Extension `command-router.ts`**: `request_feedback` case → open overlay port, register teardown, + resolve on annotations; handle inbound `cancel` frames via the teardown map. +- **Extension (new)**: persistent overlay content script (point/region/element/confirm/choose/comment + capture, ref resolution against `data-ocb-ref`); attention signals (notification/badge/focus); + side-panel fallback UI composing the daisyUI panels. + +## Security + +This is a **trust upgrade**, not a risk: a deliberate human gate in an otherwise autonomous loop. +Caveats to honor: + +- The overlay must be **unmistakably branded** as opencode-browser and **un-spoofable** by the page + (so a malicious page can't fake the prompt to harvest a click), and always dismissible. +- The `interactive` group is **opt-in** (off by default) so blocking-on-human is never a surprise. +- Trust model otherwise unchanged: loopback bind + shared token; `cancel` is a trusted broker frame. + +## Testing + +- **Broker (pure, DI):** on abort and on timeout, a `cancel{id}` is sent to the owning executor and + `pending` is cleared; per-command `timeoutMs` overrides the global and is clamped to `maxCommandMs`. +- **Protocol:** `decodeFrame` round-trips `CancelFrame`; unknown future frames still rejected. +- **Catalog:** `request_feedback` only appears when `interactive` is enabled; `result` shapes each + `mode` into the right `Annotation`; ref-resolution mapping. +- **Extension (where harnessable):** teardown handler runs on `cancel`; overlay self-times-out into a + `timedOut` result; port-delivered annotations resolve the command; worker stays alive across the + wait. + +## Phased rollout (each independently shippable & green) + +0. **Infra:** `CancelFrame` + abort/timeout teardown; per-command `timeoutMs` with `maxCommandMs` + clamp. No user-visible feature yet — pure protocol/broker, fully unit-testable. +1. **Minimal HITL:** `interactive` group + `browser_request_feedback` with `confirm` / `choose` / + `point`; in-page overlay (port-based); ref resolution; attention signal. Delivers the core + "click the thing you meant" scenario. +2. **Rich annotation:** `region` / `element` / `comment` modes; marker-annotated screenshot result; + side-panel-over-screenshot fallback for hostile pages. + +## Open decisions (need a call before Phase 1) + +- **Overlay-primary vs side-panel-primary.** Recommended: in-page overlay primary, side-panel as + hostile-page fallback (matches the "annotate the real UI" vision). Could flip to side-panel-primary + if injected-overlay reliability across arbitrary sites proves too painful. +- **One tool with `mode` vs several tools.** Recommended: one `browser_request_feedback` with a + `mode` enum (simpler catalog, simpler model choice). +- **New `interactive` group vs reuse `control`.** Recommended: new opt-in group (blocking-on-human is + a UX change that shouldn't be on by default). +- **`maxCommandMs` ceiling value.** Proposed ~10 min. Needs a number. + +## Status + +Design **draft** — infra prerequisites (Phase 0) are well-specified and low-risk; the tool surface +and overlay mechanism are specified with the open decisions above flagged. Recommend landing Phase 0 +as its own PR (protocol + broker + tests, no UI) so the `cancel`-frame change to the shared protocol +gets focused review, then Phase 1. From 7700f83d760482fa11f6b3f44d3fa8b3329bd8f5 Mon Sep 17 00:00:00 2001 From: Stephane Segning Lambou Date: Sun, 14 Jun 2026 17:52:15 +0200 Subject: [PATCH 2/9] =?UTF-8?q?feat(browser):=20Phase=200=20=E2=80=94=20ca?= =?UTF-8?q?ncel=20frame=20+=20per-command=20timeout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Infra for human-in-the-loop tools that block on a user. Two gaps a long-running command exposes, fixed bridge-wide: 1. CancelFrame (broker→executor): on agent abort, per-command timeout, or the requesting agent disconnecting, the broker now tells the still-connected executor to tear down via a `cancel` frame before rejecting — so a blocking command (e.g. an open overlay) is never orphaned in the page. New `abandon()` helper; wired into the abort handler, timeout handler, and agent-disconnect path. 2. Per-command timeout: ToolSpec.timeoutMs threads through send() → CommandFrame.timeoutMs → broker, overriding the global default and clamped to maxCommandMs (default 10 min). Guest agents forward it on the wire and keep a local backstop timer at requested + grace so the broker (which can cancel the executor) always fires first. Extension: bridge-client handles inbound `cancel` → CommandRouter.cancel(id), which runs a teardown registered by cancellable commands (empty until Phase 1). Protocol mirror updated. Tests: broker cancel-on-abort / on-agent-disconnect / no-cancel-on-success, per-command override + maxCommandMs clamp; protocol cancel round-trip; agent-client timeoutMs forwarding + backstop grace. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/background/bridge-client.ts | 8 ++ .../src/background/command-router.ts | 26 ++++ .../src/entrypoints/background.ts | 1 + apps/browser-extension/src/shared/protocol.ts | 15 +++ packages/opencode-browser-mcp/src/server.ts | 2 +- packages/opencode-browser/src/agent-client.ts | 24 +++- packages/opencode-browser/src/broker.ts | 80 ++++++++++-- packages/opencode-browser/src/catalog.ts | 6 + packages/opencode-browser/src/endpoint.ts | 12 +- packages/opencode-browser/src/protocol.ts | 29 +++++ packages/opencode-browser/src/tools.ts | 2 +- .../test/agent-client.test.ts | 119 ++++++++++++++++++ packages/opencode-browser/test/broker.test.ts | 98 +++++++++++++++ .../opencode-browser/test/protocol.test.ts | 24 ++++ packages/opencode-browser/test/tools.test.ts | 4 +- 15 files changed, 426 insertions(+), 24 deletions(-) create mode 100644 packages/opencode-browser/test/agent-client.test.ts diff --git a/apps/browser-extension/src/background/bridge-client.ts b/apps/browser-extension/src/background/bridge-client.ts index 73aba15..a705931 100644 --- a/apps/browser-extension/src/background/bridge-client.ts +++ b/apps/browser-extension/src/background/bridge-client.ts @@ -30,6 +30,11 @@ export interface BridgeClientDeps { getConfig: () => Promise; /** Execute one command and resolve with its result data. */ onCommand: (frame: CommandFrame) => Promise; + /** + * The broker abandoned an in-flight command (agent abort / timeout / agent + * gone) — tear down any UI/work for that command id. No result is sent. + */ + onCancel?: (id: string) => void; /** Executor kind to publish in the status row, for the UI. */ executorKind: () => ExecutorKind; /** @@ -169,6 +174,9 @@ export class BridgeClient { case "release": await this.deps.onRelease?.(); return; + case "cancel": + this.deps.onCancel?.(frame.id); + return; case "ping": ws.send(encodeFrame({ v: PROTOCOL_VERSION, type: "pong" })); return; diff --git a/apps/browser-extension/src/background/command-router.ts b/apps/browser-extension/src/background/command-router.ts index c2a1810..c0976d1 100644 --- a/apps/browser-extension/src/background/command-router.ts +++ b/apps/browser-extension/src/background/command-router.ts @@ -19,11 +19,34 @@ function target(params: Record): Target { * (and screenshot) to IndexedDB for the dashboard — including failures. */ export class CommandRouter { + /** + * Teardown callbacks for in-flight cancellable commands, keyed by command id. + * Long-running interactive commands (e.g. a feedback overlay) register here so + * a broker `cancel` can abort them; ordinary commands never register. + */ + private readonly cancellers = new Map void>(); + constructor( private readonly registry: GroupRegistry, private readonly executor: Executor ) {} + /** Register a teardown for a cancellable command; returns a disposer. */ + registerCanceller(id: string, teardown: () => void): () => void { + this.cancellers.set(id, teardown); + return () => this.cancellers.delete(id); + } + + /** Broker abandoned command `id` — run and drop its teardown if present. */ + cancel(id: string): void { + const teardown = this.cancellers.get(id); + if (!teardown) { + return; + } + this.cancellers.delete(id); + teardown(); + } + async handle(frame: CommandFrame): Promise { const start = Date.now(); try { @@ -48,6 +71,9 @@ export class CommandRouter { durationMs: Date.now() - start }); throw err; + } finally { + // The command settled on its own; drop any teardown it registered. + this.cancellers.delete(frame.id); } } diff --git a/apps/browser-extension/src/entrypoints/background.ts b/apps/browser-extension/src/entrypoints/background.ts index 6970266..39be4a3 100644 --- a/apps/browser-extension/src/entrypoints/background.ts +++ b/apps/browser-extension/src/entrypoints/background.ts @@ -44,6 +44,7 @@ export default defineBackground(() => { }; }, onCommand: (frame) => router.handle(frame), + onCancel: (id) => router.cancel(id), executorKind: () => executorKind, // The plugin-side `executor` option (when set) wins over the dashboard // choice on each connect — rebuild the stack if it differs. diff --git a/apps/browser-extension/src/shared/protocol.ts b/apps/browser-extension/src/shared/protocol.ts index 52ecc72..0be154c 100644 --- a/apps/browser-extension/src/shared/protocol.ts +++ b/apps/browser-extension/src/shared/protocol.ts @@ -122,6 +122,8 @@ export interface CommandFrame { params: Record; /** Broker-only executor selector; ignored by executors. */ target?: string; + /** Broker-only per-command timeout (ms); ignored by executors. */ + timeoutMs?: number; } /** Extension → server: response to a `command`. */ @@ -151,6 +153,16 @@ export interface ReleaseFrame { type: "release"; } +/** + * Server → extension: abandon the in-flight command with this `id` (tear down + * any UI/work; no `result` expected). Unknown/completed id ⇒ harmless no-op. + */ +export interface CancelFrame { + v: number; + type: "cancel"; + id: string; +} + export interface PingFrame { v: number; type: "ping"; @@ -168,6 +180,7 @@ export type Frame = | ResultFrame | EventFrame | ReleaseFrame + | CancelFrame | PingFrame | PongFrame; @@ -208,6 +221,8 @@ export function decodeFrame(raw: string): Frame | null { return typeof parsed.name === "string" ? (parsed as unknown as EventFrame) : null; case "release": return { v: PROTOCOL_VERSION, type: "release" }; + case "cancel": + return typeof parsed.id === "string" ? (parsed as unknown as CancelFrame) : null; case "ping": return { v: PROTOCOL_VERSION, type: "ping" }; case "pong": diff --git a/packages/opencode-browser-mcp/src/server.ts b/packages/opencode-browser-mcp/src/server.ts index b12ddd4..d611c43 100644 --- a/packages/opencode-browser-mcp/src/server.ts +++ b/packages/opencode-browser-mcp/src/server.ts @@ -54,7 +54,7 @@ export function createMcpServer(send: SendFn, groups: ToolGroup[]): Server { const params = spec.params ? spec.params(args) : args; const group = typeof args.group === "string" ? args.group : ""; const target = typeof args.target === "string" ? args.target : undefined; - const data = await send(spec.action, group, params, undefined, target); + const data = await send(spec.action, group, params, undefined, target, spec.timeoutMs); const result: NeutralResult = spec.result ? spec.result(data, args) : { kind: "text", text: `${spec.name} ok` }; diff --git a/packages/opencode-browser/src/agent-client.ts b/packages/opencode-browser/src/agent-client.ts index 8dfc759..1ca2ffd 100644 --- a/packages/opencode-browser/src/agent-client.ts +++ b/packages/opencode-browser/src/agent-client.ts @@ -29,6 +29,15 @@ export interface AgentClientOptions { timeoutMs: number; } +/** + * Grace added to the guest's local per-command timer over the requested + * deadline. The host broker enforces the real timeout and can tell the executor + * to tear down (a `cancel` frame); the guest's timer is only a backstop for a + * silently unresponsive broker, so it must fire *after* the broker's, never + * before — otherwise the guest rejects locally while the executor keeps running. + */ +const AGENT_TIMEOUT_GRACE_MS = 5_000; + export interface AgentClientDeps { logger: Logger; createSocket: AgentSocketFactory; @@ -130,7 +139,8 @@ export class AgentClient implements AgentEndpoint { group: string, params: Record, signal?: AbortSignal, - target?: string + target?: string, + timeoutMs?: number ): Promise { if (signal?.aborted) { throw new AgentClientError("aborted", "aborted"); @@ -148,18 +158,24 @@ export class AgentClient implements AgentEndpoint { action, group, params, - target + target, + // Forward the override so the host broker applies (and clamps) it. + timeoutMs }; + // Backstop only — let the broker's deadline fire first (it can cancel the + // executor); fall back to the global timeout when no override is given. + const requested = timeoutMs !== undefined ? timeoutMs : this.opts.timeoutMs; + const localTimeout = requested > 0 ? requested + AGENT_TIMEOUT_GRACE_MS : 0; return new Promise((resolve, reject) => { const timer = - this.opts.timeoutMs > 0 + localTimeout > 0 ? setTimeout( () => this.settleReject( id, new AgentClientError(`command '${action}' timed out`, "timeout") ), - this.opts.timeoutMs + localTimeout ) : null; let detachAbort: (() => void) | null = null; diff --git a/packages/opencode-browser/src/broker.ts b/packages/opencode-browser/src/broker.ts index b7aa342..3957393 100644 --- a/packages/opencode-browser/src/broker.ts +++ b/packages/opencode-browser/src/broker.ts @@ -1,6 +1,7 @@ import type { Logger } from "./logging.js"; import { type BrowserAction, + cancelFrame, type CommandFrame, decodeFrame, encodeFrame, @@ -23,6 +24,12 @@ export class BrokerError extends Error { } } +/** + * Hard ceiling for any per-command timeout, so a tool requesting a long + * human-paced deadline can't pin a command open indefinitely. 10 minutes. + */ +export const DEFAULT_MAX_COMMAND_MS = 600_000; + /** What an agent (local or remote) uses to drive browsers through the broker. */ export interface AgentEndpoint { send( @@ -30,7 +37,9 @@ export interface AgentEndpoint { group: string, params: Record, signal?: AbortSignal, - target?: string + target?: string, + /** Per-command timeout override (ms), clamped to the broker's `maxCommandMs`. */ + timeoutMs?: number ): Promise; /** Release this agent's browsers (detach the debugger) without closing tabs. */ release(): void; @@ -42,8 +51,10 @@ export interface BrokerOptions { token: string; /** Executor preference forwarded to extensions in `ready`. */ executor?: "auto" | "cdp" | "content"; - /** Per-command timeout in ms; `<= 0` disables. */ + /** Default per-command timeout in ms; `<= 0` disables. */ timeoutMs: number; + /** Ceiling a per-command override can request (default `DEFAULT_MAX_COMMAND_MS`). */ + maxCommandMs?: number; } export interface BrokerDeps { @@ -141,8 +152,8 @@ export class Broker { createLocalAgent(): AgentEndpoint { const id = `local:${++this.agentSeq}`; return { - send: (action, group, params, signal, target) => - this.route(id, { action, group, params, target }, signal), + send: (action, group, params, signal, target, timeoutMs) => + this.route(id, { action, group, params, target, timeoutMs }, signal), release: () => this.releaseForAgent(id) }; } @@ -289,7 +300,9 @@ export class Broker { this.releaseForAgent(agent.id); for (const [reqId, p] of this.pending) { if (p.agentId === agent.id) { - this.settleReject(reqId, new BrokerError("agent disconnected", "agent_gone")); + // The executor may still be running this command — cancel it so any + // open UI (e.g. a feedback overlay) is torn down, not just orphaned. + this.abandon(reqId, new BrokerError("agent disconnected", "agent_gone")); } } this.deps.logger.info("browser_agent_disconnected", { id: agent.id }); @@ -299,7 +312,13 @@ export class Broker { // ─── routing ─────────────────────────────────────────────────────────────── private route( agentId: string, - cmd: { action: BrowserAction; group: string; params: Record; target?: string }, + cmd: { + action: BrowserAction; + group: string; + params: Record; + target?: string; + timeoutMs?: number; + }, signal?: AbortSignal ): Promise { if (cmd.action === "targets") { @@ -389,10 +408,25 @@ export class Broker { throw new BrokerError("no browser extension is connected", "not_connected"); } + /** Effective per-command timeout: the override (or global), clamped to the ceiling. */ + private timeoutFor(override?: number): number { + const requested = override !== undefined ? override : this.opts.timeoutMs; + if (requested <= 0) { + return 0; + } + const ceiling = this.opts.maxCommandMs ?? DEFAULT_MAX_COMMAND_MS; + return Math.min(requested, ceiling); + } + private sendToExecutor( executor: ExecutorInfo, agentId: string, - cmd: { action: BrowserAction; group: string; params: Record }, + cmd: { + action: BrowserAction; + group: string; + params: Record; + timeoutMs?: number; + }, signal?: AbortSignal ): Promise { if (signal?.aborted) { @@ -407,24 +441,25 @@ export class Broker { group: cmd.group, params: cmd.params }; + const timeoutMs = this.timeoutFor(cmd.timeoutMs); return new Promise((resolve, reject) => { const timer = - this.opts.timeoutMs > 0 + timeoutMs > 0 ? setTimeout( () => - this.settleReject( + this.abandon( reqId, new BrokerError( - `command '${cmd.action}' timed out after ${this.opts.timeoutMs}ms`, + `command '${cmd.action}' timed out after ${timeoutMs}ms`, "timeout" ) ), - this.opts.timeoutMs + timeoutMs ) : null; let detachAbort: (() => void) | null = null; if (signal) { - const onAbort = () => this.settleReject(reqId, new BrokerError("aborted", "aborted")); + const onAbort = () => this.abandon(reqId, new BrokerError("aborted", "aborted")); signal.addEventListener("abort", onAbort, { once: true }); detachAbort = () => signal.removeEventListener("abort", onAbort); } @@ -473,7 +508,8 @@ export class Broker { action: frame.action, group: frame.group, params: frame.params, - target: frame.target + target: frame.target, + timeoutMs: frame.timeoutMs }); this.safeSend(conn, resultFrame(frame.id, data)); } catch (err) { @@ -576,6 +612,24 @@ export class Broker { p.reject(err); } + /** + * Give up on a command the executor may still be running (abort, timeout, or + * the requesting agent vanishing): tell the still-connected executor to tear + * down via a `cancel` frame, then reject the pending promise. For an executor + * that's already gone the cancel is skipped — there's nothing to tear down. + */ + private abandon(reqId: string, err: Error): void { + const p = this.pending.get(reqId); + if (!p) { + return; + } + const exec = this.executorById.get(p.executorId); + if (exec) { + this.safeSend(exec.conn, cancelFrame(reqId)); + } + this.settleReject(reqId, err); + } + private rejectAllPending(err: Error): void { for (const [reqId, p] of this.pending) { if (p.timer) { diff --git a/packages/opencode-browser/src/catalog.ts b/packages/opencode-browser/src/catalog.ts index 2c00081..4ea660e 100644 --- a/packages/opencode-browser/src/catalog.ts +++ b/packages/opencode-browser/src/catalog.ts @@ -36,6 +36,12 @@ export interface ToolSpec { params?: (args: Record) => Record; /** Shape the extension's reply into a neutral result (default: a short ack). */ result?: (data: unknown, args: Record) => NeutralResult; + /** + * Per-command timeout (ms) for this tool, overriding the bridge's global + * default. Used by long-running, human-paced tools (e.g. feedback prompts). + * Clamped broker-side to `maxCommandMs`. Omit for the normal fast-fail default. + */ + timeoutMs?: number; } // ─── reusable fields ───────────────────────────────────────────────────────── diff --git a/packages/opencode-browser/src/endpoint.ts b/packages/opencode-browser/src/endpoint.ts index b342346..1ab1503 100644 --- a/packages/opencode-browser/src/endpoint.ts +++ b/packages/opencode-browser/src/endpoint.ts @@ -12,6 +12,8 @@ export interface EndpointOptions { token: string; executor?: "auto" | "cdp" | "content"; timeoutMs: number; + /** Ceiling for a per-command timeout override (host mode). */ + maxCommandMs?: number; /** Descriptor sent in the agent hello (guest mode). */ label?: string; /** Delay before retrying election after a drop. */ @@ -41,7 +43,8 @@ export interface Endpoint { group: string, params: Record, signal?: AbortSignal, - target?: string + target?: string, + timeoutMs?: number ): Promise; release(): void; shutdown(): void; @@ -99,7 +102,8 @@ export async function createEndpoint(opts: EndpointOptions, deps: EndpointDeps): port: opts.port, token: opts.token, executor: opts.executor, - timeoutMs: opts.timeoutMs + timeoutMs: opts.timeoutMs, + maxCommandMs: opts.maxCommandMs }, { logger: deps.logger, transport } ); @@ -160,9 +164,9 @@ export async function createEndpoint(opts: EndpointOptions, deps: EndpointDeps): await elect(); return { - send: (action, group, params, signal, target) => + send: (action, group, params, signal, target, timeoutMs) => current - ? current.send(action, group, params, signal, target) + ? current.send(action, group, params, signal, target, timeoutMs) : Promise.reject(new Error("bridge is re-electing — retry shortly")), release: () => current?.release(), shutdown: () => { diff --git a/packages/opencode-browser/src/protocol.ts b/packages/opencode-browser/src/protocol.ts index 5f599f5..6a71f97 100644 --- a/packages/opencode-browser/src/protocol.ts +++ b/packages/opencode-browser/src/protocol.ts @@ -142,6 +142,13 @@ export interface CommandFrame { * command creates a new group; ignored by executors. */ target?: string; + /** + * Optional per-command timeout in ms. Broker-only: a guest agent puts it here + * so the host broker honors a longer (or shorter) deadline than its global + * `timeoutMs` for this one command (clamped to the broker's `maxCommandMs`). + * Executors ignore it — they rely on a `cancel` frame for early teardown. + */ + timeoutMs?: number; } /** Extension → server: response to a `command`. */ @@ -178,6 +185,21 @@ export interface ReleaseFrame { type: "release"; } +/** + * Server → extension: abandon an in-flight command. The `id` matches a prior + * `command`; the executor should tear down any UI/work for it (e.g. an open + * feedback overlay) and need not send a `result`. Sent when the broker gives up + * on a command early — agent abort, per-command timeout, or the requesting + * agent disconnecting — so a blocking command never orphans state in the page. + * A `cancel` for an unknown/completed id is a harmless no-op. + */ +export interface CancelFrame { + v: number; + type: "cancel"; + /** The id of the command being abandoned. */ + id: string; +} + export interface PingFrame { v: number; type: "ping"; @@ -195,6 +217,7 @@ export type Frame = | ResultFrame | EventFrame | ReleaseFrame + | CancelFrame | PingFrame | PongFrame; @@ -243,6 +266,8 @@ export function decodeFrame(raw: string): Frame | null { return typeof parsed.name === "string" ? (parsed as unknown as EventFrame) : null; case "release": return { v: PROTOCOL_VERSION, type: "release" }; + case "cancel": + return typeof parsed.id === "string" ? (parsed as unknown as CancelFrame) : null; case "ping": return { v: PROTOCOL_VERSION, type: "ping" }; case "pong": @@ -278,3 +303,7 @@ export function resultFrame(id: string, data: unknown): ResultFrame { export function errorFrame(id: string, message: string, code?: string): ResultFrame { return { v: PROTOCOL_VERSION, type: "result", id, ok: false, error: { message, code } }; } + +export function cancelFrame(id: string): CancelFrame { + return { v: PROTOCOL_VERSION, type: "cancel", id }; +} diff --git a/packages/opencode-browser/src/tools.ts b/packages/opencode-browser/src/tools.ts index c103954..5318cbb 100644 --- a/packages/opencode-browser/src/tools.ts +++ b/packages/opencode-browser/src/tools.ts @@ -172,7 +172,7 @@ export function createBrowserTools(deps: ToolDeps): Record { + handlers = h; + return { send: (d) => sent.push(d), close: () => {} }; + }; + return { + sent, + factory, + handlers: () => { + if (!handlers) { + throw new Error("socket not created"); + } + return handlers; + } + }; +} + +function lastCommand(sent: string[]): CommandFrame { + for (let i = sent.length - 1; i >= 0; i--) { + const f = decodeFrame(sent[i]); + if (f?.type === "command") { + return f; + } + } + throw new Error("no command frame"); +} + +/** Build a connected AgentClient over a fake socket. */ +async function connected() { + const fs = fakeSocket(); + const client = new AgentClient( + { url: "ws://x", token: "t", timeoutMs: 1000 }, + { logger: noopLogger, createSocket: fs.factory } + ); + const p = client.connect(); + fs.handlers().onOpen(); + fs.handlers().onMessage( + encodeFrame({ + v: PROTOCOL_VERSION, + type: "ready", + server: "opencode-browser", + protocol: PROTOCOL_VERSION, + role: "agent", + clientId: "a1" + }) + ); + await p; + return { client, fs }; +} + +describe("AgentClient per-command timeout (Phase 0)", () => { + it("forwards a per-command timeoutMs on the command frame for the broker to honor", async () => { + const { client, fs } = await connected(); + void client.send("snapshot", "g", {}, undefined, undefined, 120_000).catch(() => {}); + await Promise.resolve(); + await Promise.resolve(); + expect(lastCommand(fs.sent)).toMatchObject({ action: "snapshot", timeoutMs: 120_000 }); + }); + + it("leaves timeoutMs undefined when no override is given", async () => { + const { client, fs } = await connected(); + void client.send("snapshot", "g", {}).catch(() => {}); + await Promise.resolve(); + await Promise.resolve(); + expect(lastCommand(fs.sent).timeoutMs).toBeUndefined(); + }); +}); + +describe("AgentClient local backstop timer (Phase 0)", () => { + beforeEach(() => vi.useFakeTimers()); + afterEach(() => vi.useRealTimers()); + + it("fires the local backstop only after the requested deadline plus grace", async () => { + const fs = fakeSocket(); + const client = new AgentClient( + { url: "ws://x", token: "t", timeoutMs: 1000 }, + { logger: noopLogger, createSocket: fs.factory } + ); + const cp = client.connect(); + fs.handlers().onOpen(); + fs.handlers().onMessage( + encodeFrame({ + v: PROTOCOL_VERSION, + type: "ready", + server: "opencode-browser", + protocol: PROTOCOL_VERSION, + role: "agent", + clientId: "a1" + }) + ); + await cp; + + let settled = false; + void client.send("snapshot", "g", {}, undefined, undefined, 5000).catch(() => { + settled = true; + }); + // Past the requested 5s but before requested + 5s grace (10s) — broker + // should win, so the local backstop must still be pending. + await vi.advanceTimersByTimeAsync(9000); + expect(settled).toBe(false); + // Past requested + grace — the backstop fires. + await vi.advanceTimersByTimeAsync(1500); + expect(settled).toBe(true); + }); +}); diff --git a/packages/opencode-browser/test/broker.test.ts b/packages/opencode-browser/test/broker.test.ts index 9d85e95..358d384 100644 --- a/packages/opencode-browser/test/broker.test.ts +++ b/packages/opencode-browser/test/broker.test.ts @@ -320,3 +320,101 @@ describe("Broker timeout", () => { await expectation; }); }); + +/** Does `sent` contain a `cancel` frame for command `id`? */ +function hasCancel(sent: string[], id: string): boolean { + return sent.some((s) => { + const f = decodeFrame(s); + return f?.type === "cancel" && f.id === id; + }); +} + +describe("Broker cancellation (Phase 0)", () => { + it("sends a cancel frame to the executor when a command is aborted", async () => { + const { broker, h } = await setup(); + const exec = connectExecutor(h, { id: "e1" }); + const agent = broker.createLocalAgent(); + const ac = new AbortController(); + const open = agent.send("open", "g", {}, ac.signal); + const cmd = lastCommand(exec.sent); + ac.abort(); + await expect(open).rejects.toThrow(/aborted/); + expect(hasCancel(exec.sent, cmd.id)).toBe(true); + }); + + it("cancels in-flight commands on the executor when the agent disconnects", async () => { + const { h } = await setup(); + const exec = connectExecutor(h, { id: "e1" }); + const agent = connectAgent(h); + // Remote agent issues an open; the broker forwards it to the executor. + h().onMessage( + agent.conn, + encodeFrame({ + v: PROTOCOL_VERSION, + type: "command", + id: "a1", + action: "open", + group: "g", + params: {} + }) + ); + const cmd = lastCommand(exec.sent); + h().onClose(agent.conn); + expect(hasCancel(exec.sent, cmd.id)).toBe(true); + }); + + it("does not emit a cancel when a command completes normally", async () => { + const { broker, h } = await setup(); + const exec = connectExecutor(h, { id: "e1" }); + const agent = broker.createLocalAgent(); + const open = agent.send("open", "g", {}); + const cmd = replyOk(h, exec, { url: "u" }); + await open; + expect(hasCancel(exec.sent, cmd.id)).toBe(false); + }); +}); + +describe("Broker per-command timeout (Phase 0)", () => { + beforeEach(() => vi.useFakeTimers()); + afterEach(() => vi.useRealTimers()); + + it("honors a per-command override longer than the global timeout", async () => { + const { broker, h } = await setup(); // global timeoutMs: 1000 + connectExecutor(h, { id: "e1" }); + const agent = broker.createLocalAgent(); + let settled = false; + const open = agent.send("open", "g", {}, undefined, undefined, 5000).catch(() => { + settled = true; + }); + // Past the global deadline but inside the override — still pending. + await vi.advanceTimersByTimeAsync(1500); + expect(settled).toBe(false); + // Past the override — now it times out. + await vi.advanceTimersByTimeAsync(4000); + await open; + expect(settled).toBe(true); + }); + + it("clamps a per-command override to maxCommandMs", async () => { + const transport = new FakeTransport(); + const broker = new Broker( + { host: "127.0.0.1", port: 4517, token: "secret", timeoutMs: 1000, maxCommandMs: 2000 }, + { logger: noopLogger, transport } + ); + await broker.start(); + const h = () => { + if (!transport.handlers) { + throw new Error("no handlers"); + } + return transport.handlers; + }; + const exec = connectExecutor(h, { id: "e1" }); + const agent = broker.createLocalAgent(); + const open = agent.send("open", "g", {}, undefined, undefined, 999_999); + const cmd = lastCommand(exec.sent); + const expectation = expect(open).rejects.toThrow(/timed out after 2000ms/); + await vi.advanceTimersByTimeAsync(2001); + await expectation; + expect(hasCancel(exec.sent, cmd.id)).toBe(true); + }); +}); diff --git a/packages/opencode-browser/test/protocol.test.ts b/packages/opencode-browser/test/protocol.test.ts index a4b7282..b64fefc 100644 --- a/packages/opencode-browser/test/protocol.test.ts +++ b/packages/opencode-browser/test/protocol.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from "vitest"; import { + cancelFrame, type CommandFrame, decodeFrame, encodeFrame, @@ -67,6 +68,29 @@ describe("protocol encode/decode", () => { type: "release" }); }); + + it("round-trips a cancel frame", () => { + const frame = cancelFrame("c7"); + expect(frame).toEqual({ v: PROTOCOL_VERSION, type: "cancel", id: "c7" }); + expect(decodeFrame(encodeFrame(frame))).toEqual(frame); + }); + + it("rejects a cancel frame without an id", () => { + expect(decodeFrame(JSON.stringify({ v: 1, type: "cancel" }))).toBeNull(); + }); + + it("preserves a per-command timeoutMs on a command frame", () => { + const frame: CommandFrame = { + v: PROTOCOL_VERSION, + type: "command", + id: "c2", + action: "snapshot", + group: "g", + params: {}, + timeoutMs: 120_000 + }; + expect(decodeFrame(encodeFrame(frame))).toEqual(frame); + }); }); describe("nextId", () => { diff --git a/packages/opencode-browser/test/tools.test.ts b/packages/opencode-browser/test/tools.test.ts index 62316a3..17f8f9f 100644 --- a/packages/opencode-browser/test/tools.test.ts +++ b/packages/opencode-browser/test/tools.test.ts @@ -113,7 +113,8 @@ describe("tool → bridge action mapping", () => { "research", expect.objectContaining({ url: "https://example.com" }), c.abort, - "work-chrome" + "work-chrome", + undefined ); expect(typeof res === "object" ? res.output : res).toContain("research"); }); @@ -127,6 +128,7 @@ describe("tool → bridge action mapping", () => { "g", expect.objectContaining({ ref: "e7" }), expect.anything(), + undefined, undefined ); }); From cb77c8e54f074403649156814feda3ec6428b464 Mon Sep 17 00:00:00 2001 From: Stephane Segning Lambou Date: Sun, 14 Jun 2026 18:01:58 +0200 Subject: [PATCH 3/9] =?UTF-8?q?feat(browser):=20Phase=201=20=E2=80=94=20in?= =?UTF-8?q?teractive=20feedback=20(confirm/choose/point)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the opt-in `interactive` tool group and `browser_request_feedback`: the agent asks the human at the browser to respond on the page and blocks until they do (or it times out). Plugin/catalog: - new ToolGroup "interactive" (opt-in, like debug); request_feedback action. - browser_request_feedback spec with mode confirm|choose|point, prompt, options, user timeoutMs (default 120s, clamped 290s). timeoutMs: 300s so the broker deadline backstops the overlay's own countdown. - Annotation / FeedbackResult neutral types; result summarizes the response (confirm → "confirmed", choose → chosen value, point → resolved element ref). Extension: - feedback-overlay.ts: self-contained injected overlay (branded, dismissible, Esc-to-skip). point mode resolves the clicked element to its data-ocb-ref (+ CSS selector + coords). Teardown is a second tiny injection. - feedback.ts: background orchestrator — owns the timeout, correlates the page→bg result message, raises the toolbar badge + focuses the tab, and exposes cancel() for the broker's cancel frame (wired via CommandRouter). - command-router request_feedback case registers the canceller before awaiting. Docs: interactive group + tool reference in browser.md; group/count updates in CLAUDE.md. Tests: gating, action+timeout forwarding, user-timeout clamp, result summaries (confirm/timeout/point-ref). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 2 +- .../src/background/command-router.ts | 21 ++ .../src/background/feedback-overlay.ts | 280 ++++++++++++++++++ .../src/background/feedback.ts | 121 ++++++++ apps/browser-extension/src/shared/protocol.ts | 6 +- docs/browser.md | 33 ++- packages/opencode-browser/src/catalog.ts | 108 ++++++- packages/opencode-browser/src/lib.ts | 2 + packages/opencode-browser/src/protocol.ts | 6 +- packages/opencode-browser/src/schema.ts | 2 +- packages/opencode-browser/src/types.ts | 7 +- packages/opencode-browser/test/tools.test.ts | 104 ++++++- 12 files changed, 675 insertions(+), 17 deletions(-) create mode 100644 apps/browser-extension/src/background/feedback-overlay.ts create mode 100644 apps/browser-extension/src/background/feedback.ts diff --git a/CLAUDE.md b/CLAUDE.md index 311d618..f74793d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -11,7 +11,7 @@ A **pnpm workspace** of OpenCode plugins under the `@vymalo` npm scope. There ar | `packages/opencode-oauth2` → `@vymalo/opencode-oauth2` | OAuth2 / OIDC auth + dynamic model discovery for OpenAI-compatible providers. The mature plugin; five auth flows (`authorization_code`, `device_code`, `client_credentials`, `jwt_bearer`, `token_exchange`), persistent token cache, periodic sync scheduler. PKCE is on by default for the two interactive flows (`pkce: false` opts out per server). | | `packages/opencode-models-info` → `@vymalo/opencode-models-info` | **Auth-agnostic** metadata enrichment plugin: fetches OpenRouter-shaped `/models` JSON and merges `limit` / `cost` / `modalities` / capability flags onto existing provider model entries. Runs as a `Hooks.config` hook *after* other plugins. | | `packages/opencode-ratelimit` → `@vymalo/opencode-ratelimit` | **Auth-agnostic** rate-limit awareness plugin: in its `Hooks.config` hook it injects a custom `fetch` onto opted-in providers (`options.meta.rateLimit`) that reads Envoy Gateway / IETF draft-03 rate-limit headers (`x-ratelimit-limit/remaining/reset`), proactively throttles when `remaining` hits 0, and backs off + retries on `429`. Supports `tiers` (reset-magnitude policy bands with `wait`/`error` actions, so a 60s burst waits but a multi-day budget reset errors fast) and `scope: "model"\|"provider"` (per-model cooldown buckets for per-model gateway limits). The only response-observing plugin — OpenCode has no post-response hook, so wrapping `options.fetch` is the sole interception point. In-memory state only (no `cache.ts`). See [`docs/ratelimit.md`](docs/ratelimit.md). | -| `packages/opencode-browser` → `@vymalo/opencode-browser` | **Auth-agnostic** browser-automation plugin: registers `browser_*` **tools** (`Hooks.tool`) the model calls (open, click, type, scroll, screenshot, snapshot, …) and hosts a localhost WebSocket **bridge** (via the Node `ws` package, so it runs under both Bun and Node) that the companion extension dials. **33 tools** in three groups (`page`/`control`/`debug`, gated by the `groups` option); tabs are organized into **named groups**. The single source of truth for the tool surface is `catalog.ts` (shared with the MCP server). The bridge is an **auto-elect broker** (`broker.ts`) routing between **agents** (plugin/MCP/sessions) and **executors** (extensions) by named-group ownership — so multiple browsers and multiple agents can share one bridge. The only tool-registering plugin. Screenshots are written to disk (tool output is text-only). See [`docs/browser.md`](docs/browser.md) and [`plans/multi-client-routing.md`](plans/multi-client-routing.md). | +| `packages/opencode-browser` → `@vymalo/opencode-browser` | **Auth-agnostic** browser-automation plugin: registers `browser_*` **tools** (`Hooks.tool`) the model calls (open, click, type, scroll, screenshot, snapshot, …) and hosts a localhost WebSocket **bridge** (via the Node `ws` package, so it runs under both Bun and Node) that the companion extension dials. **34 tools** in four groups (`page`/`control`/`debug`/`interactive`, gated by the `groups` option; `debug` and `interactive` are opt-in); tabs are organized into **named groups**. The single source of truth for the tool surface is `catalog.ts` (shared with the MCP server). The bridge is an **auto-elect broker** (`broker.ts`) routing between **agents** (plugin/MCP/sessions) and **executors** (extensions) by named-group ownership — so multiple browsers and multiple agents can share one bridge. The only tool-registering plugin. Screenshots are written to disk (tool output is text-only). The `interactive` group adds **human-in-the-loop** feedback (`browser_request_feedback`): a blocking, branded in-page overlay (point/confirm/choose) that the broker can tear down via a `cancel` frame on abort/timeout — see [`plans/ui-feedback.md`](plans/ui-feedback.md). See [`docs/browser.md`](docs/browser.md) and [`plans/multi-client-routing.md`](plans/multi-client-routing.md). | | `packages/opencode-browser-mcp` → `@vymalo/opencode-browser-mcp` | **MCP stdio server** (a `bin`) that hosts the same bridge (Node `ws` transport) and exposes the same group-filtered `browser_*` catalog over the Model Context Protocol — so non-OpenCode agents (Claude Code, Cursor, Cline, …) can drive the extension. Reuses `@vymalo/opencode-browser`'s catalog + JSON-Schema via `./lib`; returns screenshots as inline MCP image content. | | `apps/browser-extension` → `@vymalo/opencode-browser-extension` (private) | Companion Chromium MV3 + Firefox extension for the browser plugin/MCP server. WXT + React + Tailwind + shadcn-style UI + TanStack Query + Dexie/IndexedDB. Its background worker connects out to the bridge and drives tabs via CDP (`chrome.debugger`) or a content-script fallback. **Not** published to npm. | | `packages/plugin-bundle` → `@vymalo/opencode-oauth2-bundle` (private) | Rolldown build that ships a single-file distribution of the oauth2 plugin. | diff --git a/apps/browser-extension/src/background/command-router.ts b/apps/browser-extension/src/background/command-router.ts index c0976d1..aa38783 100644 --- a/apps/browser-extension/src/background/command-router.ts +++ b/apps/browser-extension/src/background/command-router.ts @@ -1,6 +1,8 @@ import { recordAction, recordScreenshot } from "../shared/db"; import type { CommandFrame } from "../shared/protocol"; import type { Executor, Viewport } from "./executor"; +import { startFeedback } from "./feedback"; +import type { FeedbackMode, FeedbackRequest } from "./feedback-overlay"; import type { GroupRegistry } from "./group-registry"; import { runPageAction, type Target } from "./page-actions"; @@ -317,6 +319,25 @@ export class CommandRouter { await this.executor.releaseAll(); return { data: { ok: true }, summary: "released control" }; } + case "request_feedback": { + const tabId = this.tab(group, params); + const req: FeedbackRequest = { + mode: (params.mode as FeedbackMode) ?? "confirm", + prompt: params.prompt as string | undefined, + options: Array.isArray(params.options) ? (params.options as string[]) : undefined, + timeoutMs: typeof params.timeoutMs === "number" ? params.timeoutMs : 120_000 + }; + const handle = startFeedback(tabId, frame.id, req); + // Register before awaiting so a broker `cancel` mid-wait can tear it down. + this.registerCanceller(frame.id, handle.cancel); + const result = await handle.result; + const summary = result.timedOut + ? "feedback timed out" + : result.responded + ? `feedback: ${result.annotations.map((a) => a.kind).join(",") || "none"}` + : "feedback dismissed"; + return { data: result, summary }; + } default: throw new Error(`unknown action: ${action}`); } diff --git a/apps/browser-extension/src/background/feedback-overlay.ts b/apps/browser-extension/src/background/feedback-overlay.ts new file mode 100644 index 0000000..4118ff6 --- /dev/null +++ b/apps/browser-extension/src/background/feedback-overlay.ts @@ -0,0 +1,280 @@ +/** + * In-page feedback overlay for `interactive` requests. Like `page-actions`, the + * overlay function is injected via `chrome.scripting.executeScript({ func })`, + * so it is serialized and re-evaluated in the page with **no module scope** — + * everything it needs lives inside the single self-contained `feedbackOverlay`. + * + * It does NOT block: it paints the overlay, wires listeners, and returns. The + * user's response (or a dismissal) comes back to the background worker via + * `chrome.runtime.sendMessage` (the background owns the timeout + correlation). + * Teardown is a second tiny injection that removes the overlay element, so the + * background can cancel a request without a page-side message channel. + */ + +export type FeedbackMode = "confirm" | "choose" | "point"; + +export interface FeedbackRequest { + mode: FeedbackMode; + prompt?: string; + options?: string[]; + /** Background-owned deadline (ms); the overlay itself does not self-expire. */ + timeoutMs: number; +} + +/** One mark the user made; mirrors the plugin's `Annotation` wire shape. */ +export type FeedbackAnnotation = + | { kind: "confirm"; value: boolean } + | { kind: "choice"; value: string } + | { kind: "point"; x: number; y: number; ref?: string; selector?: string; text?: string }; + +/** Message the injected overlay posts back to the background worker. */ +export interface FeedbackMessage { + type: "ocb-feedback-result"; + id: string; + /** False when the user dismissed/skipped rather than answering. */ + responded: boolean; + annotations: FeedbackAnnotation[]; +} + +/** Paint the overlay in the page. Resolves once injected (does not wait for input). */ +export async function showFeedbackOverlay( + tabId: number, + id: string, + req: FeedbackRequest +): Promise { + await chrome.scripting.executeScript({ + target: { tabId }, + func: feedbackOverlay, + args: [id, req.mode, req.prompt ?? "", req.options ?? []] + }); +} + +/** Remove the overlay for `id` from the page (best-effort). */ +export async function hideFeedbackOverlay(tabId: number, id: string): Promise { + try { + await chrome.scripting.executeScript({ + target: { tabId }, + func: (overlayId: string) => { + document.getElementById(`ocb-feedback-${overlayId}`)?.remove(); + }, + args: [id] + }); + } catch { + /* tab navigated/closed — nothing to remove */ + } +} + +/** + * THE injected overlay. Self-contained (all helpers inside). Runs in the page's + * isolated content-script world, so `chrome.runtime.sendMessage` is available. + */ +function feedbackOverlay(id: string, mode: string, prompt: string, options: string[]): void { + const elementId = `ocb-feedback-${id}`; + document.getElementById(elementId)?.remove(); + + const send = (responded: boolean, annotations: unknown[]): void => { + chrome.runtime.sendMessage({ type: "ocb-feedback-result", id, responded, annotations }); + }; + + const ACCENT = "#3b82f6"; + const root = document.createElement("div"); + root.id = elementId; + root.style.cssText = [ + "position:fixed", + "inset:0", + "z-index:2147483647", + "font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,Roboto,sans-serif", + "color:#0f172a" + ].join(";"); + + // Brand bar (anti-spoof: clearly identifies the source of the prompt). + const bar = document.createElement("div"); + bar.style.cssText = [ + "position:absolute", + "top:0", + "left:0", + "right:0", + "display:flex", + "align-items:center", + "gap:8px", + "padding:8px 12px", + "background:#0f172a", + "color:#f8fafc", + "font-size:13px", + "box-shadow:0 1px 6px rgba(0,0,0,.25)" + ].join(";"); + const dot = document.createElement("span"); + dot.style.cssText = `width:8px;height:8px;border-radius:9999px;background:${ACCENT};flex:none`; + const brand = document.createElement("span"); + brand.style.cssText = "font-weight:600"; + brand.textContent = "opencode-browser"; + const msg = document.createElement("span"); + msg.style.cssText = "opacity:.85;overflow:hidden;text-overflow:ellipsis;white-space:nowrap"; + msg.textContent = prompt || "is asking for your input"; + bar.append(dot, brand, msg); + root.appendChild(bar); + + const skip = (): void => { + cleanup(); + send(false, []); + }; + + const finish = (annotations: unknown[]): void => { + cleanup(); + send(true, annotations); + }; + + const onKey = (e: KeyboardEvent): void => { + if (e.key === "Escape") { + e.preventDefault(); + skip(); + } + }; + function cleanup(): void { + document.removeEventListener("keydown", onKey, true); + root.remove(); + } + document.addEventListener("keydown", onKey, true); + + const btn = (label: string, primary: boolean): HTMLButtonElement => { + const b = document.createElement("button"); + b.type = "button"; + b.textContent = label; + b.style.cssText = [ + "padding:6px 14px", + "border-radius:8px", + "font-size:13px", + "font-weight:600", + "cursor:pointer", + `border:1px solid ${primary ? ACCENT : "#cbd5e1"}`, + `background:${primary ? ACCENT : "#fff"}`, + `color:${primary ? "#fff" : "#0f172a"}` + ].join(";"); + return b; + }; + + if (mode === "point") { + // Transparent capture layer below the bar; click picks a page element. + const capture = document.createElement("div"); + capture.style.cssText = [ + "position:absolute", + "inset:36px 0 0 0", + "cursor:crosshair", + "background:rgba(59,130,246,.06)" + ].join(";"); + const hint = document.createElement("div"); + hint.style.cssText = [ + "position:absolute", + "bottom:16px", + "left:50%", + "transform:translateX(-50%)", + "background:#0f172a", + "color:#f8fafc", + "padding:6px 12px", + "border-radius:9999px", + "font-size:12px" + ].join(";"); + hint.textContent = "Click the element you mean — Esc to skip"; + capture.appendChild(hint); + capture.addEventListener("click", (e: MouseEvent) => { + const x = e.clientX; + const y = e.clientY; + root.style.display = "none"; + const el = document.elementFromPoint(x, y) as HTMLElement | null; + root.style.display = ""; + const annotation: Record = { kind: "point", x, y }; + if (el) { + let node: HTMLElement | null = el; + while (node && !node.getAttribute("data-ocb-ref")) { + node = node.parentElement; + } + const ref = node?.getAttribute("data-ocb-ref") ?? undefined; + if (ref) { + annotation.ref = ref; + } + annotation.selector = cssPath(el); + const label = (el.textContent || "").trim().slice(0, 80); + if (label) { + annotation.text = label; + } + } + finish([annotation]); + }); + root.appendChild(capture); + } else { + const panel = document.createElement("div"); + panel.style.cssText = [ + "position:absolute", + "top:50%", + "left:50%", + "transform:translate(-50%,-50%)", + "min-width:280px", + "max-width:420px", + "background:#fff", + "border:1px solid #e2e8f0", + "border-radius:14px", + "box-shadow:0 12px 40px rgba(2,6,23,.28)", + "padding:18px" + ].join(";"); + const q = document.createElement("div"); + q.style.cssText = "font-size:14px;line-height:1.4;margin-bottom:14px"; + q.textContent = prompt || (mode === "confirm" ? "Confirm?" : "Choose one:"); + panel.appendChild(q); + + const actions = document.createElement("div"); + actions.style.cssText = "display:flex;flex-wrap:wrap;gap:8px;justify-content:flex-end"; + + if (mode === "choose") { + for (const opt of options.length ? options : ["OK"]) { + const b = btn(opt, false); + b.addEventListener("click", () => finish([{ kind: "choice", value: opt }])); + actions.appendChild(b); + } + const skipBtn = btn("Skip", false); + skipBtn.style.marginLeft = "auto"; + skipBtn.addEventListener("click", skip); + actions.appendChild(skipBtn); + } else { + const no = btn("No", false); + no.addEventListener("click", () => finish([{ kind: "confirm", value: false }])); + const yes = btn("Yes", true); + yes.addEventListener("click", () => finish([{ kind: "confirm", value: true }])); + actions.append(no, yes); + } + panel.appendChild(actions); + root.appendChild(panel); + } + + document.documentElement.appendChild(root); + + /** Minimal stable-ish CSS selector for an element (id → nth-of-type path). */ + function cssPath(el: HTMLElement): string { + if (el.id) { + return `#${CSS.escape(el.id)}`; + } + const parts: string[] = []; + let node: HTMLElement | null = el; + let depth = 0; + while (node && node.nodeType === 1 && depth < 4) { + const current: HTMLElement = node; + let part = current.tagName.toLowerCase(); + const parent: HTMLElement | null = current.parentElement; + if (parent) { + const siblings = Array.from(parent.children).filter( + (c): c is Element => c.tagName === current.tagName + ); + if (siblings.length > 1) { + part += `:nth-of-type(${siblings.indexOf(current) + 1})`; + } + } + parts.unshift(part); + if (current.id) { + parts[0] = `#${CSS.escape(current.id)}`; + break; + } + node = parent; + depth++; + } + return parts.join(" > "); + } +} diff --git a/apps/browser-extension/src/background/feedback.ts b/apps/browser-extension/src/background/feedback.ts new file mode 100644 index 0000000..5a95078 --- /dev/null +++ b/apps/browser-extension/src/background/feedback.ts @@ -0,0 +1,121 @@ +/** + * Background-side orchestration for `interactive` feedback requests. Owns the + * timeout, correlation, and attention signal; the page-side overlay (injected + * by `feedback-overlay`) reports the user's answer via `chrome.runtime.sendMessage`. + * + * Flow: paint overlay → flag attention (badge + focus the tab) → await either a + * result message, the timeout, or a `cancel()` (broker abandoned the command). + * Whatever ends it, the overlay is torn down and the attention flag cleared. + */ +import { + type FeedbackAnnotation, + type FeedbackMessage, + type FeedbackRequest, + hideFeedbackOverlay, + showFeedbackOverlay +} from "./feedback-overlay"; + +export interface FeedbackResult { + responded: boolean; + timedOut?: boolean; + annotations: FeedbackAnnotation[]; +} + +export interface FeedbackHandle { + /** Resolves when the user answers, the request times out, or it's cancelled. */ + result: Promise; + /** Broker abandoned the command — tear down the overlay and settle. */ + cancel: () => void; +} + +function isFeedbackMessage(msg: unknown, id: string): msg is FeedbackMessage { + return ( + typeof msg === "object" && + msg !== null && + (msg as { type?: unknown }).type === "ocb-feedback-result" && + (msg as { id?: unknown }).id === id + ); +} + +let attentionCount = 0; + +/** Raise the toolbar badge and bring the driven tab forward. */ +async function flagAttention(tabId: number): Promise { + attentionCount++; + try { + await chrome.action.setBadgeBackgroundColor({ color: "#3b82f6" }); + await chrome.action.setBadgeText({ text: "?" }); + } catch { + /* action API unavailable */ + } + try { + const tab = await chrome.tabs.get(tabId); + await chrome.tabs.update(tabId, { active: true }); + if (tab.windowId !== undefined) { + await chrome.windows.update(tab.windowId, { focused: true }); + } + } catch { + /* tab/window gone */ + } +} + +/** Clear the badge once no feedback requests are outstanding. */ +async function clearAttention(): Promise { + attentionCount = Math.max(0, attentionCount - 1); + if (attentionCount > 0) { + return; + } + try { + await chrome.action.setBadgeText({ text: "" }); + } catch { + /* action API unavailable */ + } +} + +/** + * Start a feedback request: paint the overlay, signal attention, and return a + * handle whose `result` settles exactly once. + */ +export function startFeedback(tabId: number, id: string, req: FeedbackRequest): FeedbackHandle { + let settle: (r: FeedbackResult) => void = () => {}; + const result = new Promise((resolve) => { + settle = resolve; + }); + + let done = false; + const timer = setTimeout( + () => finish({ responded: false, timedOut: true, annotations: [] }), + req.timeoutMs + ); + + const onMessage = (msg: unknown): void => { + if (isFeedbackMessage(msg, id)) { + finish({ responded: msg.responded, annotations: msg.annotations ?? [] }); + } + }; + chrome.runtime.onMessage.addListener(onMessage); + + function finish(r: FeedbackResult): void { + if (done) { + return; + } + done = true; + clearTimeout(timer); + chrome.runtime.onMessage.removeListener(onMessage); + void hideFeedbackOverlay(tabId, id); + void clearAttention(); + settle(r); + } + + void flagAttention(tabId); + // Injection failure (e.g. a restricted page) ends the request as "no response" + // rather than hanging until the timeout. + void showFeedbackOverlay(tabId, id, req).catch(() => { + finish({ responded: false, timedOut: true, annotations: [] }); + }); + + return { + result, + cancel: () => finish({ responded: false, annotations: [] }) + }; +} diff --git a/apps/browser-extension/src/shared/protocol.ts b/apps/browser-extension/src/shared/protocol.ts index 0be154c..293d0c2 100644 --- a/apps/browser-extension/src/shared/protocol.ts +++ b/apps/browser-extension/src/shared/protocol.ts @@ -45,7 +45,8 @@ export type BrowserAction = | "set_viewport" | "cookies" | "targets" - | "release"; + | "release" + | "request_feedback"; export const BROWSER_ACTIONS: readonly BrowserAction[] = [ "open", @@ -80,7 +81,8 @@ export const BROWSER_ACTIONS: readonly BrowserAction[] = [ "set_viewport", "cookies", "targets", - "release" + "release", + "request_feedback" ] as const; /** A connection's role on the bridge. Absent → "extension" (back-compat). */ diff --git a/docs/browser.md b/docs/browser.md index 5fd32f0..370fa33 100644 --- a/docs/browser.md +++ b/docs/browser.md @@ -64,8 +64,8 @@ Options (all optional): | `port` | `4517` | WebSocket bridge port. | | `token` | _generated_ | Shared secret the extension must present. If omitted, a random token is generated and **logged once** (`browser_bridge_token_generated`) — copy it into the extension. | | `executor` | `"auto"` | Forwarded executor preference: `auto` \| `cdp` \| `content`. | -| `groups` | `["page","control"]` | Which tool groups to register (`page` \| `control` \| `debug`). `debug` is opt-in. | -| `timeoutMs` | `30000` | Per-command timeout before the tool call rejects. | +| `groups` | `["page","control"]` | Which tool groups to register (`page` \| `control` \| `debug` \| `interactive`). `debug` and `interactive` are opt-in. | +| `timeoutMs` | `30000` | Default per-command timeout before the tool call rejects. A tool may request a longer per-command deadline (e.g. `browser_request_feedback`), capped at 10 min. | | `screenshotDir` | `".opencode/browser"` | Where screenshots are written (relative paths resolve against the session worktree). | On first run with no `token`, watch the log for: @@ -99,9 +99,9 @@ and dashboard show **Connected** once the handshake succeeds. `group` is the primary handle on every tool — it names the tab group the action targets, and the extension creates it on the first `browser_open`. -The 33 tools are organized into three **groups**, gated by the `groups` option (plugin) / -`OCB_GROUPS` (MCP). Default: `page` + `control` (`debug` is opt-in). The `browser_*` names are -stable, so OpenCode's per-agent tool allow/deny works on them directly too. +The 34 tools are organized into four **groups**, gated by the `groups` option (plugin) / +`OCB_GROUPS` (MCP). Default: `page` + `control` (`debug` and `interactive` are opt-in). The +`browser_*` names are stable, so OpenCode's per-agent tool allow/deny works on them directly too. **`page`** — observe (read-mostly): @@ -149,6 +149,29 @@ stable, so OpenCode's per-agent tool allow/deny works on them directly too. | `browser_set_viewport` | `group, width, height, mobile?, deviceScaleFactor?` | Emulate a viewport (CDP only). | | `browser_cookies` | `op, url?, name?, value?` | Read/modify cookies. | +**`interactive`** — ask the human on the page and block on them (**off by default**): + +| Tool | Key args | Result | +| --- | --- | --- | +| `browser_request_feedback` | `group, mode, prompt?, options?, timeoutMs?` | Paints a branded overlay and blocks until the user responds. | + +This is the one tool that **waits on a human** rather than acting autonomously — use it when a +screenshot or snapshot isn't enough to know what the user means ("which of these did you mean?"). +`mode`: + +- `confirm` — a yes/no bar; returns `{ kind: "confirm", value }`. +- `choose` — buttons for each `options[]` entry; returns `{ kind: "choice", value }`. +- `point` — the user clicks one spot; the reply resolves to the **element ref** under the click + (plus a CSS selector and pixel coords) so you can immediately `browser_click ref:…` it. + +The overlay is clearly branded as opencode-browser (so a page can't spoof the prompt), dismissible +(Esc / Skip), and raises the toolbar badge + focuses the tab to get the user's attention. On no +response it returns `{ responded: false, timedOut: true }` so the agent can fall back rather than +hang. The wait uses a long per-command timeout (default 120 s, max ~290 s, capped broker-side at +10 min); if the agent's turn is aborted or times out, the broker sends a `cancel` frame that tears +the overlay down — a blocking prompt never orphans state in the page. Meaningless in headless/CI +routing (no human at the browser), where it simply times out. + ### Scoping tools and token cost Two composable levers: **`groups` / `OCB_GROUPS`** (global — what's *registered*; the ceiling) and diff --git a/packages/opencode-browser/src/catalog.ts b/packages/opencode-browser/src/catalog.ts index 4ea660e..63cfef2 100644 --- a/packages/opencode-browser/src/catalog.ts +++ b/packages/opencode-browser/src/catalog.ts @@ -3,10 +3,35 @@ import type { Field, JsonInput, ToolGroup } from "./schema.js"; export type { ToolGroup } from "./schema.js"; -export const TOOL_GROUPS: readonly ToolGroup[] = ["page", "control", "debug"] as const; -/** Groups registered when the operator doesn't specify — `debug` is opt-in. */ +export const TOOL_GROUPS: readonly ToolGroup[] = [ + "page", + "control", + "debug", + "interactive" +] as const; +/** Groups registered when the operator doesn't specify — `debug` and `interactive` are opt-in. */ export const DEFAULT_GROUPS: readonly ToolGroup[] = ["page", "control"] as const; +/** + * Structured result of an `interactive` feedback request — what the user marked + * on the page. Spatial annotations resolve to a `data-ocb-ref` element ref where + * possible (so the agent can act with `browser_click ref:…`), plus pixels as a + * fallback. `comment` carries an optional free-text note attached to the mark. + */ +export type Annotation = + | { kind: "confirm"; value: boolean } + | { kind: "choice"; value: string } + | { kind: "point"; x: number; y: number; ref?: string; selector?: string; text?: string }; + +export interface FeedbackResult { + /** Whether a human responded before the timeout. */ + responded: boolean; + /** True when the request timed out with no response. */ + timedOut?: boolean; + /** The marks the user made (empty when `timedOut`). */ + annotations: Annotation[]; +} + /** * Adapter-neutral result of a tool call. The OpenCode adapter renders these to * its text-only `{ output, metadata }`; the MCP adapter renders text/json to a @@ -614,5 +639,84 @@ export const BROWSER_TOOLS: readonly ToolSpec[] = [ path: args.path }), result: (data) => json(rec(data), JSON.stringify(rec(data), null, 2)) + }, + { + name: "browser_request_feedback", + group: "interactive", + action: "request_feedback", + description: + "Ask the human at the browser to respond on the page, and block until they do (or it times out). Use when you're unsure what the user means and a screenshot or snapshot isn't enough — e.g. 'which of these did you mean?'. `mode`: confirm (yes/no bar), choose (pick one of `options`), point (click one spot on the page; the reply resolves to an element ref you can then click/snapshot). Returns the user's response; on timeout returns `{ timedOut: true }` so you can fall back. interactive group (opt-in).", + timeoutMs: 300_000, + input: { + group, + mode: { + type: "string", + enum: ["confirm", "choose", "point"], + description: "Kind of prompt: confirm | choose | point." + }, + prompt: { + type: "string", + optional: true, + description: "Question/instruction shown to the user above the controls." + }, + options: { + type: "array", + optional: true, + items: { type: "string" }, + description: "Choices for `choose` mode (ignored otherwise)." + }, + timeoutMs: { + type: "number", + optional: true, + description: "How long to wait for the user, in ms (default 120000, max 290000)." + }, + tabId + }, + params: (args) => { + const requested = typeof args.timeoutMs === "number" ? args.timeoutMs : 120_000; + return { + mode: args.mode, + prompt: args.prompt, + options: Array.isArray(args.options) ? args.options : undefined, + // Keep the overlay's own countdown below the broker deadline (300s) so it + // self-resolves into a clean `timedOut` result rather than being cancelled. + timeoutMs: Math.min(Math.max(requested, 1_000), 290_000), + tabId: args.tabId + }; + }, + result: (data) => { + const d = data as Partial; + if (!d || d.responded === false || d.timedOut) { + return json( + { responded: false, timedOut: true, annotations: [] } satisfies FeedbackResult, + "No response from the user within the time limit." + ); + } + const annotations = Array.isArray(d.annotations) ? d.annotations : []; + return json( + { responded: true, annotations } satisfies FeedbackResult, + summarizeAnnotations(annotations) + ); + } } ]; + +/** A short human-readable line describing what the user marked. */ +function summarizeAnnotations(annotations: Annotation[]): string { + if (annotations.length === 0) { + return "User responded with no marks."; + } + return annotations + .map((a) => { + if (a.kind === "confirm") { + return `User ${a.value ? "confirmed" : "declined"}.`; + } + if (a.kind === "choice") { + return `User chose "${a.value}".`; + } + const where = a.ref ? `ref ${a.ref}` : `(${Math.round(a.x)}, ${Math.round(a.y)})`; + const label = a.text ? ` — "${a.text}"` : ""; + return `User pointed at ${where}${label}.`; + }) + .join(" "); +} diff --git a/packages/opencode-browser/src/lib.ts b/packages/opencode-browser/src/lib.ts index d05d54d..d4da140 100644 --- a/packages/opencode-browser/src/lib.ts +++ b/packages/opencode-browser/src/lib.ts @@ -50,8 +50,10 @@ export { export { createBrowserTools, type SaveScreenshot, type SendFn, type ToolDeps } from "./tools.js"; export { + type Annotation, BROWSER_TOOLS, DEFAULT_GROUPS, + type FeedbackResult, type NeutralResult, TOOL_GROUPS, type ToolGroup, diff --git a/packages/opencode-browser/src/protocol.ts b/packages/opencode-browser/src/protocol.ts index 6a71f97..98c6103 100644 --- a/packages/opencode-browser/src/protocol.ts +++ b/packages/opencode-browser/src/protocol.ts @@ -53,7 +53,8 @@ export type BrowserAction = | "set_viewport" | "cookies" | "targets" - | "release"; + | "release" + | "request_feedback"; export const BROWSER_ACTIONS: readonly BrowserAction[] = [ "open", @@ -88,7 +89,8 @@ export const BROWSER_ACTIONS: readonly BrowserAction[] = [ "set_viewport", "cookies", "targets", - "release" + "release", + "request_feedback" ] as const; /** A connection's role on the bridge. Absent → "extension" (back-compat). */ diff --git a/packages/opencode-browser/src/schema.ts b/packages/opencode-browser/src/schema.ts index a242f4a..3224afd 100644 --- a/packages/opencode-browser/src/schema.ts +++ b/packages/opencode-browser/src/schema.ts @@ -10,7 +10,7 @@ * package boundary while still covering every tool's args. */ -export type ToolGroup = "page" | "control" | "debug"; +export type ToolGroup = "page" | "control" | "debug" | "interactive"; export interface StringField { type: "string"; diff --git a/packages/opencode-browser/src/types.ts b/packages/opencode-browser/src/types.ts index 27e9d1e..1ae163f 100644 --- a/packages/opencode-browser/src/types.ts +++ b/packages/opencode-browser/src/types.ts @@ -36,9 +36,10 @@ export interface BrowserPluginOptions { executor?: ExecutorMode; /** * Which tool groups to register: `page` (observe), `control` (drive), - * `debug` (powerful/sensitive). Defaults to `["page","control"]` — `debug` - * is opt-in. Per-agent control is also possible via OpenCode's tool - * allow/deny on the `browser_*` names. + * `debug` (powerful/sensitive), `interactive` (ask the human on the page and + * block on them). Defaults to `["page","control"]` — `debug` and + * `interactive` are opt-in. Per-agent control is also possible via OpenCode's + * tool allow/deny on the `browser_*` names. */ groups?: ToolGroup[]; /** Per-command timeout in ms before the tool call rejects. */ diff --git a/packages/opencode-browser/test/tools.test.ts b/packages/opencode-browser/test/tools.test.ts index 17f8f9f..84c883d 100644 --- a/packages/opencode-browser/test/tools.test.ts +++ b/packages/opencode-browser/test/tools.test.ts @@ -19,7 +19,7 @@ function baseOptions(overrides: Partial = {}): ResolvedB port: 4517, token: "secret", executor: "auto", - groups: ["page", "control", "debug"], + groups: ["page", "control", "debug", "interactive"], timeoutMs: 30_000, screenshotDir: ".opencode/browser", ...overrides @@ -171,3 +171,105 @@ describe("browser_screenshot disk write", () => { expect(written.equals(png)).toBe(true); }); }); + +describe("browser_request_feedback (interactive group)", () => { + it("is gated behind the opt-in interactive group", () => { + const off = createBrowserTools({ + send: fakeSend(), + options: baseOptions({ groups: ["page", "control", "debug"] }), + logger: noopLogger + }); + expect(Object.keys(off)).not.toContain("browser_request_feedback"); + + const on = createBrowserTools({ + send: fakeSend(), + options: baseOptions({ groups: ["page", "control", "interactive"] }), + logger: noopLogger + }); + expect(Object.keys(on)).toContain("browser_request_feedback"); + }); + + it("sends the request_feedback action with a long per-command timeout", async () => { + const send = fakeSend({ responded: true, annotations: [{ kind: "confirm", value: true }] }); + const tools = createBrowserTools({ + send, + options: baseOptions({ groups: ["interactive"] }), + logger: noopLogger + }); + await tools.browser_request_feedback.execute( + { group: "g", mode: "confirm", prompt: "ok?" }, + ctx() + ); + expect(send).toHaveBeenCalledWith( + "request_feedback", + "g", + expect.objectContaining({ mode: "confirm", prompt: "ok?", timeoutMs: 120_000 }), + expect.anything(), + undefined, + 300_000 + ); + }); + + it("clamps a user timeout to the overlay ceiling", async () => { + const send = fakeSend({ responded: true, annotations: [] }); + const tools = createBrowserTools({ + send, + options: baseOptions({ groups: ["interactive"] }), + logger: noopLogger + }); + await tools.browser_request_feedback.execute( + { group: "g", mode: "confirm", timeoutMs: 999_999 }, + ctx() + ); + expect(send).toHaveBeenCalledWith( + "request_feedback", + "g", + expect.objectContaining({ timeoutMs: 290_000 }), + expect.anything(), + undefined, + 300_000 + ); + }); + + it("summarizes a confirm response", async () => { + const send = fakeSend({ responded: true, annotations: [{ kind: "confirm", value: true }] }); + const tools = createBrowserTools({ + send, + options: baseOptions({ groups: ["interactive"] }), + logger: noopLogger + }); + const res = await tools.browser_request_feedback.execute( + { group: "g", mode: "confirm" }, + ctx() + ); + expect(typeof res === "object" ? res.output : res).toMatch(/confirmed/i); + }); + + it("reports a timeout as no response", async () => { + const send = fakeSend({ responded: false, timedOut: true, annotations: [] }); + const tools = createBrowserTools({ + send, + options: baseOptions({ groups: ["interactive"] }), + logger: noopLogger + }); + const res = await tools.browser_request_feedback.execute( + { group: "g", mode: "confirm" }, + ctx() + ); + expect(typeof res === "object" ? res.output : res).toMatch(/no response/i); + }); + + it("resolves a point response to its element ref in the summary", async () => { + const send = fakeSend({ + responded: true, + annotations: [{ kind: "point", x: 10, y: 20, ref: "e42", selector: "#x" }] + }); + const tools = createBrowserTools({ + send, + options: baseOptions({ groups: ["interactive"] }), + logger: noopLogger + }); + const res = await tools.browser_request_feedback.execute({ group: "g", mode: "point" }, ctx()); + expect(typeof res === "object" ? res.output : res).toMatch(/ref e42/); + }); +}); From 5c21c98e1a858763bc1e98d2d9b6083f1a0934fe Mon Sep 17 00:00:00 2001 From: Stephane Segning Lambou Date: Sun, 14 Jun 2026 18:07:51 +0200 Subject: [PATCH 4/9] =?UTF-8?q?feat(browser):=20Phase=202=20=E2=80=94=20el?= =?UTF-8?q?ement/region/comment=20modes=20+=20overlay-blocked=20fallback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends browser_request_feedback beyond confirm/choose/point: - element — hover highlights the element under the cursor; click picks it → { kind:"element", ref, selector, text }. - region — drag a box → { kind:"region", rect, refs } where refs are every data-ocb-ref element inside the box. - comment — click a spot, then type a free-text note attached to the point. Overlay (feedback-overlay.ts): adds hover-highlight, rubber-band region select, and a note panel; shared ref/selector resolution against data-ocb-ref so every spatial mark stays actionable as a ref. Overlay-blocked pages (restricted / CSP) now return { responded:false, error } distinct from a timeout, and the tool result tells the agent to fall back to a screenshot/snapshot instead of re-asking. Deferred, deliberately (documented in plans/ui-feedback.md): marker-annotated screenshot return (NeutralResult can't carry json+image; refs > pixels) and the side-panel-over-screenshot fallback (large separate UI; the error path covers the hostile-page case for now). Docs: new modes + fallback in browser.md; plan doc marked Phases 0–2 done with the deferred-decisions rationale. Tests: element/region/comment summaries + error-vs-timeout shaping. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/background/command-router.ts | 12 +- .../src/background/feedback-overlay.ts | 271 +++++++++++++++--- .../src/background/feedback.ts | 12 +- docs/browser.md | 28 +- packages/opencode-browser/src/catalog.ts | 41 ++- packages/opencode-browser/test/tools.test.ts | 52 ++++ plans/ui-feedback.md | 24 +- 7 files changed, 376 insertions(+), 64 deletions(-) diff --git a/apps/browser-extension/src/background/command-router.ts b/apps/browser-extension/src/background/command-router.ts index aa38783..5d87c25 100644 --- a/apps/browser-extension/src/background/command-router.ts +++ b/apps/browser-extension/src/background/command-router.ts @@ -331,11 +331,13 @@ export class CommandRouter { // Register before awaiting so a broker `cancel` mid-wait can tear it down. this.registerCanceller(frame.id, handle.cancel); const result = await handle.result; - const summary = result.timedOut - ? "feedback timed out" - : result.responded - ? `feedback: ${result.annotations.map((a) => a.kind).join(",") || "none"}` - : "feedback dismissed"; + const summary = result.error + ? `feedback unavailable: ${result.error}` + : result.timedOut + ? "feedback timed out" + : result.responded + ? `feedback: ${result.annotations.map((a) => a.kind).join(",") || "none"}` + : "feedback dismissed"; return { data: result, summary }; } default: diff --git a/apps/browser-extension/src/background/feedback-overlay.ts b/apps/browser-extension/src/background/feedback-overlay.ts index 4118ff6..21564f7 100644 --- a/apps/browser-extension/src/background/feedback-overlay.ts +++ b/apps/browser-extension/src/background/feedback-overlay.ts @@ -11,7 +11,7 @@ * background can cancel a request without a page-side message channel. */ -export type FeedbackMode = "confirm" | "choose" | "point"; +export type FeedbackMode = "confirm" | "choose" | "point" | "element" | "region" | "comment"; export interface FeedbackRequest { mode: FeedbackMode; @@ -25,7 +25,14 @@ export interface FeedbackRequest { export type FeedbackAnnotation = | { kind: "confirm"; value: boolean } | { kind: "choice"; value: string } - | { kind: "point"; x: number; y: number; ref?: string; selector?: string; text?: string }; + | { kind: "point"; x: number; y: number; ref?: string; selector?: string; text?: string } + | { kind: "element"; ref?: string; selector?: string; text?: string } + | { + kind: "region"; + rect: { x: number; y: number; width: number; height: number }; + refs: string[]; + text?: string; + }; /** Message the injected overlay posts back to the background worker. */ export interface FeedbackMessage { @@ -153,8 +160,38 @@ function feedbackOverlay(id: string, mode: string, prompt: string, options: stri return b; }; - if (mode === "point") { - // Transparent capture layer below the bar; click picks a page element. + // Resolve the nearest ancestor carrying a snapshot ref (so the agent can act). + const resolveRef = (el: HTMLElement | null): string | undefined => { + let node: HTMLElement | null = el; + while (node && !node.getAttribute("data-ocb-ref")) { + node = node.parentElement; + } + return node?.getAttribute("data-ocb-ref") ?? undefined; + }; + // The page element under a point — overlay momentarily hidden so it doesn't win. + const elementUnder = (x: number, y: number): HTMLElement | null => { + root.style.display = "none"; + const el = document.elementFromPoint(x, y) as HTMLElement | null; + root.style.display = ""; + return el; + }; + const describe = (el: HTMLElement | null, a: Record): void => { + if (!el) { + return; + } + const ref = resolveRef(el); + if (ref) { + a.ref = ref; + } + a.selector = cssPath(el); + const label = (el.textContent || "").trim().slice(0, 80); + if (label) { + a.text = label; + } + }; + + const PAGE_MODES = ["point", "element", "region", "comment"]; + if (PAGE_MODES.includes(mode)) { const capture = document.createElement("div"); capture.style.cssText = [ "position:absolute", @@ -162,45 +199,86 @@ function feedbackOverlay(id: string, mode: string, prompt: string, options: stri "cursor:crosshair", "background:rgba(59,130,246,.06)" ].join(";"); - const hint = document.createElement("div"); - hint.style.cssText = [ - "position:absolute", - "bottom:16px", - "left:50%", - "transform:translateX(-50%)", - "background:#0f172a", - "color:#f8fafc", - "padding:6px 12px", - "border-radius:9999px", - "font-size:12px" - ].join(";"); - hint.textContent = "Click the element you mean — Esc to skip"; - capture.appendChild(hint); - capture.addEventListener("click", (e: MouseEvent) => { - const x = e.clientX; - const y = e.clientY; - root.style.display = "none"; - const el = document.elementFromPoint(x, y) as HTMLElement | null; - root.style.display = ""; - const annotation: Record = { kind: "point", x, y }; - if (el) { - let node: HTMLElement | null = el; - while (node && !node.getAttribute("data-ocb-ref")) { - node = node.parentElement; + root.appendChild(capture); + + if (mode === "element") { + capture.appendChild(hintBar("Hover and click the element — Esc to skip")); + const hl = marker("2px solid"); + root.appendChild(hl); + capture.addEventListener("mousemove", (e: MouseEvent) => { + const el = elementUnder(e.clientX, e.clientY); + if (!el) { + hl.style.display = "none"; + return; } - const ref = node?.getAttribute("data-ocb-ref") ?? undefined; - if (ref) { - annotation.ref = ref; + place(hl, el.getBoundingClientRect()); + }); + capture.addEventListener("click", (e: MouseEvent) => { + const a: Record = { kind: "element" }; + describe(elementUnder(e.clientX, e.clientY), a); + finish([a]); + }); + } else if (mode === "region") { + capture.appendChild(hintBar("Drag a box over the area — Esc to skip")); + const band = marker("2px dashed"); + root.appendChild(band); + let sx = 0; + let sy = 0; + let dragging = false; + capture.addEventListener("mousedown", (e: MouseEvent) => { + dragging = true; + sx = e.clientX; + sy = e.clientY; + place(band, rectOf(sx, sy, sx, sy)); + }); + capture.addEventListener("mousemove", (e: MouseEvent) => { + if (dragging) { + place(band, rectOf(sx, sy, e.clientX, e.clientY)); } - annotation.selector = cssPath(el); - const label = (el.textContent || "").trim().slice(0, 80); - if (label) { - annotation.text = label; + }); + capture.addEventListener("mouseup", (e: MouseEvent) => { + if (!dragging) { + return; } - } - finish([annotation]); - }); - root.appendChild(capture); + dragging = false; + const rect = rectOf(sx, sy, e.clientX, e.clientY); + if (rect.width < 4 || rect.height < 4) { + band.style.display = "none"; + return; + } + const refs: string[] = []; + root.style.display = "none"; + for (const node of Array.from(document.querySelectorAll("[data-ocb-ref]"))) { + if (intersects(rect, node.getBoundingClientRect())) { + const ref = node.getAttribute("data-ocb-ref"); + if (ref) { + refs.push(ref); + } + } + } + root.style.display = ""; + finish([{ kind: "region", rect, refs }]); + }); + } else { + // point | comment — click a spot; comment then asks for a note. + capture.appendChild( + hintBar( + mode === "comment" + ? "Click a spot, then add a note — Esc to skip" + : "Click the element you mean — Esc to skip" + ) + ); + capture.addEventListener("click", (e: MouseEvent) => { + const a: Record = { kind: "point", x: e.clientX, y: e.clientY }; + describe(elementUnder(e.clientX, e.clientY), a); + if (mode === "comment") { + capture.remove(); + promptComment(a); + } else { + finish([a]); + } + }); + } } else { const panel = document.createElement("div"); panel.style.cssText = [ @@ -277,4 +355,117 @@ function feedbackOverlay(id: string, mode: string, prompt: string, options: stri } return parts.join(" > "); } + + /** A floating instruction pill near the bottom of the capture layer. */ + function hintBar(textStr: string): HTMLDivElement { + const hint = document.createElement("div"); + hint.style.cssText = [ + "position:absolute", + "bottom:16px", + "left:50%", + "transform:translateX(-50%)", + "background:#0f172a", + "color:#f8fafc", + "padding:6px 12px", + "border-radius:9999px", + "font-size:12px", + "pointer-events:none" + ].join(";"); + hint.textContent = textStr; + return hint; + } + + /** A non-interactive highlight/selection box overlaid on the page. */ + function marker(border: string): HTMLDivElement { + const m = document.createElement("div"); + m.style.cssText = [ + "position:fixed", + "pointer-events:none", + `border:${border} ${ACCENT}`, + "background:rgba(59,130,246,.12)", + "z-index:2147483646", + "display:none" + ].join(";"); + return m; + } + + /** Position a marker over a viewport rectangle. */ + function place(m: HTMLElement, r: { x: number; y: number; width: number; height: number }): void { + m.style.display = "block"; + m.style.left = `${r.x}px`; + m.style.top = `${r.y}px`; + m.style.width = `${r.width}px`; + m.style.height = `${r.height}px`; + } + + /** Normalize two corners into a positive-size rect. */ + function rectOf( + ax: number, + ay: number, + bx: number, + by: number + ): { x: number; y: number; width: number; height: number } { + return { + x: Math.min(ax, bx), + y: Math.min(ay, by), + width: Math.abs(bx - ax), + height: Math.abs(by - ay) + }; + } + + function intersects( + a: { x: number; y: number; width: number; height: number }, + b: DOMRect + ): boolean { + return !(b.right < a.x || b.left > a.x + a.width || b.bottom < a.y || b.top > a.y + a.height); + } + + /** After a point click in `comment` mode, collect an optional free-text note. */ + function promptComment(annotation: Record): void { + const panel = document.createElement("div"); + panel.style.cssText = [ + "position:absolute", + "top:50%", + "left:50%", + "transform:translate(-50%,-50%)", + "min-width:300px", + "max-width:440px", + "background:#fff", + "border:1px solid #e2e8f0", + "border-radius:14px", + "box-shadow:0 12px 40px rgba(2,6,23,.28)", + "padding:18px" + ].join(";"); + const label = document.createElement("div"); + label.style.cssText = "font-size:13px;margin-bottom:8px"; + label.textContent = "Add a note about what you pointed at:"; + const ta = document.createElement("textarea"); + ta.style.cssText = [ + "width:100%", + "box-sizing:border-box", + "min-height:72px", + "padding:8px", + "border:1px solid #cbd5e1", + "border-radius:8px", + "font:inherit", + "font-size:13px", + "resize:vertical" + ].join(";"); + const actions = document.createElement("div"); + actions.style.cssText = "display:flex;gap:8px;justify-content:flex-end;margin-top:12px"; + const skipBtn = btn("No note", false); + skipBtn.addEventListener("click", () => finish([annotation])); + const add = btn("Add", true); + add.addEventListener("click", () => { + const t = ta.value.trim(); + if (t) { + annotation.text = t; + } + finish([annotation]); + }); + actions.append(skipBtn, add); + panel.append(label, ta, actions); + root.appendChild(panel); + ta.focus(); + } } diff --git a/apps/browser-extension/src/background/feedback.ts b/apps/browser-extension/src/background/feedback.ts index 5a95078..21e6c24 100644 --- a/apps/browser-extension/src/background/feedback.ts +++ b/apps/browser-extension/src/background/feedback.ts @@ -18,6 +18,8 @@ import { export interface FeedbackResult { responded: boolean; timedOut?: boolean; + /** The overlay couldn't be shown (restricted/CSP page) — distinct from timeout. */ + error?: string; annotations: FeedbackAnnotation[]; } @@ -108,10 +110,12 @@ export function startFeedback(tabId: number, id: string, req: FeedbackRequest): } void flagAttention(tabId); - // Injection failure (e.g. a restricted page) ends the request as "no response" - // rather than hanging until the timeout. - void showFeedbackOverlay(tabId, id, req).catch(() => { - finish({ responded: false, timedOut: true, annotations: [] }); + // Injection failure (e.g. a restricted or CSP-locked page) ends the request + // immediately with a distinct error rather than hanging until the timeout, so + // the agent can fall back to a screenshot instead of silently re-asking. + void showFeedbackOverlay(tabId, id, req).catch((err: unknown) => { + const reason = err instanceof Error ? err.message : "the overlay could not be shown"; + finish({ responded: false, error: reason, annotations: [] }); }); return { diff --git a/docs/browser.md b/docs/browser.md index 370fa33..24fa7c7 100644 --- a/docs/browser.md +++ b/docs/browser.md @@ -161,16 +161,30 @@ screenshot or snapshot isn't enough to know what the user means ("which of these - `confirm` — a yes/no bar; returns `{ kind: "confirm", value }`. - `choose` — buttons for each `options[]` entry; returns `{ kind: "choice", value }`. -- `point` — the user clicks one spot; the reply resolves to the **element ref** under the click - (plus a CSS selector and pixel coords) so you can immediately `browser_click ref:…` it. +- `point` — the user clicks one spot; resolves to the **element ref** under the click (plus a CSS + selector and pixel coords). +- `element` — hover highlights the element under the cursor; a click picks it → + `{ kind: "element", ref, selector, text }`. +- `region` — the user drags a box; returns `{ kind: "region", rect, refs }` where `refs` are the + snapshot refs of every element inside the box. +- `comment` — click a spot, then type a free-text note; the point annotation carries `text`. + +point/element/region resolve to element **refs** so you can immediately `browser_click ref:…` or +`browser_snapshot` them — refs, not pixels, are what the agent acts on. The overlay is clearly branded as opencode-browser (so a page can't spoof the prompt), dismissible (Esc / Skip), and raises the toolbar badge + focuses the tab to get the user's attention. On no -response it returns `{ responded: false, timedOut: true }` so the agent can fall back rather than -hang. The wait uses a long per-command timeout (default 120 s, max ~290 s, capped broker-side at -10 min); if the agent's turn is aborted or times out, the broker sends a `cancel` frame that tears -the overlay down — a blocking prompt never orphans state in the page. Meaningless in headless/CI -routing (no human at the browser), where it simply times out. +response it returns `{ responded: false, timedOut: true }`; on a page that blocks the overlay +(restricted / CSP) it returns `{ responded: false, error }` — distinct from a timeout — so the +agent falls back to a screenshot/snapshot instead of re-asking. The wait uses a long per-command +timeout (default 120 s, max ~290 s, capped broker-side at 10 min); if the agent's turn is aborted +or times out, the broker sends a `cancel` frame that tears the overlay down — a blocking prompt +never orphans state in the page. Meaningless in headless/CI routing (no human at the browser), +where it simply times out. + +> Not yet implemented: a marker-annotated **screenshot** return (deferred — the structured refs are +> more actionable than pixels, and a single tool result can't carry both) and a **side-panel** +> annotation fallback for overlay-blocked pages (today such pages return the `error` above). ### Scoping tools and token cost diff --git a/packages/opencode-browser/src/catalog.ts b/packages/opencode-browser/src/catalog.ts index 63cfef2..34e09c1 100644 --- a/packages/opencode-browser/src/catalog.ts +++ b/packages/opencode-browser/src/catalog.ts @@ -21,14 +21,27 @@ export const DEFAULT_GROUPS: readonly ToolGroup[] = ["page", "control"] as const export type Annotation = | { kind: "confirm"; value: boolean } | { kind: "choice"; value: string } - | { kind: "point"; x: number; y: number; ref?: string; selector?: string; text?: string }; + | { kind: "point"; x: number; y: number; ref?: string; selector?: string; text?: string } + | { kind: "element"; ref?: string; selector?: string; text?: string } + | { + kind: "region"; + rect: { x: number; y: number; width: number; height: number }; + refs: string[]; + text?: string; + }; export interface FeedbackResult { /** Whether a human responded before the timeout. */ responded: boolean; /** True when the request timed out with no response. */ timedOut?: boolean; - /** The marks the user made (empty when `timedOut`). */ + /** + * Set when the overlay couldn't be shown at all (e.g. a restricted or + * CSP-locked page) — distinct from a plain timeout, so the agent can fall back + * to screenshot/snapshot reasoning rather than re-asking. + */ + error?: string; + /** The marks the user made (empty when `timedOut`/`error`). */ annotations: Annotation[]; } @@ -645,14 +658,14 @@ export const BROWSER_TOOLS: readonly ToolSpec[] = [ group: "interactive", action: "request_feedback", description: - "Ask the human at the browser to respond on the page, and block until they do (or it times out). Use when you're unsure what the user means and a screenshot or snapshot isn't enough — e.g. 'which of these did you mean?'. `mode`: confirm (yes/no bar), choose (pick one of `options`), point (click one spot on the page; the reply resolves to an element ref you can then click/snapshot). Returns the user's response; on timeout returns `{ timedOut: true }` so you can fall back. interactive group (opt-in).", + "Ask the human at the browser to respond on the page, and block until they do (or it times out). Use when you're unsure what the user means and a screenshot or snapshot isn't enough — e.g. 'which of these did you mean?'. `mode`: confirm (yes/no bar), choose (pick one of `options`), point (click one spot), element (hover-highlight then click one element), region (drag a box over an area), comment (point + a free-text note). point/element/region resolve to element ref(s) you can then click/snapshot. Returns the user's response; on timeout returns `{ timedOut: true }` so you can fall back. interactive group (opt-in).", timeoutMs: 300_000, input: { group, mode: { type: "string", - enum: ["confirm", "choose", "point"], - description: "Kind of prompt: confirm | choose | point." + enum: ["confirm", "choose", "point", "element", "region", "comment"], + description: "Kind of prompt: confirm | choose | point | element | region | comment." }, prompt: { type: "string", @@ -686,6 +699,12 @@ export const BROWSER_TOOLS: readonly ToolSpec[] = [ }, result: (data) => { const d = data as Partial; + if (d?.error) { + return json( + { responded: false, error: d.error, annotations: [] } satisfies FeedbackResult, + `Couldn't ask the user on this page: ${d.error}. Fall back to a screenshot/snapshot.` + ); + } if (!d || d.responded === false || d.timedOut) { return json( { responded: false, timedOut: true, annotations: [] } satisfies FeedbackResult, @@ -708,15 +727,23 @@ function summarizeAnnotations(annotations: Annotation[]): string { } return annotations .map((a) => { + const note = "text" in a && a.text ? ` — "${a.text}"` : ""; if (a.kind === "confirm") { return `User ${a.value ? "confirmed" : "declined"}.`; } if (a.kind === "choice") { return `User chose "${a.value}".`; } + if (a.kind === "element") { + const what = a.ref ? `ref ${a.ref}` : (a.selector ?? "an element"); + return `User selected ${what}${note}.`; + } + if (a.kind === "region") { + const refs = a.refs.length ? ` covering ${a.refs.join(", ")}` : ""; + return `User boxed a ${Math.round(a.rect.width)}×${Math.round(a.rect.height)} region${refs}${note}.`; + } const where = a.ref ? `ref ${a.ref}` : `(${Math.round(a.x)}, ${Math.round(a.y)})`; - const label = a.text ? ` — "${a.text}"` : ""; - return `User pointed at ${where}${label}.`; + return `User pointed at ${where}${note}.`; }) .join(" "); } diff --git a/packages/opencode-browser/test/tools.test.ts b/packages/opencode-browser/test/tools.test.ts index 84c883d..ee8d2b3 100644 --- a/packages/opencode-browser/test/tools.test.ts +++ b/packages/opencode-browser/test/tools.test.ts @@ -273,3 +273,55 @@ describe("browser_request_feedback (interactive group)", () => { expect(typeof res === "object" ? res.output : res).toMatch(/ref e42/); }); }); + +describe("browser_request_feedback rich modes (Phase 2)", () => { + const interactive = (result: unknown) => + createBrowserTools({ + send: fakeSend(result), + options: baseOptions({ groups: ["interactive"] }), + logger: noopLogger + }); + + it("summarizes an element selection by ref", async () => { + const tools = interactive({ + responded: true, + annotations: [{ kind: "element", ref: "e9", selector: "#x", text: "Inbox" }] + }); + const res = await tools.browser_request_feedback.execute( + { group: "g", mode: "element" }, + ctx() + ); + expect(typeof res === "object" ? res.output : res).toMatch(/selected ref e9/); + }); + + it("summarizes a region with covered refs", async () => { + const tools = interactive({ + responded: true, + annotations: [ + { kind: "region", rect: { x: 0, y: 0, width: 120, height: 60 }, refs: ["e1", "e2"] } + ] + }); + const res = await tools.browser_request_feedback.execute({ group: "g", mode: "region" }, ctx()); + expect(typeof res === "object" ? res.output : res).toMatch(/120×60 region covering e1, e2/); + }); + + it("includes a comment note in the summary", async () => { + const tools = interactive({ + responded: true, + annotations: [{ kind: "point", x: 1, y: 2, ref: "e3", text: "this label is wrong" }] + }); + const res = await tools.browser_request_feedback.execute( + { group: "g", mode: "comment" }, + ctx() + ); + expect(typeof res === "object" ? res.output : res).toMatch(/this label is wrong/); + }); + + it("surfaces an overlay error distinctly from a timeout", async () => { + const tools = interactive({ responded: false, error: "page blocked the overlay" }); + const res = await tools.browser_request_feedback.execute({ group: "g", mode: "point" }, ctx()); + const output = typeof res === "object" ? res.output : res; + expect(output).toMatch(/page blocked the overlay/); + expect(output).toMatch(/screenshot\/snapshot/); + }); +}); diff --git a/plans/ui-feedback.md b/plans/ui-feedback.md index 1d63623..fd31c3b 100644 --- a/plans/ui-feedback.md +++ b/plans/ui-feedback.md @@ -1,6 +1,28 @@ # UI feedback / human-in-the-loop (design) -Status: **design / not yet implemented.** Greenfield follow-up to the browser plugin + MCP server. +Status: **Phases 0–2 implemented** (PR #49). Greenfield follow-up to the browser plugin + MCP server. + +## What shipped vs. the original plan + +- **Phase 0** — `CancelFrame` + per-command timeout: shipped as designed. The per-command timeout + rides on `CommandFrame.timeoutMs` (broker-only, like `target`) so guest agents get it too; guest + local timer = `requested + grace` so the broker (which can cancel the executor) always fires first. +- **Phase 1** — `interactive` group + `browser_request_feedback` (confirm/choose/point), port-free + in-page overlay (background owns the timeout; the page reports back via `chrome.runtime.sendMessage` + rather than the originally-sketched long-lived port — simpler and survives the same MV3 lifetime as + the bridge's heartbeat), ref resolution, badge + tab-focus attention signal: shipped. +- **Phase 2** — element/region/comment modes: shipped. **Deferred, deliberately:** + - *Marker-annotated screenshot return* — `NeutralResult` carries text **or** json **or** image, not + a combination; the resolved element refs are more actionable to the agent than burned-in pixels, + so the json-with-refs result wins. Revisit only if a real need to return both appears (would mean + widening `NeutralResult` + both adapter renderers). + - *Side-panel-over-screenshot fallback* — a sizeable separate UI feature. Today an overlay-blocked + page (restricted / CSP) returns `{ responded: false, error }`, distinct from a timeout, so the + agent falls back to screenshot/snapshot reasoning. The side panel remains a future enhancement. + +--- + +(Original design below.) Builds on the routing in [`multi-client-routing.md`](multi-client-routing.md) and the ws transport in [`docs/adr/0001-bridge-transport-ws-not-bun-serve-or-socketio.md`](../docs/adr/0001-bridge-transport-ws-not-bun-serve-or-socketio.md). From 0f36ef43af03710f347f399e3705afd13aa9752a Mon Sep 17 00:00:00 2001 From: Stephane Segning Lambou Date: Sun, 14 Jun 2026 18:39:18 +0200 Subject: [PATCH 5/9] feat(browser): side-panel fallback for overlay-blocked feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When browser_request_feedback can't inject its in-page overlay (chrome://, Web Store, CSP-locked origins), the extension now captures a screenshot and routes the request to its own side panel instead of returning an error. - feedback-side-panel.ts: captures the tab via chrome.tabs.captureVisibleTab, stores the pending session, and enables the panel cross-browser (Chromium chrome.sidePanel / Firefox sidebar_action). Neither browser allows force-open, so it raises the badge and — only while a request is pending — flips openPanelOnActionClick so the toolbar click opens the panel, restoring the normal popup behavior on resolve. - new sidepanel entrypoint (React + daisyUI): renders the screenshot with the mode's controls; point/region map clicks to screenshot-pixel coords (no live DOM → no refs on these pages); confirm/choose work fully. Posts the same ocb-feedback-result message as the overlay, so the background's correlation / timeout / cancel path is unchanged. - feedback.ts overlay-failure path now tries the side panel first; only if a screenshot can't be taken does it settle with { error }. - background routes feedback:get-pending; wxt adds the sidePanel permission (Chromium) — WXT maps the entrypoint to side_panel / sidebar_action per target. Docs: side-panel fallback documented in browser.md; plan doc marks it shipped (fallback-only, docked); CLAUDE.md entrypoint/layout note updated. Marker- annotated screenshot remains deliberately deferred (NeutralResult is single-kind; refs > pixels). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 2 +- .../src/background/feedback-side-panel.ts | 105 +++++++ .../src/background/feedback.ts | 22 +- .../src/entrypoints/background.ts | 40 ++- .../src/entrypoints/sidepanel/App.tsx | 279 ++++++++++++++++++ .../src/entrypoints/sidepanel/index.html | 12 + .../src/entrypoints/sidepanel/main.tsx | 14 + apps/browser-extension/src/shared/messages.ts | 35 +++ apps/browser-extension/wxt.config.ts | 4 +- docs/browser.md | 17 +- plans/ui-feedback.md | 23 +- 11 files changed, 518 insertions(+), 35 deletions(-) create mode 100644 apps/browser-extension/src/background/feedback-side-panel.ts create mode 100644 apps/browser-extension/src/entrypoints/sidepanel/App.tsx create mode 100644 apps/browser-extension/src/entrypoints/sidepanel/index.html create mode 100644 apps/browser-extension/src/entrypoints/sidepanel/main.tsx diff --git a/CLAUDE.md b/CLAUDE.md index f74793d..e6abfee 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -95,7 +95,7 @@ packages// `opencode-ratelimit` follows the same shape **minus `cache.ts`** (its rate-limit state is in-memory only — a reset window is seconds, so persisting it would only serve stale data) and **plus `headers.ts`** (a pure parser for the `x-ratelimit-*` triple). It is also the only plugin that injects a custom `fetch` into `provider.options.fetch` (the sole way to observe response status/headers, since OpenCode has no post-response hook) rather than only reading/merging config. -`opencode-browser` follows the same shape **minus `cache.ts`** (broker state is in-memory) and **plus**: `protocol.ts` (the dependency-free wire-frame contract, mirrored into the extension), `transport.ts` (the `BridgeTransport` seam + `isAddrInUse`) and `node-transport.ts` (the `ws`-backed host transport + guest socket, runs under Bun *and* Node — shared with the MCP server via `./lib`; async `listen` for bind-based election), `broker.ts` (role-aware broker: executors + agents + group-ownership routing, DI-tested), `agent-client.ts` (guest-agent WS client), `endpoint.ts` (try-bind → host-or-guest auto-election with failover), `token-file.ts` (shared `bridge.json`), `catalog.ts` + `schema.ts` (the neutral tool surface), and `tools.ts` (the OpenCode `Hooks.tool` adapter over the catalog). It is the only plugin that **registers tools** and **hosts a server**. The companion extension under `apps/browser-extension` is a WXT project (not the per-package `src/` layout) — its engine lives in `src/background/` (bridge client, command router, group registry, CDP + content executors, `page-actions` injected via `chrome.scripting.executeScript`) and its UI in `src/entrypoints/{popup,options}` over Dexie. +`opencode-browser` follows the same shape **minus `cache.ts`** (broker state is in-memory) and **plus**: `protocol.ts` (the dependency-free wire-frame contract, mirrored into the extension), `transport.ts` (the `BridgeTransport` seam + `isAddrInUse`) and `node-transport.ts` (the `ws`-backed host transport + guest socket, runs under Bun *and* Node — shared with the MCP server via `./lib`; async `listen` for bind-based election), `broker.ts` (role-aware broker: executors + agents + group-ownership routing, DI-tested), `agent-client.ts` (guest-agent WS client), `endpoint.ts` (try-bind → host-or-guest auto-election with failover), `token-file.ts` (shared `bridge.json`), `catalog.ts` + `schema.ts` (the neutral tool surface), and `tools.ts` (the OpenCode `Hooks.tool` adapter over the catalog). It is the only plugin that **registers tools** and **hosts a server**. The companion extension under `apps/browser-extension` is a WXT project (not the per-package `src/` layout) — its engine lives in `src/background/` (bridge client, command router, group registry, CDP + content executors, `page-actions` injected via `chrome.scripting.executeScript`, `feedback`/`feedback-overlay`/`feedback-side-panel` for the `interactive` HITL flow) and its UI in `src/entrypoints/{popup,options,sidepanel}` over Dexie (the `sidepanel` is the docked annotation fallback for overlay-blocked pages). **Important — two entry points per published package:** diff --git a/apps/browser-extension/src/background/feedback-side-panel.ts b/apps/browser-extension/src/background/feedback-side-panel.ts new file mode 100644 index 0000000..d759aa9 --- /dev/null +++ b/apps/browser-extension/src/background/feedback-side-panel.ts @@ -0,0 +1,105 @@ +/** + * Side-panel fallback for `interactive` feedback. When the in-page overlay can't + * be injected (restricted / CSP page), we capture a screenshot of the tab and + * route the request to the extension's own side panel instead — the panel can't + * be blocked by the page. Cross-browser: Chromium `chrome.sidePanel`, Firefox + * `sidebarAction`; neither can be force-opened without a user gesture, so we + * enable it + raise the badge and the user opens it from the toolbar. + * + * The panel reads the pending request via `feedback:get-pending` and posts the + * answer with the same `ocb-feedback-result` shape the overlay uses, so the + * background's per-request listener correlates both surfaces identically. + */ +import type { FeedbackSession } from "../shared/messages"; + +/** The single in-flight side-panel request (one at a time keeps the UX simple). */ +let activeSession: FeedbackSession | null = null; +let activeTabId: number | null = null; + +/** The request the side panel should render right now, if any. */ +export function getActiveFeedbackSession(): FeedbackSession | null { + return activeSession; +} + +// Minimal structural views of the two browsers' panel APIs (the @types differ +// per target, so we narrow through `unknown` rather than depend on either). +interface SidePanelApi { + setOptions(opts: { tabId?: number; path?: string; enabled?: boolean }): Promise; + setPanelBehavior?(behavior: { openPanelOnActionClick: boolean }): Promise; +} +interface SidebarActionApi { + setPanel(details: { panel: string }): Promise | void; +} +const sidePanelApi = (chrome as unknown as { sidePanel?: SidePanelApi }).sidePanel; +const sidebarActionApi = (chrome as unknown as { sidebarAction?: SidebarActionApi }).sidebarAction; + +const SIDE_PANEL_PATH = "sidepanel.html"; + +function broadcastPendingChanged(): void { + // No receiver (panel closed) rejects — that's fine, the panel queries on open. + chrome.runtime.sendMessage({ type: "feedback:pending-changed" }).catch(() => {}); +} + +/** + * Set up the side-panel fallback for a request: activate + screenshot the tab, + * stash the session, and enable the panel. Returns true if the panel path is + * ready (caller keeps waiting for the user); false if it couldn't be set up + * (caller should settle with an error). + */ +export async function openSidePanelFallback( + tabId: number, + id: string, + req: { mode: string; prompt?: string; options?: string[] } +): Promise { + if (!sidePanelApi && !sidebarActionApi) { + return false; + } + let screenshot: string; + try { + const tab = await chrome.tabs.get(tabId); + await chrome.tabs.update(tabId, { active: true }); + screenshot = await chrome.tabs.captureVisibleTab(tab.windowId, { format: "png" }); + } catch { + return false; // can't even screenshot (e.g. a blocked page) — let caller error out + } + + activeSession = { id, mode: req.mode, prompt: req.prompt, options: req.options, screenshot }; + activeTabId = tabId; + + try { + if (sidePanelApi) { + await sidePanelApi.setOptions({ tabId, path: SIDE_PANEL_PATH, enabled: true }); + // While a request is pending, clicking the toolbar icon opens the panel + // (restored to the normal popup behavior in clearFeedbackSession). We + // can't open the panel ourselves — it needs the user's click. + await sidePanelApi.setPanelBehavior?.({ openPanelOnActionClick: true }); + } else if (sidebarActionApi) { + await sidebarActionApi.setPanel({ panel: chrome.runtime.getURL(SIDE_PANEL_PATH) }); + } + } catch { + activeSession = null; + activeTabId = null; + return false; + } + + broadcastPendingChanged(); + return true; +} + +/** Clear the session for `id` (on answer / timeout / cancel) and notify the panel. */ +export function clearFeedbackSession(id: string): void { + if (activeSession?.id !== id) { + return; + } + const tabId = activeTabId; + activeSession = null; + activeTabId = null; + // Restore the toolbar's normal popup behavior and retire the panel. + if (sidePanelApi) { + void sidePanelApi.setPanelBehavior?.({ openPanelOnActionClick: false }).catch(() => {}); + if (tabId !== null) { + void sidePanelApi.setOptions({ tabId, enabled: false }).catch(() => {}); + } + } + broadcastPendingChanged(); +} diff --git a/apps/browser-extension/src/background/feedback.ts b/apps/browser-extension/src/background/feedback.ts index 21e6c24..22e6845 100644 --- a/apps/browser-extension/src/background/feedback.ts +++ b/apps/browser-extension/src/background/feedback.ts @@ -14,6 +14,7 @@ import { hideFeedbackOverlay, showFeedbackOverlay } from "./feedback-overlay"; +import { clearFeedbackSession, openSidePanelFallback } from "./feedback-side-panel"; export interface FeedbackResult { responded: boolean; @@ -105,17 +106,26 @@ export function startFeedback(tabId: number, id: string, req: FeedbackRequest): clearTimeout(timer); chrome.runtime.onMessage.removeListener(onMessage); void hideFeedbackOverlay(tabId, id); + clearFeedbackSession(id); void clearAttention(); settle(r); } void flagAttention(tabId); - // Injection failure (e.g. a restricted or CSP-locked page) ends the request - // immediately with a distinct error rather than hanging until the timeout, so - // the agent can fall back to a screenshot instead of silently re-asking. - void showFeedbackOverlay(tabId, id, req).catch((err: unknown) => { - const reason = err instanceof Error ? err.message : "the overlay could not be shown"; - finish({ responded: false, error: reason, annotations: [] }); + // Overlay injection failed (restricted / CSP page). Fall back to the side + // panel over a screenshot; only if that can't be set up either do we settle + // with a distinct error so the agent falls back to a screenshot/snapshot. + void showFeedbackOverlay(tabId, id, req).catch(async (err: unknown) => { + if (done) { + return; + } + const ok = await openSidePanelFallback(tabId, id, req); + if (!ok && !done) { + const reason = err instanceof Error ? err.message : "the overlay could not be shown"; + finish({ responded: false, error: reason, annotations: [] }); + } + // When ok: keep waiting — the user opens the panel and the existing message + // listener / timeout settle the request. }); return { diff --git a/apps/browser-extension/src/entrypoints/background.ts b/apps/browser-extension/src/entrypoints/background.ts index 39be4a3..7f5de59 100644 --- a/apps/browser-extension/src/entrypoints/background.ts +++ b/apps/browser-extension/src/entrypoints/background.ts @@ -5,9 +5,10 @@ import { CdpExecutor } from "../background/cdp-executor"; import { CommandRouter } from "../background/command-router"; import { ContentExecutor } from "../background/content-executor"; import { detectCapabilities, type Executor, resolveExecutorKind } from "../background/executor"; +import { getActiveFeedbackSession } from "../background/feedback-side-panel"; import { GroupRegistry } from "../background/group-registry"; import { getSettings, getStatus, saveSettings, setStatus } from "../shared/db"; -import type { UiMessage, UiMessageResponse } from "../shared/messages"; +import type { FeedbackPendingResponse, UiMessage, UiMessageResponse } from "../shared/messages"; import type { ExecutorKind, ExecutorMode } from "../shared/types"; export default defineBackground(() => { @@ -71,19 +72,30 @@ export default defineBackground(() => { await client.connect(); })(); - // Control channel from the popup / dashboard. - chrome.runtime.onMessage.addListener((message: UiMessage, _sender, sendResponse) => { - void (async () => { - if (message.type === "reconnect") { - await rebuild(); - await client.reconnect(); - } else if (message.type === "disconnect") { - client.disconnect(); - } - const status = await getStatus(); - sendResponse({ status } satisfies UiMessageResponse); - })(); - return true; // keep the message channel open for the async response + // Control channel from the popup / dashboard / side panel. + chrome.runtime.onMessage.addListener((message: unknown, _sender, sendResponse) => { + const type = (message as { type?: string })?.type; + // Side panel asks what request (if any) it should render. + if (type === "feedback:get-pending") { + sendResponse({ session: getActiveFeedbackSession() } satisfies FeedbackPendingResponse); + return true; + } + if (type === "reconnect" || type === "disconnect" || type === "get_status") { + void (async () => { + const m = message as UiMessage; + if (m.type === "reconnect") { + await rebuild(); + await client.reconnect(); + } else if (m.type === "disconnect") { + client.disconnect(); + } + const status = await getStatus(); + sendResponse({ status } satisfies UiMessageResponse); + })(); + return true; // keep the message channel open for the async response + } + // Not ours (e.g. ocb-feedback-result) — handled by the per-request listener. + return false; }); // Keep the registry honest when the user closes a driven tab by hand. diff --git a/apps/browser-extension/src/entrypoints/sidepanel/App.tsx b/apps/browser-extension/src/entrypoints/sidepanel/App.tsx new file mode 100644 index 0000000..23615d0 --- /dev/null +++ b/apps/browser-extension/src/entrypoints/sidepanel/App.tsx @@ -0,0 +1,279 @@ +import { useCallback, useEffect, useRef, useState } from "react"; + +import { Button } from "../../components/ui/button"; +import type { + FeedbackPendingResponse, + FeedbackResultMessage, + FeedbackSession +} from "../../shared/messages"; + +/** Modes that annotate over the screenshot (vs. the button-only confirm/choose). */ +const SPATIAL = new Set(["point", "element", "comment", "region"]); + +export function App() { + const session = usePendingSession(); + + return ( +
+
+ + OpenCode Browser + feedback +
+ {session ? ( + + ) : ( +

+ No feedback request right now. When an agent asks for input on a page it can't draw on, it + appears here. +

+ )} +
+ ); +} + +/** Track the request the background says is pending, re-querying on change. */ +function usePendingSession(): FeedbackSession | null { + const [session, setSession] = useState(null); + useEffect(() => { + let alive = true; + const load = (): void => { + void chrome.runtime + .sendMessage({ type: "feedback:get-pending" }) + .then((r: FeedbackPendingResponse | undefined) => { + if (alive) { + setSession(r?.session ?? null); + } + }) + .catch(() => {}); + }; + load(); + const onMsg = (m: unknown): void => { + if ((m as { type?: string })?.type === "feedback:pending-changed") { + load(); + } + }; + chrome.runtime.onMessage.addListener(onMsg); + return () => { + alive = false; + chrome.runtime.onMessage.removeListener(onMsg); + }; + }, []); + return session; +} + +function respond(id: string, responded: boolean, annotations: unknown[]): void { + void chrome.runtime + .sendMessage({ + type: "ocb-feedback-result", + id, + responded, + annotations + } satisfies FeedbackResultMessage) + .catch(() => {}); +} + +function Request({ session }: { session: FeedbackSession }) { + const [done, setDone] = useState(false); + const finish = useCallback( + (responded: boolean, annotations: unknown[]) => { + respond(session.id, responded, annotations); + setDone(true); + }, + [session.id] + ); + + if (done) { + return ( +

Thanks — you can close this panel.

+ ); + } + + return ( +
+

{session.prompt || defaultPrompt(session.mode)}

+ {session.mode === "confirm" ? ( +
+ + +
+ ) : session.mode === "choose" ? ( +
+ {(session.options ?? []).map((opt) => ( + + ))} + +
+ ) : SPATIAL.has(session.mode) ? ( + + ) : ( +
+ +
+ )} +
+ ); +} + +function defaultPrompt(mode: string): string { + if (mode === "region") { + return "Drag a box over the area you mean."; + } + if (mode === "comment") { + return "Click a spot, then add a note."; + } + return "Click the element you mean."; +} + +interface Rect { + x: number; + y: number; + width: number; + height: number; +} + +/** Click/drag over the captured screenshot; coordinates map to screenshot pixels. */ +function ScreenshotAnnotator({ + session, + onFinish +}: { + session: FeedbackSession; + onFinish: (responded: boolean, annotations: unknown[]) => void; +}) { + const imgRef = useRef(null); + const isRegion = session.mode === "region"; + const [point, setPoint] = useState<{ dx: number; dy: number } | null>(null); // display px + const [band, setBand] = useState(null); // display px + const drag = useRef<{ x: number; y: number } | null>(null); + const [note, setNote] = useState(""); + + // Display px (relative to the image box) → natural screenshot px. + const toNatural = (dx: number, dy: number): { x: number; y: number } => { + const img = imgRef.current; + if (!img) { + return { x: dx, y: dy }; + } + const r = img.getBoundingClientRect(); + return { + x: Math.round((dx / r.width) * img.naturalWidth), + y: Math.round((dy / r.height) * img.naturalHeight) + }; + }; + const local = (e: { clientX: number; clientY: number }): { dx: number; dy: number } => { + const r = imgRef.current?.getBoundingClientRect(); + return { dx: e.clientX - (r?.left ?? 0), dy: e.clientY - (r?.top ?? 0) }; + }; + + const onDown = (e: React.MouseEvent): void => { + const { dx, dy } = local(e); + if (isRegion) { + drag.current = { x: dx, y: dy }; + setBand({ x: dx, y: dy, width: 0, height: 0 }); + } else { + setPoint({ dx, dy }); + } + }; + const onMove = (e: React.MouseEvent): void => { + if (!isRegion || !drag.current) { + return; + } + const { dx, dy } = local(e); + const s = drag.current; + setBand({ + x: Math.min(s.x, dx), + y: Math.min(s.y, dy), + width: Math.abs(dx - s.x), + height: Math.abs(dy - s.y) + }); + }; + const onUp = (): void => { + drag.current = null; + }; + + const canSubmit = isRegion ? band !== null && band.width > 3 : point !== null; + + const submit = (): void => { + const text = note.trim(); + if (isRegion && band) { + const tl = toNatural(band.x, band.y); + const br = toNatural(band.x + band.width, band.y + band.height); + onFinish(true, [ + { + kind: "region", + rect: { x: tl.x, y: tl.y, width: br.x - tl.x, height: br.y - tl.y }, + refs: [], + ...(text ? { text } : {}) + } + ]); + } else if (point) { + const p = toNatural(point.dx, point.dy); + onFinish(true, [{ kind: "point", x: p.x, y: p.y, ...(text ? { text } : {}) }]); + } + }; + + return ( +
+
+ {/* biome-ignore lint/a11y/noStaticElementInteractions: an annotation canvas over a screenshot */} +
+ Page screenshot — click or drag to mark what you mean + {point && !isRegion ? ( + + ) : null} + {band && isRegion ? ( + + ) : null} +
+
+ + {session.mode === "comment" ? ( +