From 1f9ee889ec9da0b1707f0836c678f950725dc05a Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 19 Jun 2026 16:51:14 +0800 Subject: [PATCH] Add Set-of-Marks overlay for VLM element grounding --- README.md | 8 + README/README_zh-CN.md | 8 + README/README_zh-TW.md | 8 + .../Eng/doc/new_features/v22_features_doc.rst | 51 +++++++ docs/source/Eng/eng_index.rst | 1 + .../Zh/doc/new_features/v22_features_doc.rst | 47 ++++++ docs/source/Zh/zh_index.rst | 1 + je_auto_control/__init__.py | 6 + .../gui/script_builder/command_schema.py | 18 +++ .../utils/executor/action_executor.py | 15 ++ .../utils/mcp_server/tools/_factories.py | 28 +++- .../utils/mcp_server/tools/_handlers.py | 10 ++ .../utils/set_of_marks/__init__.py | 10 ++ .../utils/set_of_marks/set_of_marks.py | 140 ++++++++++++++++++ .../headless/test_set_of_marks_batch.py | 72 +++++++++ 15 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 docs/source/Eng/doc/new_features/v22_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v22_features_doc.rst create mode 100644 je_auto_control/utils/set_of_marks/__init__.py create mode 100644 je_auto_control/utils/set_of_marks/set_of_marks.py create mode 100644 test/unit_test/headless/test_set_of_marks_batch.py diff --git a/README.md b/README.md index e6ab0c1f..e9a8a9fc 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ ## Table of Contents +- [What's new (2026-06-19) — Set-of-Marks Overlay](#whats-new-2026-06-19--set-of-marks-overlay) - [What's new (2026-06-19) — Checkpoint & Resume](#whats-new-2026-06-19--checkpoint--resume) - [What's new (2026-06-19) — i18n / l10n Testing](#whats-new-2026-06-19--i18n--l10n-testing) - [What's new (2026-06-19) — Data Quality](#whats-new-2026-06-19--data-quality) @@ -74,6 +75,13 @@ --- +## What's new (2026-06-19) — Set-of-Marks Overlay + +The standard VLM-grounding format, full stack. Full reference: [`docs/source/Eng/doc/new_features/v22_features_doc.rst`](docs/source/Eng/doc/new_features/v22_features_doc.rst). + +- **Number elements** — `mark_elements` / `render_marks` / `resolve_mark` (pure + Pillow): assign `1..N` to interactable elements (with centre/role/text), draw numbered red boxes on a screenshot, and map a chosen number back to its element — so a VLM picks a *number* instead of guessing pixels (directly strengthens the existing VLM locator). +- **Mark-then-click loop** — `mark_screen(render_path=...)` / `mark_click(n)` (`AC_mark_screen` / `AC_mark_click`, `ac_*`): number the live a11y tree (+ optional overlay screenshot), feed marks+image to a model, then click mark `n`. + ## What's new (2026-06-19) — Checkpoint & Resume Durable execution for long flows + a `py.typed` marker, full stack. Full reference: [`docs/source/Eng/doc/new_features/v21_features_doc.rst`](docs/source/Eng/doc/new_features/v21_features_doc.rst). diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md index 95630b48..3c82946d 100644 --- a/README/README_zh-CN.md +++ b/README/README_zh-CN.md @@ -12,6 +12,7 @@ ## 目录 +- [本次更新 (2026-06-19) — Set-of-Marks 叠图](#本次更新-2026-06-19--set-of-marks-叠图) - [本次更新 (2026-06-19) — 检查点与续跑](#本次更新-2026-06-19--检查点与续跑) - [本次更新 (2026-06-19) — i18n / l10n 测试](#本次更新-2026-06-19--i18n--l10n-测试) - [本次更新 (2026-06-19) — 数据质量](#本次更新-2026-06-19--数据质量) @@ -73,6 +74,13 @@ --- +## 本次更新 (2026-06-19) — Set-of-Marks 叠图 + +VLM 定位的标准格式,走完整五层。完整参考:[`docs/source/Zh/doc/new_features/v22_features_doc.rst`](../docs/source/Zh/doc/new_features/v22_features_doc.rst)。 + +- **元素标号** — `mark_elements` / `render_marks` / `resolve_mark`(纯函数 + Pillow):为可交互元素指派 `1..N`(含中心/role/text),在截图上画编号红框,并把选到的编号对应回元素——让 VLM 挑*编号*而非猜像素(直接强化既有 VLM locator)。 +- **标号后点击循环** — `mark_screen(render_path=...)` / `mark_click(n)`(`AC_mark_screen` / `AC_mark_click`、`ac_*`):为实时 a11y 树标号(+可选叠图截图),把 marks+图像喂给模型,再点击第 `n` 号。 + ## 本次更新 (2026-06-19) — 检查点与续跑 长流程的耐久执行 + `py.typed` 标记,走完整五层。完整参考:[`docs/source/Zh/doc/new_features/v21_features_doc.rst`](../docs/source/Zh/doc/new_features/v21_features_doc.rst)。 diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md index d8993bbe..abda423c 100644 --- a/README/README_zh-TW.md +++ b/README/README_zh-TW.md @@ -12,6 +12,7 @@ ## 目錄 +- [本次更新 (2026-06-19) — Set-of-Marks 疊圖](#本次更新-2026-06-19--set-of-marks-疊圖) - [本次更新 (2026-06-19) — 檢查點與續跑](#本次更新-2026-06-19--檢查點與續跑) - [本次更新 (2026-06-19) — i18n / l10n 測試](#本次更新-2026-06-19--i18n--l10n-測試) - [本次更新 (2026-06-19) — 資料品質](#本次更新-2026-06-19--資料品質) @@ -73,6 +74,13 @@ --- +## 本次更新 (2026-06-19) — Set-of-Marks 疊圖 + +VLM 定位的標準格式,走完整五層。完整參考:[`docs/source/Zh/doc/new_features/v22_features_doc.rst`](../docs/source/Zh/doc/new_features/v22_features_doc.rst)。 + +- **元素標號** — `mark_elements` / `render_marks` / `resolve_mark`(純函式 + Pillow):為可互動元素指派 `1..N`(含中心/role/text),在截圖上畫編號紅框,並把選到的編號對應回元素——讓 VLM 挑*編號*而非猜像素(直接強化既有 VLM locator)。 +- **標號後點擊迴圈** — `mark_screen(render_path=...)` / `mark_click(n)`(`AC_mark_screen` / `AC_mark_click`、`ac_*`):為即時 a11y 樹標號(+可選疊圖截圖),把 marks+影像餵給模型,再點擊第 `n` 號。 + ## 本次更新 (2026-06-19) — 檢查點與續跑 長流程的耐久執行 + `py.typed` 標記,走完整五層。完整參考:[`docs/source/Zh/doc/new_features/v21_features_doc.rst`](../docs/source/Zh/doc/new_features/v21_features_doc.rst)。 diff --git a/docs/source/Eng/doc/new_features/v22_features_doc.rst b/docs/source/Eng/doc/new_features/v22_features_doc.rst new file mode 100644 index 00000000..9b160723 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v22_features_doc.rst @@ -0,0 +1,51 @@ +================================================== +New Features (2026-06-19) — Set-of-Marks Overlay +================================================== + +Modern GUI agents ground far more reliably when shown a screenshot with +**numbered boxes** over the interactable elements plus an ``id -> bbox`` +legend ("Set-of-Marks" prompting): the model picks a *number* instead of +guessing pixel coordinates. This turns AutoControl's existing element +sources into that two-stage "mark then pick a number" loop and resolves the +chosen number back to a click. Pure standard library + Pillow (already a +dependency); wired through the full stack. + +.. contents:: + :local: + :depth: 2 + + +Numbering and the legend +======================= + +:: + + from je_auto_control import mark_elements, render_marks, resolve_mark + + marks = mark_elements(elements) # [{id, bbox, center, role, text}, ...] + legend = [(m["id"], m["text"]) for m in marks] + annotated_png = render_marks(screenshot_png_bytes, marks) + chosen = resolve_mark(marks, 3) # the element the model picked + +``mark_elements`` assigns ``1..N`` to every element with a valid bounds and +records its centre; ``render_marks`` draws numbered red boxes on a PNG; +``resolve_mark`` maps a number back to its mark. These are pure and +unit-testable with synthetic elements. + + +Live "mark then click" loop +========================== + +:: + + from je_auto_control import mark_screen, mark_click + + result = mark_screen(render_path="marked.png") # numbers the live a11y tree + # ... feed result["marks"] + marked.png to a VLM, get back a number ... + mark_click(3) # click mark #3 + +``mark_screen`` numbers the live accessibility elements (and optionally +saves a numbered-box overlay screenshot), caching the marks; ``mark_click`` +resolves a number from that cache and clicks the element's centre. Exposed +as ``AC_mark_screen`` / ``AC_mark_click`` (and ``ac_mark_screen`` / +``ac_mark_click``). diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst index 48989a48..44c04b96 100644 --- a/docs/source/Eng/eng_index.rst +++ b/docs/source/Eng/eng_index.rst @@ -44,6 +44,7 @@ Comprehensive guides for all AutoControl features. doc/new_features/v19_features_doc doc/new_features/v20_features_doc doc/new_features/v21_features_doc + doc/new_features/v22_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/docs/source/Zh/doc/new_features/v22_features_doc.rst b/docs/source/Zh/doc/new_features/v22_features_doc.rst new file mode 100644 index 00000000..b4b4c7f9 --- /dev/null +++ b/docs/source/Zh/doc/new_features/v22_features_doc.rst @@ -0,0 +1,47 @@ +========================================== +新功能 (2026-06-19) — Set-of-Marks 疊圖 +========================================== + +現代 GUI agent 在看到「畫上**編號方框**的截圖 + ``id -> bbox`` 圖例」時 +定位會可靠得多(Set-of-Marks prompting):模型挑一個*編號*,而不是猜 +像素座標。本功能把 AutoControl 既有的元素來源轉成這種「先標號、再挑號」 +的兩階段流程,並把選到的編號解析回一次點擊。純標準庫 + Pillow(已是相依); +走完整五層。 + +.. contents:: + :local: + :depth: 2 + + +標號與圖例 +========== + +:: + + from je_auto_control import mark_elements, render_marks, resolve_mark + + marks = mark_elements(elements) # [{id, bbox, center, role, text}, ...] + legend = [(m["id"], m["text"]) for m in marks] + annotated_png = render_marks(screenshot_png_bytes, marks) + chosen = resolve_mark(marks, 3) # 模型挑中的元素 + +``mark_elements`` 會為每個有有效 bounds 的元素指派 ``1..N`` 並記錄中心點; +``render_marks`` 在 PNG 上畫出編號紅框;``resolve_mark`` 把編號對應回該 +標記。這些都是純函式,可用合成元素做單元測試。 + + +即時「標號後點擊」迴圈 +====================== + +:: + + from je_auto_control import mark_screen, mark_click + + result = mark_screen(render_path="marked.png") # 為即時 a11y 樹標號 + # ... 把 result["marks"] + marked.png 餵給 VLM,取回一個編號 ... + mark_click(3) # 點擊第 3 號標記 + +``mark_screen`` 為即時 accessibility 元素標號(並可另存編號方框疊圖截圖), +並快取這些標記;``mark_click`` 從快取解析編號並點擊該元素中心。對應 +``AC_mark_screen`` / ``AC_mark_click``(以及 ``ac_mark_screen`` / +``ac_mark_click``)。 diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst index cf3cb61b..28ec678a 100644 --- a/docs/source/Zh/zh_index.rst +++ b/docs/source/Zh/zh_index.rst @@ -44,6 +44,7 @@ AutoControl 所有功能的完整使用指南。 doc/new_features/v19_features_doc doc/new_features/v20_features_doc doc/new_features/v21_features_doc + doc/new_features/v22_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 11abc90b..c087b231 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -165,6 +165,10 @@ from je_auto_control.utils.checkpoint import ( Checkpoint, CheckpointStore, run_resumable, ) +# Set-of-Marks overlay (number elements for VLM grounding) +from je_auto_control.utils.set_of_marks import ( + mark_click, mark_elements, mark_screen, render_marks, resolve_mark, +) # Background popup/interrupt watchdog (unattended automation) from je_auto_control.utils.watchdog import ( PopupWatchdog, WatchdogRule, default_popup_watchdog, @@ -588,6 +592,8 @@ def start_autocontrol_gui(*args, **kwargs): "check_catalog", "check_overflow", "pseudo_localize", "pseudo_localize_catalog", "Checkpoint", "CheckpointStore", "run_resumable", + "mark_click", "mark_elements", "mark_screen", "render_marks", + "resolve_mark", # MCP server "AuditLogger", "HttpMCPServer", "MCPContent", "MCPPrompt", "MCPPromptArgument", "MCPResource", "MCPServer", "MCPTool", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 0c51ad84..6e84d913 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -661,6 +661,24 @@ def _add_misc_specs(specs: List[CommandSpec]) -> None: _add_data_quality_specs(specs) _add_i18n_specs(specs) _add_checkpoint_specs(specs) + _add_set_of_marks_specs(specs) + + +def _add_set_of_marks_specs(specs: List[CommandSpec]) -> None: + specs.append(CommandSpec( + "AC_mark_screen", "Native UI", "Set-of-Marks: Number Elements", + fields=( + FieldSpec("app_name", FieldType.STRING, optional=True), + FieldSpec("render_path", FieldType.FILE_PATH, optional=True), + ), + description="Number live UI elements (id->bbox legend) for VLM " + "grounding; optional numbered-box overlay screenshot.", + )) + specs.append(CommandSpec( + "AC_mark_click", "Native UI", "Set-of-Marks: Click Number", + fields=(FieldSpec("mark_id", FieldType.INT),), + description="Click the element behind a numbered mark.", + )) def _add_checkpoint_specs(specs: List[CommandSpec]) -> None: diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index d8fb2c98..4bc97df6 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2743,6 +2743,19 @@ def _checkpoint_clear(run_id: str, db: str) -> Dict[str, Any]: return {"cleared": CheckpointStore(db).clear(run_id)} +def _mark_screen(app_name: Optional[str] = None, + render_path: Optional[str] = None) -> Dict[str, Any]: + """Adapter: number live UI elements (Set-of-Marks) for VLM grounding.""" + from je_auto_control.utils.set_of_marks import mark_screen + return mark_screen(app_name=app_name, render_path=render_path) + + +def _mark_click(mark_id: int) -> Dict[str, Any]: + """Adapter: click the element behind a numbered mark.""" + from je_auto_control.utils.set_of_marks import mark_click + return {"clicked": mark_click(int(mark_id))} + + class Executor: """ Executor @@ -2953,6 +2966,8 @@ def __init__(self): "AC_run_resumable": _run_resumable, "AC_checkpoint_status": _checkpoint_status, "AC_checkpoint_clear": _checkpoint_clear, + "AC_mark_screen": _mark_screen, + "AC_mark_click": _mark_click, "AC_a11y_record_start": _a11y_record_start, "AC_a11y_record_stop": _a11y_record_stop, "AC_a11y_record_events": _a11y_record_events, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 2c88ff6e..96c9bda2 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -2303,6 +2303,32 @@ def checkpoint_tools() -> List[MCPTool]: ] +def set_of_marks_tools() -> List[MCPTool]: + return [ + MCPTool( + name="ac_mark_screen", + description=("Set-of-Marks: number the live UI elements (a11y " + "tree) and return an id->bbox/center/role/text " + "legend for VLM grounding — the model picks a number " + "instead of pixels. Optionally render a numbered-box " + "overlay screenshot to 'render_path'."), + input_schema=schema({"app_name": {"type": "string"}, + "render_path": {"type": "string"}}), + handler=h.mark_screen, + annotations=SIDE_EFFECT_ONLY, + ), + MCPTool( + name="ac_mark_click", + description=("Click the element behind a numbered mark from the " + "last ac_mark_screen. Returns {clicked}."), + input_schema=schema({"mark_id": {"type": "integer"}}, + required=["mark_id"]), + handler=h.mark_click, + annotations=SIDE_EFFECT_ONLY, + ), + ] + + def unattended_tools() -> List[MCPTool]: return [ MCPTool( @@ -3357,7 +3383,7 @@ def media_assert_tools() -> List[MCPTool]: skill_library_tools, guardrail_tools, a2a_tools, office_tools, agent_memory_tools, determinism_tools, observer_tools, sbom_tools, sharding_tools, data_quality_tools, i18n_tools, - checkpoint_tools, + checkpoint_tools, set_of_marks_tools, screen_record_tools, process_and_shell_tools, remote_desktop_tools, gamepad_tools, usb_passthrough_tools, assertion_tools, data_source_tools, diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index ab3e4328..a802b3c9 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -1133,6 +1133,16 @@ def checkpoint_clear(run_id, db): return {"cleared": CheckpointStore(db).clear(run_id)} +def mark_screen(app_name=None, render_path=None): + from je_auto_control.utils.set_of_marks import mark_screen as _ms + return _ms(app_name=app_name, render_path=render_path) + + +def mark_click(mark_id): + from je_auto_control.utils.set_of_marks import mark_click as _mc + return {"clicked": _mc(int(mark_id))} + + def vlm_locate(description: str, screen_region: Optional[List[int]] = None, model: Optional[str] = None) -> Optional[List[int]]: diff --git a/je_auto_control/utils/set_of_marks/__init__.py b/je_auto_control/utils/set_of_marks/__init__.py new file mode 100644 index 00000000..c8d2026f --- /dev/null +++ b/je_auto_control/utils/set_of_marks/__init__.py @@ -0,0 +1,10 @@ +"""Set-of-Marks overlay — number on-screen elements for VLM grounding.""" +from je_auto_control.utils.set_of_marks.set_of_marks import ( + last_marks, mark_click, mark_elements, mark_screen, render_marks, + resolve_mark, +) + +__all__ = [ + "last_marks", "mark_click", "mark_elements", "mark_screen", + "render_marks", "resolve_mark", +] diff --git a/je_auto_control/utils/set_of_marks/set_of_marks.py b/je_auto_control/utils/set_of_marks/set_of_marks.py new file mode 100644 index 00000000..c7ffe1fe --- /dev/null +++ b/je_auto_control/utils/set_of_marks/set_of_marks.py @@ -0,0 +1,140 @@ +"""Set-of-Marks overlay — number on-screen elements for VLM grounding. + +Modern GUI agents ground far more reliably when shown a screenshot with +numbered boxes drawn over the interactable elements plus an ``id -> bbox`` +legend ("Set-of-Marks" prompting): the model picks a *number* instead of +guessing pixel coordinates. This turns AutoControl's existing element +sources (accessibility tree / OCR / template hits) into that two-stage +"mark then pick a number" loop, and resolves a chosen number back to a +click. + +The legend computation (:func:`mark_elements` / :func:`resolve_mark`) is +pure Python and unit-testable with synthetic elements; rendering uses +Pillow (already a dependency). Imports no ``PySide6``. +""" +import io +from pathlib import Path +from typing import Any, Dict, List, Optional + +_OUTLINE = (255, 0, 0) +_last_marks: List[Dict[str, Any]] = [] + + +def _bbox_of(element: Any) -> List[int]: + if isinstance(element, dict): + raw = element.get("bbox") or element.get("bounds") or [] + else: + raw = getattr(element, "bounds", []) or [] + return list(raw) + + +def _text_of(element: Any) -> str: + if isinstance(element, dict): + return str(element.get("text") or element.get("name") or "") + return str(getattr(element, "name", "") or "") + + +def _role_of(element: Any) -> str: + if isinstance(element, dict): + return str(element.get("role") or "") + return str(getattr(element, "role", "") or "") + + +def mark_elements(elements: List[Any]) -> List[Dict[str, Any]]: + """Assign a numeric mark to each element with a valid bounds. + + Returns ``[{id, bbox, center, role, text}]`` (ids start at 1). + """ + marks: List[Dict[str, Any]] = [] + next_id = 1 + for element in elements: + bbox = _bbox_of(element) + if len(bbox) < 4: + continue + left, top, width, height = (int(bbox[0]), int(bbox[1]), + int(bbox[2]), int(bbox[3])) + marks.append({ + "id": next_id, "bbox": [left, top, width, height], + "center": [left + width // 2, top + height // 2], + "role": _role_of(element), "text": _text_of(element)}) + next_id += 1 + return marks + + +def resolve_mark(marks: List[Dict[str, Any]], + mark_id: int) -> Optional[Dict[str, Any]]: + """Return the mark whose ``id`` equals ``mark_id`` (or ``None``).""" + for mark in marks: + if mark["id"] == int(mark_id): + return mark + return None + + +def _draw_marks(image: Any, marks: List[Dict[str, Any]]) -> Any: + from PIL import ImageDraw + draw = ImageDraw.Draw(image) + for mark in marks: + left, top, width, height = mark["bbox"] + draw.rectangle([left, top, left + width, top + height], + outline=_OUTLINE, width=2) + label = str(mark["id"]) + draw.rectangle([left, top, left + 8 + 6 * len(label), top + 12], + fill=_OUTLINE) + draw.text((left + 2, top + 1), label, fill=(255, 255, 255)) + return image + + +def render_marks(image_bytes: bytes, + marks: List[Dict[str, Any]]) -> bytes: + """Draw numbered boxes for ``marks`` on a PNG; return annotated PNG bytes.""" + from PIL import Image + image = Image.open(io.BytesIO(image_bytes)).convert("RGB") + _draw_marks(image, marks) + out = io.BytesIO() + image.save(out, format="PNG") + return out.getvalue() + + +def last_marks() -> List[Dict[str, Any]]: + """Return a copy of the marks from the most recent :func:`mark_screen`.""" + return [dict(mark) for mark in _last_marks] + + +def mark_screen(app_name: Optional[str] = None, + render_path: Optional[str] = None) -> Dict[str, Any]: + """Number the live accessibility elements; optionally render an overlay. + + Stores the marks for a later :func:`mark_click`. When ``render_path`` is + given, a screenshot is captured, annotated, and saved there. + """ + from je_auto_control.utils.accessibility.accessibility_api import ( + list_accessibility_elements) + marks = mark_elements(list_accessibility_elements(app_name=app_name)) + _last_marks.clear() + _last_marks.extend(marks) + result: Dict[str, Any] = {"marks": marks} + if render_path: + from je_auto_control.utils.cv2_utils.screenshot import pil_screenshot + image = _draw_marks(pil_screenshot().convert("RGB"), marks) + target = Path(render_path) + image.save(str(target), format="PNG") + result["image_path"] = str(target.resolve()) + return result + + +def mark_click(mark_id: int, + marks: Optional[List[Dict[str, Any]]] = None) -> bool: + """Click the centre of the marked element; return whether it matched. + + Uses ``marks`` if supplied, else the marks from the last + :func:`mark_screen`. + """ + mark = resolve_mark(marks if marks is not None else _last_marks, mark_id) + if mark is None: + return False + center_x, center_y = mark["center"] + from je_auto_control.wrapper.auto_control_mouse import ( + click_mouse, set_mouse_position) + set_mouse_position(center_x, center_y) + click_mouse("mouse_left", center_x, center_y) + return True diff --git a/test/unit_test/headless/test_set_of_marks_batch.py b/test/unit_test/headless/test_set_of_marks_batch.py new file mode 100644 index 00000000..94932aa2 --- /dev/null +++ b/test/unit_test/headless/test_set_of_marks_batch.py @@ -0,0 +1,72 @@ +"""Headless tests for the Set-of-Marks overlay (number elements for VLM +grounding). Pure stdlib + Pillow; no Qt imports, no live screen needed.""" +import io + +from types import SimpleNamespace + +import je_auto_control as ac +from je_auto_control.utils.set_of_marks import ( + mark_click, mark_elements, render_marks, resolve_mark) + + +def test_mark_elements_numbers_and_centers(): + elements = [ + {"bbox": [0, 0, 100, 20], "role": "button", "text": "OK"}, + SimpleNamespace(bounds=[10, 40, 60, 20], role="edit", name="user"), + {"bbox": [0, 0], "text": "bad"}, # invalid bounds -> skipped + ] + marks = mark_elements(elements) + assert [m["id"] for m in marks] == [1, 2] + assert marks[0]["center"] == [50, 10] + assert marks[1]["role"] == "edit" and marks[1]["text"] == "user" + + +def test_resolve_mark(): + marks = mark_elements([{"bbox": [0, 0, 10, 10]}, + {"bbox": [0, 0, 20, 20]}]) + assert resolve_mark(marks, 2)["bbox"] == [0, 0, 20, 20] + assert resolve_mark(marks, 99) is None + + +def test_render_marks_returns_png(): + from PIL import Image + buf = io.BytesIO() + Image.new("RGB", (80, 60), (255, 255, 255)).save(buf, format="PNG") + out = render_marks(buf.getvalue(), + [{"id": 1, "bbox": [5, 5, 30, 12], "center": [20, 11]}]) + assert out[:8] == b"\x89PNG\r\n\x1a\n" # valid PNG signature + assert len(out) > 0 + + +def test_mark_click_uses_supplied_marks(monkeypatch): + import je_auto_control.wrapper.auto_control_mouse as mouse + calls = {} + monkeypatch.setattr(mouse, "set_mouse_position", + lambda x, y: calls.setdefault("pos", (x, y))) + monkeypatch.setattr(mouse, "click_mouse", + lambda btn, x, y: calls.setdefault("click", (x, y))) + marks = [{"id": 7, "bbox": [0, 0, 40, 20], "center": [20, 10]}] + assert mark_click(7, marks) is True + assert calls["click"] == (20, 10) + assert mark_click(99, marks) is False # unknown id -> no click + + +# --- wiring --------------------------------------------------------------- + +def test_wiring(): + known = ac.executor.known_commands() + assert {"AC_mark_screen", "AC_mark_click"} <= known + from je_auto_control.utils.mcp_server.tools import ( + build_default_tool_registry) + names = {t.name for t in build_default_tool_registry()} + assert {"ac_mark_screen", "ac_mark_click"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + cmds = {s.command for s in _build_specs()} + assert {"AC_mark_screen", "AC_mark_click"} <= cmds + + +def test_facade_exports(): + for attr in ("mark_elements", "resolve_mark", "render_marks", + "mark_screen", "mark_click"): + assert hasattr(ac, attr) + assert attr in ac.__all__