From 1f9ee889ec9da0b1707f0836c678f950725dc05a Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 19 Jun 2026 16:51:14 +0800
Subject: [PATCH] Add Set-of-Marks overlay for VLM element grounding

---
 README.md                                     |   8 +
 README/README_zh-CN.md                        |   8 +
 README/README_zh-TW.md                        |   8 +
 .../Eng/doc/new_features/v22_features_doc.rst |  51 +++++++
 docs/source/Eng/eng_index.rst                 |   1 +
 .../Zh/doc/new_features/v22_features_doc.rst  |  47 ++++++
 docs/source/Zh/zh_index.rst                   |   1 +
 je_auto_control/__init__.py                   |   6 +
 .../gui/script_builder/command_schema.py      |  18 +++
 .../utils/executor/action_executor.py         |  15 ++
 .../utils/mcp_server/tools/_factories.py      |  28 +++-
 .../utils/mcp_server/tools/_handlers.py       |  10 ++
 .../utils/set_of_marks/__init__.py            |  10 ++
 .../utils/set_of_marks/set_of_marks.py        | 140 ++++++++++++++++++
 .../headless/test_set_of_marks_batch.py       |  72 +++++++++
 15 files changed, 422 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/Eng/doc/new_features/v22_features_doc.rst
 create mode 100644 docs/source/Zh/doc/new_features/v22_features_doc.rst
 create mode 100644 je_auto_control/utils/set_of_marks/__init__.py
 create mode 100644 je_auto_control/utils/set_of_marks/set_of_marks.py
 create mode 100644 test/unit_test/headless/test_set_of_marks_batch.py

diff --git a/README.md b/README.md
index e6ab0c1f..e9a8a9fc 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@
 
 ## Table of Contents
 
+- [What's new (2026-06-19) — Set-of-Marks Overlay](#whats-new-2026-06-19--set-of-marks-overlay)
 - [What's new (2026-06-19) — Checkpoint & Resume](#whats-new-2026-06-19--checkpoint--resume)
 - [What's new (2026-06-19) — i18n / l10n Testing](#whats-new-2026-06-19--i18n--l10n-testing)
 - [What's new (2026-06-19) — Data Quality](#whats-new-2026-06-19--data-quality)
@@ -74,6 +75,13 @@
 
 ---
 
+## What's new (2026-06-19) — Set-of-Marks Overlay
+
+The standard VLM-grounding format, full stack. Full reference: [`docs/source/Eng/doc/new_features/v22_features_doc.rst`](docs/source/Eng/doc/new_features/v22_features_doc.rst).
+
+- **Number elements** — `mark_elements` / `render_marks` / `resolve_mark` (pure + Pillow): assign `1..N` to interactable elements (with centre/role/text), draw numbered red boxes on a screenshot, and map a chosen number back to its element — so a VLM picks a *number* instead of guessing pixels (directly strengthens the existing VLM locator).
+- **Mark-then-click loop** — `mark_screen(render_path=...)` / `mark_click(n)` (`AC_mark_screen` / `AC_mark_click`, `ac_*`): number the live a11y tree (+ optional overlay screenshot), feed marks+image to a model, then click mark `n`.
+
 ## What's new (2026-06-19) — Checkpoint & Resume
 
 Durable execution for long flows + a `py.typed` marker, full stack. Full reference: [`docs/source/Eng/doc/new_features/v21_features_doc.rst`](docs/source/Eng/doc/new_features/v21_features_doc.rst).
diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md
index 95630b48..3c82946d 100644
--- a/README/README_zh-CN.md
+++ b/README/README_zh-CN.md
@@ -12,6 +12,7 @@
 
 ## 目录
 
+- [本次更新 (2026-06-19) — Set-of-Marks 叠图](#本次更新-2026-06-19--set-of-marks-叠图)
 - [本次更新 (2026-06-19) — 检查点与续跑](#本次更新-2026-06-19--检查点与续跑)
 - [本次更新 (2026-06-19) — i18n / l10n 测试](#本次更新-2026-06-19--i18n--l10n-测试)
 - [本次更新 (2026-06-19) — 数据质量](#本次更新-2026-06-19--数据质量)
@@ -73,6 +74,13 @@
 
 ---
 
+## 本次更新 (2026-06-19) — Set-of-Marks 叠图
+
+VLM 定位的标准格式,走完整五层。完整参考:[`docs/source/Zh/doc/new_features/v22_features_doc.rst`](../docs/source/Zh/doc/new_features/v22_features_doc.rst)。
+
+- **元素标号** — `mark_elements` / `render_marks` / `resolve_mark`(纯函数 + Pillow):为可交互元素指派 `1..N`(含中心/role/text),在截图上画编号红框,并把选到的编号对应回元素——让 VLM 挑*编号*而非猜像素(直接强化既有 VLM locator)。
+- **标号后点击循环** — `mark_screen(render_path=...)` / `mark_click(n)`(`AC_mark_screen` / `AC_mark_click`、`ac_*`):为实时 a11y 树标号(+可选叠图截图),把 marks+图像喂给模型,再点击第 `n` 号。
+
 ## 本次更新 (2026-06-19) — 检查点与续跑
 
 长流程的耐久执行 + `py.typed` 标记,走完整五层。完整参考:[`docs/source/Zh/doc/new_features/v21_features_doc.rst`](../docs/source/Zh/doc/new_features/v21_features_doc.rst)。
diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md
index d8993bbe..abda423c 100644
--- a/README/README_zh-TW.md
+++ b/README/README_zh-TW.md
@@ -12,6 +12,7 @@
 
 ## 目錄
 
+- [本次更新 (2026-06-19) — Set-of-Marks 疊圖](#本次更新-2026-06-19--set-of-marks-疊圖)
 - [本次更新 (2026-06-19) — 檢查點與續跑](#本次更新-2026-06-19--檢查點與續跑)
 - [本次更新 (2026-06-19) — i18n / l10n 測試](#本次更新-2026-06-19--i18n--l10n-測試)
 - [本次更新 (2026-06-19) — 資料品質](#本次更新-2026-06-19--資料品質)
@@ -73,6 +74,13 @@
 
 ---
 
+## 本次更新 (2026-06-19) — Set-of-Marks 疊圖
+
+VLM 定位的標準格式,走完整五層。完整參考:[`docs/source/Zh/doc/new_features/v22_features_doc.rst`](../docs/source/Zh/doc/new_features/v22_features_doc.rst)。
+
+- **元素標號** — `mark_elements` / `render_marks` / `resolve_mark`(純函式 + Pillow):為可互動元素指派 `1..N`(含中心/role/text),在截圖上畫編號紅框,並把選到的編號對應回元素——讓 VLM 挑*編號*而非猜像素(直接強化既有 VLM locator)。
+- **標號後點擊迴圈** — `mark_screen(render_path=...)` / `mark_click(n)`(`AC_mark_screen` / `AC_mark_click`、`ac_*`):為即時 a11y 樹標號(+可選疊圖截圖),把 marks+影像餵給模型,再點擊第 `n` 號。
+
 ## 本次更新 (2026-06-19) — 檢查點與續跑
 
 長流程的耐久執行 + `py.typed` 標記,走完整五層。完整參考:[`docs/source/Zh/doc/new_features/v21_features_doc.rst`](../docs/source/Zh/doc/new_features/v21_features_doc.rst)。
diff --git a/docs/source/Eng/doc/new_features/v22_features_doc.rst b/docs/source/Eng/doc/new_features/v22_features_doc.rst
new file mode 100644
index 00000000..9b160723
--- /dev/null
+++ b/docs/source/Eng/doc/new_features/v22_features_doc.rst
@@ -0,0 +1,51 @@
+==================================================
+New Features (2026-06-19) — Set-of-Marks Overlay
+==================================================
+
+Modern GUI agents ground far more reliably when shown a screenshot with
+**numbered boxes** over the interactable elements plus an ``id -> bbox``
+legend ("Set-of-Marks" prompting): the model picks a *number* instead of
+guessing pixel coordinates. This turns AutoControl's existing element
+sources into that two-stage "mark then pick a number" loop and resolves the
+chosen number back to a click. Pure standard library + Pillow (already a
+dependency); wired through the full stack.
+
+.. contents::
+   :local:
+   :depth: 2
+
+
+Numbering and the legend
+=======================
+
+::
+
+    from je_auto_control import mark_elements, render_marks, resolve_mark
+
+    marks = mark_elements(elements)   # [{id, bbox, center, role, text}, ...]
+    legend = [(m["id"], m["text"]) for m in marks]
+    annotated_png = render_marks(screenshot_png_bytes, marks)
+    chosen = resolve_mark(marks, 3)   # the element the model picked
+
+``mark_elements`` assigns ``1..N`` to every element with a valid bounds and
+records its centre; ``render_marks`` draws numbered red boxes on a PNG;
+``resolve_mark`` maps a number back to its mark. These are pure and
+unit-testable with synthetic elements.
+
+
+Live "mark then click" loop
+==========================
+
+::
+
+    from je_auto_control import mark_screen, mark_click
+
+    result = mark_screen(render_path="marked.png")   # numbers the live a11y tree
+    # ... feed result["marks"] + marked.png to a VLM, get back a number ...
+    mark_click(3)                                     # click mark #3
+
+``mark_screen`` numbers the live accessibility elements (and optionally
+saves a numbered-box overlay screenshot), caching the marks; ``mark_click``
+resolves a number from that cache and clicks the element's centre. Exposed
+as ``AC_mark_screen`` / ``AC_mark_click`` (and ``ac_mark_screen`` /
+``ac_mark_click``).
diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst
index 48989a48..44c04b96 100644
--- a/docs/source/Eng/eng_index.rst
+++ b/docs/source/Eng/eng_index.rst
@@ -44,6 +44,7 @@ Comprehensive guides for all AutoControl features.
    doc/new_features/v19_features_doc
    doc/new_features/v20_features_doc
    doc/new_features/v21_features_doc
+   doc/new_features/v22_features_doc
    doc/ocr_backends/ocr_backends_doc
    doc/observability/observability_doc
    doc/operations_layer/operations_layer_doc
diff --git a/docs/source/Zh/doc/new_features/v22_features_doc.rst b/docs/source/Zh/doc/new_features/v22_features_doc.rst
new file mode 100644
index 00000000..b4b4c7f9
--- /dev/null
+++ b/docs/source/Zh/doc/new_features/v22_features_doc.rst
@@ -0,0 +1,47 @@
+==========================================
+新功能 (2026-06-19) — Set-of-Marks 疊圖
+==========================================
+
+現代 GUI agent 在看到「畫上**編號方框**的截圖 + ``id -> bbox`` 圖例」時
+定位會可靠得多(Set-of-Marks prompting):模型挑一個*編號*,而不是猜
+像素座標。本功能把 AutoControl 既有的元素來源轉成這種「先標號、再挑號」
+的兩階段流程,並把選到的編號解析回一次點擊。純標準庫 + Pillow(已是相依);
+走完整五層。
+
+.. contents::
+   :local:
+   :depth: 2
+
+
+標號與圖例
+==========
+
+::
+
+    from je_auto_control import mark_elements, render_marks, resolve_mark
+
+    marks = mark_elements(elements)   # [{id, bbox, center, role, text}, ...]
+    legend = [(m["id"], m["text"]) for m in marks]
+    annotated_png = render_marks(screenshot_png_bytes, marks)
+    chosen = resolve_mark(marks, 3)   # 模型挑中的元素
+
+``mark_elements`` 會為每個有有效 bounds 的元素指派 ``1..N`` 並記錄中心點;
+``render_marks`` 在 PNG 上畫出編號紅框;``resolve_mark`` 把編號對應回該
+標記。這些都是純函式,可用合成元素做單元測試。
+
+
+即時「標號後點擊」迴圈
+======================
+
+::
+
+    from je_auto_control import mark_screen, mark_click
+
+    result = mark_screen(render_path="marked.png")   # 為即時 a11y 樹標號
+    # ... 把 result["marks"] + marked.png 餵給 VLM,取回一個編號 ...
+    mark_click(3)                                     # 點擊第 3 號標記
+
+``mark_screen`` 為即時 accessibility 元素標號(並可另存編號方框疊圖截圖),
+並快取這些標記;``mark_click`` 從快取解析編號並點擊該元素中心。對應
+``AC_mark_screen`` / ``AC_mark_click``(以及 ``ac_mark_screen`` /
+``ac_mark_click``)。
diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst
index cf3cb61b..28ec678a 100644
--- a/docs/source/Zh/zh_index.rst
+++ b/docs/source/Zh/zh_index.rst
@@ -44,6 +44,7 @@ AutoControl 所有功能的完整使用指南。
    doc/new_features/v19_features_doc
    doc/new_features/v20_features_doc
    doc/new_features/v21_features_doc
+   doc/new_features/v22_features_doc
    doc/ocr_backends/ocr_backends_doc
    doc/observability/observability_doc
    doc/operations_layer/operations_layer_doc
diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py
index 11abc90b..c087b231 100644
--- a/je_auto_control/__init__.py
+++ b/je_auto_control/__init__.py
@@ -165,6 +165,10 @@
 from je_auto_control.utils.checkpoint import (
     Checkpoint, CheckpointStore, run_resumable,
 )
+# Set-of-Marks overlay (number elements for VLM grounding)
+from je_auto_control.utils.set_of_marks import (
+    mark_click, mark_elements, mark_screen, render_marks, resolve_mark,
+)
 # Background popup/interrupt watchdog (unattended automation)
 from je_auto_control.utils.watchdog import (
     PopupWatchdog, WatchdogRule, default_popup_watchdog,
@@ -588,6 +592,8 @@ def start_autocontrol_gui(*args, **kwargs):
     "check_catalog", "check_overflow", "pseudo_localize",
     "pseudo_localize_catalog",
     "Checkpoint", "CheckpointStore", "run_resumable",
+    "mark_click", "mark_elements", "mark_screen", "render_marks",
+    "resolve_mark",
     # MCP server
     "AuditLogger", "HttpMCPServer", "MCPContent", "MCPPrompt",
     "MCPPromptArgument", "MCPResource", "MCPServer", "MCPTool",
diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py
index 0c51ad84..6e84d913 100644
--- a/je_auto_control/gui/script_builder/command_schema.py
+++ b/je_auto_control/gui/script_builder/command_schema.py
@@ -661,6 +661,24 @@ def _add_misc_specs(specs: List[CommandSpec]) -> None:
     _add_data_quality_specs(specs)
     _add_i18n_specs(specs)
     _add_checkpoint_specs(specs)
+    _add_set_of_marks_specs(specs)
+
+
+def _add_set_of_marks_specs(specs: List[CommandSpec]) -> None:
+    specs.append(CommandSpec(
+        "AC_mark_screen", "Native UI", "Set-of-Marks: Number Elements",
+        fields=(
+            FieldSpec("app_name", FieldType.STRING, optional=True),
+            FieldSpec("render_path", FieldType.FILE_PATH, optional=True),
+        ),
+        description="Number live UI elements (id->bbox legend) for VLM "
+                    "grounding; optional numbered-box overlay screenshot.",
+    ))
+    specs.append(CommandSpec(
+        "AC_mark_click", "Native UI", "Set-of-Marks: Click Number",
+        fields=(FieldSpec("mark_id", FieldType.INT),),
+        description="Click the element behind a numbered mark.",
+    ))
 
 
 def _add_checkpoint_specs(specs: List[CommandSpec]) -> None:
diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py
index d8fb2c98..4bc97df6 100644
--- a/je_auto_control/utils/executor/action_executor.py
+++ b/je_auto_control/utils/executor/action_executor.py
@@ -2743,6 +2743,19 @@ def _checkpoint_clear(run_id: str, db: str) -> Dict[str, Any]:
     return {"cleared": CheckpointStore(db).clear(run_id)}
 
 
+def _mark_screen(app_name: Optional[str] = None,
+                 render_path: Optional[str] = None) -> Dict[str, Any]:
+    """Adapter: number live UI elements (Set-of-Marks) for VLM grounding."""
+    from je_auto_control.utils.set_of_marks import mark_screen
+    return mark_screen(app_name=app_name, render_path=render_path)
+
+
+def _mark_click(mark_id: int) -> Dict[str, Any]:
+    """Adapter: click the element behind a numbered mark."""
+    from je_auto_control.utils.set_of_marks import mark_click
+    return {"clicked": mark_click(int(mark_id))}
+
+
 class Executor:
     """
     Executor
@@ -2953,6 +2966,8 @@ def __init__(self):
             "AC_run_resumable": _run_resumable,
             "AC_checkpoint_status": _checkpoint_status,
             "AC_checkpoint_clear": _checkpoint_clear,
+            "AC_mark_screen": _mark_screen,
+            "AC_mark_click": _mark_click,
             "AC_a11y_record_start": _a11y_record_start,
             "AC_a11y_record_stop": _a11y_record_stop,
             "AC_a11y_record_events": _a11y_record_events,
diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py
index 2c88ff6e..96c9bda2 100644
--- a/je_auto_control/utils/mcp_server/tools/_factories.py
+++ b/je_auto_control/utils/mcp_server/tools/_factories.py
@@ -2303,6 +2303,32 @@ def checkpoint_tools() -> List[MCPTool]:
     ]
 
 
+def set_of_marks_tools() -> List[MCPTool]:
+    return [
+        MCPTool(
+            name="ac_mark_screen",
+            description=("Set-of-Marks: number the live UI elements (a11y "
+                         "tree) and return an id->bbox/center/role/text "
+                         "legend for VLM grounding — the model picks a number "
+                         "instead of pixels. Optionally render a numbered-box "
+                         "overlay screenshot to 'render_path'."),
+            input_schema=schema({"app_name": {"type": "string"},
+                                 "render_path": {"type": "string"}}),
+            handler=h.mark_screen,
+            annotations=SIDE_EFFECT_ONLY,
+        ),
+        MCPTool(
+            name="ac_mark_click",
+            description=("Click the element behind a numbered mark from the "
+                         "last ac_mark_screen. Returns {clicked}."),
+            input_schema=schema({"mark_id": {"type": "integer"}},
+                                required=["mark_id"]),
+            handler=h.mark_click,
+            annotations=SIDE_EFFECT_ONLY,
+        ),
+    ]
+
+
 def unattended_tools() -> List[MCPTool]:
     return [
         MCPTool(
@@ -3357,7 +3383,7 @@ def media_assert_tools() -> List[MCPTool]:
     skill_library_tools, guardrail_tools, a2a_tools, office_tools,
     agent_memory_tools, determinism_tools, observer_tools,
     sbom_tools, sharding_tools, data_quality_tools, i18n_tools,
-    checkpoint_tools,
+    checkpoint_tools, set_of_marks_tools,
     screen_record_tools,
     process_and_shell_tools, remote_desktop_tools, gamepad_tools,
     usb_passthrough_tools, assertion_tools, data_source_tools,
diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py
index ab3e4328..a802b3c9 100644
--- a/je_auto_control/utils/mcp_server/tools/_handlers.py
+++ b/je_auto_control/utils/mcp_server/tools/_handlers.py
@@ -1133,6 +1133,16 @@ def checkpoint_clear(run_id, db):
     return {"cleared": CheckpointStore(db).clear(run_id)}
 
 
+def mark_screen(app_name=None, render_path=None):
+    from je_auto_control.utils.set_of_marks import mark_screen as _ms
+    return _ms(app_name=app_name, render_path=render_path)
+
+
+def mark_click(mark_id):
+    from je_auto_control.utils.set_of_marks import mark_click as _mc
+    return {"clicked": _mc(int(mark_id))}
+
+
 def vlm_locate(description: str,
                screen_region: Optional[List[int]] = None,
                model: Optional[str] = None) -> Optional[List[int]]:
diff --git a/je_auto_control/utils/set_of_marks/__init__.py b/je_auto_control/utils/set_of_marks/__init__.py
new file mode 100644
index 00000000..c8d2026f
--- /dev/null
+++ b/je_auto_control/utils/set_of_marks/__init__.py
@@ -0,0 +1,10 @@
+"""Set-of-Marks overlay — number on-screen elements for VLM grounding."""
+from je_auto_control.utils.set_of_marks.set_of_marks import (
+    last_marks, mark_click, mark_elements, mark_screen, render_marks,
+    resolve_mark,
+)
+
+__all__ = [
+    "last_marks", "mark_click", "mark_elements", "mark_screen",
+    "render_marks", "resolve_mark",
+]
diff --git a/je_auto_control/utils/set_of_marks/set_of_marks.py b/je_auto_control/utils/set_of_marks/set_of_marks.py
new file mode 100644
index 00000000..c7ffe1fe
--- /dev/null
+++ b/je_auto_control/utils/set_of_marks/set_of_marks.py
@@ -0,0 +1,140 @@
+"""Set-of-Marks overlay — number on-screen elements for VLM grounding.
+
+Modern GUI agents ground far more reliably when shown a screenshot with
+numbered boxes drawn over the interactable elements plus an ``id -> bbox``
+legend ("Set-of-Marks" prompting): the model picks a *number* instead of
+guessing pixel coordinates. This turns AutoControl's existing element
+sources (accessibility tree / OCR / template hits) into that two-stage
+"mark then pick a number" loop, and resolves a chosen number back to a
+click.
+
+The legend computation (:func:`mark_elements` / :func:`resolve_mark`) is
+pure Python and unit-testable with synthetic elements; rendering uses
+Pillow (already a dependency). Imports no ``PySide6``.
+"""
+import io
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+_OUTLINE = (255, 0, 0)
+_last_marks: List[Dict[str, Any]] = []
+
+
+def _bbox_of(element: Any) -> List[int]:
+    if isinstance(element, dict):
+        raw = element.get("bbox") or element.get("bounds") or []
+    else:
+        raw = getattr(element, "bounds", []) or []
+    return list(raw)
+
+
+def _text_of(element: Any) -> str:
+    if isinstance(element, dict):
+        return str(element.get("text") or element.get("name") or "")
+    return str(getattr(element, "name", "") or "")
+
+
+def _role_of(element: Any) -> str:
+    if isinstance(element, dict):
+        return str(element.get("role") or "")
+    return str(getattr(element, "role", "") or "")
+
+
+def mark_elements(elements: List[Any]) -> List[Dict[str, Any]]:
+    """Assign a numeric mark to each element with a valid bounds.
+
+    Returns ``[{id, bbox, center, role, text}]`` (ids start at 1).
+    """
+    marks: List[Dict[str, Any]] = []
+    next_id = 1
+    for element in elements:
+        bbox = _bbox_of(element)
+        if len(bbox) < 4:
+            continue
+        left, top, width, height = (int(bbox[0]), int(bbox[1]),
+                                    int(bbox[2]), int(bbox[3]))
+        marks.append({
+            "id": next_id, "bbox": [left, top, width, height],
+            "center": [left + width // 2, top + height // 2],
+            "role": _role_of(element), "text": _text_of(element)})
+        next_id += 1
+    return marks
+
+
+def resolve_mark(marks: List[Dict[str, Any]],
+                 mark_id: int) -> Optional[Dict[str, Any]]:
+    """Return the mark whose ``id`` equals ``mark_id`` (or ``None``)."""
+    for mark in marks:
+        if mark["id"] == int(mark_id):
+            return mark
+    return None
+
+
+def _draw_marks(image: Any, marks: List[Dict[str, Any]]) -> Any:
+    from PIL import ImageDraw
+    draw = ImageDraw.Draw(image)
+    for mark in marks:
+        left, top, width, height = mark["bbox"]
+        draw.rectangle([left, top, left + width, top + height],
+                       outline=_OUTLINE, width=2)
+        label = str(mark["id"])
+        draw.rectangle([left, top, left + 8 + 6 * len(label), top + 12],
+                       fill=_OUTLINE)
+        draw.text((left + 2, top + 1), label, fill=(255, 255, 255))
+    return image
+
+
+def render_marks(image_bytes: bytes,
+                 marks: List[Dict[str, Any]]) -> bytes:
+    """Draw numbered boxes for ``marks`` on a PNG; return annotated PNG bytes."""
+    from PIL import Image
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    _draw_marks(image, marks)
+    out = io.BytesIO()
+    image.save(out, format="PNG")
+    return out.getvalue()
+
+
+def last_marks() -> List[Dict[str, Any]]:
+    """Return a copy of the marks from the most recent :func:`mark_screen`."""
+    return [dict(mark) for mark in _last_marks]
+
+
+def mark_screen(app_name: Optional[str] = None,
+                render_path: Optional[str] = None) -> Dict[str, Any]:
+    """Number the live accessibility elements; optionally render an overlay.
+
+    Stores the marks for a later :func:`mark_click`. When ``render_path`` is
+    given, a screenshot is captured, annotated, and saved there.
+    """
+    from je_auto_control.utils.accessibility.accessibility_api import (
+        list_accessibility_elements)
+    marks = mark_elements(list_accessibility_elements(app_name=app_name))
+    _last_marks.clear()
+    _last_marks.extend(marks)
+    result: Dict[str, Any] = {"marks": marks}
+    if render_path:
+        from je_auto_control.utils.cv2_utils.screenshot import pil_screenshot
+        image = _draw_marks(pil_screenshot().convert("RGB"), marks)
+        target = Path(render_path)
+        image.save(str(target), format="PNG")
+        result["image_path"] = str(target.resolve())
+    return result
+
+
+def mark_click(mark_id: int,
+               marks: Optional[List[Dict[str, Any]]] = None) -> bool:
+    """Click the centre of the marked element; return whether it matched.
+
+    Uses ``marks`` if supplied, else the marks from the last
+    :func:`mark_screen`.
+    """
+    mark = resolve_mark(marks if marks is not None else _last_marks, mark_id)
+    if mark is None:
+        return False
+    center_x, center_y = mark["center"]
+    from je_auto_control.wrapper.auto_control_mouse import (
+        click_mouse, set_mouse_position)
+    set_mouse_position(center_x, center_y)
+    click_mouse("mouse_left", center_x, center_y)
+    return True
diff --git a/test/unit_test/headless/test_set_of_marks_batch.py b/test/unit_test/headless/test_set_of_marks_batch.py
new file mode 100644
index 00000000..94932aa2
--- /dev/null
+++ b/test/unit_test/headless/test_set_of_marks_batch.py
@@ -0,0 +1,72 @@
+"""Headless tests for the Set-of-Marks overlay (number elements for VLM
+grounding). Pure stdlib + Pillow; no Qt imports, no live screen needed."""
+import io
+
+from types import SimpleNamespace
+
+import je_auto_control as ac
+from je_auto_control.utils.set_of_marks import (
+    mark_click, mark_elements, render_marks, resolve_mark)
+
+
+def test_mark_elements_numbers_and_centers():
+    elements = [
+        {"bbox": [0, 0, 100, 20], "role": "button", "text": "OK"},
+        SimpleNamespace(bounds=[10, 40, 60, 20], role="edit", name="user"),
+        {"bbox": [0, 0], "text": "bad"},          # invalid bounds -> skipped
+    ]
+    marks = mark_elements(elements)
+    assert [m["id"] for m in marks] == [1, 2]
+    assert marks[0]["center"] == [50, 10]
+    assert marks[1]["role"] == "edit" and marks[1]["text"] == "user"
+
+
+def test_resolve_mark():
+    marks = mark_elements([{"bbox": [0, 0, 10, 10]},
+                           {"bbox": [0, 0, 20, 20]}])
+    assert resolve_mark(marks, 2)["bbox"] == [0, 0, 20, 20]
+    assert resolve_mark(marks, 99) is None
+
+
+def test_render_marks_returns_png():
+    from PIL import Image
+    buf = io.BytesIO()
+    Image.new("RGB", (80, 60), (255, 255, 255)).save(buf, format="PNG")
+    out = render_marks(buf.getvalue(),
+                       [{"id": 1, "bbox": [5, 5, 30, 12], "center": [20, 11]}])
+    assert out[:8] == b"\x89PNG\r\n\x1a\n"   # valid PNG signature
+    assert len(out) > 0
+
+
+def test_mark_click_uses_supplied_marks(monkeypatch):
+    import je_auto_control.wrapper.auto_control_mouse as mouse
+    calls = {}
+    monkeypatch.setattr(mouse, "set_mouse_position",
+                        lambda x, y: calls.setdefault("pos", (x, y)))
+    monkeypatch.setattr(mouse, "click_mouse",
+                        lambda btn, x, y: calls.setdefault("click", (x, y)))
+    marks = [{"id": 7, "bbox": [0, 0, 40, 20], "center": [20, 10]}]
+    assert mark_click(7, marks) is True
+    assert calls["click"] == (20, 10)
+    assert mark_click(99, marks) is False    # unknown id -> no click
+
+
+# --- wiring ---------------------------------------------------------------
+
+def test_wiring():
+    known = ac.executor.known_commands()
+    assert {"AC_mark_screen", "AC_mark_click"} <= known
+    from je_auto_control.utils.mcp_server.tools import (
+        build_default_tool_registry)
+    names = {t.name for t in build_default_tool_registry()}
+    assert {"ac_mark_screen", "ac_mark_click"} <= names
+    from je_auto_control.gui.script_builder.command_schema import _build_specs
+    cmds = {s.command for s in _build_specs()}
+    assert {"AC_mark_screen", "AC_mark_click"} <= cmds
+
+
+def test_facade_exports():
+    for attr in ("mark_elements", "resolve_mark", "render_marks",
+                 "mark_screen", "mark_click"):
+        assert hasattr(ac, attr)
+        assert attr in ac.__all__