From a7c23f187c8fc603fb8133301fb0d98a4067120c Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 19 Jun 2026 12:00:06 +0800 Subject: [PATCH] Add data-quality helpers: row schema validation, field extraction, masking --- README.md | 9 + README/README_zh-CN.md | 9 + README/README_zh-TW.md | 9 + .../Eng/doc/new_features/v19_features_doc.rst | 69 +++++++ docs/source/Eng/eng_index.rst | 1 + .../Zh/doc/new_features/v19_features_doc.rst | 67 +++++++ docs/source/Zh/zh_index.rst | 1 + je_auto_control/__init__.py | 5 + .../gui/script_builder/command_schema.py | 19 ++ .../utils/data_quality/__init__.py | 6 + .../utils/data_quality/data_quality.py | 179 ++++++++++++++++++ .../utils/executor/action_executor.py | 25 +++ .../utils/mcp_server/tools/_factories.py | 46 ++++- .../utils/mcp_server/tools/_handlers.py | 15 ++ .../headless/test_data_quality_batch.py | 93 +++++++++ 15 files changed, 552 insertions(+), 1 deletion(-) create mode 100644 docs/source/Eng/doc/new_features/v19_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v19_features_doc.rst create mode 100644 je_auto_control/utils/data_quality/__init__.py create mode 100644 je_auto_control/utils/data_quality/data_quality.py create mode 100644 test/unit_test/headless/test_data_quality_batch.py diff --git a/README.md b/README.md index 063b9621..a9252bc0 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ ## Table of Contents +- [What's new (2026-06-19) — Data Quality](#whats-new-2026-06-19--data-quality) - [What's new (2026-06-19) — SBOM & Suite Sharding](#whats-new-2026-06-19--sbom--suite-sharding) - [What's new (2026-06-19) — Reactive Observer](#whats-new-2026-06-19--reactive-observer) - [What's new (2026-06-19) — WCAG 2.2 Audit](#whats-new-2026-06-19--wcag-22-audit) @@ -71,6 +72,14 @@ --- +## What's new (2026-06-19) — Data Quality + +Three pure-stdlib data-quality helpers (the gate between `load_rows`/OCR and downstream entry), full stack. Full reference: [`docs/source/Eng/doc/new_features/v19_features_doc.rst`](docs/source/Eng/doc/new_features/v19_features_doc.rst). + +- **Row schema validation** — `validate_rows(rows, schema)` (`AC_validate_rows`, `ac_validate_rows`): declarative per-field rules (type/required/regex/min/max/min_len/max_len/allowed/unique); returns `{ok, valid, invalid, errors}` so bad scraped/OCR data is caught before it corrupts an ERP/form. +- **Field extraction** — `extract_fields(text, fields, patterns)` (`AC_extract_fields`, `ac_extract_fields`): named regex presets (email/url/ipv4/phone/date_iso/amount/hashtag) + custom patterns over free text / OCR blobs. +- **Row masking** — `mask_rows(rows, rules)` (`AC_mask_rows`, `ac_mask_rows`): mask columns before export — `redact` / `hash` (SHA-256) / `partial` (keep last 4); complements the screenshot-only redaction. + ## What's new (2026-06-19) — SBOM & Suite Sharding Two pure-stdlib ops tools (security + scale research angles), full stack. Full reference: [`docs/source/Eng/doc/new_features/v18_features_doc.rst`](docs/source/Eng/doc/new_features/v18_features_doc.rst). diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md index f0f3473e..42e6ddd5 100644 --- a/README/README_zh-CN.md +++ b/README/README_zh-CN.md @@ -12,6 +12,7 @@ ## 目录 +- [本次更新 (2026-06-19) — 数据质量](#本次更新-2026-06-19--数据质量) - [本次更新 (2026-06-19) — SBOM 与测试分片](#本次更新-2026-06-19--sbom-与测试分片) - [本次更新 (2026-06-19) — 反应式观察器](#本次更新-2026-06-19--反应式观察器) - [本次更新 (2026-06-19) — WCAG 2.2 审计](#本次更新-2026-06-19--wcag-22-审计) @@ -70,6 +71,14 @@ --- +## 本次更新 (2026-06-19) — 数据质量 + +三项纯标准库的数据质量辅助工具(介于 `load_rows`/OCR 与下游输入之间的闸),走完整五层。完整参考:[`docs/source/Zh/doc/new_features/v19_features_doc.rst`](../docs/source/Zh/doc/new_features/v19_features_doc.rst)。 + +- **数据行 schema 验证** — `validate_rows(rows, schema)`(`AC_validate_rows`、`ac_validate_rows`):声明式逐字段规则(type/required/regex/min/max/min_len/max_len/allowed/unique);返回 `{ok, valid, invalid, errors}`,在坏掉的抓取/OCR 数据污染 ERP/表单前拦下。 +- **字段提取** — `extract_fields(text, fields, patterns)`(`AC_extract_fields`、`ac_extract_fields`):具名 regex 预设(email/url/ipv4/phone/date_iso/amount/hashtag)+自定义 patterns,作用于自由文本 / OCR 文本块。 +- **数据行掩码** — `mask_rows(rows, rules)`(`AC_mask_rows`、`ac_mask_rows`):导出前掩码字段——`redact` / `hash`(SHA-256)/ `partial`(保留末 4 字);补足仅针对截图的脱敏。 + ## 本次更新 (2026-06-19) — SBOM 与测试分片 来自安全与规模研究角度的两项纯标准库运维工具,走完整五层。完整参考:[`docs/source/Zh/doc/new_features/v18_features_doc.rst`](../docs/source/Zh/doc/new_features/v18_features_doc.rst)。 diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md index a41fceda..21bc79f6 100644 --- a/README/README_zh-TW.md +++ b/README/README_zh-TW.md @@ -12,6 +12,7 @@ ## 目錄 +- [本次更新 (2026-06-19) — 資料品質](#本次更新-2026-06-19--資料品質) - [本次更新 (2026-06-19) — SBOM 與測試分片](#本次更新-2026-06-19--sbom-與測試分片) - [本次更新 (2026-06-19) — 反應式觀察器](#本次更新-2026-06-19--反應式觀察器) - [本次更新 (2026-06-19) — WCAG 2.2 稽核](#本次更新-2026-06-19--wcag-22-稽核) @@ -70,6 +71,14 @@ --- +## 本次更新 (2026-06-19) — 資料品質 + +三項純標準庫的資料品質輔助工具(介於 `load_rows`/OCR 與下游輸入之間的閘),走完整五層。完整參考:[`docs/source/Zh/doc/new_features/v19_features_doc.rst`](../docs/source/Zh/doc/new_features/v19_features_doc.rst)。 + +- **資料列 schema 驗證** — `validate_rows(rows, schema)`(`AC_validate_rows`、`ac_validate_rows`):宣告式逐欄規則(type/required/regex/min/max/min_len/max_len/allowed/unique);回傳 `{ok, valid, invalid, errors}`,在壞掉的抓取/OCR 資料汙染 ERP/表單前攔下。 +- **欄位擷取** — `extract_fields(text, fields, patterns)`(`AC_extract_fields`、`ac_extract_fields`):具名 regex 預設(email/url/ipv4/phone/date_iso/amount/hashtag)+自訂 patterns,作用於自由文字 / OCR 文字塊。 +- **資料列遮罩** — `mask_rows(rows, rules)`(`AC_mask_rows`、`ac_mask_rows`):匯出前遮罩欄位——`redact` / `hash`(SHA-256)/ `partial`(保留末 4 字);補足僅針對截圖的遮罩。 + ## 本次更新 (2026-06-19) — SBOM 與測試分片 來自安全與規模研究角度的兩項純標準庫維運工具,走完整五層。完整參考:[`docs/source/Zh/doc/new_features/v18_features_doc.rst`](../docs/source/Zh/doc/new_features/v18_features_doc.rst)。 diff --git a/docs/source/Eng/doc/new_features/v19_features_doc.rst b/docs/source/Eng/doc/new_features/v19_features_doc.rst new file mode 100644 index 00000000..3fa48900 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v19_features_doc.rst @@ -0,0 +1,69 @@ +================================================== +New Features (2026-06-19) — Data Quality +================================================== + +Three pure-standard-library data-quality helpers from the data/validation +research angle — the quality gate between ingestion (``load_rows`` / OCR) +and downstream entry. Wired through the full stack (facade, ``AC_*`` +executor commands, MCP tools, Script Builder). + +.. contents:: + :local: + :depth: 2 + + +Row schema validation +==================== + +Validate scraped / loaded rows against a declarative schema before they +reach an ERP or form — bad data caught here doesn't corrupt downstream:: + + from je_auto_control import validate_rows + + report = validate_rows(rows, { + "name": {"type": "str", "required": True}, + "age": {"type": "int", "min": 0, "max": 130}, + "email": {"regex": r".+@.+\..+"}, + "id": {"unique": True}, + "tier": {"allowed": ["gold", "silver"]}, + }) + report["ok"] # False if any row failed + report["valid"] # rows that passed + report["errors"] # [{"row": 1, "field": "age", "error": "above max 130"}] + +Rules: ``type`` / ``required`` / ``regex`` / ``min`` / ``max`` / +``min_len`` / ``max_len`` / ``allowed`` / ``unique``. Exposed as +``AC_validate_rows`` / ``ac_validate_rows``. + + +Field extraction +=============== + +Pull structured values out of free text / OCR blobs with named regex +presets (plus your own ``patterns``):: + + from je_auto_control import extract_fields + + out = extract_fields("Mail ada@x.io on 2026-06-19", + fields=["email", "date_iso"]) + # {"email": ["ada@x.io"], "date_iso": ["2026-06-19"]} + +Presets: ``email`` / ``url`` / ``ipv4`` / ``phone`` / ``date_iso`` / +``amount`` / ``hashtag``. Exposed as ``AC_extract_fields`` / +``ac_extract_fields``. + + +Row masking +========== + +Mask sensitive columns before exporting rows / reports (the existing +redaction is screenshot-only):: + + from je_auto_control import mask_rows + + safe = mask_rows(rows, {"ssn": "partial", "token": "redact", + "name": "hash"}) + # ssn -> "*****6789", token -> "***", name -> sha256 hex + +Modes: ``redact`` (``***``), ``hash`` (SHA-256 hex), ``partial`` (keep the +last 4 chars). Exposed as ``AC_mask_rows`` / ``ac_mask_rows``. diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst index 1dcedf1b..65203719 100644 --- a/docs/source/Eng/eng_index.rst +++ b/docs/source/Eng/eng_index.rst @@ -41,6 +41,7 @@ Comprehensive guides for all AutoControl features. doc/new_features/v16_features_doc doc/new_features/v17_features_doc doc/new_features/v18_features_doc + doc/new_features/v19_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/docs/source/Zh/doc/new_features/v19_features_doc.rst b/docs/source/Zh/doc/new_features/v19_features_doc.rst new file mode 100644 index 00000000..ff73a5f6 --- /dev/null +++ b/docs/source/Zh/doc/new_features/v19_features_doc.rst @@ -0,0 +1,67 @@ +======================================== +新功能 (2026-06-19) — 資料品質 +======================================== + +來自資料/驗證研究角度的三項純標準庫資料品質輔助工具——介於資料匯入 +(``load_rows`` / OCR)與下游輸入之間的品質閘。走完整五層(facade、 +``AC_*`` 執行器指令、MCP 工具、Script Builder)。 + +.. contents:: + :local: + :depth: 2 + + +資料列 schema 驗證 +================== + +在抓取/載入的資料列進入 ERP 或表單前,依宣告式 schema 驗證——在此攔下的 +壞資料就不會汙染下游:: + + from je_auto_control import validate_rows + + report = validate_rows(rows, { + "name": {"type": "str", "required": True}, + "age": {"type": "int", "min": 0, "max": 130}, + "email": {"regex": r".+@.+\..+"}, + "id": {"unique": True}, + "tier": {"allowed": ["gold", "silver"]}, + }) + report["ok"] # 任何列失敗則為 False + report["valid"] # 通過的資料列 + report["errors"] # [{"row": 1, "field": "age", "error": "above max 130"}] + +規則:``type`` / ``required`` / ``regex`` / ``min`` / ``max`` / +``min_len`` / ``max_len`` / ``allowed`` / ``unique``。對應 +``AC_validate_rows`` / ``ac_validate_rows``。 + + +欄位擷取 +======== + +用具名的 regex 預設(也可加自訂 ``patterns``)從自由文字 / OCR 文字塊中 +擷取結構化值:: + + from je_auto_control import extract_fields + + out = extract_fields("Mail ada@x.io on 2026-06-19", + fields=["email", "date_iso"]) + # {"email": ["ada@x.io"], "date_iso": ["2026-06-19"]} + +預設:``email`` / ``url`` / ``ipv4`` / ``phone`` / ``date_iso`` / +``amount`` / ``hashtag``。對應 ``AC_extract_fields`` / +``ac_extract_fields``。 + + +資料列遮罩 +========== + +在匯出資料列 / 報告前遮罩敏感欄位(既有的遮罩僅針對截圖):: + + from je_auto_control import mask_rows + + safe = mask_rows(rows, {"ssn": "partial", "token": "redact", + "name": "hash"}) + # ssn -> "*****6789"、token -> "***"、name -> sha256 hex + +模式:``redact``(``***``)、``hash``(SHA-256 hex)、``partial``(保留末 4 +字)。對應 ``AC_mask_rows`` / ``ac_mask_rows``。 diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst index 08b67c5c..bf01290f 100644 --- a/docs/source/Zh/zh_index.rst +++ b/docs/source/Zh/zh_index.rst @@ -41,6 +41,7 @@ AutoControl 所有功能的完整使用指南。 doc/new_features/v16_features_doc doc/new_features/v17_features_doc doc/new_features/v18_features_doc + doc/new_features/v19_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 1f2600f2..639011af 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -153,6 +153,10 @@ from je_auto_control.utils.sbom import build_sbom, write_sbom # Duration-aware suite sharding + shard-result merge from je_auto_control.utils.test_shard import merge_results, shard_flows +# Data-quality: row schema validation, field extraction, masking +from je_auto_control.utils.data_quality import ( + extract_fields, mask_rows, validate_rows, +) # Background popup/interrupt watchdog (unattended automation) from je_auto_control.utils.watchdog import ( PopupWatchdog, WatchdogRule, default_popup_watchdog, @@ -572,6 +576,7 @@ def start_autocontrol_gui(*args, **kwargs): "image_predicate", "pixel_predicate", "text_predicate", "build_sbom", "write_sbom", "merge_results", "shard_flows", + "extract_fields", "mask_rows", "validate_rows", # MCP server "AuditLogger", "HttpMCPServer", "MCPContent", "MCPPrompt", "MCPPromptArgument", "MCPResource", "MCPServer", "MCPTool", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index c7553999..3252f867 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -658,6 +658,7 @@ def _add_misc_specs(specs: List[CommandSpec]) -> None: _add_agent_specs(specs) _add_office_specs(specs) _add_memory_specs(specs) + _add_data_quality_specs(specs) specs.append(CommandSpec( "AC_wcag_audit", "Accessibility", "WCAG 2.2 Conformance Audit", fields=( @@ -738,6 +739,24 @@ def _add_observer_specs(specs: List[CommandSpec]) -> None: description="Stop the background observer thread.")) +def _add_data_quality_specs(specs: List[CommandSpec]) -> None: + specs.append(CommandSpec( + "AC_validate_rows", "Data", "Validate Rows (schema)", + description="Validate 'rows' against a 'schema' (both via JSON view).", + )) + specs.append(CommandSpec( + "AC_extract_fields", "Data", "Extract Fields (regex)", + fields=(FieldSpec("text", FieldType.STRING),), + description="Pull email/url/phone/amount/... from text; 'fields' / " + "'patterns' via JSON view.", + )) + specs.append(CommandSpec( + "AC_mask_rows", "Data", "Mask Rows", + description="Mask columns in 'rows' per 'rules' (redact/hash/partial)," + " via JSON view.", + )) + + def _add_memory_specs(specs: List[CommandSpec]) -> None: db = FieldSpec("db", FieldType.FILE_PATH) specs.append(CommandSpec( diff --git a/je_auto_control/utils/data_quality/__init__.py b/je_auto_control/utils/data_quality/__init__.py new file mode 100644 index 00000000..de054088 --- /dev/null +++ b/je_auto_control/utils/data_quality/__init__.py @@ -0,0 +1,6 @@ +"""Data-quality helpers: row schema validation, field extraction, masking.""" +from je_auto_control.utils.data_quality.data_quality import ( + extract_fields, mask_rows, validate_rows, +) + +__all__ = ["extract_fields", "mask_rows", "validate_rows"] diff --git a/je_auto_control/utils/data_quality/data_quality.py b/je_auto_control/utils/data_quality/data_quality.py new file mode 100644 index 00000000..2bf56812 --- /dev/null +++ b/je_auto_control/utils/data_quality/data_quality.py @@ -0,0 +1,179 @@ +"""Data-quality helpers for scraped / loaded rows (pure standard library). + +``load_rows`` and OCR bring data *in*; this module is the quality gate that +sits between ingestion and downstream entry: validate rows against a +declarative schema, pull structured fields out of free text with named +regex presets, and mask sensitive columns before export. + +Pure standard library (``re`` / ``hashlib``); imports no ``PySide6``. +""" +import hashlib +import re +from typing import Any, Dict, List, Optional + +_TYPES = { + "int": int, "float": (int, float), "number": (int, float), + "str": str, "bool": bool, +} + +# Named extraction presets (no capturing groups, so findall returns matches). +_PRESETS = { + "email": r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", + "url": r"https?://[^\s,)]+", + "ipv4": r"\b(?:\d{1,3}\.){3}\d{1,3}\b", + "phone": r"\+?\d[\d\s().-]{6,}\d", + "date_iso": r"\b\d{4}-\d{2}-\d{2}\b", + "amount": r"[$€£]\s?\d[\d,]*(?:\.\d+)?", + "hashtag": r"#\w+", +} + + +# --- row schema validation ----------------------------------------------- + +def _matches_type(value: Any, kind: str) -> bool: + expected = _TYPES.get(kind) + if expected is None: + return True + if kind in ("int", "number", "float") and isinstance(value, bool): + return False + return isinstance(value, expected) + + +def _number_range_error(value: Any, rule: Dict[str, Any]) -> Optional[str]: + if "min" in rule and value < rule["min"]: + return f"below min {rule['min']}" + if "max" in rule and value > rule["max"]: + return f"above max {rule['max']}" + return None + + +def _length_range_error(value: str, rule: Dict[str, Any]) -> Optional[str]: + if "min_len" in rule and len(value) < rule["min_len"]: + return f"shorter than {rule['min_len']}" + if "max_len" in rule and len(value) > rule["max_len"]: + return f"longer than {rule['max_len']}" + return None + + +def _range_error(value: Any, rule: Dict[str, Any]) -> Optional[str]: + if isinstance(value, (int, float)) and not isinstance(value, bool): + return _number_range_error(value, rule) + if isinstance(value, str): + return _length_range_error(value, rule) + return None + + +def _field_error(value: Any, rule: Dict[str, Any]) -> Optional[str]: + kind = rule.get("type") + if kind and not _matches_type(value, kind): + return f"expected {kind}" + if "regex" in rule and not re.search(rule["regex"], str(value)): + return f"does not match {rule['regex']}" + range_msg = _range_error(value, rule) + if range_msg: + return range_msg + allowed = rule.get("allowed") + if allowed is not None and value not in allowed: + return "not in allowed set" + return None + + +def _validate_row(index: int, row: Dict[str, Any], schema: Dict[str, Any], + seen_unique: Dict[str, set]) -> List[Dict[str, Any]]: + errors: List[Dict[str, Any]] = [] + for field, rule in schema.items(): + if field not in row or row[field] in (None, ""): + if rule.get("required"): + errors.append({"row": index, "field": field, + "error": "required"}) + continue + value = row[field] + message = _field_error(value, rule) + if message: + errors.append({"row": index, "field": field, "error": message}) + elif field in seen_unique: + if value in seen_unique[field]: + errors.append({"row": index, "field": field, + "error": "duplicate"}) + else: + seen_unique[field].add(value) + return errors + + +def validate_rows(rows: List[Dict[str, Any]], + schema: Dict[str, Any]) -> Dict[str, Any]: + """Validate ``rows`` against ``schema``; return a pass/fail report. + + Each schema rule supports ``type`` / ``required`` / ``regex`` / + ``min`` / ``max`` / ``min_len`` / ``max_len`` / ``allowed`` / ``unique``. + The report holds ``ok``, ``valid`` / ``invalid`` row lists, and per-row + ``errors`` (``{row, field, error}``). + """ + rows = list(rows) + seen_unique = {field: set() for field, rule in schema.items() + if rule.get("unique")} + errors: List[Dict[str, Any]] = [] + valid: List[Dict[str, Any]] = [] + invalid: List[Dict[str, Any]] = [] + for index, row in enumerate(rows): + row_errors = _validate_row(index, row, schema, seen_unique) + errors.extend(row_errors) + (invalid if row_errors else valid).append(row) + return {"ok": not errors, "total": len(rows), + "valid_count": len(valid), "invalid_count": len(invalid), + "valid": valid, "invalid": invalid, "errors": errors} + + +# --- field extraction ----------------------------------------------------- + +def preset_names() -> List[str]: + """Return the names of the built-in extraction presets.""" + return sorted(_PRESETS) + + +def extract_fields(text: str, fields: Optional[List[str]] = None, + patterns: Optional[Dict[str, str]] = None + ) -> Dict[str, List[str]]: + """Extract structured values from free text. + + ``fields`` selects built-in presets (default: all); ``patterns`` adds + or overrides named custom regexes. Returns ``{name: [matches]}``. + """ + haystack = text or "" + chosen: Dict[str, str] = {} + for name in (fields if fields is not None else list(_PRESETS)): + if name in _PRESETS: + chosen[name] = _PRESETS[name] + for name, pattern in (patterns or {}).items(): + chosen[name] = pattern + return {name: re.findall(pattern, haystack) + for name, pattern in chosen.items()} + + +# --- masking -------------------------------------------------------------- + +def _mask_value(value: str, mode: str) -> str: + if mode == "redact": + return "***" + if mode == "hash": + return hashlib.sha256(value.encode("utf-8")).hexdigest() + if mode == "partial": + return "*" * max(0, len(value) - 4) + value[-4:] + raise ValueError(f"unknown mask mode: {mode!r}") + + +def mask_rows(rows: List[Dict[str, Any]], + rules: Dict[str, str]) -> List[Dict[str, Any]]: + """Return ``rows`` with masked columns. + + ``rules`` maps a field to a mode: ``redact`` (``***``), ``hash`` + (SHA-256 hex), or ``partial`` (keep last 4 chars). + """ + masked: List[Dict[str, Any]] = [] + for row in rows: + out = dict(row) + for field, mode in rules.items(): + if field in out and out[field] is not None: + out[field] = _mask_value(str(out[field]), mode) + masked.append(out) + return masked diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index cd532db9..cd325b2c 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2663,6 +2663,28 @@ def _merge_results(reports: List[Dict[str, Any]]) -> Dict[str, Any]: return merge_results(reports) +def _validate_rows(rows: List[Dict[str, Any]], + schema: Dict[str, Any]) -> Dict[str, Any]: + """Adapter: validate rows against a declarative schema.""" + from je_auto_control.utils.data_quality import validate_rows + return validate_rows(rows, schema) + + +def _extract_fields(text: str, fields: Optional[List[str]] = None, + patterns: Optional[Dict[str, str]] = None + ) -> Dict[str, Any]: + """Adapter: extract structured fields from free text.""" + from je_auto_control.utils.data_quality import extract_fields + return {"fields": extract_fields(text, fields=fields, patterns=patterns)} + + +def _mask_rows(rows: List[Dict[str, Any]], + rules: Dict[str, str]) -> Dict[str, Any]: + """Adapter: mask sensitive columns in rows.""" + from je_auto_control.utils.data_quality import mask_rows + return {"rows": mask_rows(rows, rules)} + + class Executor: """ Executor @@ -2864,6 +2886,9 @@ def __init__(self): "AC_generate_sbom": _generate_sbom, "AC_shard_suite": _shard_suite, "AC_merge_results": _merge_results, + "AC_validate_rows": _validate_rows, + "AC_extract_fields": _extract_fields, + "AC_mask_rows": _mask_rows, "AC_a11y_record_start": _a11y_record_start, "AC_a11y_record_stop": _a11y_record_stop, "AC_a11y_record_events": _a11y_record_events, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index b1ee8d88..c96de7b8 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -2181,6 +2181,50 @@ def sharding_tools() -> List[MCPTool]: ] +def data_quality_tools() -> List[MCPTool]: + return [ + MCPTool( + name="ac_validate_rows", + description=("Validate rows against a declarative schema " + "(type/required/regex/min/max/min_len/max_len/" + "allowed/unique). Returns {ok, valid, invalid, " + "errors} — the data-quality gate after load_rows."), + input_schema=schema({ + "rows": {"type": "array", "items": {"type": "object"}}, + "schema": {"type": "object"}, + }, required=["rows", "schema"]), + handler=h.validate_rows, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_extract_fields", + description=("Extract structured values from free text using " + "named presets (email/url/ipv4/phone/date_iso/" + "amount/hashtag) and/or custom 'patterns'. Returns " + "{fields: {name: [matches]}}."), + input_schema=schema({ + "text": {"type": "string"}, + "fields": {"type": "array", "items": {"type": "string"}}, + "patterns": {"type": "object"}, + }, required=["text"]), + handler=h.extract_fields, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_mask_rows", + description=("Mask sensitive columns in rows before export. " + "'rules' maps a field to redact / hash / partial. " + "Returns {rows}."), + input_schema=schema({ + "rows": {"type": "array", "items": {"type": "object"}}, + "rules": {"type": "object"}, + }, required=["rows", "rules"]), + handler=h.mask_rows, + annotations=READ_ONLY, + ), + ] + + def unattended_tools() -> List[MCPTool]: return [ MCPTool( @@ -3234,7 +3278,7 @@ def media_assert_tools() -> List[MCPTool]: element_repository_tools, flow_debugger_tools, skill_library_tools, guardrail_tools, a2a_tools, office_tools, agent_memory_tools, determinism_tools, observer_tools, - sbom_tools, sharding_tools, + sbom_tools, sharding_tools, data_quality_tools, screen_record_tools, process_and_shell_tools, remote_desktop_tools, gamepad_tools, usb_passthrough_tools, assertion_tools, data_source_tools, diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index b540f3db..0796cea6 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -1074,6 +1074,21 @@ def merge_results(reports): return _merge(reports) +def validate_rows(rows, schema): + from je_auto_control.utils.data_quality import validate_rows as _validate + return _validate(rows, schema) + + +def extract_fields(text, fields=None, patterns=None): + from je_auto_control.utils.data_quality import extract_fields as _extract + return {"fields": _extract(text, fields=fields, patterns=patterns)} + + +def mask_rows(rows, rules): + from je_auto_control.utils.data_quality import mask_rows as _mask + return {"rows": _mask(rows, rules)} + + def vlm_locate(description: str, screen_region: Optional[List[int]] = None, model: Optional[str] = None) -> Optional[List[int]]: diff --git a/test/unit_test/headless/test_data_quality_batch.py b/test/unit_test/headless/test_data_quality_batch.py new file mode 100644 index 00000000..a7c7799e --- /dev/null +++ b/test/unit_test/headless/test_data_quality_batch.py @@ -0,0 +1,93 @@ +"""Headless tests for the data-quality batch: row schema validation, +field extraction, and masking. Pure stdlib; no Qt imports.""" +import je_auto_control as ac +from je_auto_control.utils.data_quality import ( + extract_fields, mask_rows, validate_rows) + + +# --- validate_rows -------------------------------------------------------- + +def test_validate_rows_reports_errors(): + rows = [ + {"name": "Ada", "age": 36, "email": "ada@x.io"}, + {"name": "", "age": 200, "email": "nope"}, + {"name": "Bo", "age": 41, "email": "bo@y.io"}, + ] + schema = { + "name": {"type": "str", "required": True}, + "age": {"type": "int", "min": 0, "max": 130}, + "email": {"regex": r".+@.+\..+"}, + } + report = validate_rows(rows, schema) + assert report["ok"] is False + assert report["valid_count"] == 2 and report["invalid_count"] == 1 + fields = {e["field"] for e in report["errors"] if e["row"] == 1} + assert {"name", "age", "email"} <= fields + + +def test_validate_rows_unique_and_allowed(): + rows = [{"id": "a", "tier": "gold"}, {"id": "a", "tier": "bronze"}] + schema = {"id": {"unique": True}, + "tier": {"allowed": ["gold", "silver"]}} + report = validate_rows(rows, schema) + errors = {(e["row"], e["field"], e["error"]) for e in report["errors"]} + assert (1, "id", "duplicate") in errors + assert (1, "tier", "not in allowed set") in errors + + +def test_validate_rows_all_valid(): + report = validate_rows([{"n": 5}], {"n": {"type": "int", "min": 1}}) + assert report["ok"] is True and report["invalid_count"] == 0 + + +# --- extract_fields ------------------------------------------------------- + +def test_extract_presets_and_custom(): + text = "Mail ada@x.io or see https://x.io — ref #A12 on 2026-06-19." + out = extract_fields(text, fields=["email", "url", "date_iso"]) + assert out["email"] == ["ada@x.io"] + assert out["url"] == ["https://x.io"] + assert out["date_iso"] == ["2026-06-19"] + custom = extract_fields(text, fields=[], patterns={"ref": r"#[A-Z]\d+"}) + assert custom["ref"] == ["#A12"] + + +# --- mask_rows ------------------------------------------------------------ + +def test_mask_modes(): + rows = [{"name": "Ada", "ssn": "123456789", "tok": "secret"}] + masked = mask_rows(rows, {"ssn": "partial", "tok": "redact", + "name": "hash"}) + assert masked[0]["ssn"] == "*****6789" + assert masked[0]["tok"] == "***" + assert len(masked[0]["name"]) == 64 # sha256 hex + assert rows[0]["name"] == "Ada" # original untouched + + +# --- wiring --------------------------------------------------------------- + +def test_executor_wiring(): + rec = ac.execute_action([["AC_validate_rows", { + "rows": [{"n": 1}], "schema": {"n": {"type": "int"}}}]]) + assert any("'ok': True" in str(v) for v in rec.values()) + ex = ac.execute_action([["AC_extract_fields", { + "text": "a@b.co", "fields": ["email"]}]]) + assert any("a@b.co" in str(v) for v in ex.values()) + known = ac.executor.known_commands() + assert {"AC_validate_rows", "AC_extract_fields", "AC_mask_rows"} <= known + + +def test_mcp_and_builder_wiring(): + from je_auto_control.utils.mcp_server.tools import ( + build_default_tool_registry) + names = {t.name for t in build_default_tool_registry()} + assert {"ac_validate_rows", "ac_extract_fields", "ac_mask_rows"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + cmds = {s.command for s in _build_specs()} + assert {"AC_validate_rows", "AC_extract_fields", "AC_mask_rows"} <= cmds + + +def test_facade_exports(): + for attr in ("validate_rows", "extract_fields", "mask_rows"): + assert hasattr(ac, attr) + assert attr in ac.__all__