From 386638f842a5bc6084c4ebc8ff52f6235e12bbd5 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Sat, 20 Jun 2026 00:03:00 +0800 Subject: [PATCH 1/2] Add fuzzy string matching and dedupe (difflib default, optional rapidfuzz) --- README.md | 7 ++ README/README_zh-CN.md | 7 ++ README/README_zh-TW.md | 7 ++ .../Eng/doc/new_features/v40_features_doc.rst | 51 ++++++++++ docs/source/Eng/eng_index.rst | 1 + .../Zh/doc/new_features/v40_features_doc.rst | 48 ++++++++++ docs/source/Zh/zh_index.rst | 1 + je_auto_control/__init__.py | 5 + .../gui/script_builder/command_schema.py | 35 +++++++ .../utils/executor/action_executor.py | 34 +++++++ je_auto_control/utils/fuzzy/__init__.py | 9 ++ je_auto_control/utils/fuzzy/fuzzy_match.py | 85 ++++++++++++++++ .../utils/mcp_server/tools/_factories.py | 43 ++++++++- .../utils/mcp_server/tools/_handlers.py | 20 ++++ pyproject.toml | 1 + .../headless/test_fuzzy_match_batch.py | 96 +++++++++++++++++++ 16 files changed, 449 insertions(+), 1 deletion(-) create mode 100644 docs/source/Eng/doc/new_features/v40_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v40_features_doc.rst create mode 100644 je_auto_control/utils/fuzzy/__init__.py create mode 100644 je_auto_control/utils/fuzzy/fuzzy_match.py create mode 100644 test/unit_test/headless/test_fuzzy_match_batch.py diff --git a/README.md b/README.md index 154ca942..b232208d 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ ## Table of Contents +- [What's new (2026-06-20) — Fuzzy String Matching & Dedupe](#whats-new-2026-06-20--fuzzy-string-matching--dedupe) - [What's new (2026-06-19) — Video Step-Overlay Report](#whats-new-2026-06-19--video-step-overlay-report) - [What's new (2026-06-19) — Agent Observability (GenAI OpenTelemetry Spans)](#whats-new-2026-06-19--agent-observability-genai-opentelemetry-spans) - [What's new (2026-06-19) — Compliance Control Report (SOC2 / ISO 27001)](#whats-new-2026-06-19--compliance-control-report-soc2--iso-27001) @@ -92,6 +93,12 @@ --- +## What's new (2026-06-20) — Fuzzy String Matching & Dedupe + +Match noisy OCR/UI text robustly. Full reference: [`docs/source/Eng/doc/new_features/v40_features_doc.rst`](docs/source/Eng/doc/new_features/v40_features_doc.rst). + +- **`fuzzy_ratio` / `fuzzy_best_match` / `fuzzy_matches` / `fuzzy_dedupe`** (`AC_fuzzy_ratio` / `AC_fuzzy_best_match` / `AC_fuzzy_dedupe`, `ac_*`): score similarity (0..1), pick the closest candidate from a list, or collapse near-duplicates — so a flow can act on "the button that *looks like* Submit" rather than an exact label. The default backend is stdlib `difflib` (**zero extra deps**); the optional `[fuzzy]` extra adds `rapidfuzz` for speed, with scores normalised either way. `ignore_case` and `score_cutoff` supported. + ## What's new (2026-06-19) — Video Step-Overlay Report Caption screenshots into a walkthrough video. Full reference: [`docs/source/Eng/doc/new_features/v39_features_doc.rst`](docs/source/Eng/doc/new_features/v39_features_doc.rst). diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md index 4888eb50..de8a39ab 100644 --- a/README/README_zh-CN.md +++ b/README/README_zh-CN.md @@ -12,6 +12,7 @@ ## 目录 +- [本次更新 (2026-06-20) — 模糊字符串匹配与去重](#本次更新-2026-06-20--模糊字符串匹配与去重) - [本次更新 (2026-06-19) — 视频步骤叠加报告](#本次更新-2026-06-19--视频步骤叠加报告) - [本次更新 (2026-06-19) — Agent 可观测性(GenAI OpenTelemetry Spans)](#本次更新-2026-06-19--agent-可观测性genai-opentelemetry-spans) - [本次更新 (2026-06-19) — 合规控制报告(SOC2 / ISO 27001)](#本次更新-2026-06-19--合规控制报告soc2--iso-27001) @@ -91,6 +92,12 @@ --- +## 本次更新 (2026-06-20) — 模糊字符串匹配与去重 + +稳健匹配含噪声的 OCR/UI 文本。完整参考:[`docs/source/Zh/doc/new_features/v40_features_doc.rst`](../docs/source/Zh/doc/new_features/v40_features_doc.rst)。 + +- **`fuzzy_ratio` / `fuzzy_best_match` / `fuzzy_matches` / `fuzzy_dedupe`**(`AC_fuzzy_ratio` / `AC_fuzzy_best_match` / `AC_fuzzy_dedupe`、`ac_*`):为相似度评分(0..1)、从列表挑最接近的候选,或收合近似重复 —— 让流程可针对「*看起来像* Submit 的按钮」动作,而非精确标签。默认后端为标准库 `difflib`(**无额外依赖**);可选的 `[fuzzy]` extra 加入 `rapidfuzz` 以加速,两者分数皆归一化。支持 `ignore_case` 与 `score_cutoff`。 + ## 本次更新 (2026-06-19) — 视频步骤叠加报告 将屏幕截图加上字幕制成走查视频。完整参考:[`docs/source/Zh/doc/new_features/v39_features_doc.rst`](../docs/source/Zh/doc/new_features/v39_features_doc.rst)。 diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md index 6a158c53..4a5c8570 100644 --- a/README/README_zh-TW.md +++ b/README/README_zh-TW.md @@ -12,6 +12,7 @@ ## 目錄 +- [本次更新 (2026-06-20) — 模糊字串比對與去重](#本次更新-2026-06-20--模糊字串比對與去重) - [本次更新 (2026-06-19) — 影片步驟疊加報告](#本次更新-2026-06-19--影片步驟疊加報告) - [本次更新 (2026-06-19) — Agent 可觀測性(GenAI OpenTelemetry Spans)](#本次更新-2026-06-19--agent-可觀測性genai-opentelemetry-spans) - [本次更新 (2026-06-19) — 合規控制報告(SOC2 / ISO 27001)](#本次更新-2026-06-19--合規控制報告soc2--iso-27001) @@ -91,6 +92,12 @@ --- +## 本次更新 (2026-06-20) — 模糊字串比對與去重 + +穩健比對含雜訊的 OCR/UI 文字。完整參考:[`docs/source/Zh/doc/new_features/v40_features_doc.rst`](../docs/source/Zh/doc/new_features/v40_features_doc.rst)。 + +- **`fuzzy_ratio` / `fuzzy_best_match` / `fuzzy_matches` / `fuzzy_dedupe`**(`AC_fuzzy_ratio` / `AC_fuzzy_best_match` / `AC_fuzzy_dedupe`、`ac_*`):為相似度評分(0..1)、從清單挑最接近的候選,或收合近似重複 —— 讓流程可針對「*看起來像* Submit 的按鈕」動作,而非精確標籤。預設後端為標準函式庫 `difflib`(**無額外相依**);選用的 `[fuzzy]` extra 加入 `rapidfuzz` 以加速,兩者分數皆正規化。支援 `ignore_case` 與 `score_cutoff`。 + ## 本次更新 (2026-06-19) — 影片步驟疊加報告 將螢幕截圖加上字幕製成走查影片。完整參考:[`docs/source/Zh/doc/new_features/v39_features_doc.rst`](../docs/source/Zh/doc/new_features/v39_features_doc.rst)。 diff --git a/docs/source/Eng/doc/new_features/v40_features_doc.rst b/docs/source/Eng/doc/new_features/v40_features_doc.rst new file mode 100644 index 00000000..8822f52d --- /dev/null +++ b/docs/source/Eng/doc/new_features/v40_features_doc.rst @@ -0,0 +1,51 @@ +Fuzzy String Matching & Dedupe +============================== + +Exact string comparison is brittle when text comes from OCR or shifting UI copy. +These helpers score similarity, pick the best candidate from a list, and collapse +near-duplicates — so a flow can act on "the button that *looks like* Submit" +rather than an exact label. + +The default backend is the standard library :mod:`difflib`, so the feature works +with **zero extra dependencies**. If the optional ``rapidfuzz`` package is +installed (``pip install je_auto_control[fuzzy]``) it is used instead for speed; +scores are normalised to ``0.0..1.0`` either way, so callers never depend on +which backend ran. ``BACKEND`` names the active one. Imports no ``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import ( + fuzzy_ratio, fuzzy_best_match, fuzzy_matches, fuzzy_dedupe) + + fuzzy_ratio("Sumbit", "Submit") # ~0.83 (case-insensitive default) + + fuzzy_best_match("Sve", ["Cancel", "Save", "Submit"]) + # -> ("Save", 0.86, 1) (choice, score, index) — or None below score_cutoff + + fuzzy_matches("login", ["login", "logon", "logout"], limit=2) + # -> [("login", 1.0, 0), ("logon", 0.8, 1)] sorted best-first + + fuzzy_dedupe(["Invoice", "invoice ", "Receipt"], threshold=0.85) + # -> ["Invoice", "Receipt"] near-duplicates collapse, first kept + +All functions take ``ignore_case`` (default ``True``); ``fuzzy_best_match`` / +``fuzzy_matches`` take ``score_cutoff`` to drop weak candidates. + +Executor commands +----------------- + +================================ =================================================== +Command Effect +================================ =================================================== +``AC_fuzzy_ratio`` ``{score}`` similarity between two strings. +``AC_fuzzy_best_match`` ``{match, score, index}`` (or null) from choices. +``AC_fuzzy_dedupe`` ``{unique}`` with near-duplicates collapsed. +================================ =================================================== + +``choices`` / ``items`` accept a list or a JSON-string list (so the visual +builder works). The same operations are exposed as MCP tools (``ac_fuzzy_ratio`` +/ ``ac_fuzzy_best_match`` / ``ac_fuzzy_dedupe``) and as Script Builder commands +under **Data**. diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst index dafd14d0..27b3fa9c 100644 --- a/docs/source/Eng/eng_index.rst +++ b/docs/source/Eng/eng_index.rst @@ -62,6 +62,7 @@ Comprehensive guides for all AutoControl features. doc/new_features/v37_features_doc doc/new_features/v38_features_doc doc/new_features/v39_features_doc + doc/new_features/v40_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/docs/source/Zh/doc/new_features/v40_features_doc.rst b/docs/source/Zh/doc/new_features/v40_features_doc.rst new file mode 100644 index 00000000..49eec3a1 --- /dev/null +++ b/docs/source/Zh/doc/new_features/v40_features_doc.rst @@ -0,0 +1,48 @@ +模糊字串比對與去重 +================== + +當文字來自 OCR 或時常變動的 UI 文案時,精確字串比對很脆弱。這些輔助函式為相似度評分、 +從清單中挑出最佳候選,並收合近似重複項 —— 讓流程可以針對「*看起來像* Submit 的按鈕」 +動作,而非精確標籤。 + +預設後端為標準函式庫 :mod:`difflib`,因此本功能**無需任何額外相依**即可運作。若安裝了 +選用的 ``rapidfuzz`` 套件(``pip install je_auto_control[fuzzy]``)則改用其以加速;無論 +何者,分數皆正規化為 ``0.0..1.0``,故呼叫端永不依賴實際執行的後端。``BACKEND`` 標示目 +前作用中的後端。不匯入 ``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import ( + fuzzy_ratio, fuzzy_best_match, fuzzy_matches, fuzzy_dedupe) + + fuzzy_ratio("Sumbit", "Submit") # ~0.83(預設不分大小寫) + + fuzzy_best_match("Sve", ["Cancel", "Save", "Submit"]) + # -> ("Save", 0.86, 1) (choice, score, index) —— 低於 score_cutoff 則為 None + + fuzzy_matches("login", ["login", "logon", "logout"], limit=2) + # -> [("login", 1.0, 0), ("logon", 0.8, 1)] 由高分至低分排序 + + fuzzy_dedupe(["Invoice", "invoice ", "Receipt"], threshold=0.85) + # -> ["Invoice", "Receipt"] 近似重複收合,保留第一個 + +所有函式皆接受 ``ignore_case``(預設 ``True``);``fuzzy_best_match`` / +``fuzzy_matches`` 接受 ``score_cutoff`` 以濾除弱候選。 + +執行器指令 +---------- + +================================ =================================================== +指令 效果 +================================ =================================================== +``AC_fuzzy_ratio`` 兩字串相似度的 ``{score}``。 +``AC_fuzzy_best_match`` 從候選中取 ``{match, score, index}``(或 null)。 +``AC_fuzzy_dedupe`` 收合近似重複後的 ``{unique}``。 +================================ =================================================== + +``choices`` / ``items`` 接受清單或 JSON 字串清單(因此視覺化建構器可用)。相同操作亦提供 +為 MCP 工具(``ac_fuzzy_ratio`` / ``ac_fuzzy_best_match`` / ``ac_fuzzy_dedupe``),以及 +Script Builder 中 **Data** 分類下的指令。 diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst index b9a8ccb1..eb68bf66 100644 --- a/docs/source/Zh/zh_index.rst +++ b/docs/source/Zh/zh_index.rst @@ -62,6 +62,7 @@ AutoControl 所有功能的完整使用指南。 doc/new_features/v37_features_doc doc/new_features/v38_features_doc doc/new_features/v39_features_doc + doc/new_features/v40_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index e8f8eadc..f8c18a8f 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -230,6 +230,10 @@ from je_auto_control.utils.video_report import ( VideoStep, build_overlay_plan, render_overlay_frame, write_step_video, ) +# Fuzzy string matching / dedupe (difflib default, optional rapidfuzz) +from je_auto_control.utils.fuzzy import ( + fuzzy_best_match, fuzzy_dedupe, fuzzy_matches, fuzzy_ratio, +) # Background popup/interrupt watchdog (unattended automation) from je_auto_control.utils.watchdog import ( PopupWatchdog, WatchdogRule, default_popup_watchdog, @@ -676,6 +680,7 @@ def start_autocontrol_gui(*args, **kwargs): "AgentTrace", "default_trace", "reset_trace", "VideoStep", "build_overlay_plan", "render_overlay_frame", "write_step_video", + "fuzzy_best_match", "fuzzy_dedupe", "fuzzy_matches", "fuzzy_ratio", # MCP server "AuditLogger", "HttpMCPServer", "MCPContent", "MCPPrompt", "MCPPromptArgument", "MCPResource", "MCPServer", "MCPTool", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index a12ef96e..a726cbfe 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -867,6 +867,41 @@ def _add_misc_specs(specs: List[CommandSpec]) -> None: ), description="Render captioned screenshots into a walkthrough video.", )) + specs.append(CommandSpec( + "AC_fuzzy_ratio", "Data", "Fuzzy: Similarity Ratio", + fields=( + FieldSpec("left", FieldType.STRING), + FieldSpec("right", FieldType.STRING), + FieldSpec("ignore_case", FieldType.BOOL, optional=True, + default=True), + ), + description="Similarity score (0..1) between two strings.", + )) + specs.append(CommandSpec( + "AC_fuzzy_best_match", "Data", "Fuzzy: Best Match", + fields=( + FieldSpec("query", FieldType.STRING), + FieldSpec("choices", FieldType.STRING, + placeholder='["Save", "Cancel", "Submit"]'), + FieldSpec("score_cutoff", FieldType.FLOAT, optional=True, + default=0.0), + FieldSpec("ignore_case", FieldType.BOOL, optional=True, + default=True), + ), + description="Best fuzzy match of query within choices (JSON list).", + )) + specs.append(CommandSpec( + "AC_fuzzy_dedupe", "Data", "Fuzzy: Dedupe", + fields=( + FieldSpec("items", FieldType.STRING, + placeholder='["foo", "foo ", "bar"]'), + FieldSpec("threshold", FieldType.FLOAT, optional=True, + default=0.9), + FieldSpec("ignore_case", FieldType.BOOL, optional=True, + default=True), + ), + description="Collapse near-duplicate strings (JSON list).", + )) specs.append(CommandSpec( "AC_generate_sop", "Report", "Generate SOP Document", fields=( diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index 23f6555c..41d6f0a0 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -3067,6 +3067,37 @@ def _write_step_video(steps: Any, output: str, fps: int = 10, seconds_per_step=seconds_per_step) +def _coerce_list(value: Any) -> List[Any]: + import json + return json.loads(value) if isinstance(value, str) else list(value) + + +def _fuzzy_ratio(left: Any, right: Any, + ignore_case: bool = True) -> Dict[str, Any]: + """Adapter: similarity score (0..1) between two values.""" + from je_auto_control.utils.fuzzy import fuzzy_ratio + return {"score": fuzzy_ratio(left, right, ignore_case=ignore_case)} + + +def _fuzzy_best_match(query: Any, choices: Any, score_cutoff: float = 0.0, + ignore_case: bool = True) -> Dict[str, Any]: + """Adapter: best fuzzy match from choices, or a null match.""" + from je_auto_control.utils.fuzzy import fuzzy_best_match + best = fuzzy_best_match(query, _coerce_list(choices), + score_cutoff=score_cutoff, ignore_case=ignore_case) + if best is None: + return {"match": None, "score": 0.0, "index": -1} + return {"match": best[0], "score": best[1], "index": best[2]} + + +def _fuzzy_dedupe(items: Any, threshold: float = 0.9, + ignore_case: bool = True) -> Dict[str, Any]: + """Adapter: drop near-duplicate items, keeping the first of each cluster.""" + from je_auto_control.utils.fuzzy import fuzzy_dedupe + return {"unique": fuzzy_dedupe(_coerce_list(items), threshold=threshold, + ignore_case=ignore_case)} + + class Executor: """ Executor @@ -3319,6 +3350,9 @@ def __init__(self): "AC_trace_export": _trace_export, "AC_trace_reset": _trace_reset, "AC_write_step_video": _write_step_video, + "AC_fuzzy_ratio": _fuzzy_ratio, + "AC_fuzzy_best_match": _fuzzy_best_match, + "AC_fuzzy_dedupe": _fuzzy_dedupe, "AC_a11y_record_start": _a11y_record_start, "AC_a11y_record_stop": _a11y_record_stop, "AC_a11y_record_events": _a11y_record_events, diff --git a/je_auto_control/utils/fuzzy/__init__.py b/je_auto_control/utils/fuzzy/__init__.py new file mode 100644 index 00000000..e7c2b0fd --- /dev/null +++ b/je_auto_control/utils/fuzzy/__init__.py @@ -0,0 +1,9 @@ +"""Fuzzy string matching and dedupe (difflib by default, rapidfuzz if present).""" +from je_auto_control.utils.fuzzy.fuzzy_match import ( + BACKEND, fuzzy_best_match, fuzzy_dedupe, fuzzy_matches, fuzzy_ratio, +) + +__all__ = [ + "BACKEND", "fuzzy_best_match", "fuzzy_dedupe", "fuzzy_matches", + "fuzzy_ratio", +] diff --git a/je_auto_control/utils/fuzzy/fuzzy_match.py b/je_auto_control/utils/fuzzy/fuzzy_match.py new file mode 100644 index 00000000..026ffc7b --- /dev/null +++ b/je_auto_control/utils/fuzzy/fuzzy_match.py @@ -0,0 +1,85 @@ +"""Fuzzy string matching for noisy automation text (OCR labels, table cells). + +Exact string comparison is brittle when text comes from OCR or shifting UI +copy. These helpers score similarity, pick the best candidate from a list, and +collapse near-duplicates. The default backend is the standard library +``difflib`` (so the feature works with **zero** extra dependencies); if the +optional ``rapidfuzz`` package is installed it is used instead for speed — the +scores are normalised to ``0.0..1.0`` either way, so callers don't care which +backend ran. :data:`BACKEND` names the active one. + +Pure Python; imports no ``PySide6``. +""" +from typing import Any, List, Optional, Sequence, Tuple + +try: # optional acceleration; the difflib fallback is always correct + from rapidfuzz import fuzz as _rf + + BACKEND = "rapidfuzz" + + def _similarity(left: str, right: str) -> float: + return _rf.ratio(left, right) / 100.0 +except ImportError: # pragma: no cover - exercised wherever rapidfuzz is absent + from difflib import SequenceMatcher + + BACKEND = "difflib" + + def _similarity(left: str, right: str) -> float: + return SequenceMatcher(None, left, right).ratio() + + +def _prepare(value: Any, ignore_case: bool) -> str: + text = str(value) + return text.lower() if ignore_case else text + + +def fuzzy_ratio(left: Any, right: Any, *, ignore_case: bool = True) -> float: + """Return a similarity score in ``0.0..1.0`` for two values.""" + return _similarity(_prepare(left, ignore_case), + _prepare(right, ignore_case)) + + +def fuzzy_matches(query: Any, choices: Sequence[Any], *, limit: int = 5, + score_cutoff: float = 0.0, ignore_case: bool = True + ) -> List[Tuple[Any, float, int]]: + """Return up to ``limit`` ``(choice, score, index)`` tuples, best first. + + Only choices scoring at least ``score_cutoff`` are returned. + """ + prepared_query = _prepare(query, ignore_case) + scored = [ + (choice, _similarity(prepared_query, _prepare(choice, ignore_case)), + index) + for index, choice in enumerate(choices) + ] + scored = [item for item in scored if item[1] >= score_cutoff] + scored.sort(key=lambda item: item[1], reverse=True) + return scored[:limit] if limit >= 0 else scored + + +def fuzzy_best_match(query: Any, choices: Sequence[Any], *, + score_cutoff: float = 0.0, ignore_case: bool = True + ) -> Optional[Tuple[Any, float, int]]: + """Return the single best ``(choice, score, index)`` or ``None``.""" + ranked = fuzzy_matches(query, choices, limit=1, score_cutoff=score_cutoff, + ignore_case=ignore_case) + return ranked[0] if ranked else None + + +def fuzzy_dedupe(items: Sequence[Any], *, threshold: float = 0.9, + ignore_case: bool = True) -> List[Any]: + """Collapse near-duplicate items, keeping the first of each cluster. + + An item is dropped when it scores at least ``threshold`` against an item + already kept. + """ + kept: List[Any] = [] + kept_prepared: List[str] = [] + for item in items: + prepared = _prepare(item, ignore_case) + if any(_similarity(prepared, seen) >= threshold + for seen in kept_prepared): + continue + kept.append(item) + kept_prepared.append(prepared) + return kept diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 091683ff..0069d947 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -2844,6 +2844,47 @@ def video_report_tools() -> List[MCPTool]: ] +def fuzzy_tools() -> List[MCPTool]: + _CHOICES = {"type": "array", "items": {"type": "string"}} + return [ + MCPTool( + name="ac_fuzzy_ratio", + description=("Similarity score (0..1) between two strings, robust " + "to OCR/UI noise (difflib, or rapidfuzz if " + "installed). 'ignore_case' defaults true. Returns " + "{score}."), + input_schema=schema( + {"left": {"type": "string"}, "right": {"type": "string"}, + "ignore_case": {"type": "boolean"}}, ["left", "right"]), + handler=h.fuzzy_ratio, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_fuzzy_best_match", + description=("Best fuzzy match of 'query' within 'choices' scoring " + ">= 'score_cutoff'. Returns {match, score, index} or " + "{match: null} when nothing qualifies."), + input_schema=schema( + {"query": {"type": "string"}, "choices": _CHOICES, + "score_cutoff": {"type": "number"}, + "ignore_case": {"type": "boolean"}}, ["query", "choices"]), + handler=h.fuzzy_best_match, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_fuzzy_dedupe", + description=("Collapse near-duplicate strings, keeping the first " + "of each cluster (items >= 'threshold' similar are " + "dropped). Returns {unique}."), + input_schema=schema( + {"items": _CHOICES, "threshold": {"type": "number"}, + "ignore_case": {"type": "boolean"}}, ["items"]), + handler=h.fuzzy_dedupe, + annotations=READ_ONLY, + ), + ] + + def unattended_tools() -> List[MCPTool]: return [ MCPTool( @@ -3904,7 +3945,7 @@ def media_assert_tools() -> List[MCPTool]: process_doc_tools, tween_drag_tools, plugin_sdk_tools, governance_tools, credential_lease_tools, egress_tools, approval_testing_tools, trajectory_eval_tools, compliance_tools, agent_trace_tools, - video_report_tools, + video_report_tools, fuzzy_tools, screen_record_tools, process_and_shell_tools, remote_desktop_tools, gamepad_tools, usb_passthrough_tools, assertion_tools, data_source_tools, diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index d099003f..2621783a 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -1372,6 +1372,26 @@ def write_step_video(steps, output, fps=10, seconds_per_step=2.0): return _write(steps, output, fps=fps, seconds_per_step=seconds_per_step) +def fuzzy_ratio(left, right, ignore_case=True): + from je_auto_control.utils.fuzzy import fuzzy_ratio as _ratio + return {"score": _ratio(left, right, ignore_case=ignore_case)} + + +def fuzzy_best_match(query, choices, score_cutoff=0.0, ignore_case=True): + from je_auto_control.utils.fuzzy import fuzzy_best_match as _best + best = _best(query, choices, score_cutoff=score_cutoff, + ignore_case=ignore_case) + if best is None: + return {"match": None, "score": 0.0, "index": -1} + return {"match": best[0], "score": best[1], "index": best[2]} + + +def fuzzy_dedupe(items, threshold=0.9, ignore_case=True): + from je_auto_control.utils.fuzzy import fuzzy_dedupe as _dedupe + return {"unique": _dedupe(items, threshold=threshold, + ignore_case=ignore_case)} + + def vlm_locate(description: str, screen_region: Optional[List[int]] = None, model: Optional[str] = None) -> Optional[List[int]]: diff --git a/pyproject.toml b/pyproject.toml index e35619e8..a7136a51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,7 @@ signaling = ["fastapi>=0.115", "uvicorn>=0.32"] discovery = ["zeroconf>=0.130"] pdf = ["pypdf>=4.0"] office = ["openpyxl>=3.1", "python-docx>=1.1", "python-pptx>=0.6"] +fuzzy = ["rapidfuzz>=3.0"] [tool.bandit] exclude_dirs = [ diff --git a/test/unit_test/headless/test_fuzzy_match_batch.py b/test/unit_test/headless/test_fuzzy_match_batch.py new file mode 100644 index 00000000..c561f884 --- /dev/null +++ b/test/unit_test/headless/test_fuzzy_match_batch.py @@ -0,0 +1,96 @@ +"""Headless tests for fuzzy matching/dedupe. The difflib backend is always +present, so these run with no extra dependency; assertions check ordering and +thresholds, not exact backend-specific float values. Pure stdlib, no Qt.""" +import je_auto_control as ac +from je_auto_control.utils.fuzzy import ( + BACKEND, fuzzy_best_match, fuzzy_dedupe, fuzzy_matches, fuzzy_ratio) + + +def test_backend_is_known(): + assert BACKEND in ("difflib", "rapidfuzz") + + +def test_ratio_bounds_and_identity(): + assert fuzzy_ratio("hello", "hello") == 1.0 + assert fuzzy_ratio("hello", "xxxxx") < 0.5 + assert 0.0 <= fuzzy_ratio("Save", "save", ignore_case=False) <= 1.0 + + +def test_ratio_ignore_case(): + assert fuzzy_ratio("SAVE", "save", ignore_case=True) == 1.0 + assert fuzzy_ratio("SAVE", "save", ignore_case=False) < 1.0 + + +def test_best_match_picks_closest(): + choices = ["Cancel", "Save As...", "Save", "Submit"] + match, score, index = fuzzy_best_match("Sve", choices) + assert match == "Save" + assert choices[index] == "Save" + assert score > 0.5 + + +def test_best_match_cutoff_returns_none(): + assert fuzzy_best_match("zzzzz", ["alpha", "beta"], + score_cutoff=0.8) is None + + +def test_matches_sorted_and_limited(): + choices = ["login", "logon", "logout", "register"] + ranked = fuzzy_matches("login", choices, limit=2) + assert len(ranked) == 2 + scores = [score for _, score, _ in ranked] + assert scores == sorted(scores, reverse=True) + assert ranked[0][0] == "login" + + +def test_dedupe_collapses_near_duplicates(): + items = ["Invoice", "invoice ", "Receipt", "INVOICE"] + unique = fuzzy_dedupe(items, threshold=0.85) + assert "Invoice" in unique + assert "Receipt" in unique + assert len(unique) == 2 # the invoice variants collapse + + +def test_dedupe_keeps_distinct(): + items = ["apple", "banana", "cherry"] + assert fuzzy_dedupe(items) == items + + +# --- wiring --------------------------------------------------------------- + +def test_executor_round_trip(): + rec = ac.execute_action([[ + "AC_fuzzy_best_match", + {"query": "Sumbit", "choices": ["Cancel", "Submit", "Save"]}, + ]]) + result = next(v for v in rec.values() if isinstance(v, dict)) + assert result["match"] == "Submit" + + rec2 = ac.execute_action([[ + "AC_fuzzy_dedupe", + {"items": ["a", "a", "b"], "threshold": 0.95}, + ]]) + deduped = next(v for v in rec2.values() if isinstance(v, dict)) + assert deduped["unique"] == ["a", "b"] + + +def test_wiring(): + known = ac.executor.known_commands() + assert {"AC_fuzzy_ratio", "AC_fuzzy_best_match", + "AC_fuzzy_dedupe"} <= known + from je_auto_control.utils.mcp_server.tools import ( + build_default_tool_registry) + names = {t.name for t in build_default_tool_registry()} + assert {"ac_fuzzy_ratio", "ac_fuzzy_best_match", + "ac_fuzzy_dedupe"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + cmds = {s.command for s in _build_specs()} + assert {"AC_fuzzy_ratio", "AC_fuzzy_best_match", + "AC_fuzzy_dedupe"} <= cmds + + +def test_facade_exports(): + for attr in ("fuzzy_ratio", "fuzzy_best_match", "fuzzy_matches", + "fuzzy_dedupe"): + assert hasattr(ac, attr) + assert attr in ac.__all__ From 81394ca611df011b4d3f9e255adfa1fae27a3705 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Sat, 20 Jun 2026 00:09:33 +0800 Subject: [PATCH 2/2] Use pytest.approx for fuzzy ratio comparisons (Sonar S1244) --- test/unit_test/headless/test_fuzzy_match_batch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/unit_test/headless/test_fuzzy_match_batch.py b/test/unit_test/headless/test_fuzzy_match_batch.py index c561f884..6f3ffde0 100644 --- a/test/unit_test/headless/test_fuzzy_match_batch.py +++ b/test/unit_test/headless/test_fuzzy_match_batch.py @@ -1,6 +1,8 @@ """Headless tests for fuzzy matching/dedupe. The difflib backend is always present, so these run with no extra dependency; assertions check ordering and thresholds, not exact backend-specific float values. Pure stdlib, no Qt.""" +import pytest + import je_auto_control as ac from je_auto_control.utils.fuzzy import ( BACKEND, fuzzy_best_match, fuzzy_dedupe, fuzzy_matches, fuzzy_ratio) @@ -11,13 +13,13 @@ def test_backend_is_known(): def test_ratio_bounds_and_identity(): - assert fuzzy_ratio("hello", "hello") == 1.0 + assert fuzzy_ratio("hello", "hello") == pytest.approx(1.0) assert fuzzy_ratio("hello", "xxxxx") < 0.5 assert 0.0 <= fuzzy_ratio("Save", "save", ignore_case=False) <= 1.0 def test_ratio_ignore_case(): - assert fuzzy_ratio("SAVE", "save", ignore_case=True) == 1.0 + assert fuzzy_ratio("SAVE", "save", ignore_case=True) == pytest.approx(1.0) assert fuzzy_ratio("SAVE", "save", ignore_case=False) < 1.0