From 44ca1b72460e2622d9f7430f622d62f8503d9f0b Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Mon, 22 Jun 2026 11:41:25 +0800 Subject: [PATCH] Add bidirectional-text QA with Trojan-source detection --- README.md | 7 ++ README/README_zh-CN.md | 7 ++ README/README_zh-TW.md | 7 ++ .../doc/new_features/v111_features_doc.rst | 51 +++++++++ docs/source/Eng/eng_index.rst | 1 + .../Zh/doc/new_features/v111_features_doc.rst | 42 +++++++ docs/source/Zh/zh_index.rst | 1 + je_auto_control/__init__.py | 13 +++ .../gui/script_builder/command_schema.py | 14 +++ je_auto_control/utils/bidi_check/__init__.py | 11 ++ .../utils/bidi_check/bidi_check.py | 105 ++++++++++++++++++ .../utils/executor/action_executor.py | 14 +++ .../utils/mcp_server/tools/_factories.py | 22 ++++ .../utils/mcp_server/tools/_handlers.py | 10 ++ .../headless/test_bidi_check_batch.py | 85 ++++++++++++++ 15 files changed, 390 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v111_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v111_features_doc.rst create mode 100644 je_auto_control/utils/bidi_check/__init__.py create mode 100644 je_auto_control/utils/bidi_check/bidi_check.py create mode 100644 test/unit_test/headless/test_bidi_check_batch.py diff --git a/README.md b/README.md index af0bc82e..5c6b4926 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ ## Table of Contents +- [What's new (2026-06-22) — Bidirectional-Text QA (Trojan-Source Scan)](#whats-new-2026-06-22--bidirectional-text-qa-trojan-source-scan) - [What's new (2026-06-22) — Readability Scoring](#whats-new-2026-06-22--readability-scoring) - [What's new (2026-06-22) — Confusable / Homoglyph Detection](#whats-new-2026-06-22--confusable--homoglyph-detection) - [What's new (2026-06-22) — Locale-Aware String Collation](#whats-new-2026-06-22--locale-aware-string-collation) @@ -163,6 +164,12 @@ --- +## What's new (2026-06-22) — Bidirectional-Text QA (Trojan-Source Scan) + +Catch invisible Unicode directional formatting (RTL QA + Trojan-source). Full reference: [`docs/source/Eng/doc/new_features/v111_features_doc.rst`](docs/source/Eng/doc/new_features/v111_features_doc.rst). + +- **`detect_bidi_issues` / `bidi_controls` / `is_bidi_balanced` / `base_direction` / `is_trojan_source` / `strip_bidi_controls` / `has_bidi_controls`** (`AC_bidi_check`, `AC_bidi_strip`): `confusables` catches lookalike characters, but bidi controls (LRO/RLO/PDF, isolates, marks) can silently reorder rendered text — an RTL-QA gap and the "Trojan Source" attack (CVE-2021-42574). This lists the controls, checks nesting balance, infers base direction, and flags reordering formatting. Pure-stdlib (`unicodedata`), deterministic. + ## What's new (2026-06-22) — Readability Scoring Score how hard text is to read; gate generated copy on a reading grade. Full reference: [`docs/source/Eng/doc/new_features/v110_features_doc.rst`](docs/source/Eng/doc/new_features/v110_features_doc.rst). diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md index 356cbed1..e1ce571f 100644 --- a/README/README_zh-CN.md +++ b/README/README_zh-CN.md @@ -12,6 +12,7 @@ ## 目录 +- [本次更新 (2026-06-22) — 双向文字 QA(Trojan-Source 扫描)](#本次更新-2026-06-22--双向文字-qatrojan-source-扫描) - [本次更新 (2026-06-22) — 可读性评分](#本次更新-2026-06-22--可读性评分) - [本次更新 (2026-06-22) — 易混淆字符 / 同形异义字检测](#本次更新-2026-06-22--易混淆字符--同形异义字检测) - [本次更新 (2026-06-22) — 区域感知字符串排序](#本次更新-2026-06-22--区域感知字符串排序) @@ -166,6 +167,12 @@ 平滑噪声值序列。完整参考:[`docs/source/Zh/doc/new_features/v102_features_doc.rst`](../docs/source/Zh/doc/new_features/v102_features_doc.rst)。 +## 本次更新 (2026-06-22) — 双向文字 QA(Trojan-Source 扫描) + +抓出隐形的 Unicode 方向格式控制(RTL QA + Trojan-source)。完整参考:[`docs/source/Zh/doc/new_features/v111_features_doc.rst`](../docs/source/Zh/doc/new_features/v111_features_doc.rst)。 + +- **`detect_bidi_issues` / `bidi_controls` / `is_bidi_balanced` / `base_direction` / `is_trojan_source` / `strip_bidi_controls` / `has_bidi_controls`**(`AC_bidi_check`、`AC_bidi_strip`):`confusables` 抓相似字符,但双向控制(LRO/RLO/PDF、隔离、标记)可悄悄改变呈现顺序——既是 RTL QA 缺口,也是「Trojan Source」攻击(CVE-2021-42574)。本功能列出控制字符、检查嵌套平衡、推断基底方向,并标记重排格式。纯标准库(`unicodedata`)、确定。 + ## 本次更新 (2026-06-22) — 可读性评分 评估文字有多难读;以阅读年级把关生成的文案。完整参考:[`docs/source/Zh/doc/new_features/v110_features_doc.rst`](../docs/source/Zh/doc/new_features/v110_features_doc.rst)。 diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md index e93080ac..a45fbe28 100644 --- a/README/README_zh-TW.md +++ b/README/README_zh-TW.md @@ -12,6 +12,7 @@ ## 目錄 +- [本次更新 (2026-06-22) — 雙向文字 QA(Trojan-Source 掃描)](#本次更新-2026-06-22--雙向文字-qatrojan-source-掃描) - [本次更新 (2026-06-22) — 可讀性評分](#本次更新-2026-06-22--可讀性評分) - [本次更新 (2026-06-22) — 易混淆字元 / 同形異義字偵測](#本次更新-2026-06-22--易混淆字元--同形異義字偵測) - [本次更新 (2026-06-22) — 地區感知字串排序](#本次更新-2026-06-22--地區感知字串排序) @@ -166,6 +167,12 @@ 平滑雜訊值序列。完整參考:[`docs/source/Zh/doc/new_features/v102_features_doc.rst`](../docs/source/Zh/doc/new_features/v102_features_doc.rst)。 +## 本次更新 (2026-06-22) — 雙向文字 QA(Trojan-Source 掃描) + +抓出隱形的 Unicode 方向格式控制(RTL QA + Trojan-source)。完整參考:[`docs/source/Zh/doc/new_features/v111_features_doc.rst`](../docs/source/Zh/doc/new_features/v111_features_doc.rst)。 + +- **`detect_bidi_issues` / `bidi_controls` / `is_bidi_balanced` / `base_direction` / `is_trojan_source` / `strip_bidi_controls` / `has_bidi_controls`**(`AC_bidi_check`、`AC_bidi_strip`):`confusables` 抓相似字元,但雙向控制(LRO/RLO/PDF、隔離、標記)可悄悄改變呈現順序——既是 RTL QA 缺口,也是「Trojan Source」攻擊(CVE-2021-42574)。本功能列出控制字元、檢查巢狀平衡、推斷基底方向,並標記重排格式。純標準函式庫(`unicodedata`)、具決定性。 + ## 本次更新 (2026-06-22) — 可讀性評分 評估文字有多難讀;以閱讀年級把關產生的文案。完整參考:[`docs/source/Zh/doc/new_features/v110_features_doc.rst`](../docs/source/Zh/doc/new_features/v110_features_doc.rst)。 diff --git a/docs/source/Eng/doc/new_features/v111_features_doc.rst b/docs/source/Eng/doc/new_features/v111_features_doc.rst new file mode 100644 index 00000000..560daff3 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v111_features_doc.rst @@ -0,0 +1,51 @@ +Bidirectional-Text QA (Trojan-Source Scan) +========================================== + +``confusables`` catches lookalike *characters*, but invisible Unicode +*directional formatting* is a separate hazard. The embeddings/overrides +(LRE/RLE/LRO/RLO/PDF), isolates (LRI/RLI/FSI/PDI) and marks (LRM/RLM/ALM) can +silently reorder how text renders. That is both an RTL localisation-QA gap and +the basis of the "Trojan Source" attack (CVE-2021-42574), where override controls +make source read differently than it runs. + +This reports the bidi controls in a string, checks that embeddings/isolates are +balanced, infers the base direction, and flags Trojan-source-style formatting. +Pure standard library (``unicodedata``); imports no ``PySide6``. Every function is +pure, so it is fully deterministic in CI. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import ( + detect_bidi_issues, bidi_controls, has_bidi_controls, + is_bidi_balanced, base_direction, is_trojan_source, + strip_bidi_controls, + ) + + rlo, pdf = chr(0x202E), chr(0x202C) # RLO override, PDF terminator + sneaky = f"value = {rlo}admin{pdf}" + detect_bidi_issues(sneaky) + # {'controls': [{'index': 8, 'char': '', 'name': 'RLO'}, ...], + # 'has_controls': True, 'balanced': True, 'base_direction': 'LTR', + # 'trojan_source': True} + + is_trojan_source(sneaky) # True + strip_bidi_controls(sneaky) # 'value = admin' + base_direction("אב") # 'RTL' (Hebrew alef bet) + +``bidi_controls`` lists every control as ``{index, char, name}``. ``is_bidi_balanced`` +checks that PDF closes an embedding/override and PDI closes an isolate, properly +nested. ``base_direction`` returns ``LTR`` / ``RTL`` / ``NEUTRAL`` from the first +strong character. ``is_trojan_source`` is true when any non-mark formatting +control is present or the nesting is unbalanced. ``strip_bidi_controls`` returns a +clean copy. ``detect_bidi_issues`` bundles it all into one report. + +Executor commands +----------------- + +``AC_bidi_check`` returns the full report; ``AC_bidi_strip`` returns +``{text}`` with controls removed. Both are exposed as MCP tools +(``ac_bidi_check`` / ``ac_bidi_strip``) and as Script Builder commands under +**Data**. diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst index 34e7c122..9adac878 100644 --- a/docs/source/Eng/eng_index.rst +++ b/docs/source/Eng/eng_index.rst @@ -133,6 +133,7 @@ Comprehensive guides for all AutoControl features. doc/new_features/v108_features_doc doc/new_features/v109_features_doc doc/new_features/v110_features_doc + doc/new_features/v111_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/docs/source/Zh/doc/new_features/v111_features_doc.rst b/docs/source/Zh/doc/new_features/v111_features_doc.rst new file mode 100644 index 00000000..9555eb7b --- /dev/null +++ b/docs/source/Zh/doc/new_features/v111_features_doc.rst @@ -0,0 +1,42 @@ +雙向文字 QA(Trojan-Source 掃描) +================================ + +``confusables`` 抓出相似的*字元*,但隱形的 Unicode *方向格式控制*是另一種危害。嵌入/覆寫 +(LRE/RLE/LRO/RLO/PDF)、隔離(LRI/RLI/FSI/PDI)與標記(LRM/RLM/ALM)可以悄悄改變文字的呈現順序。這既是 +RTL 在地化 QA 的缺口,也是「Trojan Source」攻擊(CVE-2021-42574)的根源——覆寫控制讓原始碼讀起來與實際執行 +不同。 + +本功能回報字串中的雙向控制字元、檢查嵌入/隔離是否平衡、推斷基底方向,並標記 Trojan-source 式的格式。 +純標準函式庫(``unicodedata``);不匯入 ``PySide6``。每個函式皆為純函式,因此在 CI 中完全具決定性。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import ( + detect_bidi_issues, bidi_controls, has_bidi_controls, + is_bidi_balanced, base_direction, is_trojan_source, + strip_bidi_controls, + ) + + sneaky = "value = admin" # RLO ... PDF + detect_bidi_issues(sneaky) + # {'controls': [{'index': 8, 'char': '', 'name': 'RLO'}, ...], + # 'has_controls': True, 'balanced': True, 'base_direction': 'LTR', + # 'trojan_source': True} + + is_trojan_source(sneaky) # True + strip_bidi_controls(sneaky) # 'value = admin' + base_direction("אב") # 'RTL' + +``bidi_controls`` 將每個控制字元列為 ``{index, char, name}``。``is_bidi_balanced`` 檢查 PDF 關閉一個嵌入/覆寫、 +PDI 關閉一個隔離,且正確巢狀。``base_direction`` 依第一個強方向字元回傳 ``LTR`` / ``RTL`` / ``NEUTRAL``。 +``is_trojan_source`` 在出現任何非標記格式控制或巢狀不平衡時為真。``strip_bidi_controls`` 回傳乾淨副本。 +``detect_bidi_issues`` 將全部打包為一份報告。 + +執行器命令 +---------- + +``AC_bidi_check`` 回傳完整報告;``AC_bidi_strip`` 回傳移除控制字元後的 ``{text}``。兩者皆以 MCP 工具 +(``ac_bidi_check`` / ``ac_bidi_strip``)以及 Script Builder 中 **Data** 分類下的命令提供。 diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst index d0630382..f604f80e 100644 --- a/docs/source/Zh/zh_index.rst +++ b/docs/source/Zh/zh_index.rst @@ -133,6 +133,7 @@ AutoControl 所有功能的完整使用指南。 doc/new_features/v108_features_doc doc/new_features/v109_features_doc doc/new_features/v110_features_doc + doc/new_features/v111_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 95991bf5..b627d1a4 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -229,6 +229,12 @@ flesch_reading_ease, gunning_fog, readability_report, readability_stats, smog_index, ) +# Bidirectional-text QA (bidi controls, nesting balance, Trojan-source scan) +from je_auto_control.utils.bidi_check import ( + base_direction, bidi_controls, detect_bidi_issues, has_bidi_controls, + is_trojan_source, strip_bidi_controls, +) +from je_auto_control.utils.bidi_check import is_balanced as is_bidi_balanced # CI workflow annotations (GitHub Actions) from je_auto_control.utils.ci_annotations import ( emit_annotations, format_annotation, @@ -975,6 +981,13 @@ def start_autocontrol_gui(*args, **kwargs): "readability_report", "readability_stats", "smog_index", + "base_direction", + "bidi_controls", + "detect_bidi_issues", + "has_bidi_controls", + "is_bidi_balanced", + "is_trojan_source", + "strip_bidi_controls", "emit_annotations", "format_annotation", "ClipboardHistory", "default_clipboard_history", "analyze_heal_log", "heal_stats", "scan_secrets", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 24e269ab..caf33d5b 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -2113,6 +2113,20 @@ def _add_resilience_specs(specs: List[CommandSpec]) -> None: ), description="Flesch / Flesch-Kincaid / Fog / SMOG / ARI scores + counts.", )) + specs.append(CommandSpec( + "AC_bidi_check", "Data", "Text: Bidi / Trojan-Source Check", + fields=( + FieldSpec("text", FieldType.STRING, placeholder="value = admin"), + ), + description="Bidi controls, nesting balance, base dir, Trojan-source flag.", + )) + specs.append(CommandSpec( + "AC_bidi_strip", "Data", "Text: Strip Bidi Controls", + fields=( + FieldSpec("text", FieldType.STRING, placeholder="value = admin"), + ), + description="Remove all bidirectional control characters from a string.", + )) specs.append(CommandSpec( "AC_diff_rows", "Data", "Dataset Diff: Rows by Key", fields=( diff --git a/je_auto_control/utils/bidi_check/__init__.py b/je_auto_control/utils/bidi_check/__init__.py new file mode 100644 index 00000000..b31f4d19 --- /dev/null +++ b/je_auto_control/utils/bidi_check/__init__.py @@ -0,0 +1,11 @@ +"""Bidirectional-text QA (bidi controls, nesting balance, Trojan-source scan).""" +from je_auto_control.utils.bidi_check.bidi_check import ( + base_direction, bidi_controls, detect_bidi_issues, has_bidi_controls, + is_balanced, is_trojan_source, strip_bidi_controls, +) + +__all__ = [ + "base_direction", "bidi_controls", "detect_bidi_issues", + "has_bidi_controls", "is_balanced", "is_trojan_source", + "strip_bidi_controls", +] diff --git a/je_auto_control/utils/bidi_check/bidi_check.py b/je_auto_control/utils/bidi_check/bidi_check.py new file mode 100644 index 00000000..f38e6423 --- /dev/null +++ b/je_auto_control/utils/bidi_check/bidi_check.py @@ -0,0 +1,105 @@ +"""Bidirectional-text QA: bidi controls, nesting balance, Trojan-source scan. + +``confusables`` catches lookalike *characters*, but invisible Unicode *directional +formatting* (LRE/RLE/LRO/RLO/PDF, the isolates LRI/RLI/FSI/PDI, and the marks +LRM/RLM/ALM) is a separate hazard: it can silently reorder how text renders. That +is both an RTL localisation-QA gap and the basis of the "Trojan Source" attack +(CVE-2021-42574), where override controls make source read differently than it +runs. + +This reports the bidi controls in a string, checks that embeddings/isolates are +balanced, infers the base direction, and flags Trojan-source-style formatting. +Pure standard library (``unicodedata``); imports no ``PySide6``. Every function is +pure, so it is fully deterministic in CI. +""" +import unicodedata +from typing import Dict, List + +# Code points are built with ``chr`` rather than literal characters so this +# source file itself contains no bidi controls (which would trip Trojan-source +# scanners such as Bandit B613 on the module that detects them). +_NAME_TO_CP = { + "LRE": 0x202A, "RLE": 0x202B, "PDF": 0x202C, "LRO": 0x202D, "RLO": 0x202E, + "LRI": 0x2066, "RLI": 0x2067, "FSI": 0x2068, "PDI": 0x2069, + "LRM": 0x200E, "RLM": 0x200F, "ALM": 0x061C, +} +_BIDI_CONTROLS: Dict[str, str] = {chr(cp): name + for name, cp in _NAME_TO_CP.items()} +# Opening controls mapped to their kind: "E" embedding/override, "I" isolate. +_OPEN_KIND = {chr(_NAME_TO_CP[name]): "E" + for name in ("LRE", "RLE", "LRO", "RLO")} +_OPEN_KIND.update({chr(_NAME_TO_CP[name]): "I" + for name in ("LRI", "RLI", "FSI")}) +_CLOSE_EMBED = chr(_NAME_TO_CP["PDF"]) +_CLOSE_ISOLATE = chr(_NAME_TO_CP["PDI"]) +# Marks do not nest; formatting (non-mark) controls are the reordering hazard. +_MARKS = {chr(_NAME_TO_CP[name]) for name in ("LRM", "RLM", "ALM")} + + +def bidi_controls(text: str) -> List[Dict[str, object]]: + """List the bidi control characters in ``text`` as ``{index, char, name}``.""" + return [{"index": index, "char": char, "name": _BIDI_CONTROLS[char]} + for index, char in enumerate(text or "") + if char in _BIDI_CONTROLS] + + +def has_bidi_controls(text: str) -> bool: + """Whether ``text`` contains any bidi control character.""" + return any(char in _BIDI_CONTROLS for char in text or "") + + +def _pop_matches(stack: List[str], expected: str) -> bool: + """Pop ``stack`` and report whether the popped kind was ``expected``.""" + return bool(stack) and stack.pop() == expected + + +def is_balanced(text: str) -> bool: + """Whether embeddings/overrides (PDF) and isolates (PDI) are well nested.""" + stack: List[str] = [] + for char in text or "": + kind = _OPEN_KIND.get(char) + if kind is not None: + stack.append(kind) + elif char == _CLOSE_EMBED and not _pop_matches(stack, "E"): + return False + elif char == _CLOSE_ISOLATE and not _pop_matches(stack, "I"): + return False + return not stack + + +def base_direction(text: str) -> str: + """Infer the paragraph base direction from the first strong character.""" + for char in text or "": + bidi = unicodedata.bidirectional(char) + if bidi == "L": + return "LTR" + if bidi in ("R", "AL"): + return "RTL" + return "NEUTRAL" + + +def strip_bidi_controls(text: str) -> str: + """Return ``text`` with every bidi control character removed.""" + return "".join(char for char in text or "" if char not in _BIDI_CONTROLS) + + +def is_trojan_source(text: str) -> bool: + """Whether ``text`` carries reordering formatting (Trojan-source hazard). + + True when any non-mark formatting control (embedding/override/isolate) is + present, or when the bidi nesting is unbalanced. + """ + has_formatting = any(char in _BIDI_CONTROLS and char not in _MARKS + for char in text or "") + return has_formatting or not is_balanced(text) + + +def detect_bidi_issues(text: str) -> Dict[str, object]: + """Full bidi report: controls, balance, base direction, Trojan-source flag.""" + return { + "controls": bidi_controls(text), + "has_controls": has_bidi_controls(text), + "balanced": is_balanced(text), + "base_direction": base_direction(text), + "trojan_source": is_trojan_source(text), + } diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index e51795c4..57ab0c70 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2999,6 +2999,18 @@ def _readability_report(text: str) -> Dict[str, Any]: return readability_report(text) +def _bidi_check(text: str) -> Dict[str, Any]: + """Adapter: bidirectional-text QA report (controls/balance/Trojan-source).""" + from je_auto_control.utils.bidi_check import detect_bidi_issues + return detect_bidi_issues(text) + + +def _bidi_strip(text: str) -> Dict[str, Any]: + """Adapter: remove all bidi control characters from a string.""" + from je_auto_control.utils.bidi_check import strip_bidi_controls + return {"text": strip_bidi_controls(text)} + + def _cas_put(name: str, key: str, value: Any, expected_version: Any = None) -> Dict[str, Any]: """Adapter: optimistic put into a named versioned store.""" @@ -4686,6 +4698,8 @@ def __init__(self): "AC_confusable_scan": _confusable_scan, "AC_confusable_compare": _confusable_compare, "AC_readability_report": _readability_report, + "AC_bidi_check": _bidi_check, + "AC_bidi_strip": _bidi_strip, "AC_detect_drift": _detect_drift, "AC_categorical_drift": _categorical_drift, "AC_diff_rows": _diff_rows, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index b54223d8..da5426df 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -3666,6 +3666,27 @@ def readability_tools() -> List[MCPTool]: ] +def bidi_check_tools() -> List[MCPTool]: + return [ + MCPTool( + name="ac_bidi_check", + description=("Bidirectional-text QA for 'text': bidi controls, " + "nesting balance, base direction, Trojan-source flag."), + input_schema=schema({"text": {"type": "string"}}, ["text"]), + handler=h.bidi_check, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_bidi_strip", + description=("Remove every bidi control character from 'text'. " + "Returns {text}."), + input_schema=schema({"text": {"type": "string"}}, ["text"]), + handler=h.bidi_strip, + annotations=NON_DESTRUCTIVE, + ), + ] + + def sequence_gap_tools() -> List[MCPTool]: return [ MCPTool( @@ -5707,6 +5728,7 @@ def media_assert_tools() -> List[MCPTool]: timeseries_tools, anomaly_tools, smoothing_tools, idempotency_tools, dedup_window_tools, sequence_gap_tools, optimistic_tools, outbox_tools, locale_collation_tools, confusables_tools, readability_tools, + bidi_check_tools, dataset_diff_tools, referential_tools, link_header_tools, multipart_tools, http_content_tools, cookie_jar_tools, http_conditional_tools, saga_tools, decision_table_tools, locator_repair_tools, diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 1ecf6eb2..4be2e362 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -1987,6 +1987,16 @@ def readability_report(text): return _readability_report(text) +def bidi_check(text): + from je_auto_control.utils.executor.action_executor import _bidi_check + return _bidi_check(text) + + +def bidi_strip(text): + from je_auto_control.utils.executor.action_executor import _bidi_strip + return _bidi_strip(text) + + def detect_drift(reference, current, threshold=0.25, bins=10): from je_auto_control.utils.executor.action_executor import _detect_drift return _detect_drift(reference, current, threshold, bins) diff --git a/test/unit_test/headless/test_bidi_check_batch.py b/test/unit_test/headless/test_bidi_check_batch.py new file mode 100644 index 00000000..2e4db77f --- /dev/null +++ b/test/unit_test/headless/test_bidi_check_batch.py @@ -0,0 +1,85 @@ +"""Headless tests for bidirectional-text QA. No Qt.""" +import je_auto_control as ac +from je_auto_control.utils.bidi_check import ( + base_direction, bidi_controls, detect_bidi_issues, has_bidi_controls, + is_balanced, is_trojan_source, strip_bidi_controls, +) + +# "value = admin" — a Trojan-source style reorder. Controls are built +# with chr() so this test file holds no literal bidi characters. +_RLO = chr(0x202E) +_PDF = chr(0x202C) +_LRI = chr(0x2066) +_PDI = chr(0x2069) +_LRM = chr(0x200E) +_TROJAN = f"value = {_RLO}admin{_PDF}" +_PLAIN = "value = admin" + + +def test_bidi_controls_lists_positions(): + found = bidi_controls(_TROJAN) + assert [entry["name"] for entry in found] == ["RLO", "PDF"] + assert bidi_controls(_PLAIN) == [] + + +def test_has_controls(): + assert has_bidi_controls(_TROJAN) is True + assert has_bidi_controls(_PLAIN) is False + + +def test_balance_detection(): + assert is_balanced(_TROJAN) is True # RLO ... PDF closes + assert is_balanced(f"{_RLO}no close") is False # embed never closed + assert is_balanced(_PDF) is False # stray close + assert is_balanced(f"{_LRI}{_PDI}") is True # LRI ... PDI + assert is_balanced(f"{_RLO}{_PDI}") is False # PDI cannot close embed + + +def test_base_direction(): + assert base_direction("hello") == "LTR" + assert base_direction("אב") == "RTL" # Hebrew alef bet + assert base_direction("123 ...") == "NEUTRAL" + + +def test_strip_controls(): + assert strip_bidi_controls(_TROJAN) == _PLAIN + assert strip_bidi_controls(_PLAIN) == _PLAIN + + +def test_trojan_source_flag(): + assert is_trojan_source(_TROJAN) is True + assert is_trojan_source(_PLAIN) is False + assert is_trojan_source(_LRM) is False # a bare mark is benign + assert is_trojan_source(f"{_RLO}oops") is True # unbalanced override + + +# --- wiring --------------------------------------------------------------- + +def test_executor_round_trip(): + rec = ac.execute_action([["AC_bidi_check", {"text": _TROJAN}]]) + out = next(v for v in rec.values() if isinstance(v, dict)) + assert out["trojan_source"] is True and out["has_controls"] is True + rec2 = ac.execute_action([["AC_bidi_strip", {"text": _TROJAN}]]) + assert next(v for v in rec2.values() if isinstance(v, dict))["text"] == _PLAIN + + +def test_wiring(): + known = ac.executor.known_commands() + assert {"AC_bidi_check", "AC_bidi_strip"} <= set(known) + from je_auto_control.utils.mcp_server.tools import build_default_tool_registry + names = {t.name for t in build_default_tool_registry()} + assert {"ac_bidi_check", "ac_bidi_strip"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_bidi_check", "AC_bidi_strip"} <= specs + + +def test_facade_exports(): + for attr in ("base_direction", "bidi_controls", "detect_bidi_issues", + "has_bidi_controls", "is_bidi_balanced", "is_trojan_source", + "strip_bidi_controls"): + assert hasattr(ac, attr) and attr in ac.__all__ + # the report bundles everything + assert set(detect_bidi_issues(_TROJAN)) == { + "controls", "has_controls", "balanced", "base_direction", + "trojan_source"}