From b2526d570bf74802b27c65a3a265a41ee1e3a9aa Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Mon, 22 Jun 2026 11:03:14 +0800 Subject: [PATCH] Add locale-aware string collation with multi-level sort keys --- README.md | 7 + README/README_zh-CN.md | 7 + README/README_zh-TW.md | 7 + .../doc/new_features/v108_features_doc.rst | 47 +++++++ docs/source/Eng/eng_index.rst | 1 + .../Zh/doc/new_features/v108_features_doc.rst | 39 ++++++ docs/source/Zh/zh_index.rst | 1 + je_auto_control/__init__.py | 8 ++ .../gui/script_builder/command_schema.py | 24 ++++ .../utils/executor/action_executor.py | 22 ++++ .../utils/locale_collation/__init__.py | 6 + .../locale_collation/locale_collation.py | 122 ++++++++++++++++++ .../utils/mcp_server/tools/_factories.py | 32 +++++ .../utils/mcp_server/tools/_handlers.py | 10 ++ .../headless/test_locale_collation_batch.py | 81 ++++++++++++ 15 files changed, 414 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v108_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v108_features_doc.rst create mode 100644 je_auto_control/utils/locale_collation/__init__.py create mode 100644 je_auto_control/utils/locale_collation/locale_collation.py create mode 100644 test/unit_test/headless/test_locale_collation_batch.py diff --git a/README.md b/README.md index a5a6c5bc..505abf4b 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ ## Table of Contents +- [What's new (2026-06-22) — Locale-Aware String Collation](#whats-new-2026-06-22--locale-aware-string-collation) - [What's new (2026-06-22) — Transactional Outbox](#whats-new-2026-06-22--transactional-outbox) - [What's new (2026-06-22) — Optimistic-Concurrency Versioned Store](#whats-new-2026-06-22--optimistic-concurrency-versioned-store) - [What's new (2026-06-22) — Per-Stream Sequence-Gap Detection](#whats-new-2026-06-22--per-stream-sequence-gap-detection) @@ -160,6 +161,12 @@ --- +## What's new (2026-06-22) — Locale-Aware String Collation + +Sort strings the way a reader of the language expects. Full reference: [`docs/source/Eng/doc/new_features/v108_features_doc.rst`](docs/source/Eng/doc/new_features/v108_features_doc.rst). + +- **`sort_strings` / `collation_compare` / `collation_key`** (`AC_collation_sort`, `AC_collation_compare`): Python's default `sorted` is codepoint order, so `"Z" < "a"` and `"ä"` lands far from `"a"`. This Unicode-Collation-lite key orders by base letter, then accent (secondary), then case (tertiary), with an optional `tailoring` alphabet so Swedish puts `å ä ö` after `z`. Pure-stdlib (`unicodedata`), deterministic across platforms — unlike `locale.strxfrm`. + ## What's new (2026-06-22) — Transactional Outbox Durably buffer events and drain them at-least-once. Full reference: [`docs/source/Eng/doc/new_features/v107_features_doc.rst`](docs/source/Eng/doc/new_features/v107_features_doc.rst). diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md index 1b77da7e..100a175e 100644 --- a/README/README_zh-CN.md +++ b/README/README_zh-CN.md @@ -12,6 +12,7 @@ ## 目录 +- [本次更新 (2026-06-22) — 区域感知字符串排序](#本次更新-2026-06-22--区域感知字符串排序) - [本次更新 (2026-06-22) — 事务型 Outbox](#本次更新-2026-06-22--事务型-outbox) - [本次更新 (2026-06-22) — 乐观并发版本存储](#本次更新-2026-06-22--乐观并发版本存储) - [本次更新 (2026-06-22) — 逐流序号间隙检测](#本次更新-2026-06-22--逐流序号间隙检测) @@ -163,6 +164,12 @@ 平滑噪声值序列。完整参考:[`docs/source/Zh/doc/new_features/v102_features_doc.rst`](../docs/source/Zh/doc/new_features/v102_features_doc.rst)。 +## 本次更新 (2026-06-22) — 区域感知字符串排序 + +依某语言读者的期望排序字符串。完整参考:[`docs/source/Zh/doc/new_features/v108_features_doc.rst`](../docs/source/Zh/doc/new_features/v108_features_doc.rst)。 + +- **`sort_strings` / `collation_compare` / `collation_key`**(`AC_collation_sort`、`AC_collation_compare`):Python 默认的 `sorted` 是码位顺序,因此 `"Z" < "a"`,而 `"ä"` 离 `"a"` 很远。本 Unicode-Collation-lite 键先依基底字母、再依变音符号(次层)、再依大小写(三层)排序,并可用 `tailoring` 字母表让瑞典文将 `å ä ö` 排在 `z` 之后。纯标准库(`unicodedata`)、跨平台确定——不像 `locale.strxfrm`。 + ## 本次更新 (2026-06-22) — 事务型 Outbox 持久化缓冲事件并以至少一次传递排空。完整参考:[`docs/source/Zh/doc/new_features/v107_features_doc.rst`](../docs/source/Zh/doc/new_features/v107_features_doc.rst)。 diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md index 8b3cdc60..1106d8cc 100644 --- a/README/README_zh-TW.md +++ b/README/README_zh-TW.md @@ -12,6 +12,7 @@ ## 目錄 +- [本次更新 (2026-06-22) — 地區感知字串排序](#本次更新-2026-06-22--地區感知字串排序) - [本次更新 (2026-06-22) — 交易型 Outbox](#本次更新-2026-06-22--交易型-outbox) - [本次更新 (2026-06-22) — 樂觀並行版本儲存](#本次更新-2026-06-22--樂觀並行版本儲存) - [本次更新 (2026-06-22) — 逐串流序號間隙偵測](#本次更新-2026-06-22--逐串流序號間隙偵測) @@ -163,6 +164,12 @@ 平滑雜訊值序列。完整參考:[`docs/source/Zh/doc/new_features/v102_features_doc.rst`](../docs/source/Zh/doc/new_features/v102_features_doc.rst)。 +## 本次更新 (2026-06-22) — 地區感知字串排序 + +依某語言讀者的期望排序字串。完整參考:[`docs/source/Zh/doc/new_features/v108_features_doc.rst`](../docs/source/Zh/doc/new_features/v108_features_doc.rst)。 + +- **`sort_strings` / `collation_compare` / `collation_key`**(`AC_collation_sort`、`AC_collation_compare`):Python 預設的 `sorted` 是碼位順序,因此 `"Z" < "a"`,而 `"ä"` 離 `"a"` 很遠。本 Unicode-Collation-lite 鍵先依基底字母、再依變音符號(次層)、再依大小寫(三層)排序,並可用 `tailoring` 字母表讓瑞典文將 `å ä ö` 排在 `z` 之後。純標準函式庫(`unicodedata`)、跨平台具決定性——不像 `locale.strxfrm`。 + ## 本次更新 (2026-06-22) — 交易型 Outbox 持久化緩衝事件並以至少一次傳遞排空。完整參考:[`docs/source/Zh/doc/new_features/v107_features_doc.rst`](../docs/source/Zh/doc/new_features/v107_features_doc.rst)。 diff --git a/docs/source/Eng/doc/new_features/v108_features_doc.rst b/docs/source/Eng/doc/new_features/v108_features_doc.rst new file mode 100644 index 00000000..1f5f20b4 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v108_features_doc.rst @@ -0,0 +1,47 @@ +Locale-Aware String Collation +============================= + +``text_normalize`` canonicalises text and ``locale_parse`` formats numbers, but +nothing sorts strings the way a reader of a given language expects. Python's +default ``sorted`` is codepoint order, so ``"Z" < "a"`` and ``"ä"`` lands far +from ``"a"``. A real collation orders by *base letter* first, then *accent*, +then *case*, and lets a locale tailor the alphabet (Swedish sorts ``å ä ö`` after +``z``). + +This builds a Unicode-Collation-lite sort key with three levels — primary (base +letter), secondary (diacritics), tertiary (case) — plus an optional alphabet +``tailoring``. Pure standard library (``unicodedata``); imports no ``PySide6``. +Every function is pure, so it is fully deterministic across platforms (unlike +``locale.strxfrm``, which depends on the host's installed locales). + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import sort_strings, collation_compare, collation_key + + sort_strings(["résumé", "rest", "resume"]) + # ['rest', 'resume', 'résumé'] (accent is a secondary difference) + + swedish = "abcdefghijklmnopqrstuvwxyzåäö" + sort_strings(["zebra", "äpple", "apple"], tailoring=swedish) + # ['apple', 'zebra', 'äpple'] (å ä ö sort after z) + + collation_compare("apple", "Apple") # -1 (lowercase before uppercase) + sort_strings(rows, key=lambda r: r["name"]) # sort dicts by a field + +``strength`` (``primary`` / ``secondary`` / ``tertiary``) caps the levels +compared, so ``strength="primary"`` is accent- and case-insensitive. +``tailoring`` is an ordered alphabet whose characters sort in the given order and +before any unlisted character; a precomposed letter such as ``"å"`` keeps its +alphabet rank instead of decomposing to ``a`` + diaeresis. ``collation_key`` +returns the raw comparable tuple for use as a ``sorted`` key. + +Executor commands +----------------- + +``AC_collation_sort`` takes a JSON list and returns ``{sorted}``; +``AC_collation_compare`` returns ``{order: -1|0|1}``. Both accept ``strength`` +and ``tailoring``, are exposed as MCP tools (``ac_collation_sort`` / +``ac_collation_compare``) and as Script Builder commands under **Data**. diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst index 6466dc30..03f73773 100644 --- a/docs/source/Eng/eng_index.rst +++ b/docs/source/Eng/eng_index.rst @@ -130,6 +130,7 @@ Comprehensive guides for all AutoControl features. doc/new_features/v105_features_doc doc/new_features/v106_features_doc doc/new_features/v107_features_doc + doc/new_features/v108_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/docs/source/Zh/doc/new_features/v108_features_doc.rst b/docs/source/Zh/doc/new_features/v108_features_doc.rst new file mode 100644 index 00000000..d8973792 --- /dev/null +++ b/docs/source/Zh/doc/new_features/v108_features_doc.rst @@ -0,0 +1,39 @@ +地區感知字串排序(Collation) +============================ + +``text_normalize`` 正規化文字、``locale_parse`` 格式化數字,但沒有任何功能能依某語言讀者的期望排序字串。 +Python 預設的 ``sorted`` 是碼位順序,因此 ``"Z" < "a"``,而 ``"ä"`` 會離 ``"a"`` 很遠。真正的排序會先依 +*基底字母*、再依*變音符號*、再依*大小寫*,並讓地區得以調整字母表(瑞典文將 ``å ä ö`` 排在 ``z`` 之後)。 + +本功能建立一個 Unicode-Collation-lite 排序鍵,含三個層級——主層(基底字母)、次層(變音符號)、三層(大小寫) +——以及選用的字母表 ``tailoring``。純標準函式庫(``unicodedata``);不匯入 ``PySide6``。每個函式皆為純函式, +因此跨平台完全具決定性(不像 ``locale.strxfrm`` 取決於主機已安裝的地區設定)。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import sort_strings, collation_compare, collation_key + + sort_strings(["résumé", "rest", "resume"]) + # ['rest', 'resume', 'résumé'] (變音符號為次層差異) + + swedish = "abcdefghijklmnopqrstuvwxyzåäö" + sort_strings(["zebra", "äpple", "apple"], tailoring=swedish) + # ['apple', 'zebra', 'äpple'] (å ä ö 排在 z 之後) + + collation_compare("apple", "Apple") # -1 (小寫在大寫之前) + sort_strings(rows, key=lambda r: r["name"]) # 依欄位排序字典 + +``strength``(``primary`` / ``secondary`` / ``tertiary``)限制比較的層級,因此 ``strength="primary"`` 為 +不分變音符號與大小寫。``tailoring`` 是有序字母表,所列字元依給定順序排序,且排在任何未列字元之前;像 ``"å"`` +這類預組字元會保有其字母表排名,而非分解為 ``a`` + 分音符。``collation_key`` 回傳可比較的原始 tuple,供作 +``sorted`` 的 key 使用。 + +執行器命令 +---------- + +``AC_collation_sort`` 接受 JSON 列表並回傳 ``{sorted}``;``AC_collation_compare`` 回傳 ``{order: -1|0|1}``。 +兩者皆接受 ``strength`` 與 ``tailoring``,並以 MCP 工具(``ac_collation_sort`` / ``ac_collation_compare``) +以及 Script Builder 中 **Data** 分類下的命令提供。 diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst index eed2fa78..3600e172 100644 --- a/docs/source/Zh/zh_index.rst +++ b/docs/source/Zh/zh_index.rst @@ -130,6 +130,7 @@ AutoControl 所有功能的完整使用指南。 doc/new_features/v105_features_doc doc/new_features/v106_features_doc doc/new_features/v107_features_doc + doc/new_features/v108_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index b48e5d28..7d206198 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -213,6 +213,11 @@ ) # Transactional outbox (durable at-least-once event delivery) from je_auto_control.utils.outbox import Outbox +# Locale-aware string collation (deterministic multi-level sort keys) +from je_auto_control.utils.locale_collation import ( + collation_key, sort_strings, +) +from je_auto_control.utils.locale_collation import compare as collation_compare # CI workflow annotations (GitHub Actions) from je_auto_control.utils.ci_annotations import ( emit_annotations, format_annotation, @@ -943,6 +948,9 @@ def start_autocontrol_gui(*args, **kwargs): "DedupWindow", "SequenceTracker", "VersionConflict", "VersionedStore", "check_if_match", "if_match_header", "Outbox", + "collation_key", + "collation_compare", + "sort_strings", "emit_annotations", "format_annotation", "ClipboardHistory", "default_clipboard_history", "analyze_heal_log", "heal_stats", "scan_secrets", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index cb672e2b..87c5c2ca 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -2066,6 +2066,30 @@ def _add_resilience_specs(specs: List[CommandSpec]) -> None: ), description="List events still awaiting successful delivery.", )) + specs.append(CommandSpec( + "AC_collation_sort", "Data", "Text: Collation Sort", + fields=( + FieldSpec("items", FieldType.STRING, + placeholder='["zebra", "apple", "Äpple"]'), + FieldSpec("strength", FieldType.STRING, optional=True, + placeholder="tertiary"), + FieldSpec("tailoring", FieldType.STRING, optional=True, + placeholder="abc...xyzåäö"), + FieldSpec("reverse", FieldType.BOOL, optional=True), + ), + description="Locale-aware sort (base letter, then accent, then case).", + )) + specs.append(CommandSpec( + "AC_collation_compare", "Data", "Text: Collation Compare", + fields=( + FieldSpec("first", FieldType.STRING, placeholder="apple"), + FieldSpec("second", FieldType.STRING, placeholder="Äpple"), + FieldSpec("strength", FieldType.STRING, optional=True, + placeholder="tertiary"), + FieldSpec("tailoring", FieldType.STRING, optional=True), + ), + description="Locale-aware compare; returns order -1/0/1.", + )) specs.append(CommandSpec( "AC_diff_rows", "Data", "Dataset Diff: Rows by Key", fields=( diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index ed9064ea..d60514e0 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2956,6 +2956,26 @@ def _outbox_pending(name: str) -> Dict[str, Any]: return {"pending": outbox.pending()} +def _collation_sort(items: Any, strength: str = "tertiary", + tailoring: Any = None, reverse: Any = False) -> Dict[str, Any]: + """Adapter: locale-aware sort of a list of strings.""" + import json + from je_auto_control.utils.locale_collation import sort_strings + if isinstance(items, str): + items = json.loads(items) + ordered = sort_strings(list(items), strength=strength, + tailoring=tailoring or None, reverse=bool(reverse)) + return {"sorted": ordered} + + +def _collation_compare(first: str, second: str, strength: str = "tertiary", + tailoring: Any = None) -> Dict[str, Any]: + """Adapter: locale-aware comparison of two strings.""" + from je_auto_control.utils.locale_collation import compare + return {"order": compare(first, second, strength=strength, + tailoring=tailoring or None)} + + def _cas_put(name: str, key: str, value: Any, expected_version: Any = None) -> Dict[str, Any]: """Adapter: optimistic put into a named versioned store.""" @@ -4638,6 +4658,8 @@ def __init__(self): "AC_cas_get": _cas_get, "AC_outbox_enqueue": _outbox_enqueue, "AC_outbox_pending": _outbox_pending, + "AC_collation_sort": _collation_sort, + "AC_collation_compare": _collation_compare, "AC_detect_drift": _detect_drift, "AC_categorical_drift": _categorical_drift, "AC_diff_rows": _diff_rows, diff --git a/je_auto_control/utils/locale_collation/__init__.py b/je_auto_control/utils/locale_collation/__init__.py new file mode 100644 index 00000000..be9dda80 --- /dev/null +++ b/je_auto_control/utils/locale_collation/__init__.py @@ -0,0 +1,6 @@ +"""Locale-aware string collation (deterministic multi-level sort keys).""" +from je_auto_control.utils.locale_collation.locale_collation import ( + collation_key, compare, sort_strings, +) + +__all__ = ["collation_key", "compare", "sort_strings"] diff --git a/je_auto_control/utils/locale_collation/locale_collation.py b/je_auto_control/utils/locale_collation/locale_collation.py new file mode 100644 index 00000000..340075d1 --- /dev/null +++ b/je_auto_control/utils/locale_collation/locale_collation.py @@ -0,0 +1,122 @@ +"""Locale-aware string collation (deterministic multi-level sort keys). + +``text_normalize`` canonicalises text and ``locale_parse`` formats numbers, but +nothing sorts strings the way a human reading a given language expects: Python's +default ``sorted`` is codepoint order, so ``"Z" < "a"`` and ``"ä"`` lands far +from ``"a"``. A real collation orders by base letter first, then accent, then +case, and lets a locale tailor the alphabet (Swedish sorts ``å ä ö`` after +``z``). + +This builds a Unicode-Collation-lite sort key with three levels — primary (base +letter), secondary (diacritics), tertiary (case) — plus an optional alphabet +``tailoring``. Pure standard library (``unicodedata``); imports no ``PySide6``. +Every function is pure (text in, key/order out), so it is fully deterministic in +CI and across platforms (unlike ``locale.strxfrm``). +""" +import unicodedata +from typing import Callable, Dict, List, Optional, Sequence, Tuple + +_STRENGTHS = {"primary": 1, "secondary": 2, "tertiary": 3} + +CollationKey = Tuple[Tuple[int, ...], ...] + + +def _build_tailoring(tailoring: Optional[str]) -> Optional[Dict[str, int]]: + """Map each character of an ordered alphabet to its primary rank.""" + if not tailoring: + return None + ranks: Dict[str, int] = {} + for index, char in enumerate(tailoring): + folded = char.casefold() + if folded not in ranks: + ranks[folded] = index + return ranks + + +def _untailored_weight(base: str, ranks: Optional[Dict[str, int]], + offset: int) -> int: + """Primary weight of a folded base character outside any tailoring.""" + if not base: + return offset if ranks is not None else 0 + return offset + ord(base[0]) if ranks is not None else ord(base[0]) + + +def _char_weights(char: str, ranks: Optional[Dict[str, int]], + offset: int) -> Tuple[List[int], List[int], List[int]]: + """Primary/secondary/tertiary weight contributions of one character. + + A tailored character is treated atomically (no decomposition) so a + precomposed letter like ``"å"`` keeps its alphabet rank; everything else is + NFKD-decomposed so diacritics fall to the secondary level. + """ + folded = char.casefold() + if ranks is not None and folded in ranks: + return [ranks[folded]], [], [1 if char != folded else 0] + primary: List[int] = [] + secondary: List[int] = [] + tertiary: List[int] = [] + for sub in unicodedata.normalize("NFKD", char): + if unicodedata.combining(sub): + secondary.append(ord(sub)) + continue + subfold = sub.casefold() + primary.append(_untailored_weight(subfold, ranks, offset)) + tertiary.append(1 if sub != subfold else 0) + return primary, secondary, tertiary + + +def collation_key(text: str, *, strength: str = "tertiary", + tailoring: Optional[str] = None) -> CollationKey: + """Return a comparable multi-level sort key for ``text``. + + Levels: primary (base letter), secondary (diacritics), tertiary (case, + lowercase before uppercase). ``strength`` (``primary`` / ``secondary`` / + ``tertiary``) caps the levels compared. ``tailoring`` is an ordered alphabet + whose characters sort in the given order and before any unlisted character + (so a Swedish ``"...xyzåäö"`` puts ``å`` after ``z``). + """ + level = _STRENGTHS.get(strength) + if level is None: + raise ValueError(f"unknown strength: {strength!r}") + ranks = _build_tailoring(tailoring) + offset = len(tailoring) if tailoring else 0 + primary: List[int] = [] + secondary: List[int] = [] + tertiary: List[int] = [] + for char in text or "": + char_primary, char_secondary, char_tertiary = _char_weights( + char, ranks, offset) + primary.extend(char_primary) + secondary.extend(char_secondary) + tertiary.extend(char_tertiary) + levels = (tuple(primary), tuple(secondary), tuple(tertiary)) + return levels[:level] + + +def compare(first: str, second: str, *, strength: str = "tertiary", + tailoring: Optional[str] = None) -> int: + """Return ``-1`` / ``0`` / ``1`` ordering ``first`` against ``second``.""" + key_first = collation_key(first, strength=strength, tailoring=tailoring) + key_second = collation_key(second, strength=strength, tailoring=tailoring) + if key_first < key_second: + return -1 + if key_first > key_second: + return 1 + return 0 + + +def sort_strings(items: Sequence[str], *, strength: str = "tertiary", + tailoring: Optional[str] = None, reverse: bool = False, + key: Optional[Callable[[object], str]] = None) -> List[object]: + """Return ``items`` sorted by collation key. + + ``key`` extracts the string from each item (default: the item itself), so + dicts or tuples can be sorted by one of their fields. + """ + extract = key or (lambda item: item) + + def sort_key(item: object) -> CollationKey: + return collation_key(str(extract(item)), strength=strength, + tailoring=tailoring) + + return sorted(items, key=sort_key, reverse=reverse) diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index b4400fc4..c0699e5a 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -3599,6 +3599,37 @@ def outbox_tools() -> List[MCPTool]: ] +def locale_collation_tools() -> List[MCPTool]: + return [ + MCPTool( + name="ac_collation_sort", + description=("Locale-aware sort of string list 'items'. 'strength' " + "primary|secondary|tertiary; 'tailoring' is an ordered " + "alphabet (e.g. Swedish '...xyzåäö'). Returns {sorted}."), + input_schema=schema( + {"items": {"type": "array", "items": {"type": "string"}}, + "strength": {"type": "string"}, + "tailoring": {"type": "string"}, + "reverse": {"type": "boolean"}}, + ["items"]), + handler=h.collation_sort, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_collation_compare", + description=("Locale-aware compare of 'first' vs 'second'; returns " + "{order: -1|0|1}. Same 'strength'/'tailoring' options."), + input_schema=schema( + {"first": {"type": "string"}, "second": {"type": "string"}, + "strength": {"type": "string"}, + "tailoring": {"type": "string"}}, + ["first", "second"]), + handler=h.collation_compare, + annotations=READ_ONLY, + ), + ] + + def sequence_gap_tools() -> List[MCPTool]: return [ MCPTool( @@ -5639,6 +5670,7 @@ def media_assert_tools() -> List[MCPTool]: sse_client_tools, layered_config_tools, data_drift_tools, schema_compat_tools, timeseries_tools, anomaly_tools, smoothing_tools, idempotency_tools, dedup_window_tools, sequence_gap_tools, optimistic_tools, outbox_tools, + locale_collation_tools, dataset_diff_tools, referential_tools, link_header_tools, multipart_tools, http_content_tools, cookie_jar_tools, http_conditional_tools, saga_tools, decision_table_tools, locator_repair_tools, diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 665dc19c..a73280e4 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -1962,6 +1962,16 @@ def outbox_pending(name): return _outbox_pending(name) +def collation_sort(items, strength="tertiary", tailoring=None, reverse=False): + from je_auto_control.utils.executor.action_executor import _collation_sort + return _collation_sort(items, strength, tailoring, reverse) + + +def collation_compare(first, second, strength="tertiary", tailoring=None): + from je_auto_control.utils.executor.action_executor import _collation_compare + return _collation_compare(first, second, strength, tailoring) + + def detect_drift(reference, current, threshold=0.25, bins=10): from je_auto_control.utils.executor.action_executor import _detect_drift return _detect_drift(reference, current, threshold, bins) diff --git a/test/unit_test/headless/test_locale_collation_batch.py b/test/unit_test/headless/test_locale_collation_batch.py new file mode 100644 index 00000000..e40a5273 --- /dev/null +++ b/test/unit_test/headless/test_locale_collation_batch.py @@ -0,0 +1,81 @@ +"""Headless tests for locale-aware string collation. No Qt.""" +import json + +import pytest + +import je_auto_control as ac +from je_auto_control.utils.locale_collation import ( + collation_key, compare, sort_strings, +) + +_SWEDISH = "abcdefghijklmnopqrstuvwxyzåäö" + + +def test_default_case_insensitive_primary(): + # lowercase sorts before uppercase at the tertiary level + assert sort_strings(["banana", "Apple", "apple", "cherry"]) == [ + "apple", "Apple", "banana", "cherry"] + + +def test_accent_is_secondary(): + # plain letters sort before their accented form, after base ordering + assert sort_strings(["résumé", "rest", "resume"]) == [ + "rest", "resume", "résumé"] + assert compare("resume", "résumé") == -1 + assert compare("resume", "résumé", strength="primary") == 0 + + +def test_case_is_tertiary(): + assert compare("apple", "Apple") == -1 # lower before upper + assert compare("apple", "Apple", strength="secondary") == 0 + + +def test_tailoring_orders_alphabet(): + # Swedish: å ä ö sort after z, not next to a/o + assert sort_strings(["zebra", "äpple", "apple", "örn"], + tailoring=_SWEDISH) == ["apple", "zebra", "äpple", "örn"] + assert compare("zebra", "åa", tailoring=_SWEDISH) == -1 + + +def test_sort_by_key_and_reverse(): + rows = [{"n": "béta"}, {"n": "alpha"}, {"n": "Gamma"}] + assert [r["n"] for r in sort_strings(rows, key=lambda r: r["n"])] == [ + "alpha", "béta", "Gamma"] + assert sort_strings(["a", "b", "c"], reverse=True) == ["c", "b", "a"] + + +def test_collation_key_levels_and_validation(): + assert collation_key("Ab", strength="primary") == ((97, 98),) + assert len(collation_key("Ab", strength="tertiary")) == 3 + with pytest.raises(ValueError): + collation_key("x", strength="quaternary") + + +# --- wiring --------------------------------------------------------------- + +def test_executor_round_trip(): + rec = ac.execute_action([[ + "AC_collation_sort", + {"items": json.dumps(["zebra", "äpple", "apple"]), + "tailoring": _SWEDISH}]]) + out = next(v for v in rec.values() if isinstance(v, dict)) + assert out["sorted"] == ["apple", "zebra", "äpple"] + rec2 = ac.execute_action([[ + "AC_collation_compare", {"first": "resume", "second": "résumé"}]]) + assert next(v for v in rec2.values() if isinstance(v, dict))["order"] == -1 + + +def test_wiring(): + known = ac.executor.known_commands() + assert {"AC_collation_sort", "AC_collation_compare"} <= set(known) + from je_auto_control.utils.mcp_server.tools import build_default_tool_registry + names = {t.name for t in build_default_tool_registry()} + assert {"ac_collation_sort", "ac_collation_compare"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_collation_sort", "AC_collation_compare"} <= specs + + +def test_facade_exports(): + for attr in ("collation_key", "collation_compare", "sort_strings"): + assert hasattr(ac, attr) and attr in ac.__all__