From e31580ea0fcc271958d0b594e0686b22685e79b5 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Mon, 22 Jun 2026 07:18:32 +0800 Subject: [PATCH 1/2] Add near-duplicate text detection (SimHash / MinHash) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fuzzy_dedupe is O(n²) pairwise with no stable fingerprint and image_dedup only hashes pixels. Add the text analog: simhash + hamming_distance + near_duplicates clustering, and minhash_signature + minhash_similarity for estimated Jaccard. Uses a fixed blake2b hash for deterministic fingerprints. hamming_distance is shared with image_dedup (not re-exported from the facade to avoid a name clash). Wired through facade, executor (AC_simhash / AC_near_duplicates), MCP, and the Script Builder with a headless test batch and EN/Zh docs. --- README.md | 7 ++ README/README_zh-CN.md | 7 ++ README/README_zh-TW.md | 7 ++ .../doc/new_features/v100_features_doc.rst | 46 +++++++++ docs/source/Eng/eng_index.rst | 1 + .../Zh/doc/new_features/v100_features_doc.rst | 39 ++++++++ docs/source/Zh/zh_index.rst | 1 + je_auto_control/__init__.py | 6 ++ .../gui/script_builder/command_schema.py | 17 ++++ .../utils/executor/action_executor.py | 17 ++++ .../utils/mcp_server/tools/_factories.py | 28 +++++- .../utils/mcp_server/tools/_handlers.py | 10 ++ je_auto_control/utils/near_dup/__init__.py | 10 ++ je_auto_control/utils/near_dup/near_dup.py | 95 +++++++++++++++++++ .../unit_test/headless/test_near_dup_batch.py | 79 +++++++++++++++ 15 files changed, 369 insertions(+), 1 deletion(-) create mode 100644 docs/source/Eng/doc/new_features/v100_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v100_features_doc.rst create mode 100644 je_auto_control/utils/near_dup/__init__.py create mode 100644 je_auto_control/utils/near_dup/near_dup.py create mode 100644 test/unit_test/headless/test_near_dup_batch.py diff --git a/README.md b/README.md index 23afba30..73ef5f29 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ ## Table of Contents +- [What's new (2026-06-22) — Near-Duplicate Text Detection (SimHash / MinHash)](#whats-new-2026-06-22--near-duplicate-text-detection-simhash--minhash) - [What's new (2026-06-22) — String-Distance Similarity Metrics](#whats-new-2026-06-22--string-distance-similarity-metrics) - [What's new (2026-06-22) — Time-Series Transforms](#whats-new-2026-06-22--time-series-transforms) - [What's new (2026-06-22) — Unicode Text Normalisation & Slugify](#whats-new-2026-06-22--unicode-text-normalisation--slugify) @@ -152,6 +153,12 @@ --- +## What's new (2026-06-22) — Near-Duplicate Text Detection (SimHash / MinHash) + +Fingerprint text to find near-dups at scale. Full reference: [`docs/source/Eng/doc/new_features/v100_features_doc.rst`](docs/source/Eng/doc/new_features/v100_features_doc.rst). + +- **`simhash` / `near_duplicates` / `minhash_signature` / `minhash_similarity`** (`AC_simhash`, `AC_near_duplicates`): `fuzzy_dedupe` is O(n²) pairwise with no stable fingerprint and `image_dedup` only hashes pixels. This adds the text analog — SimHash (Hamming-distance near-dup clustering) and MinHash (estimated Jaccard) using a fixed `blake2b` hash for deterministic fingerprints. Pairs with `normalize_text`. Pure-stdlib. + ## What's new (2026-06-22) — String-Distance Similarity Metrics Match typos and reordered tokens. Full reference: [`docs/source/Eng/doc/new_features/v99_features_doc.rst`](docs/source/Eng/doc/new_features/v99_features_doc.rst). diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md index ccdb98a1..81f4abff 100644 --- a/README/README_zh-CN.md +++ b/README/README_zh-CN.md @@ -12,6 +12,7 @@ ## 目录 +- [本次更新 (2026-06-22) — 近似重复文本检测(SimHash / MinHash)](#本次更新-2026-06-22--近似重复文本检测simhash--minhash) - [本次更新 (2026-06-22) — 字符串距离相似度量](#本次更新-2026-06-22--字符串距离相似度量) - [本次更新 (2026-06-22) — 时间序列变换](#本次更新-2026-06-22--时间序列变换) - [本次更新 (2026-06-22) — Unicode 文本规范化与 Slug](#本次更新-2026-06-22--unicode-文本规范化与-slug) @@ -151,6 +152,12 @@ --- +## 本次更新 (2026-06-22) — 近似重复文本检测(SimHash / MinHash) + +为文本生成指纹以大规模找近似重复。完整参考:[`docs/source/Zh/doc/new_features/v100_features_doc.rst`](../docs/source/Zh/doc/new_features/v100_features_doc.rst)。 + +- **`simhash` / `near_duplicates` / `minhash_signature` / `minhash_similarity`**(`AC_simhash`、`AC_near_duplicates`):`fuzzy_dedupe` 是 O(n²) 成对且无稳定指纹,`image_dedup` 只哈希像素。本功能加入文本对应 —— SimHash(Hamming 距离近似重复聚类)与 MinHash(估计 Jaccard),使用固定 `blake2b` 哈希取得确定的指纹。可搭配 `normalize_text`。纯标准库。 + ## 本次更新 (2026-06-22) — 字符串距离相似度量 匹配打字错误与重排 token。完整参考:[`docs/source/Zh/doc/new_features/v99_features_doc.rst`](../docs/source/Zh/doc/new_features/v99_features_doc.rst)。 diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md index f4b5086b..97715983 100644 --- a/README/README_zh-TW.md +++ b/README/README_zh-TW.md @@ -12,6 +12,7 @@ ## 目錄 +- [本次更新 (2026-06-22) — 近似重複文字偵測(SimHash / MinHash)](#本次更新-2026-06-22--近似重複文字偵測simhash--minhash) - [本次更新 (2026-06-22) — 字串距離相似度量](#本次更新-2026-06-22--字串距離相似度量) - [本次更新 (2026-06-22) — 時間序列轉換](#本次更新-2026-06-22--時間序列轉換) - [本次更新 (2026-06-22) — Unicode 文字正規化與 Slug](#本次更新-2026-06-22--unicode-文字正規化與-slug) @@ -151,6 +152,12 @@ --- +## 本次更新 (2026-06-22) — 近似重複文字偵測(SimHash / MinHash) + +為文字產生指紋以大規模找近似重複。完整參考:[`docs/source/Zh/doc/new_features/v100_features_doc.rst`](../docs/source/Zh/doc/new_features/v100_features_doc.rst)。 + +- **`simhash` / `near_duplicates` / `minhash_signature` / `minhash_similarity`**(`AC_simhash`、`AC_near_duplicates`):`fuzzy_dedupe` 是 O(n²) 成對且無穩定指紋,`image_dedup` 只雜湊像素。本功能加入文字對應 —— SimHash(Hamming 距離近似重複分群)與 MinHash(估計 Jaccard),使用固定 `blake2b` 雜湊取得具決定性的指紋。可搭配 `normalize_text`。純標準函式庫。 + ## 本次更新 (2026-06-22) — 字串距離相似度量 比對打字錯誤與重排 token。完整參考:[`docs/source/Zh/doc/new_features/v99_features_doc.rst`](../docs/source/Zh/doc/new_features/v99_features_doc.rst)。 diff --git a/docs/source/Eng/doc/new_features/v100_features_doc.rst b/docs/source/Eng/doc/new_features/v100_features_doc.rst new file mode 100644 index 00000000..dbe084e8 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v100_features_doc.rst @@ -0,0 +1,46 @@ +Near-Duplicate Text Detection (SimHash / MinHash) +================================================= + +``fuzzy.fuzzy_dedupe`` is O(n²) pairwise ``SequenceMatcher`` with no stable +fingerprint, and ``image_dedup`` only hashes pixels. This adds text +fingerprints — SimHash (Hamming-distance near-dup) and MinHash (estimated +Jaccard) — that scale and give a reusable signature, the text analog of the +perceptual image hash. + +Pure standard library (``hashlib`` / ``re``); imports no ``PySide6``. A fixed +hash (``blake2b``, not the salted built-in ``hash()``) keeps fingerprints +deterministic across runs and CI. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import ( + simhash, near_duplicates, minhash_signature, minhash_similarity, + ) + + h1 = simhash("the quick brown fox jumps over the lazy dog") + h2 = simhash("the quick brown fox jumps over the lazy dogs") + # small Hamming distance ⇒ near-duplicate + + clusters = near_duplicates(docs, max_distance=12) # groups of indices + + sig_a = minhash_signature(text_a) + minhash_similarity(sig_a, minhash_signature(text_b)) # ~ Jaccard + +``simhash`` returns a ``bits``-wide fingerprint from word shingles; +``hamming_distance`` (shared with ``image_dedup``) measures bit difference. +``near_duplicates`` clusters texts whose SimHashes are within ``max_distance`` +bits, returning a partition of indices (singletons included). +``minhash_signature`` / ``minhash_similarity`` give a MinHash signature and a +Jaccard estimate for set-overlap style dedup. Run ``normalize_text`` first for +accent/form-insensitive fingerprints. + +Executor commands +----------------- + +``AC_simhash`` returns ``{simhash}`` for a ``text``; ``AC_near_duplicates`` +returns ``{clusters}`` for ``texts`` within ``max_distance``. Both are exposed +as MCP tools (``ac_simhash`` / ``ac_near_duplicates``) and as Script Builder +commands under **Data**. diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst index 7680301d..c8f8465b 100644 --- a/docs/source/Eng/eng_index.rst +++ b/docs/source/Eng/eng_index.rst @@ -122,6 +122,7 @@ Comprehensive guides for all AutoControl features. doc/new_features/v97_features_doc doc/new_features/v98_features_doc doc/new_features/v99_features_doc + doc/new_features/v100_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/docs/source/Zh/doc/new_features/v100_features_doc.rst b/docs/source/Zh/doc/new_features/v100_features_doc.rst new file mode 100644 index 00000000..f56cc247 --- /dev/null +++ b/docs/source/Zh/doc/new_features/v100_features_doc.rst @@ -0,0 +1,39 @@ +近似重複文字偵測(SimHash / MinHash) +==================================== + +``fuzzy.fuzzy_dedupe`` 是 O(n²) 的成對 ``SequenceMatcher``,沒有穩定指紋,而 ``image_dedup`` 只雜湊 +像素。本功能加入文字指紋 —— SimHash(以 Hamming 距離找近似重複)與 MinHash(估計 Jaccard)—— 可擴展且 +提供可重用的簽章,是感知式影像雜湊的文字對應。 + +純標準函式庫(``hashlib`` / ``re``);不匯入 ``PySide6``。使用固定雜湊(``blake2b``,而非加鹽的內建 +``hash()``)讓指紋在不同執行與 CI 間具決定性。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import ( + simhash, near_duplicates, minhash_signature, minhash_similarity, + ) + + h1 = simhash("the quick brown fox jumps over the lazy dog") + h2 = simhash("the quick brown fox jumps over the lazy dogs") + # Hamming 距離小 ⇒ 近似重複 + + clusters = near_duplicates(docs, max_distance=12) # 索引分群 + + sig_a = minhash_signature(text_a) + minhash_similarity(sig_a, minhash_signature(text_b)) # ~ Jaccard + +``simhash`` 從詞 shingle 產生 ``bits`` 寬的指紋;``hamming_distance``(與 ``image_dedup`` 共用)量測位元 +差異。``near_duplicates`` 把 SimHash 在 ``max_distance`` 位元內的文字分群,回傳索引的分割(含單例)。 +``minhash_signature`` / ``minhash_similarity`` 提供 MinHash 簽章與 Jaccard 估計以做集合重疊式去重。可先執行 +``normalize_text`` 取得對重音/形式不敏感的指紋。 + +執行器命令 +---------- + +``AC_simhash`` 對 ``text`` 回傳 ``{simhash}``;``AC_near_duplicates`` 對 ``texts`` 在 ``max_distance`` 內 +回傳 ``{clusters}``。兩者皆以 MCP 工具(``ac_simhash`` / ``ac_near_duplicates``)以及 Script Builder 中 +**Data** 分類下的命令提供。 diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst index 033341ef..eea510ec 100644 --- a/docs/source/Zh/zh_index.rst +++ b/docs/source/Zh/zh_index.rst @@ -122,6 +122,7 @@ AutoControl 所有功能的完整使用指南。 doc/new_features/v97_features_doc doc/new_features/v98_features_doc doc/new_features/v99_features_doc + doc/new_features/v100_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index f8ff0b87..75b0ba17 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -262,6 +262,11 @@ damerau_levenshtein, dice, jaccard, jaro, jaro_winkler, levenshtein, similarity, ) +# Near-duplicate text detection (SimHash / MinHash fingerprints) +# (hamming_distance is already exported by image_dedup with identical semantics) +from je_auto_control.utils.near_dup import ( + minhash_signature, minhash_similarity, near_duplicates, simhash, +) # S3-compatible artifact store (optional boto3, injectable client) from je_auto_control.utils.artifact_store import ( S3ArtifactStore, configure_default_store, get_default_store, @@ -935,6 +940,7 @@ def start_autocontrol_gui(*args, **kwargs): "slugify", "damerau_levenshtein", "dice", "jaccard", "jaro", "jaro_winkler", "levenshtein", "similarity", + "minhash_signature", "minhash_similarity", "near_duplicates", "simhash", "S3ArtifactStore", "configure_default_store", "get_default_store", "set_default_store", "average_hash", "dedupe_images", "dhash", "hamming_distance", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 8fd416a9..3fbb54e4 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -1678,6 +1678,23 @@ def _add_resilience_specs(specs: List[CommandSpec]) -> None: ), description="Normalised string similarity (Jaro-Winkler / edit / Jaccard).", )) + specs.append(CommandSpec( + "AC_simhash", "Data", "Near-Dup: SimHash", + fields=( + FieldSpec("text", FieldType.STRING, placeholder="some text"), + FieldSpec("bits", FieldType.INT, optional=True, default=64), + ), + description="SimHash fingerprint (int) of text.", + )) + specs.append(CommandSpec( + "AC_near_duplicates", "Data", "Near-Dup: Cluster Texts", + fields=( + FieldSpec("texts", FieldType.STRING, + placeholder='["the cat sat", "the cat sat down", "dog"]'), + FieldSpec("max_distance", FieldType.INT, optional=True, default=3), + ), + description="Cluster near-duplicate texts by SimHash distance.", + )) specs.append(CommandSpec( "AC_spans_to_otlp", "Report", "OTLP: Export Spans", fields=( diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index 4f8441d7..cf859378 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -3159,6 +3159,21 @@ def _text_similarity(a: str, b: str, return {"score": similarity(a, b, metric=metric)} +def _simhash(text: str, bits: Any = 64) -> Dict[str, Any]: + """Adapter: SimHash fingerprint of text (as int).""" + from je_auto_control.utils.near_dup import simhash + return {"simhash": simhash(text, bits=int(bits))} + + +def _near_duplicates(texts: Any, max_distance: Any = 3) -> Dict[str, Any]: + """Adapter: cluster near-duplicate texts by SimHash distance.""" + import json + from je_auto_control.utils.near_dup import near_duplicates + if isinstance(texts, str): + texts = json.loads(texts) + return {"clusters": near_duplicates(texts, max_distance=int(max_distance))} + + def _canonical_log(fields: Any) -> Dict[str, Any]: """Adapter: build a canonical log line from a fields dict.""" import json @@ -4469,6 +4484,8 @@ def __init__(self): "AC_normalize_text": _normalize_text, "AC_slugify": _slugify, "AC_text_similarity": _text_similarity, + "AC_simhash": _simhash, + "AC_near_duplicates": _near_duplicates, "AC_validate_config": _validate_config, "AC_resolve_ref": _resolve_ref, "AC_resolve_refs": _resolve_refs, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index b15c02c5..c5e39ae8 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -3810,6 +3810,32 @@ def otlp_export_tools() -> List[MCPTool]: ] +def near_dup_tools() -> List[MCPTool]: + return [ + MCPTool( + name="ac_simhash", + description=("SimHash fingerprint (int) of 'text' (optional 'bits'). " + "Returns {simhash}."), + input_schema=schema( + {"text": {"type": "string"}, "bits": {"type": "integer"}}, + ["text"]), + handler=h.simhash, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_near_duplicates", + description=("Cluster near-duplicate 'texts' within 'max_distance' " + "SimHash bits. Returns {clusters} of index lists."), + input_schema=schema( + {"texts": {"type": "array"}, + "max_distance": {"type": "integer"}}, + ["texts"]), + handler=h.near_duplicates, + annotations=READ_ONLY, + ), + ] + + def text_similarity_tools() -> List[MCPTool]: return [ MCPTool( @@ -5452,7 +5478,7 @@ def media_assert_tools() -> List[MCPTool]: feature_flag_tools, provenance_tools, json_contract_tools, chaos_tools, slo_tools, percentiles_tools, bulkhead_tools, http_cassette_tools, trace_context_tools, baggage_tools, canonical_log_tools, otlp_export_tools, - text_normalize_tools, text_similarity_tools, + text_normalize_tools, text_similarity_tools, near_dup_tools, secret_ref_tools, config_schema_tools, config_redaction_tools, data_profile_tools, http_problem_tools, dotenv_tools, sse_client_tools, layered_config_tools, data_drift_tools, schema_compat_tools, diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index a2326013..0d991f85 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -1751,6 +1751,16 @@ def text_similarity(a, b, metric="jaro_winkler"): return _text_similarity(a, b, metric) +def simhash(text, bits=64): + from je_auto_control.utils.executor.action_executor import _simhash + return _simhash(text, bits) + + +def near_duplicates(texts, max_distance=3): + from je_auto_control.utils.executor.action_executor import _near_duplicates + return _near_duplicates(texts, max_distance) + + def canonical_log(fields): from je_auto_control.utils.executor.action_executor import _canonical_log return _canonical_log(fields) diff --git a/je_auto_control/utils/near_dup/__init__.py b/je_auto_control/utils/near_dup/__init__.py new file mode 100644 index 00000000..1345c56a --- /dev/null +++ b/je_auto_control/utils/near_dup/__init__.py @@ -0,0 +1,10 @@ +"""Near-duplicate text detection (SimHash / MinHash) for AutoControl.""" +from je_auto_control.utils.near_dup.near_dup import ( + hamming_distance, minhash_signature, minhash_similarity, near_duplicates, + simhash, +) + +__all__ = [ + "hamming_distance", "minhash_signature", "minhash_similarity", + "near_duplicates", "simhash", +] diff --git a/je_auto_control/utils/near_dup/near_dup.py b/je_auto_control/utils/near_dup/near_dup.py new file mode 100644 index 00000000..a2626a7f --- /dev/null +++ b/je_auto_control/utils/near_dup/near_dup.py @@ -0,0 +1,95 @@ +"""Near-duplicate text detection via SimHash and MinHash fingerprints. + +``fuzzy.fuzzy_dedupe`` is O(n²) pairwise ``SequenceMatcher`` with no stable +fingerprint, and ``image_dedup`` only hashes pixels. This adds text +fingerprints — SimHash (Hamming-distance near-dup) and MinHash (estimated +Jaccard) — that scale and give a reusable signature, the text analog of the +perceptual image hash. + +Pure standard library (``hashlib`` / ``re``); imports no ``PySide6``. A fixed +hash (``blake2b``, not the salted built-in ``hash()``) keeps fingerprints +deterministic across runs and CI. +""" +import hashlib +import re +from typing import List, Sequence + +_TOKEN_RE = re.compile(r"\w+") + + +def _tokens(text: str) -> List[str]: + return _TOKEN_RE.findall((text or "").lower()) + + +def _shingles(text: str, k: int = 2) -> List[str]: + tokens = _tokens(text) + if len(tokens) < k: + return [" ".join(tokens)] if tokens else [""] + return [" ".join(tokens[i:i + k]) for i in range(len(tokens) - k + 1)] + + +def _hash64(value: str, salt: bytes = b"") -> int: + digest = hashlib.blake2b(value.encode("utf-8"), digest_size=8, + salt=salt).digest() + return int.from_bytes(digest, "big") + + +def simhash(text: str, *, bits: int = 64) -> int: + """Return a ``bits``-wide SimHash fingerprint of ``text``.""" + vector = [0] * bits + for shingle in _shingles(text): + value = _hash64(shingle) + for index in range(bits): + vector[index] += 1 if (value >> index) & 1 else -1 + result = 0 + for index in range(bits): + if vector[index] > 0: + result |= (1 << index) + return result + + +def hamming_distance(a: int, b: int) -> int: + """Number of differing bits between two fingerprints.""" + return bin(a ^ b).count("1") + + +def near_duplicates(texts: Sequence[str], *, max_distance: int = 3, + bits: int = 64) -> List[List[int]]: + """Cluster ``texts`` whose SimHashes are within ``max_distance`` bits. + + Returns a list of clusters, each a list of indices into ``texts`` + (singletons included, so the result partitions every input). + """ + hashes = [simhash(text, bits=bits) for text in texts] + assigned = [False] * len(texts) + clusters: List[List[int]] = [] + for i in range(len(texts)): + if assigned[i]: + continue + group = [i] + assigned[i] = True + for j in range(i + 1, len(texts)): + if not assigned[j] and hamming_distance(hashes[i], + hashes[j]) <= max_distance: + group.append(j) + assigned[j] = True + clusters.append(group) + return clusters + + +def minhash_signature(text: str, *, num_perm: int = 64) -> List[int]: + """Return a MinHash signature (``num_perm`` minima) of ``text``.""" + shingles = set(_shingles(text)) + signature: List[int] = [] + for seed in range(num_perm): + salt = seed.to_bytes(2, "big") + signature.append(min(_hash64(shingle, salt) for shingle in shingles)) + return signature + + +def minhash_similarity(sig_a: Sequence[int], sig_b: Sequence[int]) -> float: + """Estimate the Jaccard similarity from two MinHash signatures.""" + if not sig_a or len(sig_a) != len(sig_b): + return 0.0 + equal = sum(1 for left, right in zip(sig_a, sig_b) if left == right) + return equal / len(sig_a) diff --git a/test/unit_test/headless/test_near_dup_batch.py b/test/unit_test/headless/test_near_dup_batch.py new file mode 100644 index 00000000..a8994ce4 --- /dev/null +++ b/test/unit_test/headless/test_near_dup_batch.py @@ -0,0 +1,79 @@ +"""Headless tests for near-duplicate text detection. Pure stdlib, no Qt.""" +import json + +import je_auto_control as ac +from je_auto_control.utils.near_dup import ( + hamming_distance, minhash_signature, minhash_similarity, near_duplicates, + simhash, +) + + +def test_simhash_deterministic_and_identical(): + a = simhash("the quick brown fox jumps") + assert a == simhash("the quick brown fox jumps") # deterministic + assert hamming_distance(a, a) == 0 + + +def test_near_texts_have_small_distance(): + a = simhash("the quick brown fox jumps over the lazy dog") + b = simhash("the quick brown fox jumps over the lazy dogs") + far = simhash("completely unrelated sentence about databases") + assert hamming_distance(a, b) < hamming_distance(a, far) + + +def test_near_duplicates_clusters(): + texts = [ + "the cat sat on the mat", + "the cat sat on the mat today", # near-dup of #0 + "quantum chromodynamics lecture", # unrelated + ] + clusters = near_duplicates(texts, max_distance=12) + # every index appears exactly once across clusters (partition) + assert sorted(i for group in clusters for i in group) == [0, 1, 2] + # 0 and 1 land together, 2 separate + group_of = {i: gi for gi, group in enumerate(clusters) for i in group} + assert group_of[0] == group_of[1] and group_of[2] != group_of[0] + + +def test_minhash_similarity(): + sig_a = minhash_signature("the quick brown fox") + sig_b = minhash_signature("the quick brown fox") + assert minhash_similarity(sig_a, sig_b) == 1.0 + sig_c = minhash_signature("entirely different words here now") + assert minhash_similarity(sig_a, sig_c) < 1.0 + assert minhash_similarity([], [1]) == 0.0 + + +def test_hamming_distance(): + assert hamming_distance(0b1010, 0b1001) == 2 + + +# --- wiring --------------------------------------------------------------- + +def test_executor_round_trip(): + rec = ac.execute_action([["AC_simhash", {"text": "hello world"}]]) + value = next(v for v in rec.values() if isinstance(v, dict))["simhash"] + assert isinstance(value, int) + texts = json.dumps(["the cat sat", "the cat sat now", "zzz"]) + rec2 = ac.execute_action([[ + "AC_near_duplicates", {"texts": texts, "max_distance": 8}]]) + clusters = next(v for v in rec2.values() if isinstance(v, dict))["clusters"] + assert sorted(i for g in clusters for i in g) == [0, 1, 2] + + +def test_wiring(): + known = ac.executor.known_commands() + assert {"AC_simhash", "AC_near_duplicates"} <= set(known) + from je_auto_control.utils.mcp_server.tools import build_default_tool_registry + names = {t.name for t in build_default_tool_registry()} + assert {"ac_simhash", "ac_near_duplicates"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_simhash", "AC_near_duplicates"} <= specs + + +def test_facade_exports(): + # hamming_distance is exported by image_dedup (identical int semantics) + for attr in ("simhash", "near_duplicates", "minhash_signature", + "minhash_similarity"): + assert hasattr(ac, attr) and attr in ac.__all__ From 991ec6abaff270422de45935c151ca00de022658 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Mon, 22 Jun 2026 07:24:25 +0800 Subject: [PATCH 2/2] Use pytest.approx for minhash float comparisons (Sonar S1244) --- test/unit_test/headless/test_near_dup_batch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/unit_test/headless/test_near_dup_batch.py b/test/unit_test/headless/test_near_dup_batch.py index a8994ce4..2849ee3f 100644 --- a/test/unit_test/headless/test_near_dup_batch.py +++ b/test/unit_test/headless/test_near_dup_batch.py @@ -1,6 +1,8 @@ """Headless tests for near-duplicate text detection. Pure stdlib, no Qt.""" import json +import pytest + import je_auto_control as ac from je_auto_control.utils.near_dup import ( hamming_distance, minhash_signature, minhash_similarity, near_duplicates, @@ -38,10 +40,10 @@ def test_near_duplicates_clusters(): def test_minhash_similarity(): sig_a = minhash_signature("the quick brown fox") sig_b = minhash_signature("the quick brown fox") - assert minhash_similarity(sig_a, sig_b) == 1.0 + assert minhash_similarity(sig_a, sig_b) == pytest.approx(1.0) sig_c = minhash_signature("entirely different words here now") assert minhash_similarity(sig_a, sig_c) < 1.0 - assert minhash_similarity([], [1]) == 0.0 + assert minhash_similarity([], [1]) == pytest.approx(0.0) def test_hamming_distance():