ParzivalHack · ParzivalHack · May 27, 2026 · May 27, 2026
diff --git a/src/pyspector/config.py b/src/pyspector/config.py
@@ -1,3 +1,4 @@
+import re
 from pathlib import Path
 import toml # type: ignore
 import click # type: ignore
@@ -8,6 +9,16 @@
     # Fallback for older Python versions
     import importlib_resources as pkg_resources # type: ignore
 
+# Sentinel placed inside any rule's `exclude_pattern` to inherit the shared
+# placeholder regex declared at [defaults].exclude_pattern_placeholder. The
+# sentinel is string-substituted in `get_default_rules` before the TOML text
+# is handed to the Rust core.
+_PLACEHOLDER_SENTINEL = "__SHARED_PLACEHOLDERS__"
+_PLACEHOLDER_KEY_RX = re.compile(
+    r'^\s*exclude_pattern_placeholder\s*=\s*"((?:[^"\\]|\\.)*)"',
+    re.MULTILINE,
+)
+
 DEFAULT_CONFIG = {
     "exclude": [
         ".venv", "venv", ".git", "__pycache__", "build", "dist", "*.egg-info",
@@ -37,14 +48,26 @@ def load_config(config_path: Path) -> dict:
     return DEFAULT_CONFIG
 
 def get_default_rules(ai_scan: bool = False) -> str:
-    """Loads the built-in TOML rules file from package resources."""
+    """Loads the built-in TOML rules file from package resources.
+
+    Substitutes the `__SHARED_PLACEHOLDERS__` sentinel inside any rule's
+    exclude_pattern with the value of `[defaults].exclude_pattern_placeholder`,
+    so the placeholder/dummy-secret regex lives in one place rather than being
+    copy-pasted across every format-specific rule.
+    """
     try:
         base_rules = pkg_resources.files('pyspector.rules').joinpath('built-in-rules.toml').read_text(encoding='utf-8')
         if ai_scan:
             click.echo("[*] AI scanning enabled. Loading additional AI/LLM rules.")
             ai_rules = pkg_resources.files('pyspector.rules').joinpath('built-in-rules-ai.toml').read_text(encoding='utf-8')
-            # Combine the two rulesets
-            return base_rules + "\n" + ai_rules
-        return base_rules
+            text = base_rules + "\n" + ai_rules
+        else:
+            text = base_rules
+
+        # Inline shared placeholder regex into rule-level exclude_patterns
+        m = _PLACEHOLDER_KEY_RX.search(text)
+        if m and _PLACEHOLDER_SENTINEL in text:
+            text = text.replace(_PLACEHOLDER_SENTINEL, m.group(1))
+        return text
     except Exception as e:
         raise FileNotFoundError(f"Could not load built-in-rules.toml from package data! Error: {e}")
diff --git a/src/pyspector/rules/built-in-rules-ai.toml b/src/pyspector/rules/built-in-rules-ai.toml
@@ -314,8 +314,13 @@ id = "AI404"
 description = "Hugging Face authentication token is hardcoded in the source file."
 severity = "Critical"
 remediation = "Store Hugging Face tokens and other secrets in environment variables or a secrets management tool, not in source code."
-pattern = "token\\s*=\\s*[\"']hf_"
+# Real HF tokens are hf_ + ~34 alphanumeric chars. Require at least 16 consecutive
+# alphanumeric characters after `hf_` to drop placeholders like "hf_token", "hf_X",
+# "hf_xxx_your_token", and docstring examples like 'hf_....'.
+pattern = "token\\s*=\\s*[\"']hf_[A-Za-z0-9]{16,}"
 file_pattern = "*.py"
+# Doctest examples (>>> / ...) shouldn't fire even if they happen to use a long fake token.
+exclude_pattern = "^\\s*(>>>|\\.\\.\\.)\\s"
 
 [[rule]]
 id = "AI405"