Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions src/pyspector/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from pathlib import Path
import toml # type: ignore
import click # type: ignore
Expand All @@ -8,6 +9,16 @@
# Fallback for older Python versions
import importlib_resources as pkg_resources # type: ignore

# Sentinel placed inside any rule's `exclude_pattern` to inherit the shared
# placeholder regex declared at [defaults].exclude_pattern_placeholder. The
# sentinel is string-substituted in `get_default_rules` before the TOML text
# is handed to the Rust core.
_PLACEHOLDER_SENTINEL = "__SHARED_PLACEHOLDERS__"
_PLACEHOLDER_KEY_RX = re.compile(
r'^\s*exclude_pattern_placeholder\s*=\s*"((?:[^"\\]|\\.)*)"',
re.MULTILINE,
)

DEFAULT_CONFIG = {
"exclude": [
".venv", "venv", ".git", "__pycache__", "build", "dist", "*.egg-info",
Expand Down Expand Up @@ -37,14 +48,26 @@ def load_config(config_path: Path) -> dict:
return DEFAULT_CONFIG

def get_default_rules(ai_scan: bool = False) -> str:
"""Loads the built-in TOML rules file from package resources."""
"""Loads the built-in TOML rules file from package resources.

Substitutes the `__SHARED_PLACEHOLDERS__` sentinel inside any rule's
exclude_pattern with the value of `[defaults].exclude_pattern_placeholder`,
so the placeholder/dummy-secret regex lives in one place rather than being
copy-pasted across every format-specific rule.
"""
try:
base_rules = pkg_resources.files('pyspector.rules').joinpath('built-in-rules.toml').read_text(encoding='utf-8')
if ai_scan:
click.echo("[*] AI scanning enabled. Loading additional AI/LLM rules.")
ai_rules = pkg_resources.files('pyspector.rules').joinpath('built-in-rules-ai.toml').read_text(encoding='utf-8')
# Combine the two rulesets
return base_rules + "\n" + ai_rules
return base_rules
text = base_rules + "\n" + ai_rules
else:
text = base_rules

# Inline shared placeholder regex into rule-level exclude_patterns
m = _PLACEHOLDER_KEY_RX.search(text)
if m and _PLACEHOLDER_SENTINEL in text:
text = text.replace(_PLACEHOLDER_SENTINEL, m.group(1))
return text
except Exception as e:
raise FileNotFoundError(f"Could not load built-in-rules.toml from package data! Error: {e}")
7 changes: 6 additions & 1 deletion src/pyspector/rules/built-in-rules-ai.toml
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,13 @@ id = "AI404"
description = "Hugging Face authentication token is hardcoded in the source file."
severity = "Critical"
remediation = "Store Hugging Face tokens and other secrets in environment variables or a secrets management tool, not in source code."
pattern = "token\\s*=\\s*[\"']hf_"
# Real HF tokens are hf_ + ~34 alphanumeric chars. Require at least 16 consecutive
# alphanumeric characters after `hf_` to drop placeholders like "hf_token", "hf_X",
# "hf_xxx_your_token", and docstring examples like 'hf_....'.
pattern = "token\\s*=\\s*[\"']hf_[A-Za-z0-9]{16,}"
file_pattern = "*.py"
# Doctest examples (>>> / ...) shouldn't fire even if they happen to use a long fake token.
exclude_pattern = "^\\s*(>>>|\\.\\.\\.)\\s"

[[rule]]
id = "AI405"
Expand Down
Loading
Loading