diff --git a/README.md b/README.md index 061e94e..a165a36 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Unified telemetry and error tracking for OpenAdapt packages. - **Unified Error Tracking**: Consistent error reporting across all OpenAdapt packages - **Privacy-First Design**: Automatic PII scrubbing and path sanitization - **Configurable Opt-Out**: Respects `DO_NOT_TRACK` and custom environment variables -- **CI/Dev Mode Detection**: Automatically tags internal usage for filtering +- **Internal Usage Tagging**: Explicit flags + CI detection with optional git heuristic - **GlitchTip/Sentry Compatible**: Uses the Sentry SDK for maximum compatibility ## Installation @@ -98,10 +98,12 @@ with TelemetrySpan("indexing", "build_faiss_index") as span: | `OPENADAPT_TELEMETRY_ENABLED` | `true` | Enable/disable telemetry | | `OPENADAPT_INTERNAL` | `false` | Tag as internal usage | | `OPENADAPT_DEV` | `false` | Development mode | +| `OPENADAPT_INTERNAL_FROM_GIT` | `false` | Optional: tag as internal when running from a git checkout | | `OPENADAPT_TELEMETRY_DSN` | - | GlitchTip/Sentry DSN | | `OPENADAPT_TELEMETRY_ENVIRONMENT` | `production` | Environment name | | `OPENADAPT_TELEMETRY_SAMPLE_RATE` | `1.0` | Error sampling rate (0.0-1.0) | | `OPENADAPT_TELEMETRY_TRACES_SAMPLE_RATE` | `0.01` | Performance sampling rate | +| `OPENADAPT_TELEMETRY_ANON_SALT` | generated | Optional anonymization salt override (advanced use only) | ### Configuration File @@ -160,7 +162,9 @@ export OPENADAPT_TELEMETRY_ENABLED=false - File paths have usernames replaced with `` - Sensitive fields (password, token, api_key, etc.) are redacted - Email addresses and phone numbers are scrubbed from messages -- User IDs are hashed before upload (`anon:`) +- Top-level event messages/logentry strings are scrubbed +- Tag keys are validated, sensitive/invalid keys are dropped, and values are scrubbed before upload +- User IDs are HMAC-anonymized before upload (`anon:v2:`) - `send_default_pii` is enforced to `false` by the client ## Internal Usage Tagging @@ -169,8 +173,8 @@ Internal/developer usage is automatically detected via: 1. `OPENADAPT_INTERNAL=true` environment variable 2. `OPENADAPT_DEV=true` environment variable -3. Git repository present in working directory -4. CI environment detected (GitHub Actions, GitLab CI, etc.) +3. CI environment detected (GitHub Actions, GitLab CI, etc.) +4. Optional git repository heuristic when `OPENADAPT_INTERNAL_FROM_GIT=true` Filter in GlitchTip: ``` diff --git a/src/openadapt_telemetry/client.py b/src/openadapt_telemetry/client.py index d3e7aa7..d25da41 100644 --- a/src/openadapt_telemetry/client.py +++ b/src/openadapt_telemetry/client.py @@ -60,11 +60,11 @@ def is_ci_environment() -> bool: def is_internal_user() -> bool: """Determine if current usage is from internal team. - Uses multiple heuristics to detect internal/developer usage: + Uses multiple signals to detect internal/developer usage: 1. Explicit OPENADAPT_INTERNAL environment variable 2. OPENADAPT_DEV environment variable - 3. Git repository present in current directory - 4. CI environment detected + 3. CI environment detected + 4. Optional git repository heuristic when OPENADAPT_INTERNAL_FROM_GIT=true Returns: True if this appears to be internal usage. @@ -77,25 +77,26 @@ def is_internal_user() -> bool: if os.getenv("OPENADAPT_DEV", "").lower() in ("true", "1", "yes"): return True - # Method 3: Git repository present (development checkout) - if Path(".git").exists() or Path("../.git").exists(): - return True - - # Method 4: CI/CD environment + # Method 3: CI/CD environment if is_ci_environment(): return True + # Method 4: optional git heuristic + if os.getenv("OPENADAPT_INTERNAL_FROM_GIT", "").lower() in ("true", "1", "yes"): + if Path(".git").exists() or Path("../.git").exists(): + return True + return False def _compose_before_send(base: BeforeSendFn, extra: BeforeSendFn) -> BeforeSendFn: - """Compose custom before_send after privacy filtering.""" + """Compose custom before_send before final privacy filtering.""" def composed(event: Event, hint: Hint) -> Optional[Event]: - sanitized = base(event, hint) - if sanitized is None: + modified = extra(event, hint) + if modified is None: return None - return extra(sanitized, hint) + return base(modified, hint) return composed @@ -219,7 +220,7 @@ def initialize( if not callable(custom_before_send): raise TypeError("before_send must be callable") warnings.warn( - "Custom before_send is composed after OpenAdapt privacy filtering and cannot bypass scrubbing.", + "Custom before_send runs before OpenAdapt privacy filtering; final payload is always scrubbed.", stacklevel=2, ) before_send = _compose_before_send(base_before_send, custom_before_send) diff --git a/src/openadapt_telemetry/config.py b/src/openadapt_telemetry/config.py index a7c047a..8129f79 100644 --- a/src/openadapt_telemetry/config.py +++ b/src/openadapt_telemetry/config.py @@ -11,6 +11,8 @@ import json import os +import secrets +import warnings from dataclasses import dataclass, field from pathlib import Path from typing import Any, Optional @@ -27,11 +29,13 @@ "performance_tracking": True, "feature_usage": True, "send_default_pii": False, + "anon_salt": None, } # Config file location CONFIG_DIR = Path.home() / ".config" / "openadapt" CONFIG_FILE = CONFIG_DIR / "telemetry.json" +_INVALID_ANON_SALT_WARNED = False @dataclass @@ -48,6 +52,7 @@ class TelemetryConfig: performance_tracking: bool = True feature_usage: bool = True send_default_pii: bool = False + anon_salt: Optional[str] = None _loaded: bool = field(default=False, repr=False) @@ -73,7 +78,8 @@ def _load_config_file() -> dict[str, Any]: try: with open(CONFIG_FILE) as f: - return json.load(f) + data = json.load(f) + return data if isinstance(data, dict) else {} except (json.JSONDecodeError, OSError): return {} @@ -122,9 +128,72 @@ def _get_env_config() -> dict[str, Any]: except ValueError: pass + # Optional override for deterministic anonymization in controlled environments. + anon_salt = os.getenv("OPENADAPT_TELEMETRY_ANON_SALT") + if anon_salt: + if _is_valid_anon_salt(anon_salt): + config["anon_salt"] = anon_salt.strip() + else: + _warn_invalid_anon_salt_once() + return config +def _is_valid_anon_salt(value: Any) -> bool: + """Check whether a salt value is valid for HMAC anonymization.""" + return isinstance(value, str) and len(value.strip()) >= 32 + + +def _warn_invalid_anon_salt_once() -> None: + """Warn once per process when OPENADAPT_TELEMETRY_ANON_SALT is invalid.""" + global _INVALID_ANON_SALT_WARNED + if _INVALID_ANON_SALT_WARNED: + return + warnings.warn( + "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT; must be >= 32 chars.", + stacklevel=2, + ) + _INVALID_ANON_SALT_WARNED = True + + +def _generate_anon_salt() -> str: + """Generate a high-entropy random salt.""" + return secrets.token_hex(32) + + +def get_or_create_anon_salt() -> str: + """Get anonymization salt from env/config, creating one if missing. + + Priority: + 1. OPENADAPT_TELEMETRY_ANON_SALT (if valid) + 2. telemetry config file `anon_salt` (if valid) + 3. generated and persisted random salt + """ + env_salt = os.getenv("OPENADAPT_TELEMETRY_ANON_SALT") + if env_salt: + if _is_valid_anon_salt(env_salt): + return env_salt.strip() + _warn_invalid_anon_salt_once() + + config_data = _load_config_file() + file_salt = config_data.get("anon_salt") + if _is_valid_anon_salt(file_salt): + return str(file_salt).strip() + + generated = _generate_anon_salt() + config_data["anon_salt"] = generated + try: + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + with open(CONFIG_FILE, "w") as f: + json.dump(config_data, f, indent=2) + except OSError: + warnings.warn( + "Failed to persist telemetry anonymization salt; using ephemeral salt for this process.", + stacklevel=2, + ) + return generated + + def load_config() -> TelemetryConfig: """Load telemetry configuration from all sources. @@ -148,7 +217,7 @@ def load_config() -> TelemetryConfig: merged.update(env_config) # Remove None values for fields that should use defaults - config_dict = {k: v for k, v in merged.items() if v is not None or k == "dsn"} + config_dict = {k: v for k, v in merged.items() if v is not None or k in {"dsn", "anon_salt"}} return TelemetryConfig(**config_dict, _loaded=True) @@ -172,6 +241,7 @@ def save_config(config: TelemetryConfig) -> None: "performance_tracking": config.performance_tracking, "feature_usage": config.feature_usage, "send_default_pii": config.send_default_pii, + "anon_salt": config.anon_salt, } with open(CONFIG_FILE, "w") as f: diff --git a/src/openadapt_telemetry/decorators.py b/src/openadapt_telemetry/decorators.py index 7925f34..1e6d714 100644 --- a/src/openadapt_telemetry/decorators.py +++ b/src/openadapt_telemetry/decorators.py @@ -10,7 +10,7 @@ import functools import time -from typing import Any, Callable, Optional, TypeVar, Union +from typing import Any, Callable, Optional, TypeVar import sentry_sdk @@ -57,7 +57,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: result = func(*args, **kwargs) transaction.set_status("ok") return result - except Exception as e: + except Exception: transaction.set_status("internal_error") raise finally: diff --git a/src/openadapt_telemetry/privacy.py b/src/openadapt_telemetry/privacy.py index 48a1f0f..027d10b 100644 --- a/src/openadapt_telemetry/privacy.py +++ b/src/openadapt_telemetry/privacy.py @@ -10,9 +10,13 @@ from __future__ import annotations import hashlib +import hmac import re +from functools import lru_cache from typing import Any, Dict, List, Optional, Set +from .config import get_or_create_anon_salt + # Sensitive field names that should have their values redacted PII_DENYLIST: Set[str] = { # Authentication @@ -95,6 +99,97 @@ re.compile(r"[A-Za-z0-9+/]{40,}={0,2}"), ] +# Whitelisted tags that retain observability value and are safe to keep. +ALLOWED_OBSERVABILITY_KEYS: Set[str] = { + "package", + "package_version", + "python_version", + "os", + "os_version", + "ci", + "internal", +} + +ANON_VERSION = "v2" +TAG_KEY_PATTERN = re.compile(r"^[a-zA-Z0-9._:-]{1,64}$") +MAX_TAGS = 64 +MAX_TAG_VALUE_CHARS = 256 + + +@lru_cache(maxsize=1) +def _get_anon_salt_cached() -> str: + """Memoize anon salt lookup to avoid repeated filesystem reads.""" + return get_or_create_anon_salt() + + +def _is_already_anonymized(value: str, prefix: str = "anon") -> bool: + """Return True only for canonical anonymous ID formats.""" + escaped_prefix = re.escape(prefix) + patterns = ( + rf"^{escaped_prefix}:v2:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:v1:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:v2:unknown$", + ) + return any(re.match(pattern, value) for pattern in patterns) + + +def _scrub_top_level_messages(event: Dict[str, Any]) -> None: + """Scrub top-level message fields that may contain free-form text.""" + if isinstance(event.get("message"), str): + event["message"] = scrub_string(event["message"]) + if isinstance(event.get("logentry"), dict): + logentry = event["logentry"] + if isinstance(logentry.get("message"), str): + logentry["message"] = scrub_string(logentry["message"]) + if isinstance(logentry.get("formatted"), str): + logentry["formatted"] = scrub_string(logentry["formatted"]) + + +def _is_safe_tag_key(key: Any) -> bool: + """Return True when a tag key is safe to keep.""" + if not isinstance(key, str): + return False + if not TAG_KEY_PATTERN.match(key): + return False + if is_sensitive_key(key): + return False + return True + + +def _normalize_tag_value(value: Any) -> str: + """Convert a tag value to a scrubbed bounded string.""" + scrubbed = scrub_string(str(value)) + return scrubbed[:MAX_TAG_VALUE_CHARS] + + +def _scrub_tags(tags: Dict[str, Any]) -> Dict[str, Any]: + """Keep safe tags, scrub values, and bound payload size.""" + sanitized: Dict[str, Any] = {} + dropped_count = 0 + + # Preserve explicitly listed observability keys first so they survive capping. + for key in ALLOWED_OBSERVABILITY_KEYS: + if key in tags and _is_safe_tag_key(key): + sanitized[key] = _normalize_tag_value(tags[key]) + + for key, value in tags.items(): + if key in sanitized: + continue + if len(sanitized) >= MAX_TAGS: + dropped_count += 1 + continue + if not _is_safe_tag_key(key): + dropped_count += 1 + continue + sanitized[key] = _normalize_tag_value(value) + + # Keep hard cap strict, including metadata markers. + if dropped_count > 0 and len(sanitized) < MAX_TAGS: + sanitized["_dropped_tag_count"] = str(dropped_count) + + return sanitized + def sanitize_path(path: str) -> str: """Remove username from file paths. @@ -159,14 +254,20 @@ def scrub_string(value: str) -> str: def anonymize_identifier(value: str, prefix: str = "anon") -> str: - """Deterministically anonymize an identifier using SHA-256.""" + """Deterministically anonymize an identifier using versioned HMAC-SHA256.""" normalized = str(value or "").strip() if not normalized: - return f"{prefix}:unknown" - if normalized.startswith(f"{prefix}:"): + return f"{prefix}:{ANON_VERSION}:unknown" + if _is_already_anonymized(normalized, prefix=prefix): return normalized - digest = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16] - return f"{prefix}:{digest}" + + salt = _get_anon_salt_cached() + digest = hmac.new( + salt.encode("utf-8"), + normalized.encode("utf-8"), + hashlib.sha256, + ).hexdigest()[:16] + return f"{prefix}:{ANON_VERSION}:{digest}" def scrub_dict( @@ -193,6 +294,8 @@ def scrub_dict( result[key] = scrub_dict(value, deep=True, scrub_values=scrub_values) elif isinstance(value, list) and deep: result[key] = scrub_list(value, scrub_values=scrub_values) + elif isinstance(value, tuple) and deep: + result[key] = tuple(scrub_list(list(value), scrub_values=scrub_values)) elif isinstance(value, str): if scrub_values: result[key] = scrub_string(value) @@ -221,6 +324,8 @@ def scrub_list(data: List[Any], scrub_values: bool = False) -> List[Any]: result.append(scrub_dict(item, deep=True, scrub_values=scrub_values)) elif isinstance(item, list): result.append(scrub_list(item, scrub_values=scrub_values)) + elif isinstance(item, tuple): + result.append(tuple(scrub_list(list(item), scrub_values=scrub_values))) elif isinstance(item, str) and scrub_values: result.append(scrub_string(item)) else: @@ -270,48 +375,63 @@ def create_before_send_filter(): def before_send(event: Event, hint: Hint) -> Optional[Event]: """Filter and sanitize events before sending to Sentry/GlitchTip.""" - # Scrub exception data - if "exception" in event: - scrub_exception_data(event["exception"]) - - # Scrub breadcrumbs - if "breadcrumbs" in event and "values" in event["breadcrumbs"]: - for breadcrumb in event["breadcrumbs"]["values"]: - if "message" in breadcrumb and isinstance(breadcrumb["message"], str): - breadcrumb["message"] = scrub_string(breadcrumb["message"]) - if "data" in breadcrumb and isinstance(breadcrumb["data"], dict): - breadcrumb["data"] = scrub_dict( - breadcrumb["data"], deep=True, scrub_values=True - ) - - # Scrub extra data - if "extra" in event and isinstance(event["extra"], dict): - event["extra"] = scrub_dict(event["extra"], deep=True, scrub_values=True) - - # Scrub contexts - if "contexts" in event and isinstance(event["contexts"], dict): - event["contexts"] = scrub_dict(event["contexts"], deep=True, scrub_values=False) - - # Scrub tags - if "tags" in event and isinstance(event["tags"], dict): - event["tags"] = scrub_dict(event["tags"], deep=False, scrub_values=False) - - # Scrub request data (if present) - if "request" in event: - request = event["request"] - if "headers" in request: - request["headers"] = scrub_dict(request["headers"], deep=False, scrub_values=False) - if "data" in request: - if isinstance(request["data"], dict): - request["data"] = scrub_dict(request["data"], deep=True, scrub_values=True) - elif isinstance(request["data"], str): - request["data"] = scrub_string(request["data"]) - - # Keep only anonymous user IDs (drop all other user attributes). - if "user" in event and isinstance(event["user"], dict): - user_id = event["user"].get("id") - event["user"] = {"id": anonymize_identifier(str(user_id))} if user_id else {} - - return event + try: + _scrub_top_level_messages(event) + + # Scrub exception data + if "exception" in event: + scrub_exception_data(event["exception"]) + + # Scrub breadcrumbs + if "breadcrumbs" in event and "values" in event["breadcrumbs"]: + for breadcrumb in event["breadcrumbs"]["values"]: + if "message" in breadcrumb and isinstance(breadcrumb["message"], str): + breadcrumb["message"] = scrub_string(breadcrumb["message"]) + if "data" in breadcrumb and isinstance(breadcrumb["data"], dict): + breadcrumb["data"] = scrub_dict( + breadcrumb["data"], deep=True, scrub_values=True + ) + + # Scrub extra data + if "extra" in event and isinstance(event["extra"], dict): + event["extra"] = scrub_dict(event["extra"], deep=True, scrub_values=True) + + # Scrub contexts + if "contexts" in event and isinstance(event["contexts"], dict): + event["contexts"] = scrub_dict(event["contexts"], deep=True, scrub_values=True) + + # Scrub tags (allowlist + scrub values) + if "tags" in event and isinstance(event["tags"], dict): + event["tags"] = _scrub_tags(event["tags"]) + + # Scrub request data (if present) + if "request" in event and isinstance(event["request"], dict): + request = event["request"] + if "headers" in request: + if isinstance(request["headers"], dict): + request["headers"] = scrub_dict( + request["headers"], deep=False, scrub_values=True + ) + elif isinstance(request["headers"], list): + request["headers"] = scrub_list(request["headers"], scrub_values=True) + elif isinstance(request["headers"], str): + request["headers"] = scrub_string(request["headers"]) + if "data" in request: + if isinstance(request["data"], dict): + request["data"] = scrub_dict(request["data"], deep=True, scrub_values=True) + elif isinstance(request["data"], list): + request["data"] = scrub_list(request["data"], scrub_values=True) + elif isinstance(request["data"], str): + request["data"] = scrub_string(request["data"]) + + # Keep only anonymous user IDs (drop all other user attributes). + if "user" in event and isinstance(event["user"], dict): + user_id = event["user"].get("id") + event["user"] = {"id": anonymize_identifier(str(user_id))} if user_id else {} + + return event + except Exception: + # Fail closed on privacy filter errors. + return None return before_send diff --git a/tests/test_client.py b/tests/test_client.py index 53386e9..cc2606f 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -105,6 +105,32 @@ def test_no_signals_indicates_external(self, _mock_ci, _mock_exists): with patch.dict(os.environ, {"OPENADAPT_INTERNAL": "", "OPENADAPT_DEV": ""}, clear=False): assert is_internal_user() is False + @patch("openadapt_telemetry.client.Path.exists", return_value=True) + @patch("openadapt_telemetry.client.is_ci_environment", return_value=False) + def test_git_repo_not_internal_by_default(self, _mock_ci, _mock_exists): + """Git repos should not be internal unless explicitly enabled.""" + with patch.dict( + os.environ, + { + "OPENADAPT_INTERNAL": "", + "OPENADAPT_DEV": "", + "OPENADAPT_INTERNAL_FROM_GIT": "", + }, + clear=False, + ): + assert is_internal_user() is False + + @patch("openadapt_telemetry.client.Path.exists", return_value=True) + @patch("openadapt_telemetry.client.is_ci_environment", return_value=False) + def test_git_repo_internal_when_opted_in(self, _mock_ci, _mock_exists): + """Git repos should be internal when OPENADAPT_INTERNAL_FROM_GIT is enabled.""" + with patch.dict( + os.environ, + {"OPENADAPT_INTERNAL_FROM_GIT": "true", "OPENADAPT_INTERNAL": "", "OPENADAPT_DEV": ""}, + clear=False, + ): + assert is_internal_user() is True + class TestTelemetryClient: """Tests for TelemetryClient class.""" @@ -255,21 +281,25 @@ def test_set_user_hashes_identifier_and_drops_extra_fields(self, mock_sentry): mock_sentry.set_user.assert_called_once() payload = mock_sentry.set_user.call_args.args[0] assert set(payload.keys()) == {"id"} - assert payload["id"].startswith("anon:") + assert payload["id"].startswith("anon:v2:") assert payload["id"] != "user@example.com" @patch("openadapt_telemetry.client.sentry_sdk") def test_custom_before_send_is_composed_after_privacy_filter(self, mock_sentry): - """Custom before_send should run after built-in scrubbing.""" + """Custom before_send should run before built-in scrubbing.""" def custom_before_send(event, hint): - event.setdefault("tags", {})["custom"] = "true" + event.setdefault("logentry", {})["message"] = "secret user@example.com" + event.setdefault("tags", {})["package"] = "openadapt-evals" return event with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): TelemetryClient.reset_instance() client = TelemetryClient.get_instance() - with pytest.warns(UserWarning, match="Custom before_send is composed after OpenAdapt privacy filtering"): + with pytest.warns( + UserWarning, + match="Custom before_send runs before OpenAdapt privacy filtering", + ): client.initialize( dsn="https://test@example.com/1", before_send=custom_before_send, @@ -279,9 +309,9 @@ def custom_before_send(event, hint): event = {"user": {"id": "user@example.com", "email": "user@example.com"}} output = before_send(event, hint={}) assert output is not None - assert output["user"]["id"].startswith("anon:") + assert output["user"]["id"].startswith("anon:v2:") assert "email" not in output["user"] - assert output["tags"]["custom"] == "true" + assert output["logentry"]["message"] == "secret [REDACTED]" @patch("openadapt_telemetry.client.sentry_sdk") def test_send_default_pii_override_is_ignored(self, mock_sentry): diff --git a/tests/test_config.py b/tests/test_config.py index 0d8f9d0..9be445f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -3,6 +3,7 @@ import json import os import tempfile +import warnings from pathlib import Path from unittest.mock import patch @@ -10,9 +11,12 @@ from openadapt_telemetry.config import ( TelemetryConfig, + _generate_anon_salt, _get_env_config, + _is_valid_anon_salt, _load_config_file, _parse_bool, + get_or_create_anon_salt, load_config, save_config, ) @@ -157,6 +161,19 @@ def test_invalid_sample_rate_ignored(self): config = _get_env_config() assert "sample_rate" not in config + def test_anon_salt_env_override(self): + """OPENADAPT_TELEMETRY_ANON_SALT should be loaded when set.""" + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "x" * 32}, clear=False): + config = _get_env_config() + assert config.get("anon_salt") == "x" * 32 + + def test_invalid_anon_salt_env_ignored(self): + """Invalid OPENADAPT_TELEMETRY_ANON_SALT should be ignored in env config.""" + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "short"}, clear=False): + with pytest.warns(UserWarning, match="Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT"): + config = _get_env_config() + assert "anon_salt" not in config + class TestConfigFile: """Tests for configuration file loading.""" @@ -193,6 +210,18 @@ def test_load_invalid_json(self): os.unlink(f.name) + def test_load_non_dict_json_returns_empty(self): + """Loading valid JSON that is not an object should return empty dict.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(["not", "a", "dict"], f) + f.flush() + + with patch("openadapt_telemetry.config.CONFIG_FILE", Path(f.name)): + config = _load_config_file() + assert config == {} + + os.unlink(f.name) + class TestLoadConfig: """Tests for full configuration loading.""" @@ -249,3 +278,61 @@ def test_save_and_load(self): assert loaded.enabled is False assert loaded.environment == "test" assert loaded.sample_rate == 0.5 + + +class TestAnonSalt: + """Tests for per-install anonymization salt handling.""" + + def test_validate_anon_salt(self): + assert _is_valid_anon_salt("x" * 32) is True + assert _is_valid_anon_salt("short") is False + assert _is_valid_anon_salt(None) is False + + def test_generate_anon_salt(self): + salt = _generate_anon_salt() + assert isinstance(salt, str) + assert len(salt) >= 32 + + def test_get_or_create_anon_salt_from_env(self): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "y" * 32}, clear=False): + assert get_or_create_anon_salt() == "y" * 32 + + def test_get_or_create_anon_salt_persists_when_missing(self): + with tempfile.TemporaryDirectory() as tmpdir: + config_dir = Path(tmpdir) / "openadapt" + config_file = config_dir / "telemetry.json" + with patch("openadapt_telemetry.config.CONFIG_DIR", config_dir): + with patch("openadapt_telemetry.config.CONFIG_FILE", config_file): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": ""}, clear=False): + first = get_or_create_anon_salt() + second = get_or_create_anon_salt() + assert first == second + with open(config_file) as f: + data = json.load(f) + assert data["anon_salt"] == first + + def test_get_or_create_anon_salt_recovers_from_corrupt_file(self): + with tempfile.TemporaryDirectory() as tmpdir: + config_dir = Path(tmpdir) / "openadapt" + config_dir.mkdir(parents=True, exist_ok=True) + config_file = config_dir / "telemetry.json" + config_file.write_text("{not-valid-json") + with patch("openadapt_telemetry.config.CONFIG_DIR", config_dir): + with patch("openadapt_telemetry.config.CONFIG_FILE", config_file): + salt = get_or_create_anon_salt() + assert isinstance(salt, str) + assert len(salt) >= 32 + + def test_invalid_anon_salt_warns_once_across_config_paths(self): + with patch("openadapt_telemetry.config._INVALID_ANON_SALT_WARNED", False): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "short"}, clear=False): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _get_env_config() + get_or_create_anon_salt() + matches = [ + warning + for warning in caught + if "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT" in str(warning.message) + ] + assert len(matches) == 1 diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 00346e7..e5698b8 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -1,6 +1,5 @@ """Tests for telemetry decorators.""" -import os from unittest.mock import MagicMock, patch import pytest @@ -171,20 +170,20 @@ def teardown_method(self): def test_basic_usage(self): """Basic span usage should work.""" - with TelemetrySpan("test_op", "test_span") as span: + with TelemetrySpan("test_op", "test_span"): result = 1 + 1 assert result == 2 def test_with_description(self): """Span with description should work.""" - with TelemetrySpan("op", "name", description="Test description") as span: + with TelemetrySpan("op", "name", description="Test description"): pass def test_exception_handling(self): """Exceptions should propagate through span.""" with pytest.raises(ValueError, match="test"): - with TelemetrySpan("op", "name") as span: + with TelemetrySpan("op", "name"): raise ValueError("test") def test_set_tag(self): diff --git a/tests/test_privacy.py b/tests/test_privacy.py index 817350e..67f4f31 100644 --- a/tests/test_privacy.py +++ b/tests/test_privacy.py @@ -1,7 +1,10 @@ """Tests for privacy filtering and PII scrubbing.""" +from unittest.mock import patch from openadapt_telemetry.privacy import ( + ALLOWED_OBSERVABILITY_KEYS, + MAX_TAGS, anonymize_identifier, create_before_send_filter, is_sensitive_key, @@ -280,19 +283,36 @@ def test_local_variables_scrubbing(self): class TestAnonymizeIdentifier: """Tests for deterministic identifier anonymization.""" - def test_stable_hash(self): - first = anonymize_identifier("user@example.com") - second = anonymize_identifier("user@example.com") + def test_stable_hash_with_same_salt(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + first = anonymize_identifier("user@example.com") + second = anonymize_identifier("user@example.com") assert first == second - assert first.startswith("anon:") + assert first.startswith("anon:v2:") assert first != "user@example.com" + def test_different_salts_produce_different_ids(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + first = anonymize_identifier("user@example.com") + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="b" * 32): + second = anonymize_identifier("user@example.com") + assert first != second + def test_empty_value(self): - assert anonymize_identifier("") == "anon:unknown" + assert anonymize_identifier("") == "anon:v2:unknown" def test_already_anonymized_id_is_not_rehashed(self): - anon_id = anonymize_identifier("user@example.com") + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + anon_id = anonymize_identifier("user@example.com") assert anonymize_identifier(anon_id) == anon_id + assert anonymize_identifier("anon:v1:1234567890abcdef") == "anon:v1:1234567890abcdef" + assert anonymize_identifier("anon:1234567890abcdef") == "anon:1234567890abcdef" + + def test_non_canonical_anon_prefix_is_rehashed(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + value = anonymize_identifier("anon:user@example.com") + assert value.startswith("anon:v2:") + assert value != "anon:user@example.com" class TestBeforeSendUserScrubbing: @@ -311,5 +331,90 @@ def test_only_anonymous_user_id_remains(self): assert sanitized is not None assert "user" in sanitized assert set(sanitized["user"].keys()) == {"id"} - assert sanitized["user"]["id"].startswith("anon:") + assert sanitized["user"]["id"].startswith("anon:v2:") assert sanitized["user"]["id"] != "user@example.com" + + +class TestBeforeSendEventScrubbing: + """Tests for event-level before_send filtering behavior.""" + + def test_top_level_message_is_scrubbed(self): + before_send = create_before_send_filter() + event = {"message": "Failure for user@example.com"} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert "user@example.com" not in sanitized["message"] + + def test_logentry_message_is_scrubbed(self): + before_send = create_before_send_filter() + event = {"logentry": {"message": "token=Bearer abc123", "formatted": "email user@example.com"}} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert "abc123" not in sanitized["logentry"]["message"] + assert "user@example.com" not in sanitized["logentry"]["formatted"] + + def test_context_values_are_scrubbed(self): + before_send = create_before_send_filter() + event = {"contexts": {"runtime": {"note": "contact me at user@example.com"}}} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["contexts"]["runtime"]["note"] == "contact me at [REDACTED]" + + def test_tags_are_allowlisted_and_scrubbed(self): + before_send = create_before_send_filter() + event = { + "tags": { + "package": "openadapt-evals", + "internal": "false", + "task_id": "abc123", + "email": "user@example.com", + "not valid": "drop-me", + }, + } + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert set(ALLOWED_OBSERVABILITY_KEYS).intersection(set(sanitized["tags"].keys())) + assert sanitized["tags"]["task_id"] == "abc123" + assert "email" not in sanitized["tags"] + assert "not valid" not in sanitized["tags"] + assert sanitized["tags"]["_dropped_tag_count"] == "2" + + def test_filter_fails_closed_on_unexpected_error(self): + before_send = create_before_send_filter() + with patch("openadapt_telemetry.privacy.scrub_exception_data", side_effect=RuntimeError("boom")): + event = {"exception": {"values": []}} + assert before_send(event, hint={}) is None + + def test_request_non_dict_does_not_drop_event(self): + before_send = create_before_send_filter() + event = {"request": "unexpected-request-shape", "message": "ok"} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["request"] == "unexpected-request-shape" + + def test_request_headers_tuple_list_is_scrubbed(self): + before_send = create_before_send_filter() + event = { + "request": { + "headers": [ + ("Authorization", "Bearer abc123"), + ("X-User", "user@example.com"), + ], + }, + } + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["request"]["headers"][0][1] == "[REDACTED]" + assert sanitized["request"]["headers"][1][1] == "[REDACTED]" + + def test_tag_cap_is_hard_limited(self): + before_send = create_before_send_filter() + many_tags = {f"k{i}": f"v{i}" for i in range(MAX_TAGS + 20)} + many_tags.update({"package": "openadapt-evals", "internal": "false"}) + event = {"tags": many_tags} + sanitized = before_send(event, hint={}) + assert sanitized is not None + tags = sanitized["tags"] + assert "package" in tags + assert "internal" in tags + assert len(tags) <= MAX_TAGS