diff --git a/README.md b/README.md index 172da1b..54b7eb7 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Unified telemetry and error tracking for OpenAdapt packages. - **Usage Counters (PostHog)**: Lightweight product usage events for adoption metrics - **Privacy-First Design**: Automatic PII scrubbing and path sanitization - **Configurable Opt-Out**: Respects `DO_NOT_TRACK` and custom environment variables -- **CI/Dev Mode Detection**: Automatically tags internal usage for filtering +- **Internal Usage Tagging**: Explicit flags + CI detection with optional git heuristic - **GlitchTip/Sentry Compatible**: Uses the Sentry SDK for maximum compatibility ## Installation @@ -111,6 +111,7 @@ with TelemetrySpan("indexing", "build_faiss_index") as span: | `OPENADAPT_TELEMETRY_ENABLED` | `true` | Enable/disable telemetry | | `OPENADAPT_INTERNAL` | `false` | Tag as internal usage | | `OPENADAPT_DEV` | `false` | Development mode | +| `OPENADAPT_INTERNAL_FROM_GIT` | `false` | Optional: tag as internal when running from a git checkout | | `OPENADAPT_TELEMETRY_DSN` | - | GlitchTip/Sentry DSN | | `OPENADAPT_POSTHOG_PROJECT_API_KEY` | embedded default | PostHog ingestion project token (`phc_...`) | | `OPENADAPT_POSTHOG_HOST` | `https://us.i.posthog.com` | PostHog ingestion host | @@ -120,6 +121,7 @@ with TelemetrySpan("indexing", "build_faiss_index") as span: | `OPENADAPT_TELEMETRY_ENVIRONMENT` | `production` | Environment name | | `OPENADAPT_TELEMETRY_SAMPLE_RATE` | `1.0` | Error sampling rate (0.0-1.0) | | `OPENADAPT_TELEMETRY_TRACES_SAMPLE_RATE` | `0.01` | Performance sampling rate | +| `OPENADAPT_TELEMETRY_ANON_SALT` | generated | Optional anonymization salt override (advanced use only) | ### Configuration File @@ -178,6 +180,10 @@ export OPENADAPT_TELEMETRY_ENABLED=false - File paths have usernames replaced with `` - Sensitive fields (password, token, api_key, etc.) are redacted - Email addresses and phone numbers are scrubbed from messages +- Top-level event messages/logentry strings are scrubbed +- Tag keys are validated, sensitive/invalid keys are dropped, and values are scrubbed before upload +- User IDs are HMAC-anonymized before upload (`anon:v2:`) +- `send_default_pii` is enforced to `false` by the client ## Internal Usage Tagging @@ -185,9 +191,8 @@ Internal/developer usage is automatically detected via: 1. `OPENADAPT_INTERNAL=true` environment variable 2. `OPENADAPT_DEV=true` environment variable -3. Running from source (not frozen executable) -4. Git repository present in working directory -5. CI environment detected (GitHub Actions, GitLab CI, etc.) +3. CI environment detected (GitHub Actions, GitLab CI, etc.) +4. Optional git repository heuristic when `OPENADAPT_INTERNAL_FROM_GIT=true` Filter in GlitchTip: ``` diff --git a/src/openadapt_telemetry/client.py b/src/openadapt_telemetry/client.py index 7dfb81a..d25da41 100644 --- a/src/openadapt_telemetry/client.py +++ b/src/openadapt_telemetry/client.py @@ -9,13 +9,17 @@ import os import platform import sys +import warnings from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Callable, Dict, Optional import sentry_sdk +from sentry_sdk.types import Event, Hint from .config import TelemetryConfig, load_config -from .privacy import create_before_send_filter +from .privacy import anonymize_identifier, create_before_send_filter + +BeforeSendFn = Callable[[Event, Hint], Optional[Event]] def is_running_from_executable() -> bool: @@ -56,12 +60,11 @@ def is_ci_environment() -> bool: def is_internal_user() -> bool: """Determine if current usage is from internal team. - Uses multiple heuristics to detect internal/developer usage: + Uses multiple signals to detect internal/developer usage: 1. Explicit OPENADAPT_INTERNAL environment variable 2. OPENADAPT_DEV environment variable - 3. Not running from frozen executable - 4. Git repository present in current directory - 5. CI environment detected + 3. CI environment detected + 4. Optional git repository heuristic when OPENADAPT_INTERNAL_FROM_GIT=true Returns: True if this appears to be internal usage. @@ -74,21 +77,30 @@ def is_internal_user() -> bool: if os.getenv("OPENADAPT_DEV", "").lower() in ("true", "1", "yes"): return True - # Method 3: Not running from executable (indicates dev mode) - if not is_running_from_executable(): - return True - - # Method 4: Git repository present (development checkout) - if Path(".git").exists() or Path("../.git").exists(): - return True - - # Method 5: CI/CD environment + # Method 3: CI/CD environment if is_ci_environment(): return True + # Method 4: optional git heuristic + if os.getenv("OPENADAPT_INTERNAL_FROM_GIT", "").lower() in ("true", "1", "yes"): + if Path(".git").exists() or Path("../.git").exists(): + return True + return False +def _compose_before_send(base: BeforeSendFn, extra: BeforeSendFn) -> BeforeSendFn: + """Compose custom before_send before final privacy filtering.""" + + def composed(event: Event, hint: Hint) -> Optional[Event]: + modified = extra(event, hint) + if modified is None: + return None + return base(modified, hint) + + return composed + + class TelemetryClient: """Unified telemetry client for all OpenAdapt packages. @@ -128,20 +140,13 @@ def reset_instance(cls) -> None: def _check_enabled(self) -> bool: """Check if telemetry should be enabled. - Checks environment variables for opt-out signals. + Uses merged config with defaults/env/file precedence. Returns: True if telemetry should be enabled. """ - # Universal opt-out (DO_NOT_TRACK standard) - if os.getenv("DO_NOT_TRACK", "").lower() in ("1", "true"): - return False - - # Package-specific opt-out - if os.getenv("OPENADAPT_TELEMETRY_ENABLED", "").lower() in ("false", "0", "no"): - return False - - return True + self._config = load_config() + return bool(self._config.enabled) @property def enabled(self) -> bool: @@ -187,10 +192,8 @@ def initialize( Returns: True if initialization succeeded, False if disabled or already initialized. """ - if not self._enabled: - return False - - if self._initialized and not kwargs.get("force", False): + force = bool(kwargs.pop("force", False)) + if self._initialized and not force: return True # Load configuration @@ -201,13 +204,35 @@ def initialize( self._config.dsn = dsn if environment: self._config.environment = environment + self._enabled = bool(self._config.enabled) + + if not self._enabled: + return False # Skip if no DSN configured if not self._config.dsn: return False - # Create privacy filter - before_send = create_before_send_filter() + # Always enforce privacy scrubber first; optional custom filter can run afterward. + base_before_send = create_before_send_filter() + custom_before_send = kwargs.pop("before_send", None) + if custom_before_send is not None: + if not callable(custom_before_send): + raise TypeError("before_send must be callable") + warnings.warn( + "Custom before_send runs before OpenAdapt privacy filtering; final payload is always scrubbed.", + stacklevel=2, + ) + before_send = _compose_before_send(base_before_send, custom_before_send) + else: + before_send = base_before_send + + if "send_default_pii" in kwargs: + kwargs.pop("send_default_pii") + warnings.warn( + "Ignoring sentry init override for send_default_pii; OpenAdapt telemetry enforces send_default_pii=False.", + stacklevel=2, + ) # Initialize Sentry SDK sentry_kwargs = { @@ -215,14 +240,13 @@ def initialize( "environment": self._config.environment, "sample_rate": self._config.sample_rate, "traces_sample_rate": self._config.traces_sample_rate, - "send_default_pii": self._config.send_default_pii, + # Enforced for privacy safety across all callers/configs. + "send_default_pii": False, "before_send": before_send, } # Merge in any additional kwargs sentry_kwargs.update(kwargs) - # Remove our internal kwargs - sentry_kwargs.pop("force", None) sentry_sdk.init(**sentry_kwargs) @@ -314,12 +338,13 @@ def set_user( Note: Only sets anonymous user ID. Never set email, name, or other PII. Args: - user_id: Anonymous user identifier. - **kwargs: Additional user properties (id only recommended). + user_id: User identifier to hash before sending. + **kwargs: Ignored. Additional user fields are dropped. """ if not self._enabled or not self._initialized: return - sentry_sdk.set_user({"id": user_id, **kwargs}) + _ = kwargs + sentry_sdk.set_user({"id": anonymize_identifier(user_id)}) def set_tag(self, key: str, value: str) -> None: """Set a custom tag for all subsequent events. diff --git a/src/openadapt_telemetry/config.py b/src/openadapt_telemetry/config.py index 6bbf4c1..8129f79 100644 --- a/src/openadapt_telemetry/config.py +++ b/src/openadapt_telemetry/config.py @@ -11,6 +11,8 @@ import json import os +import secrets +import warnings from dataclasses import dataclass, field from pathlib import Path from typing import Any, Optional @@ -27,11 +29,13 @@ "performance_tracking": True, "feature_usage": True, "send_default_pii": False, + "anon_salt": None, } # Config file location CONFIG_DIR = Path.home() / ".config" / "openadapt" CONFIG_FILE = CONFIG_DIR / "telemetry.json" +_INVALID_ANON_SALT_WARNED = False @dataclass @@ -48,6 +52,7 @@ class TelemetryConfig: performance_tracking: bool = True feature_usage: bool = True send_default_pii: bool = False + anon_salt: Optional[str] = None _loaded: bool = field(default=False, repr=False) @@ -73,7 +78,8 @@ def _load_config_file() -> dict[str, Any]: try: with open(CONFIG_FILE) as f: - return json.load(f) + data = json.load(f) + return data if isinstance(data, dict) else {} except (json.JSONDecodeError, OSError): return {} @@ -82,15 +88,15 @@ def _get_env_config() -> dict[str, Any]: """Get configuration from environment variables.""" config: dict[str, Any] = {} - # Universal opt-out (DO_NOT_TRACK standard) - if os.getenv("DO_NOT_TRACK", "").lower() in ("1", "true"): - config["enabled"] = False - - # Package-specific opt-out + # Package-specific toggle enabled_env = os.getenv("OPENADAPT_TELEMETRY_ENABLED", "") if enabled_env: config["enabled"] = _parse_bool(enabled_env) + # Universal opt-out (DO_NOT_TRACK standard) always wins. + if os.getenv("DO_NOT_TRACK", "").lower() in ("1", "true"): + config["enabled"] = False + # Internal/developer flags if os.getenv("OPENADAPT_INTERNAL", "").lower() in ("true", "1", "yes"): config["internal"] = True @@ -122,9 +128,72 @@ def _get_env_config() -> dict[str, Any]: except ValueError: pass + # Optional override for deterministic anonymization in controlled environments. + anon_salt = os.getenv("OPENADAPT_TELEMETRY_ANON_SALT") + if anon_salt: + if _is_valid_anon_salt(anon_salt): + config["anon_salt"] = anon_salt.strip() + else: + _warn_invalid_anon_salt_once() + return config +def _is_valid_anon_salt(value: Any) -> bool: + """Check whether a salt value is valid for HMAC anonymization.""" + return isinstance(value, str) and len(value.strip()) >= 32 + + +def _warn_invalid_anon_salt_once() -> None: + """Warn once per process when OPENADAPT_TELEMETRY_ANON_SALT is invalid.""" + global _INVALID_ANON_SALT_WARNED + if _INVALID_ANON_SALT_WARNED: + return + warnings.warn( + "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT; must be >= 32 chars.", + stacklevel=2, + ) + _INVALID_ANON_SALT_WARNED = True + + +def _generate_anon_salt() -> str: + """Generate a high-entropy random salt.""" + return secrets.token_hex(32) + + +def get_or_create_anon_salt() -> str: + """Get anonymization salt from env/config, creating one if missing. + + Priority: + 1. OPENADAPT_TELEMETRY_ANON_SALT (if valid) + 2. telemetry config file `anon_salt` (if valid) + 3. generated and persisted random salt + """ + env_salt = os.getenv("OPENADAPT_TELEMETRY_ANON_SALT") + if env_salt: + if _is_valid_anon_salt(env_salt): + return env_salt.strip() + _warn_invalid_anon_salt_once() + + config_data = _load_config_file() + file_salt = config_data.get("anon_salt") + if _is_valid_anon_salt(file_salt): + return str(file_salt).strip() + + generated = _generate_anon_salt() + config_data["anon_salt"] = generated + try: + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + with open(CONFIG_FILE, "w") as f: + json.dump(config_data, f, indent=2) + except OSError: + warnings.warn( + "Failed to persist telemetry anonymization salt; using ephemeral salt for this process.", + stacklevel=2, + ) + return generated + + def load_config() -> TelemetryConfig: """Load telemetry configuration from all sources. @@ -148,7 +217,7 @@ def load_config() -> TelemetryConfig: merged.update(env_config) # Remove None values for fields that should use defaults - config_dict = {k: v for k, v in merged.items() if v is not None or k == "dsn"} + config_dict = {k: v for k, v in merged.items() if v is not None or k in {"dsn", "anon_salt"}} return TelemetryConfig(**config_dict, _loaded=True) @@ -172,6 +241,7 @@ def save_config(config: TelemetryConfig) -> None: "performance_tracking": config.performance_tracking, "feature_usage": config.feature_usage, "send_default_pii": config.send_default_pii, + "anon_salt": config.anon_salt, } with open(CONFIG_FILE, "w") as f: diff --git a/src/openadapt_telemetry/privacy.py b/src/openadapt_telemetry/privacy.py index 8117581..027d10b 100644 --- a/src/openadapt_telemetry/privacy.py +++ b/src/openadapt_telemetry/privacy.py @@ -9,9 +9,14 @@ from __future__ import annotations +import hashlib +import hmac import re +from functools import lru_cache from typing import Any, Dict, List, Optional, Set +from .config import get_or_create_anon_salt + # Sensitive field names that should have their values redacted PII_DENYLIST: Set[str] = { # Authentication @@ -94,6 +99,97 @@ re.compile(r"[A-Za-z0-9+/]{40,}={0,2}"), ] +# Whitelisted tags that retain observability value and are safe to keep. +ALLOWED_OBSERVABILITY_KEYS: Set[str] = { + "package", + "package_version", + "python_version", + "os", + "os_version", + "ci", + "internal", +} + +ANON_VERSION = "v2" +TAG_KEY_PATTERN = re.compile(r"^[a-zA-Z0-9._:-]{1,64}$") +MAX_TAGS = 64 +MAX_TAG_VALUE_CHARS = 256 + + +@lru_cache(maxsize=1) +def _get_anon_salt_cached() -> str: + """Memoize anon salt lookup to avoid repeated filesystem reads.""" + return get_or_create_anon_salt() + + +def _is_already_anonymized(value: str, prefix: str = "anon") -> bool: + """Return True only for canonical anonymous ID formats.""" + escaped_prefix = re.escape(prefix) + patterns = ( + rf"^{escaped_prefix}:v2:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:v1:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:v2:unknown$", + ) + return any(re.match(pattern, value) for pattern in patterns) + + +def _scrub_top_level_messages(event: Dict[str, Any]) -> None: + """Scrub top-level message fields that may contain free-form text.""" + if isinstance(event.get("message"), str): + event["message"] = scrub_string(event["message"]) + if isinstance(event.get("logentry"), dict): + logentry = event["logentry"] + if isinstance(logentry.get("message"), str): + logentry["message"] = scrub_string(logentry["message"]) + if isinstance(logentry.get("formatted"), str): + logentry["formatted"] = scrub_string(logentry["formatted"]) + + +def _is_safe_tag_key(key: Any) -> bool: + """Return True when a tag key is safe to keep.""" + if not isinstance(key, str): + return False + if not TAG_KEY_PATTERN.match(key): + return False + if is_sensitive_key(key): + return False + return True + + +def _normalize_tag_value(value: Any) -> str: + """Convert a tag value to a scrubbed bounded string.""" + scrubbed = scrub_string(str(value)) + return scrubbed[:MAX_TAG_VALUE_CHARS] + + +def _scrub_tags(tags: Dict[str, Any]) -> Dict[str, Any]: + """Keep safe tags, scrub values, and bound payload size.""" + sanitized: Dict[str, Any] = {} + dropped_count = 0 + + # Preserve explicitly listed observability keys first so they survive capping. + for key in ALLOWED_OBSERVABILITY_KEYS: + if key in tags and _is_safe_tag_key(key): + sanitized[key] = _normalize_tag_value(tags[key]) + + for key, value in tags.items(): + if key in sanitized: + continue + if len(sanitized) >= MAX_TAGS: + dropped_count += 1 + continue + if not _is_safe_tag_key(key): + dropped_count += 1 + continue + sanitized[key] = _normalize_tag_value(value) + + # Keep hard cap strict, including metadata markers. + if dropped_count > 0 and len(sanitized) < MAX_TAGS: + sanitized["_dropped_tag_count"] = str(dropped_count) + + return sanitized + def sanitize_path(path: str) -> str: """Remove username from file paths. @@ -157,6 +253,23 @@ def scrub_string(value: str) -> str: return value +def anonymize_identifier(value: str, prefix: str = "anon") -> str: + """Deterministically anonymize an identifier using versioned HMAC-SHA256.""" + normalized = str(value or "").strip() + if not normalized: + return f"{prefix}:{ANON_VERSION}:unknown" + if _is_already_anonymized(normalized, prefix=prefix): + return normalized + + salt = _get_anon_salt_cached() + digest = hmac.new( + salt.encode("utf-8"), + normalized.encode("utf-8"), + hashlib.sha256, + ).hexdigest()[:16] + return f"{prefix}:{ANON_VERSION}:{digest}" + + def scrub_dict( data: Dict[str, Any], deep: bool = True, @@ -181,6 +294,8 @@ def scrub_dict( result[key] = scrub_dict(value, deep=True, scrub_values=scrub_values) elif isinstance(value, list) and deep: result[key] = scrub_list(value, scrub_values=scrub_values) + elif isinstance(value, tuple) and deep: + result[key] = tuple(scrub_list(list(value), scrub_values=scrub_values)) elif isinstance(value, str): if scrub_values: result[key] = scrub_string(value) @@ -209,6 +324,8 @@ def scrub_list(data: List[Any], scrub_values: bool = False) -> List[Any]: result.append(scrub_dict(item, deep=True, scrub_values=scrub_values)) elif isinstance(item, list): result.append(scrub_list(item, scrub_values=scrub_values)) + elif isinstance(item, tuple): + result.append(tuple(scrub_list(list(item), scrub_values=scrub_values))) elif isinstance(item, str) and scrub_values: result.append(scrub_string(item)) else: @@ -258,43 +375,63 @@ def create_before_send_filter(): def before_send(event: Event, hint: Hint) -> Optional[Event]: """Filter and sanitize events before sending to Sentry/GlitchTip.""" - # Scrub exception data - if "exception" in event: - scrub_exception_data(event["exception"]) - - # Scrub breadcrumbs - if "breadcrumbs" in event and "values" in event["breadcrumbs"]: - for breadcrumb in event["breadcrumbs"]["values"]: - if "message" in breadcrumb and isinstance(breadcrumb["message"], str): - breadcrumb["message"] = scrub_string(breadcrumb["message"]) - if "data" in breadcrumb and isinstance(breadcrumb["data"], dict): - breadcrumb["data"] = scrub_dict( - breadcrumb["data"], deep=True, scrub_values=True - ) - - # Scrub extra data - if "extra" in event and isinstance(event["extra"], dict): - event["extra"] = scrub_dict(event["extra"], deep=True, scrub_values=True) - - # Scrub contexts - if "contexts" in event and isinstance(event["contexts"], dict): - event["contexts"] = scrub_dict(event["contexts"], deep=True, scrub_values=False) - - # Scrub tags - if "tags" in event and isinstance(event["tags"], dict): - event["tags"] = scrub_dict(event["tags"], deep=False, scrub_values=False) - - # Scrub request data (if present) - if "request" in event: - request = event["request"] - if "headers" in request: - request["headers"] = scrub_dict(request["headers"], deep=False, scrub_values=False) - if "data" in request: - if isinstance(request["data"], dict): - request["data"] = scrub_dict(request["data"], deep=True, scrub_values=True) - elif isinstance(request["data"], str): - request["data"] = scrub_string(request["data"]) - - return event + try: + _scrub_top_level_messages(event) + + # Scrub exception data + if "exception" in event: + scrub_exception_data(event["exception"]) + + # Scrub breadcrumbs + if "breadcrumbs" in event and "values" in event["breadcrumbs"]: + for breadcrumb in event["breadcrumbs"]["values"]: + if "message" in breadcrumb and isinstance(breadcrumb["message"], str): + breadcrumb["message"] = scrub_string(breadcrumb["message"]) + if "data" in breadcrumb and isinstance(breadcrumb["data"], dict): + breadcrumb["data"] = scrub_dict( + breadcrumb["data"], deep=True, scrub_values=True + ) + + # Scrub extra data + if "extra" in event and isinstance(event["extra"], dict): + event["extra"] = scrub_dict(event["extra"], deep=True, scrub_values=True) + + # Scrub contexts + if "contexts" in event and isinstance(event["contexts"], dict): + event["contexts"] = scrub_dict(event["contexts"], deep=True, scrub_values=True) + + # Scrub tags (allowlist + scrub values) + if "tags" in event and isinstance(event["tags"], dict): + event["tags"] = _scrub_tags(event["tags"]) + + # Scrub request data (if present) + if "request" in event and isinstance(event["request"], dict): + request = event["request"] + if "headers" in request: + if isinstance(request["headers"], dict): + request["headers"] = scrub_dict( + request["headers"], deep=False, scrub_values=True + ) + elif isinstance(request["headers"], list): + request["headers"] = scrub_list(request["headers"], scrub_values=True) + elif isinstance(request["headers"], str): + request["headers"] = scrub_string(request["headers"]) + if "data" in request: + if isinstance(request["data"], dict): + request["data"] = scrub_dict(request["data"], deep=True, scrub_values=True) + elif isinstance(request["data"], list): + request["data"] = scrub_list(request["data"], scrub_values=True) + elif isinstance(request["data"], str): + request["data"] = scrub_string(request["data"]) + + # Keep only anonymous user IDs (drop all other user attributes). + if "user" in event and isinstance(event["user"], dict): + user_id = event["user"].get("id") + event["user"] = {"id": anonymize_identifier(str(user_id))} if user_id else {} + + return event + except Exception: + # Fail closed on privacy filter errors. + return None return before_send diff --git a/tests/test_client.py b/tests/test_client.py index cebcdca..cc2606f 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -3,6 +3,8 @@ import os from unittest.mock import patch +import pytest + from openadapt_telemetry.client import ( TelemetryClient, get_telemetry, @@ -96,6 +98,39 @@ def test_ci_indicates_internal(self): with patch.dict(os.environ, {"CI": "true"}, clear=False): assert is_internal_user() is True + @patch("openadapt_telemetry.client.Path.exists", return_value=False) + @patch("openadapt_telemetry.client.is_ci_environment", return_value=False) + def test_no_signals_indicates_external(self, _mock_ci, _mock_exists): + """Without explicit internal signals, usage should be treated as external.""" + with patch.dict(os.environ, {"OPENADAPT_INTERNAL": "", "OPENADAPT_DEV": ""}, clear=False): + assert is_internal_user() is False + + @patch("openadapt_telemetry.client.Path.exists", return_value=True) + @patch("openadapt_telemetry.client.is_ci_environment", return_value=False) + def test_git_repo_not_internal_by_default(self, _mock_ci, _mock_exists): + """Git repos should not be internal unless explicitly enabled.""" + with patch.dict( + os.environ, + { + "OPENADAPT_INTERNAL": "", + "OPENADAPT_DEV": "", + "OPENADAPT_INTERNAL_FROM_GIT": "", + }, + clear=False, + ): + assert is_internal_user() is False + + @patch("openadapt_telemetry.client.Path.exists", return_value=True) + @patch("openadapt_telemetry.client.is_ci_environment", return_value=False) + def test_git_repo_internal_when_opted_in(self, _mock_ci, _mock_exists): + """Git repos should be internal when OPENADAPT_INTERNAL_FROM_GIT is enabled.""" + with patch.dict( + os.environ, + {"OPENADAPT_INTERNAL_FROM_GIT": "true", "OPENADAPT_INTERNAL": "", "OPENADAPT_DEV": ""}, + clear=False, + ): + assert is_internal_user() is True + class TestTelemetryClient: """Tests for TelemetryClient class.""" @@ -181,6 +216,8 @@ def test_initialize_with_dsn(self, mock_sentry): assert result is True assert client.initialized is True mock_sentry.init.assert_called_once() + init_kwargs = mock_sentry.init.call_args.kwargs + assert init_kwargs["send_default_pii"] is False @patch("openadapt_telemetry.client.sentry_sdk") def test_capture_exception_when_enabled(self, mock_sentry): @@ -231,6 +268,74 @@ def test_set_tag(self, mock_sentry): mock_sentry.set_tag.assert_called_with("test_key", "test_value") + @patch("openadapt_telemetry.client.sentry_sdk") + def test_set_user_hashes_identifier_and_drops_extra_fields(self, mock_sentry): + """set_user should only send an anonymized ID.""" + with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): + TelemetryClient.reset_instance() + client = TelemetryClient.get_instance() + client.initialize(dsn="https://test@example.com/1") + + client.set_user("user@example.com", email="user@example.com", name="User") + + mock_sentry.set_user.assert_called_once() + payload = mock_sentry.set_user.call_args.args[0] + assert set(payload.keys()) == {"id"} + assert payload["id"].startswith("anon:v2:") + assert payload["id"] != "user@example.com" + + @patch("openadapt_telemetry.client.sentry_sdk") + def test_custom_before_send_is_composed_after_privacy_filter(self, mock_sentry): + """Custom before_send should run before built-in scrubbing.""" + + def custom_before_send(event, hint): + event.setdefault("logentry", {})["message"] = "secret user@example.com" + event.setdefault("tags", {})["package"] = "openadapt-evals" + return event + + with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): + TelemetryClient.reset_instance() + client = TelemetryClient.get_instance() + with pytest.warns( + UserWarning, + match="Custom before_send runs before OpenAdapt privacy filtering", + ): + client.initialize( + dsn="https://test@example.com/1", + before_send=custom_before_send, + ) + + before_send = mock_sentry.init.call_args.kwargs["before_send"] + event = {"user": {"id": "user@example.com", "email": "user@example.com"}} + output = before_send(event, hint={}) + assert output is not None + assert output["user"]["id"].startswith("anon:v2:") + assert "email" not in output["user"] + assert output["logentry"]["message"] == "secret [REDACTED]" + + @patch("openadapt_telemetry.client.sentry_sdk") + def test_send_default_pii_override_is_ignored(self, mock_sentry): + """send_default_pii should always be enforced to False.""" + with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): + TelemetryClient.reset_instance() + client = TelemetryClient.get_instance() + with pytest.warns(UserWarning, match="Ignoring sentry init override for send_default_pii"): + client.initialize( + dsn="https://test@example.com/1", + send_default_pii=True, + ) + + assert mock_sentry.init.call_args.kwargs["send_default_pii"] is False + + def test_initialize_rejects_non_callable_before_send(self): + """Non-callable before_send should raise TypeError.""" + with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): + TelemetryClient.reset_instance() + client = TelemetryClient.get_instance() + with patch("openadapt_telemetry.client.sentry_sdk"): + with pytest.raises(TypeError, match="before_send must be callable"): + client.initialize(dsn="https://test@example.com/1", before_send="invalid") + @patch("openadapt_telemetry.client.sentry_sdk") def test_add_breadcrumb(self, mock_sentry): """add_breadcrumb should call sentry when enabled.""" diff --git a/tests/test_config.py b/tests/test_config.py index 4809daa..9be445f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -3,6 +3,7 @@ import json import os import tempfile +import warnings from pathlib import Path from unittest.mock import patch @@ -10,9 +11,12 @@ from openadapt_telemetry.config import ( TelemetryConfig, + _generate_anon_salt, _get_env_config, + _is_valid_anon_salt, _load_config_file, _parse_bool, + get_or_create_anon_salt, load_config, save_config, ) @@ -110,6 +114,16 @@ def test_explicit_disable(self): config = _get_env_config() assert config.get("enabled") is False + def test_do_not_track_overrides_explicit_enable(self): + """DO_NOT_TRACK should disable telemetry even if explicitly enabled.""" + with patch.dict( + os.environ, + {"DO_NOT_TRACK": "1", "OPENADAPT_TELEMETRY_ENABLED": "true"}, + clear=False, + ): + config = _get_env_config() + assert config.get("enabled") is False + def test_internal_flag(self): """OPENADAPT_INTERNAL should set internal flag.""" with patch.dict(os.environ, {"OPENADAPT_INTERNAL": "true"}, clear=False): @@ -147,6 +161,19 @@ def test_invalid_sample_rate_ignored(self): config = _get_env_config() assert "sample_rate" not in config + def test_anon_salt_env_override(self): + """OPENADAPT_TELEMETRY_ANON_SALT should be loaded when set.""" + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "x" * 32}, clear=False): + config = _get_env_config() + assert config.get("anon_salt") == "x" * 32 + + def test_invalid_anon_salt_env_ignored(self): + """Invalid OPENADAPT_TELEMETRY_ANON_SALT should be ignored in env config.""" + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "short"}, clear=False): + with pytest.warns(UserWarning, match="Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT"): + config = _get_env_config() + assert "anon_salt" not in config + class TestConfigFile: """Tests for configuration file loading.""" @@ -183,6 +210,18 @@ def test_load_invalid_json(self): os.unlink(f.name) + def test_load_non_dict_json_returns_empty(self): + """Loading valid JSON that is not an object should return empty dict.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(["not", "a", "dict"], f) + f.flush() + + with patch("openadapt_telemetry.config.CONFIG_FILE", Path(f.name)): + config = _load_config_file() + assert config == {} + + os.unlink(f.name) + class TestLoadConfig: """Tests for full configuration loading.""" @@ -239,3 +278,61 @@ def test_save_and_load(self): assert loaded.enabled is False assert loaded.environment == "test" assert loaded.sample_rate == 0.5 + + +class TestAnonSalt: + """Tests for per-install anonymization salt handling.""" + + def test_validate_anon_salt(self): + assert _is_valid_anon_salt("x" * 32) is True + assert _is_valid_anon_salt("short") is False + assert _is_valid_anon_salt(None) is False + + def test_generate_anon_salt(self): + salt = _generate_anon_salt() + assert isinstance(salt, str) + assert len(salt) >= 32 + + def test_get_or_create_anon_salt_from_env(self): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "y" * 32}, clear=False): + assert get_or_create_anon_salt() == "y" * 32 + + def test_get_or_create_anon_salt_persists_when_missing(self): + with tempfile.TemporaryDirectory() as tmpdir: + config_dir = Path(tmpdir) / "openadapt" + config_file = config_dir / "telemetry.json" + with patch("openadapt_telemetry.config.CONFIG_DIR", config_dir): + with patch("openadapt_telemetry.config.CONFIG_FILE", config_file): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": ""}, clear=False): + first = get_or_create_anon_salt() + second = get_or_create_anon_salt() + assert first == second + with open(config_file) as f: + data = json.load(f) + assert data["anon_salt"] == first + + def test_get_or_create_anon_salt_recovers_from_corrupt_file(self): + with tempfile.TemporaryDirectory() as tmpdir: + config_dir = Path(tmpdir) / "openadapt" + config_dir.mkdir(parents=True, exist_ok=True) + config_file = config_dir / "telemetry.json" + config_file.write_text("{not-valid-json") + with patch("openadapt_telemetry.config.CONFIG_DIR", config_dir): + with patch("openadapt_telemetry.config.CONFIG_FILE", config_file): + salt = get_or_create_anon_salt() + assert isinstance(salt, str) + assert len(salt) >= 32 + + def test_invalid_anon_salt_warns_once_across_config_paths(self): + with patch("openadapt_telemetry.config._INVALID_ANON_SALT_WARNED", False): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "short"}, clear=False): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _get_env_config() + get_or_create_anon_salt() + matches = [ + warning + for warning in caught + if "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT" in str(warning.message) + ] + assert len(matches) == 1 diff --git a/tests/test_privacy.py b/tests/test_privacy.py index af37ae0..67f4f31 100644 --- a/tests/test_privacy.py +++ b/tests/test_privacy.py @@ -1,7 +1,12 @@ """Tests for privacy filtering and PII scrubbing.""" +from unittest.mock import patch from openadapt_telemetry.privacy import ( + ALLOWED_OBSERVABILITY_KEYS, + MAX_TAGS, + anonymize_identifier, + create_before_send_filter, is_sensitive_key, sanitize_path, scrub_dict, @@ -273,3 +278,143 @@ def test_local_variables_scrubbing(self): frame = exception_data["values"][0]["stacktrace"]["frames"][0] assert frame["vars"]["password"] == "[REDACTED]" assert frame["vars"]["username"] == "john" + + +class TestAnonymizeIdentifier: + """Tests for deterministic identifier anonymization.""" + + def test_stable_hash_with_same_salt(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + first = anonymize_identifier("user@example.com") + second = anonymize_identifier("user@example.com") + assert first == second + assert first.startswith("anon:v2:") + assert first != "user@example.com" + + def test_different_salts_produce_different_ids(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + first = anonymize_identifier("user@example.com") + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="b" * 32): + second = anonymize_identifier("user@example.com") + assert first != second + + def test_empty_value(self): + assert anonymize_identifier("") == "anon:v2:unknown" + + def test_already_anonymized_id_is_not_rehashed(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + anon_id = anonymize_identifier("user@example.com") + assert anonymize_identifier(anon_id) == anon_id + assert anonymize_identifier("anon:v1:1234567890abcdef") == "anon:v1:1234567890abcdef" + assert anonymize_identifier("anon:1234567890abcdef") == "anon:1234567890abcdef" + + def test_non_canonical_anon_prefix_is_rehashed(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + value = anonymize_identifier("anon:user@example.com") + assert value.startswith("anon:v2:") + assert value != "anon:user@example.com" + + +class TestBeforeSendUserScrubbing: + """Tests for user context anonymization in before_send.""" + + def test_only_anonymous_user_id_remains(self): + before_send = create_before_send_filter() + event = { + "user": { + "id": "user@example.com", + "email": "user@example.com", + "username": "alice", + }, + } + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert "user" in sanitized + assert set(sanitized["user"].keys()) == {"id"} + assert sanitized["user"]["id"].startswith("anon:v2:") + assert sanitized["user"]["id"] != "user@example.com" + + +class TestBeforeSendEventScrubbing: + """Tests for event-level before_send filtering behavior.""" + + def test_top_level_message_is_scrubbed(self): + before_send = create_before_send_filter() + event = {"message": "Failure for user@example.com"} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert "user@example.com" not in sanitized["message"] + + def test_logentry_message_is_scrubbed(self): + before_send = create_before_send_filter() + event = {"logentry": {"message": "token=Bearer abc123", "formatted": "email user@example.com"}} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert "abc123" not in sanitized["logentry"]["message"] + assert "user@example.com" not in sanitized["logentry"]["formatted"] + + def test_context_values_are_scrubbed(self): + before_send = create_before_send_filter() + event = {"contexts": {"runtime": {"note": "contact me at user@example.com"}}} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["contexts"]["runtime"]["note"] == "contact me at [REDACTED]" + + def test_tags_are_allowlisted_and_scrubbed(self): + before_send = create_before_send_filter() + event = { + "tags": { + "package": "openadapt-evals", + "internal": "false", + "task_id": "abc123", + "email": "user@example.com", + "not valid": "drop-me", + }, + } + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert set(ALLOWED_OBSERVABILITY_KEYS).intersection(set(sanitized["tags"].keys())) + assert sanitized["tags"]["task_id"] == "abc123" + assert "email" not in sanitized["tags"] + assert "not valid" not in sanitized["tags"] + assert sanitized["tags"]["_dropped_tag_count"] == "2" + + def test_filter_fails_closed_on_unexpected_error(self): + before_send = create_before_send_filter() + with patch("openadapt_telemetry.privacy.scrub_exception_data", side_effect=RuntimeError("boom")): + event = {"exception": {"values": []}} + assert before_send(event, hint={}) is None + + def test_request_non_dict_does_not_drop_event(self): + before_send = create_before_send_filter() + event = {"request": "unexpected-request-shape", "message": "ok"} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["request"] == "unexpected-request-shape" + + def test_request_headers_tuple_list_is_scrubbed(self): + before_send = create_before_send_filter() + event = { + "request": { + "headers": [ + ("Authorization", "Bearer abc123"), + ("X-User", "user@example.com"), + ], + }, + } + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["request"]["headers"][0][1] == "[REDACTED]" + assert sanitized["request"]["headers"][1][1] == "[REDACTED]" + + def test_tag_cap_is_hard_limited(self): + before_send = create_before_send_filter() + many_tags = {f"k{i}": f"v{i}" for i in range(MAX_TAGS + 20)} + many_tags.update({"package": "openadapt-evals", "internal": "false"}) + event = {"tags": many_tags} + sanitized = before_send(event, hint={}) + assert sanitized is not None + tags = sanitized["tags"] + assert "package" in tags + assert "internal" in tags + assert len(tags) <= MAX_TAGS