From afb480b9f71dc488c875da87bdab54b99451f109 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Sat, 17 Jan 2026 00:54:14 -0500 Subject: [PATCH 01/11] fix: add README badges for license and Python version Add standard badges for license and Python version. PyPI badges are commented out until the package is published. Co-Authored-By: Claude Sonnet 4.5 --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 5781a7d..dcb4774 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,13 @@ # openadapt-telemetry +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/downloads/) + + + Unified telemetry and error tracking for OpenAdapt packages. ## Features From 278e091b480190b4201e5faed3bca13f6faf11cd Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Fri, 6 Mar 2026 15:38:27 -0500 Subject: [PATCH 02/11] feat: make telemetry opt-out and enforce anonymized user IDs --- src/openadapt_telemetry/client.py | 32 +++++++++++--------------- src/openadapt_telemetry/config.py | 1 - src/openadapt_telemetry/privacy.py | 16 ++++++++++++- tests/test_client.py | 21 +++++++++++++---- tests/test_privacy.py | 37 +++++++++++++++++++++++++++++- 5 files changed, 81 insertions(+), 26 deletions(-) diff --git a/src/openadapt_telemetry/client.py b/src/openadapt_telemetry/client.py index 2db0c43..0d526d1 100644 --- a/src/openadapt_telemetry/client.py +++ b/src/openadapt_telemetry/client.py @@ -10,13 +10,12 @@ import platform import sys from pathlib import Path -from typing import Any, Callable, Dict, Optional +from typing import Any, Dict, Optional import sentry_sdk -from sentry_sdk.types import Event, Hint from .config import TelemetryConfig, load_config -from .privacy import create_before_send_filter, sanitize_path +from .privacy import anonymize_identifier, create_before_send_filter def is_running_from_executable() -> bool: @@ -129,20 +128,13 @@ def reset_instance(cls) -> None: def _check_enabled(self) -> bool: """Check if telemetry should be enabled. - Checks environment variables for opt-out signals. + Uses merged config with defaults/env/file precedence. Returns: True if telemetry should be enabled. """ - # Universal opt-out (DO_NOT_TRACK standard) - if os.getenv("DO_NOT_TRACK", "").lower() in ("1", "true"): - return False - - # Package-specific opt-out - if os.getenv("OPENADAPT_TELEMETRY_ENABLED", "").lower() in ("false", "0", "no"): - return False - - return True + self._config = load_config() + return bool(self._config.enabled) @property def enabled(self) -> bool: @@ -188,9 +180,6 @@ def initialize( Returns: True if initialization succeeded, False if disabled or already initialized. """ - if not self._enabled: - return False - if self._initialized and not kwargs.get("force", False): return True @@ -202,6 +191,10 @@ def initialize( self._config.dsn = dsn if environment: self._config.environment = environment + self._enabled = bool(self._config.enabled) + + if not self._enabled: + return False # Skip if no DSN configured if not self._config.dsn: @@ -315,12 +308,13 @@ def set_user( Note: Only sets anonymous user ID. Never set email, name, or other PII. Args: - user_id: Anonymous user identifier. - **kwargs: Additional user properties (id only recommended). + user_id: User identifier to hash before sending. + **kwargs: Ignored. Additional user fields are dropped. """ if not self._enabled or not self._initialized: return - sentry_sdk.set_user({"id": user_id, **kwargs}) + _ = kwargs + sentry_sdk.set_user({"id": anonymize_identifier(user_id)}) def set_tag(self, key: str, value: str) -> None: """Set a custom tag for all subsequent events. diff --git a/src/openadapt_telemetry/config.py b/src/openadapt_telemetry/config.py index 39ac6f8..6bbf4c1 100644 --- a/src/openadapt_telemetry/config.py +++ b/src/openadapt_telemetry/config.py @@ -15,7 +15,6 @@ from pathlib import Path from typing import Any, Optional - # Default configuration values DEFAULTS = { "enabled": True, diff --git a/src/openadapt_telemetry/privacy.py b/src/openadapt_telemetry/privacy.py index 3e62c18..ba0d277 100644 --- a/src/openadapt_telemetry/privacy.py +++ b/src/openadapt_telemetry/privacy.py @@ -9,10 +9,10 @@ from __future__ import annotations +import hashlib import re from typing import Any, Dict, List, Optional, Set - # Sensitive field names that should have their values redacted PII_DENYLIST: Set[str] = { # Authentication @@ -158,6 +158,15 @@ def scrub_string(value: str) -> str: return value +def anonymize_identifier(value: str, prefix: str = "anon") -> str: + """Deterministically anonymize an identifier using SHA-256.""" + normalized = str(value or "").strip() + if not normalized: + return f"{prefix}:unknown" + digest = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16] + return f"{prefix}:{digest}" + + def scrub_dict( data: Dict[str, Any], deep: bool = True, @@ -296,6 +305,11 @@ def before_send(event: Event, hint: Hint) -> Optional[Event]: elif isinstance(request["data"], str): request["data"] = scrub_string(request["data"]) + # Keep only anonymous user IDs (drop all other user attributes). + if "user" in event and isinstance(event["user"], dict): + user_id = event["user"].get("id") + event["user"] = {"id": anonymize_identifier(str(user_id))} if user_id else {} + return event return before_send diff --git a/tests/test_client.py b/tests/test_client.py index c74ece7..d34396d 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,10 +1,7 @@ """Tests for telemetry client.""" import os -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest +from unittest.mock import patch from openadapt_telemetry.client import ( TelemetryClient, @@ -234,6 +231,22 @@ def test_set_tag(self, mock_sentry): mock_sentry.set_tag.assert_called_with("test_key", "test_value") + @patch("openadapt_telemetry.client.sentry_sdk") + def test_set_user_hashes_identifier_and_drops_extra_fields(self, mock_sentry): + """set_user should only send an anonymized ID.""" + with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): + TelemetryClient.reset_instance() + client = TelemetryClient.get_instance() + client.initialize(dsn="https://test@example.com/1") + + client.set_user("user@example.com", email="user@example.com", name="User") + + mock_sentry.set_user.assert_called_once() + payload = mock_sentry.set_user.call_args.args[0] + assert set(payload.keys()) == {"id"} + assert payload["id"].startswith("anon:") + assert payload["id"] != "user@example.com" + @patch("openadapt_telemetry.client.sentry_sdk") def test_add_breadcrumb(self, mock_sentry): """add_breadcrumb should call sentry when enabled.""" diff --git a/tests/test_privacy.py b/tests/test_privacy.py index e33dbbc..b0158b7 100644 --- a/tests/test_privacy.py +++ b/tests/test_privacy.py @@ -1,8 +1,9 @@ """Tests for privacy filtering and PII scrubbing.""" -import pytest from openadapt_telemetry.privacy import ( + anonymize_identifier, + create_before_send_filter, is_sensitive_key, sanitize_path, scrub_dict, @@ -274,3 +275,37 @@ def test_local_variables_scrubbing(self): frame = exception_data["values"][0]["stacktrace"]["frames"][0] assert frame["vars"]["password"] == "[REDACTED]" assert frame["vars"]["username"] == "john" + + +class TestAnonymizeIdentifier: + """Tests for deterministic identifier anonymization.""" + + def test_stable_hash(self): + first = anonymize_identifier("user@example.com") + second = anonymize_identifier("user@example.com") + assert first == second + assert first.startswith("anon:") + assert first != "user@example.com" + + def test_empty_value(self): + assert anonymize_identifier("") == "anon:unknown" + + +class TestBeforeSendUserScrubbing: + """Tests for user context anonymization in before_send.""" + + def test_only_anonymous_user_id_remains(self): + before_send = create_before_send_filter() + event = { + "user": { + "id": "user@example.com", + "email": "user@example.com", + "username": "alice", + }, + } + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert "user" in sanitized + assert set(sanitized["user"].keys()) == {"id"} + assert sanitized["user"]["id"].startswith("anon:") + assert sanitized["user"]["id"] != "user@example.com" From 6dce201e2f1cd2be05606ef1fc71d22c1cead16c Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Fri, 6 Mar 2026 15:48:16 -0500 Subject: [PATCH 03/11] fix: enforce do-not-track precedence and avoid rehashing anon IDs --- src/openadapt_telemetry/config.py | 10 +++++----- src/openadapt_telemetry/privacy.py | 2 ++ tests/test_config.py | 11 ++++++++++- tests/test_privacy.py | 4 ++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/openadapt_telemetry/config.py b/src/openadapt_telemetry/config.py index 6bbf4c1..a7c047a 100644 --- a/src/openadapt_telemetry/config.py +++ b/src/openadapt_telemetry/config.py @@ -82,15 +82,15 @@ def _get_env_config() -> dict[str, Any]: """Get configuration from environment variables.""" config: dict[str, Any] = {} - # Universal opt-out (DO_NOT_TRACK standard) - if os.getenv("DO_NOT_TRACK", "").lower() in ("1", "true"): - config["enabled"] = False - - # Package-specific opt-out + # Package-specific toggle enabled_env = os.getenv("OPENADAPT_TELEMETRY_ENABLED", "") if enabled_env: config["enabled"] = _parse_bool(enabled_env) + # Universal opt-out (DO_NOT_TRACK standard) always wins. + if os.getenv("DO_NOT_TRACK", "").lower() in ("1", "true"): + config["enabled"] = False + # Internal/developer flags if os.getenv("OPENADAPT_INTERNAL", "").lower() in ("true", "1", "yes"): config["internal"] = True diff --git a/src/openadapt_telemetry/privacy.py b/src/openadapt_telemetry/privacy.py index ba0d277..48a1f0f 100644 --- a/src/openadapt_telemetry/privacy.py +++ b/src/openadapt_telemetry/privacy.py @@ -163,6 +163,8 @@ def anonymize_identifier(value: str, prefix: str = "anon") -> str: normalized = str(value or "").strip() if not normalized: return f"{prefix}:unknown" + if normalized.startswith(f"{prefix}:"): + return normalized digest = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16] return f"{prefix}:{digest}" diff --git a/tests/test_config.py b/tests/test_config.py index a196895..0d8f9d0 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -9,7 +9,6 @@ import pytest from openadapt_telemetry.config import ( - CONFIG_FILE, TelemetryConfig, _get_env_config, _load_config_file, @@ -111,6 +110,16 @@ def test_explicit_disable(self): config = _get_env_config() assert config.get("enabled") is False + def test_do_not_track_overrides_explicit_enable(self): + """DO_NOT_TRACK should disable telemetry even if explicitly enabled.""" + with patch.dict( + os.environ, + {"DO_NOT_TRACK": "1", "OPENADAPT_TELEMETRY_ENABLED": "true"}, + clear=False, + ): + config = _get_env_config() + assert config.get("enabled") is False + def test_internal_flag(self): """OPENADAPT_INTERNAL should set internal flag.""" with patch.dict(os.environ, {"OPENADAPT_INTERNAL": "true"}, clear=False): diff --git a/tests/test_privacy.py b/tests/test_privacy.py index b0158b7..817350e 100644 --- a/tests/test_privacy.py +++ b/tests/test_privacy.py @@ -290,6 +290,10 @@ def test_stable_hash(self): def test_empty_value(self): assert anonymize_identifier("") == "anon:unknown" + def test_already_anonymized_id_is_not_rehashed(self): + anon_id = anonymize_identifier("user@example.com") + assert anonymize_identifier(anon_id) == anon_id + class TestBeforeSendUserScrubbing: """Tests for user context anonymization in before_send.""" From 7d64d77423a970c12be06a79ec157a2ab2e6e145 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Fri, 6 Mar 2026 15:48:48 -0500 Subject: [PATCH 04/11] fix: avoid misclassifying non-binary users as internal --- src/openadapt_telemetry/client.py | 13 ++++--------- tests/test_client.py | 7 +++++++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/openadapt_telemetry/client.py b/src/openadapt_telemetry/client.py index 0d526d1..b0630a7 100644 --- a/src/openadapt_telemetry/client.py +++ b/src/openadapt_telemetry/client.py @@ -59,9 +59,8 @@ def is_internal_user() -> bool: Uses multiple heuristics to detect internal/developer usage: 1. Explicit OPENADAPT_INTERNAL environment variable 2. OPENADAPT_DEV environment variable - 3. Not running from frozen executable - 4. Git repository present in current directory - 5. CI environment detected + 3. Git repository present in current directory + 4. CI environment detected Returns: True if this appears to be internal usage. @@ -74,15 +73,11 @@ def is_internal_user() -> bool: if os.getenv("OPENADAPT_DEV", "").lower() in ("true", "1", "yes"): return True - # Method 3: Not running from executable (indicates dev mode) - if not is_running_from_executable(): - return True - - # Method 4: Git repository present (development checkout) + # Method 3: Git repository present (development checkout) if Path(".git").exists() or Path("../.git").exists(): return True - # Method 5: CI/CD environment + # Method 4: CI/CD environment if is_ci_environment(): return True diff --git a/tests/test_client.py b/tests/test_client.py index d34396d..663f283 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -96,6 +96,13 @@ def test_ci_indicates_internal(self): with patch.dict(os.environ, {"CI": "true"}, clear=False): assert is_internal_user() is True + @patch("openadapt_telemetry.client.Path.exists", return_value=False) + @patch("openadapt_telemetry.client.is_ci_environment", return_value=False) + def test_no_signals_indicates_external(self, _mock_ci, _mock_exists): + """Without explicit internal signals, usage should be treated as external.""" + with patch.dict(os.environ, {"OPENADAPT_INTERNAL": "", "OPENADAPT_DEV": ""}, clear=False): + assert is_internal_user() is False + class TestTelemetryClient: """Tests for TelemetryClient class.""" From a8f2a57e1f0287154d98c1cd4c2d7cd00d55e4ac Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Fri, 6 Mar 2026 15:56:08 -0500 Subject: [PATCH 05/11] fix: enforce privacy filter precedence for sentry init overrides --- README.md | 7 +++-- src/openadapt_telemetry/client.py | 48 +++++++++++++++++++++++----- tests/test_client.py | 52 +++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index dcb4774..061e94e 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,8 @@ export OPENADAPT_TELEMETRY_ENABLED=false - File paths have usernames replaced with `` - Sensitive fields (password, token, api_key, etc.) are redacted - Email addresses and phone numbers are scrubbed from messages +- User IDs are hashed before upload (`anon:`) +- `send_default_pii` is enforced to `false` by the client ## Internal Usage Tagging @@ -167,9 +169,8 @@ Internal/developer usage is automatically detected via: 1. `OPENADAPT_INTERNAL=true` environment variable 2. `OPENADAPT_DEV=true` environment variable -3. Running from source (not frozen executable) -4. Git repository present in working directory -5. CI environment detected (GitHub Actions, GitLab CI, etc.) +3. Git repository present in working directory +4. CI environment detected (GitHub Actions, GitLab CI, etc.) Filter in GlitchTip: ``` diff --git a/src/openadapt_telemetry/client.py b/src/openadapt_telemetry/client.py index b0630a7..d3e7aa7 100644 --- a/src/openadapt_telemetry/client.py +++ b/src/openadapt_telemetry/client.py @@ -9,14 +9,18 @@ import os import platform import sys +import warnings from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Callable, Dict, Optional import sentry_sdk +from sentry_sdk.types import Event, Hint from .config import TelemetryConfig, load_config from .privacy import anonymize_identifier, create_before_send_filter +BeforeSendFn = Callable[[Event, Hint], Optional[Event]] + def is_running_from_executable() -> bool: """Check if running from a frozen/bundled executable. @@ -84,6 +88,18 @@ def is_internal_user() -> bool: return False +def _compose_before_send(base: BeforeSendFn, extra: BeforeSendFn) -> BeforeSendFn: + """Compose custom before_send after privacy filtering.""" + + def composed(event: Event, hint: Hint) -> Optional[Event]: + sanitized = base(event, hint) + if sanitized is None: + return None + return extra(sanitized, hint) + + return composed + + class TelemetryClient: """Unified telemetry client for all OpenAdapt packages. @@ -175,7 +191,8 @@ def initialize( Returns: True if initialization succeeded, False if disabled or already initialized. """ - if self._initialized and not kwargs.get("force", False): + force = bool(kwargs.pop("force", False)) + if self._initialized and not force: return True # Load configuration @@ -195,8 +212,26 @@ def initialize( if not self._config.dsn: return False - # Create privacy filter - before_send = create_before_send_filter() + # Always enforce privacy scrubber first; optional custom filter can run afterward. + base_before_send = create_before_send_filter() + custom_before_send = kwargs.pop("before_send", None) + if custom_before_send is not None: + if not callable(custom_before_send): + raise TypeError("before_send must be callable") + warnings.warn( + "Custom before_send is composed after OpenAdapt privacy filtering and cannot bypass scrubbing.", + stacklevel=2, + ) + before_send = _compose_before_send(base_before_send, custom_before_send) + else: + before_send = base_before_send + + if "send_default_pii" in kwargs: + kwargs.pop("send_default_pii") + warnings.warn( + "Ignoring sentry init override for send_default_pii; OpenAdapt telemetry enforces send_default_pii=False.", + stacklevel=2, + ) # Initialize Sentry SDK sentry_kwargs = { @@ -204,14 +239,13 @@ def initialize( "environment": self._config.environment, "sample_rate": self._config.sample_rate, "traces_sample_rate": self._config.traces_sample_rate, - "send_default_pii": self._config.send_default_pii, + # Enforced for privacy safety across all callers/configs. + "send_default_pii": False, "before_send": before_send, } # Merge in any additional kwargs sentry_kwargs.update(kwargs) - # Remove our internal kwargs - sentry_kwargs.pop("force", None) sentry_sdk.init(**sentry_kwargs) diff --git a/tests/test_client.py b/tests/test_client.py index 663f283..53386e9 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -3,6 +3,8 @@ import os from unittest.mock import patch +import pytest + from openadapt_telemetry.client import ( TelemetryClient, get_telemetry, @@ -188,6 +190,8 @@ def test_initialize_with_dsn(self, mock_sentry): assert result is True assert client.initialized is True mock_sentry.init.assert_called_once() + init_kwargs = mock_sentry.init.call_args.kwargs + assert init_kwargs["send_default_pii"] is False @patch("openadapt_telemetry.client.sentry_sdk") def test_capture_exception_when_enabled(self, mock_sentry): @@ -254,6 +258,54 @@ def test_set_user_hashes_identifier_and_drops_extra_fields(self, mock_sentry): assert payload["id"].startswith("anon:") assert payload["id"] != "user@example.com" + @patch("openadapt_telemetry.client.sentry_sdk") + def test_custom_before_send_is_composed_after_privacy_filter(self, mock_sentry): + """Custom before_send should run after built-in scrubbing.""" + + def custom_before_send(event, hint): + event.setdefault("tags", {})["custom"] = "true" + return event + + with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): + TelemetryClient.reset_instance() + client = TelemetryClient.get_instance() + with pytest.warns(UserWarning, match="Custom before_send is composed after OpenAdapt privacy filtering"): + client.initialize( + dsn="https://test@example.com/1", + before_send=custom_before_send, + ) + + before_send = mock_sentry.init.call_args.kwargs["before_send"] + event = {"user": {"id": "user@example.com", "email": "user@example.com"}} + output = before_send(event, hint={}) + assert output is not None + assert output["user"]["id"].startswith("anon:") + assert "email" not in output["user"] + assert output["tags"]["custom"] == "true" + + @patch("openadapt_telemetry.client.sentry_sdk") + def test_send_default_pii_override_is_ignored(self, mock_sentry): + """send_default_pii should always be enforced to False.""" + with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): + TelemetryClient.reset_instance() + client = TelemetryClient.get_instance() + with pytest.warns(UserWarning, match="Ignoring sentry init override for send_default_pii"): + client.initialize( + dsn="https://test@example.com/1", + send_default_pii=True, + ) + + assert mock_sentry.init.call_args.kwargs["send_default_pii"] is False + + def test_initialize_rejects_non_callable_before_send(self): + """Non-callable before_send should raise TypeError.""" + with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): + TelemetryClient.reset_instance() + client = TelemetryClient.get_instance() + with patch("openadapt_telemetry.client.sentry_sdk"): + with pytest.raises(TypeError, match="before_send must be callable"): + client.initialize(dsn="https://test@example.com/1", before_send="invalid") + @patch("openadapt_telemetry.client.sentry_sdk") def test_add_breadcrumb(self, mock_sentry): """add_breadcrumb should call sentry when enabled.""" From 2d9dfa64253219a640f100a28111879f281fd31a Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 16 Mar 2026 16:58:01 -0400 Subject: [PATCH 06/11] feat: harden telemetry privacy filters and anon ID policy --- README.md | 12 ++- src/openadapt_telemetry/client.py | 27 ++--- src/openadapt_telemetry/config.py | 58 ++++++++++- src/openadapt_telemetry/privacy.py | 155 ++++++++++++++++++++--------- tests/test_client.py | 42 ++++++-- tests/test_config.py | 53 ++++++++++ tests/test_privacy.py | 73 ++++++++++++-- 7 files changed, 341 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 061e94e..b1b1f4b 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Unified telemetry and error tracking for OpenAdapt packages. - **Unified Error Tracking**: Consistent error reporting across all OpenAdapt packages - **Privacy-First Design**: Automatic PII scrubbing and path sanitization - **Configurable Opt-Out**: Respects `DO_NOT_TRACK` and custom environment variables -- **CI/Dev Mode Detection**: Automatically tags internal usage for filtering +- **Internal Usage Tagging**: Explicit flags + CI detection with optional git heuristic - **GlitchTip/Sentry Compatible**: Uses the Sentry SDK for maximum compatibility ## Installation @@ -98,10 +98,12 @@ with TelemetrySpan("indexing", "build_faiss_index") as span: | `OPENADAPT_TELEMETRY_ENABLED` | `true` | Enable/disable telemetry | | `OPENADAPT_INTERNAL` | `false` | Tag as internal usage | | `OPENADAPT_DEV` | `false` | Development mode | +| `OPENADAPT_INTERNAL_FROM_GIT` | `false` | Optional: tag as internal when running from a git checkout | | `OPENADAPT_TELEMETRY_DSN` | - | GlitchTip/Sentry DSN | | `OPENADAPT_TELEMETRY_ENVIRONMENT` | `production` | Environment name | | `OPENADAPT_TELEMETRY_SAMPLE_RATE` | `1.0` | Error sampling rate (0.0-1.0) | | `OPENADAPT_TELEMETRY_TRACES_SAMPLE_RATE` | `0.01` | Performance sampling rate | +| `OPENADAPT_TELEMETRY_ANON_SALT` | generated | Optional anonymization salt override (advanced use only) | ### Configuration File @@ -160,7 +162,9 @@ export OPENADAPT_TELEMETRY_ENABLED=false - File paths have usernames replaced with `` - Sensitive fields (password, token, api_key, etc.) are redacted - Email addresses and phone numbers are scrubbed from messages -- User IDs are hashed before upload (`anon:`) +- Top-level event messages/logentry strings are scrubbed +- Tag keys are allowlisted and values are scrubbed before upload +- User IDs are HMAC-anonymized before upload (`anon:v2:`) - `send_default_pii` is enforced to `false` by the client ## Internal Usage Tagging @@ -169,8 +173,8 @@ Internal/developer usage is automatically detected via: 1. `OPENADAPT_INTERNAL=true` environment variable 2. `OPENADAPT_DEV=true` environment variable -3. Git repository present in working directory -4. CI environment detected (GitHub Actions, GitLab CI, etc.) +3. CI environment detected (GitHub Actions, GitLab CI, etc.) +4. Optional git repository heuristic when `OPENADAPT_INTERNAL_FROM_GIT=true` Filter in GlitchTip: ``` diff --git a/src/openadapt_telemetry/client.py b/src/openadapt_telemetry/client.py index d3e7aa7..d25da41 100644 --- a/src/openadapt_telemetry/client.py +++ b/src/openadapt_telemetry/client.py @@ -60,11 +60,11 @@ def is_ci_environment() -> bool: def is_internal_user() -> bool: """Determine if current usage is from internal team. - Uses multiple heuristics to detect internal/developer usage: + Uses multiple signals to detect internal/developer usage: 1. Explicit OPENADAPT_INTERNAL environment variable 2. OPENADAPT_DEV environment variable - 3. Git repository present in current directory - 4. CI environment detected + 3. CI environment detected + 4. Optional git repository heuristic when OPENADAPT_INTERNAL_FROM_GIT=true Returns: True if this appears to be internal usage. @@ -77,25 +77,26 @@ def is_internal_user() -> bool: if os.getenv("OPENADAPT_DEV", "").lower() in ("true", "1", "yes"): return True - # Method 3: Git repository present (development checkout) - if Path(".git").exists() or Path("../.git").exists(): - return True - - # Method 4: CI/CD environment + # Method 3: CI/CD environment if is_ci_environment(): return True + # Method 4: optional git heuristic + if os.getenv("OPENADAPT_INTERNAL_FROM_GIT", "").lower() in ("true", "1", "yes"): + if Path(".git").exists() or Path("../.git").exists(): + return True + return False def _compose_before_send(base: BeforeSendFn, extra: BeforeSendFn) -> BeforeSendFn: - """Compose custom before_send after privacy filtering.""" + """Compose custom before_send before final privacy filtering.""" def composed(event: Event, hint: Hint) -> Optional[Event]: - sanitized = base(event, hint) - if sanitized is None: + modified = extra(event, hint) + if modified is None: return None - return extra(sanitized, hint) + return base(modified, hint) return composed @@ -219,7 +220,7 @@ def initialize( if not callable(custom_before_send): raise TypeError("before_send must be callable") warnings.warn( - "Custom before_send is composed after OpenAdapt privacy filtering and cannot bypass scrubbing.", + "Custom before_send runs before OpenAdapt privacy filtering; final payload is always scrubbed.", stacklevel=2, ) before_send = _compose_before_send(base_before_send, custom_before_send) diff --git a/src/openadapt_telemetry/config.py b/src/openadapt_telemetry/config.py index a7c047a..63410c1 100644 --- a/src/openadapt_telemetry/config.py +++ b/src/openadapt_telemetry/config.py @@ -11,6 +11,8 @@ import json import os +import secrets +import warnings from dataclasses import dataclass, field from pathlib import Path from typing import Any, Optional @@ -27,6 +29,7 @@ "performance_tracking": True, "feature_usage": True, "send_default_pii": False, + "anon_salt": None, } # Config file location @@ -48,6 +51,7 @@ class TelemetryConfig: performance_tracking: bool = True feature_usage: bool = True send_default_pii: bool = False + anon_salt: Optional[str] = None _loaded: bool = field(default=False, repr=False) @@ -122,9 +126,60 @@ def _get_env_config() -> dict[str, Any]: except ValueError: pass + # Optional override for deterministic anonymization in controlled environments. + anon_salt = os.getenv("OPENADAPT_TELEMETRY_ANON_SALT") + if anon_salt: + config["anon_salt"] = anon_salt + return config +def _is_valid_anon_salt(value: Any) -> bool: + """Check whether a salt value is valid for HMAC anonymization.""" + return isinstance(value, str) and len(value.strip()) >= 32 + + +def _generate_anon_salt() -> str: + """Generate a high-entropy random salt.""" + return secrets.token_hex(32) + + +def get_or_create_anon_salt() -> str: + """Get anonymization salt from env/config, creating one if missing. + + Priority: + 1. OPENADAPT_TELEMETRY_ANON_SALT (if valid) + 2. telemetry config file `anon_salt` (if valid) + 3. generated and persisted random salt + """ + env_salt = os.getenv("OPENADAPT_TELEMETRY_ANON_SALT") + if env_salt: + if _is_valid_anon_salt(env_salt): + return env_salt.strip() + warnings.warn( + "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT; must be >= 32 chars.", + stacklevel=2, + ) + + config_data = _load_config_file() + file_salt = config_data.get("anon_salt") + if _is_valid_anon_salt(file_salt): + return str(file_salt).strip() + + generated = _generate_anon_salt() + config_data["anon_salt"] = generated + try: + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + with open(CONFIG_FILE, "w") as f: + json.dump(config_data, f, indent=2) + except OSError: + warnings.warn( + "Failed to persist telemetry anonymization salt; using ephemeral salt for this process.", + stacklevel=2, + ) + return generated + + def load_config() -> TelemetryConfig: """Load telemetry configuration from all sources. @@ -148,7 +203,7 @@ def load_config() -> TelemetryConfig: merged.update(env_config) # Remove None values for fields that should use defaults - config_dict = {k: v for k, v in merged.items() if v is not None or k == "dsn"} + config_dict = {k: v for k, v in merged.items() if v is not None or k in {"dsn", "anon_salt"}} return TelemetryConfig(**config_dict, _loaded=True) @@ -172,6 +227,7 @@ def save_config(config: TelemetryConfig) -> None: "performance_tracking": config.performance_tracking, "feature_usage": config.feature_usage, "send_default_pii": config.send_default_pii, + "anon_salt": config.anon_salt, } with open(CONFIG_FILE, "w") as f: diff --git a/src/openadapt_telemetry/privacy.py b/src/openadapt_telemetry/privacy.py index 48a1f0f..32a1d05 100644 --- a/src/openadapt_telemetry/privacy.py +++ b/src/openadapt_telemetry/privacy.py @@ -10,9 +10,13 @@ from __future__ import annotations import hashlib +import hmac import re +from functools import lru_cache from typing import Any, Dict, List, Optional, Set +from .config import get_or_create_anon_salt + # Sensitive field names that should have their values redacted PII_DENYLIST: Set[str] = { # Authentication @@ -95,6 +99,49 @@ re.compile(r"[A-Za-z0-9+/]{40,}={0,2}"), ] +# Whitelisted tags that retain observability value and are safe to keep. +ALLOWED_OBSERVABILITY_KEYS: Set[str] = { + "package", + "package_version", + "python_version", + "os", + "os_version", + "ci", + "internal", +} + +ANON_VERSION = "v2" + + +@lru_cache(maxsize=1) +def _get_anon_salt_cached() -> str: + """Memoize anon salt lookup to avoid repeated filesystem reads.""" + return get_or_create_anon_salt() + + +def _is_already_anonymized(value: str, prefix: str = "anon") -> bool: + """Return True when value already matches anon id formats.""" + legacy_prefix = f"{prefix}:" + return value.startswith(legacy_prefix) + + +def _scrub_top_level_messages(event: Dict[str, Any]) -> None: + """Scrub top-level message fields that may contain free-form text.""" + if isinstance(event.get("message"), str): + event["message"] = scrub_string(event["message"]) + if isinstance(event.get("logentry"), dict): + logentry = event["logentry"] + if isinstance(logentry.get("message"), str): + logentry["message"] = scrub_string(logentry["message"]) + if isinstance(logentry.get("formatted"), str): + logentry["formatted"] = scrub_string(logentry["formatted"]) + + +def _scrub_tags(tags: Dict[str, Any]) -> Dict[str, Any]: + """Keep allowed tag keys and scrub their values.""" + filtered = {k: v for k, v in tags.items() if k in ALLOWED_OBSERVABILITY_KEYS} + return scrub_dict(filtered, deep=False, scrub_values=True) + def sanitize_path(path: str) -> str: """Remove username from file paths. @@ -159,14 +206,20 @@ def scrub_string(value: str) -> str: def anonymize_identifier(value: str, prefix: str = "anon") -> str: - """Deterministically anonymize an identifier using SHA-256.""" + """Deterministically anonymize an identifier using versioned HMAC-SHA256.""" normalized = str(value or "").strip() if not normalized: - return f"{prefix}:unknown" - if normalized.startswith(f"{prefix}:"): + return f"{prefix}:{ANON_VERSION}:unknown" + if _is_already_anonymized(normalized, prefix=prefix): return normalized - digest = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16] - return f"{prefix}:{digest}" + + salt = _get_anon_salt_cached() + digest = hmac.new( + salt.encode("utf-8"), + normalized.encode("utf-8"), + hashlib.sha256, + ).hexdigest()[:16] + return f"{prefix}:{ANON_VERSION}:{digest}" def scrub_dict( @@ -270,48 +323,54 @@ def create_before_send_filter(): def before_send(event: Event, hint: Hint) -> Optional[Event]: """Filter and sanitize events before sending to Sentry/GlitchTip.""" - # Scrub exception data - if "exception" in event: - scrub_exception_data(event["exception"]) - - # Scrub breadcrumbs - if "breadcrumbs" in event and "values" in event["breadcrumbs"]: - for breadcrumb in event["breadcrumbs"]["values"]: - if "message" in breadcrumb and isinstance(breadcrumb["message"], str): - breadcrumb["message"] = scrub_string(breadcrumb["message"]) - if "data" in breadcrumb and isinstance(breadcrumb["data"], dict): - breadcrumb["data"] = scrub_dict( - breadcrumb["data"], deep=True, scrub_values=True - ) - - # Scrub extra data - if "extra" in event and isinstance(event["extra"], dict): - event["extra"] = scrub_dict(event["extra"], deep=True, scrub_values=True) - - # Scrub contexts - if "contexts" in event and isinstance(event["contexts"], dict): - event["contexts"] = scrub_dict(event["contexts"], deep=True, scrub_values=False) - - # Scrub tags - if "tags" in event and isinstance(event["tags"], dict): - event["tags"] = scrub_dict(event["tags"], deep=False, scrub_values=False) - - # Scrub request data (if present) - if "request" in event: - request = event["request"] - if "headers" in request: - request["headers"] = scrub_dict(request["headers"], deep=False, scrub_values=False) - if "data" in request: - if isinstance(request["data"], dict): - request["data"] = scrub_dict(request["data"], deep=True, scrub_values=True) - elif isinstance(request["data"], str): - request["data"] = scrub_string(request["data"]) - - # Keep only anonymous user IDs (drop all other user attributes). - if "user" in event and isinstance(event["user"], dict): - user_id = event["user"].get("id") - event["user"] = {"id": anonymize_identifier(str(user_id))} if user_id else {} - - return event + try: + _scrub_top_level_messages(event) + + # Scrub exception data + if "exception" in event: + scrub_exception_data(event["exception"]) + + # Scrub breadcrumbs + if "breadcrumbs" in event and "values" in event["breadcrumbs"]: + for breadcrumb in event["breadcrumbs"]["values"]: + if "message" in breadcrumb and isinstance(breadcrumb["message"], str): + breadcrumb["message"] = scrub_string(breadcrumb["message"]) + if "data" in breadcrumb and isinstance(breadcrumb["data"], dict): + breadcrumb["data"] = scrub_dict( + breadcrumb["data"], deep=True, scrub_values=True + ) + + # Scrub extra data + if "extra" in event and isinstance(event["extra"], dict): + event["extra"] = scrub_dict(event["extra"], deep=True, scrub_values=True) + + # Scrub contexts + if "contexts" in event and isinstance(event["contexts"], dict): + event["contexts"] = scrub_dict(event["contexts"], deep=True, scrub_values=True) + + # Scrub tags (allowlist + scrub values) + if "tags" in event and isinstance(event["tags"], dict): + event["tags"] = _scrub_tags(event["tags"]) + + # Scrub request data (if present) + if "request" in event: + request = event["request"] + if "headers" in request: + request["headers"] = scrub_dict(request["headers"], deep=False, scrub_values=True) + if "data" in request: + if isinstance(request["data"], dict): + request["data"] = scrub_dict(request["data"], deep=True, scrub_values=True) + elif isinstance(request["data"], str): + request["data"] = scrub_string(request["data"]) + + # Keep only anonymous user IDs (drop all other user attributes). + if "user" in event and isinstance(event["user"], dict): + user_id = event["user"].get("id") + event["user"] = {"id": anonymize_identifier(str(user_id))} if user_id else {} + + return event + except Exception: + # Fail closed on privacy filter errors. + return None return before_send diff --git a/tests/test_client.py b/tests/test_client.py index 53386e9..cc2606f 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -105,6 +105,32 @@ def test_no_signals_indicates_external(self, _mock_ci, _mock_exists): with patch.dict(os.environ, {"OPENADAPT_INTERNAL": "", "OPENADAPT_DEV": ""}, clear=False): assert is_internal_user() is False + @patch("openadapt_telemetry.client.Path.exists", return_value=True) + @patch("openadapt_telemetry.client.is_ci_environment", return_value=False) + def test_git_repo_not_internal_by_default(self, _mock_ci, _mock_exists): + """Git repos should not be internal unless explicitly enabled.""" + with patch.dict( + os.environ, + { + "OPENADAPT_INTERNAL": "", + "OPENADAPT_DEV": "", + "OPENADAPT_INTERNAL_FROM_GIT": "", + }, + clear=False, + ): + assert is_internal_user() is False + + @patch("openadapt_telemetry.client.Path.exists", return_value=True) + @patch("openadapt_telemetry.client.is_ci_environment", return_value=False) + def test_git_repo_internal_when_opted_in(self, _mock_ci, _mock_exists): + """Git repos should be internal when OPENADAPT_INTERNAL_FROM_GIT is enabled.""" + with patch.dict( + os.environ, + {"OPENADAPT_INTERNAL_FROM_GIT": "true", "OPENADAPT_INTERNAL": "", "OPENADAPT_DEV": ""}, + clear=False, + ): + assert is_internal_user() is True + class TestTelemetryClient: """Tests for TelemetryClient class.""" @@ -255,21 +281,25 @@ def test_set_user_hashes_identifier_and_drops_extra_fields(self, mock_sentry): mock_sentry.set_user.assert_called_once() payload = mock_sentry.set_user.call_args.args[0] assert set(payload.keys()) == {"id"} - assert payload["id"].startswith("anon:") + assert payload["id"].startswith("anon:v2:") assert payload["id"] != "user@example.com" @patch("openadapt_telemetry.client.sentry_sdk") def test_custom_before_send_is_composed_after_privacy_filter(self, mock_sentry): - """Custom before_send should run after built-in scrubbing.""" + """Custom before_send should run before built-in scrubbing.""" def custom_before_send(event, hint): - event.setdefault("tags", {})["custom"] = "true" + event.setdefault("logentry", {})["message"] = "secret user@example.com" + event.setdefault("tags", {})["package"] = "openadapt-evals" return event with patch.dict(os.environ, {"DO_NOT_TRACK": ""}, clear=False): TelemetryClient.reset_instance() client = TelemetryClient.get_instance() - with pytest.warns(UserWarning, match="Custom before_send is composed after OpenAdapt privacy filtering"): + with pytest.warns( + UserWarning, + match="Custom before_send runs before OpenAdapt privacy filtering", + ): client.initialize( dsn="https://test@example.com/1", before_send=custom_before_send, @@ -279,9 +309,9 @@ def custom_before_send(event, hint): event = {"user": {"id": "user@example.com", "email": "user@example.com"}} output = before_send(event, hint={}) assert output is not None - assert output["user"]["id"].startswith("anon:") + assert output["user"]["id"].startswith("anon:v2:") assert "email" not in output["user"] - assert output["tags"]["custom"] == "true" + assert output["logentry"]["message"] == "secret [REDACTED]" @patch("openadapt_telemetry.client.sentry_sdk") def test_send_default_pii_override_is_ignored(self, mock_sentry): diff --git a/tests/test_config.py b/tests/test_config.py index 0d8f9d0..bc97443 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -10,9 +10,12 @@ from openadapt_telemetry.config import ( TelemetryConfig, + _generate_anon_salt, _get_env_config, + _is_valid_anon_salt, _load_config_file, _parse_bool, + get_or_create_anon_salt, load_config, save_config, ) @@ -157,6 +160,12 @@ def test_invalid_sample_rate_ignored(self): config = _get_env_config() assert "sample_rate" not in config + def test_anon_salt_env_override(self): + """OPENADAPT_TELEMETRY_ANON_SALT should be loaded when set.""" + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "x" * 32}, clear=False): + config = _get_env_config() + assert config.get("anon_salt") == "x" * 32 + class TestConfigFile: """Tests for configuration file loading.""" @@ -249,3 +258,47 @@ def test_save_and_load(self): assert loaded.enabled is False assert loaded.environment == "test" assert loaded.sample_rate == 0.5 + + +class TestAnonSalt: + """Tests for per-install anonymization salt handling.""" + + def test_validate_anon_salt(self): + assert _is_valid_anon_salt("x" * 32) is True + assert _is_valid_anon_salt("short") is False + assert _is_valid_anon_salt(None) is False + + def test_generate_anon_salt(self): + salt = _generate_anon_salt() + assert isinstance(salt, str) + assert len(salt) >= 32 + + def test_get_or_create_anon_salt_from_env(self): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "y" * 32}, clear=False): + assert get_or_create_anon_salt() == "y" * 32 + + def test_get_or_create_anon_salt_persists_when_missing(self): + with tempfile.TemporaryDirectory() as tmpdir: + config_dir = Path(tmpdir) / "openadapt" + config_file = config_dir / "telemetry.json" + with patch("openadapt_telemetry.config.CONFIG_DIR", config_dir): + with patch("openadapt_telemetry.config.CONFIG_FILE", config_file): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": ""}, clear=False): + first = get_or_create_anon_salt() + second = get_or_create_anon_salt() + assert first == second + with open(config_file) as f: + data = json.load(f) + assert data["anon_salt"] == first + + def test_get_or_create_anon_salt_recovers_from_corrupt_file(self): + with tempfile.TemporaryDirectory() as tmpdir: + config_dir = Path(tmpdir) / "openadapt" + config_dir.mkdir(parents=True, exist_ok=True) + config_file = config_dir / "telemetry.json" + config_file.write_text("{not-valid-json") + with patch("openadapt_telemetry.config.CONFIG_DIR", config_dir): + with patch("openadapt_telemetry.config.CONFIG_FILE", config_file): + salt = get_or_create_anon_salt() + assert isinstance(salt, str) + assert len(salt) >= 32 diff --git a/tests/test_privacy.py b/tests/test_privacy.py index 817350e..466a740 100644 --- a/tests/test_privacy.py +++ b/tests/test_privacy.py @@ -1,7 +1,9 @@ """Tests for privacy filtering and PII scrubbing.""" +from unittest.mock import patch from openadapt_telemetry.privacy import ( + ALLOWED_OBSERVABILITY_KEYS, anonymize_identifier, create_before_send_filter, is_sensitive_key, @@ -280,19 +282,30 @@ def test_local_variables_scrubbing(self): class TestAnonymizeIdentifier: """Tests for deterministic identifier anonymization.""" - def test_stable_hash(self): - first = anonymize_identifier("user@example.com") - second = anonymize_identifier("user@example.com") + def test_stable_hash_with_same_salt(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + first = anonymize_identifier("user@example.com") + second = anonymize_identifier("user@example.com") assert first == second - assert first.startswith("anon:") + assert first.startswith("anon:v2:") assert first != "user@example.com" + def test_different_salts_produce_different_ids(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + first = anonymize_identifier("user@example.com") + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="b" * 32): + second = anonymize_identifier("user@example.com") + assert first != second + def test_empty_value(self): - assert anonymize_identifier("") == "anon:unknown" + assert anonymize_identifier("") == "anon:v2:unknown" def test_already_anonymized_id_is_not_rehashed(self): - anon_id = anonymize_identifier("user@example.com") + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + anon_id = anonymize_identifier("user@example.com") assert anonymize_identifier(anon_id) == anon_id + assert anonymize_identifier("anon:v1:1234567890abcdef") == "anon:v1:1234567890abcdef" + assert anonymize_identifier("anon:1234567890abcdef") == "anon:1234567890abcdef" class TestBeforeSendUserScrubbing: @@ -311,5 +324,51 @@ def test_only_anonymous_user_id_remains(self): assert sanitized is not None assert "user" in sanitized assert set(sanitized["user"].keys()) == {"id"} - assert sanitized["user"]["id"].startswith("anon:") + assert sanitized["user"]["id"].startswith("anon:v2:") assert sanitized["user"]["id"] != "user@example.com" + + +class TestBeforeSendEventScrubbing: + """Tests for event-level before_send filtering behavior.""" + + def test_top_level_message_is_scrubbed(self): + before_send = create_before_send_filter() + event = {"message": "Failure for user@example.com"} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert "user@example.com" not in sanitized["message"] + + def test_logentry_message_is_scrubbed(self): + before_send = create_before_send_filter() + event = {"logentry": {"message": "token=Bearer abc123", "formatted": "email user@example.com"}} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert "abc123" not in sanitized["logentry"]["message"] + assert "user@example.com" not in sanitized["logentry"]["formatted"] + + def test_context_values_are_scrubbed(self): + before_send = create_before_send_filter() + event = {"contexts": {"runtime": {"note": "contact me at user@example.com"}}} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["contexts"]["runtime"]["note"] == "contact me at [REDACTED]" + + def test_tags_are_allowlisted_and_scrubbed(self): + before_send = create_before_send_filter() + event = { + "tags": { + "package": "openadapt-evals", + "internal": "false", + "custom_tag": "user@example.com", + }, + } + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert set(sanitized["tags"].keys()).issubset(ALLOWED_OBSERVABILITY_KEYS) + assert "custom_tag" not in sanitized["tags"] + + def test_filter_fails_closed_on_unexpected_error(self): + before_send = create_before_send_filter() + with patch("openadapt_telemetry.privacy.scrub_exception_data", side_effect=RuntimeError("boom")): + event = {"exception": {"values": []}} + assert before_send(event, hint={}) is None From 8e9a3e10d3d9eca92dfbfaef8570c66226e5f0cf Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 16 Mar 2026 17:34:39 -0400 Subject: [PATCH 07/11] fix: guard non-dict telemetry config payloads --- src/openadapt_telemetry/config.py | 3 ++- tests/test_config.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/openadapt_telemetry/config.py b/src/openadapt_telemetry/config.py index 63410c1..c7f1853 100644 --- a/src/openadapt_telemetry/config.py +++ b/src/openadapt_telemetry/config.py @@ -77,7 +77,8 @@ def _load_config_file() -> dict[str, Any]: try: with open(CONFIG_FILE) as f: - return json.load(f) + data = json.load(f) + return data if isinstance(data, dict) else {} except (json.JSONDecodeError, OSError): return {} diff --git a/tests/test_config.py b/tests/test_config.py index bc97443..3542585 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -202,6 +202,18 @@ def test_load_invalid_json(self): os.unlink(f.name) + def test_load_non_dict_json_returns_empty(self): + """Loading valid JSON that is not an object should return empty dict.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(["not", "a", "dict"], f) + f.flush() + + with patch("openadapt_telemetry.config.CONFIG_FILE", Path(f.name)): + config = _load_config_file() + assert config == {} + + os.unlink(f.name) + class TestLoadConfig: """Tests for full configuration loading.""" From a8c752dba04ff3dc550d7998d990d47f2e9bb785 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 16 Mar 2026 18:17:31 -0400 Subject: [PATCH 08/11] fix: harden anon id validation and preserve safe custom tags --- README.md | 2 +- src/openadapt_telemetry/privacy.py | 57 ++++++++++++++++++++++++++---- tests/test_privacy.py | 17 +++++++-- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b1b1f4b..a165a36 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ export OPENADAPT_TELEMETRY_ENABLED=false - Sensitive fields (password, token, api_key, etc.) are redacted - Email addresses and phone numbers are scrubbed from messages - Top-level event messages/logentry strings are scrubbed -- Tag keys are allowlisted and values are scrubbed before upload +- Tag keys are validated, sensitive/invalid keys are dropped, and values are scrubbed before upload - User IDs are HMAC-anonymized before upload (`anon:v2:`) - `send_default_pii` is enforced to `false` by the client diff --git a/src/openadapt_telemetry/privacy.py b/src/openadapt_telemetry/privacy.py index 32a1d05..8b109e6 100644 --- a/src/openadapt_telemetry/privacy.py +++ b/src/openadapt_telemetry/privacy.py @@ -111,6 +111,9 @@ } ANON_VERSION = "v2" +TAG_KEY_PATTERN = re.compile(r"^[a-zA-Z0-9._:-]{1,64}$") +MAX_TAGS = 64 +MAX_TAG_VALUE_CHARS = 256 @lru_cache(maxsize=1) @@ -120,9 +123,15 @@ def _get_anon_salt_cached() -> str: def _is_already_anonymized(value: str, prefix: str = "anon") -> bool: - """Return True when value already matches anon id formats.""" - legacy_prefix = f"{prefix}:" - return value.startswith(legacy_prefix) + """Return True only for canonical anonymous ID formats.""" + escaped_prefix = re.escape(prefix) + patterns = ( + rf"^{escaped_prefix}:v2:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:v1:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:[0-9a-f]{{16}}$", + rf"^{escaped_prefix}:v2:unknown$", + ) + return any(re.match(pattern, value) for pattern in patterns) def _scrub_top_level_messages(event: Dict[str, Any]) -> None: @@ -137,10 +146,46 @@ def _scrub_top_level_messages(event: Dict[str, Any]) -> None: logentry["formatted"] = scrub_string(logentry["formatted"]) +def _is_safe_tag_key(key: Any) -> bool: + """Return True when a tag key is safe to keep.""" + if not isinstance(key, str): + return False + if not TAG_KEY_PATTERN.match(key): + return False + if is_sensitive_key(key): + return False + return True + + +def _normalize_tag_value(value: Any) -> str: + """Convert a tag value to a scrubbed bounded string.""" + scrubbed = scrub_string(str(value)) + return scrubbed[:MAX_TAG_VALUE_CHARS] + + def _scrub_tags(tags: Dict[str, Any]) -> Dict[str, Any]: - """Keep allowed tag keys and scrub their values.""" - filtered = {k: v for k, v in tags.items() if k in ALLOWED_OBSERVABILITY_KEYS} - return scrub_dict(filtered, deep=False, scrub_values=True) + """Keep safe tags, scrub values, and bound payload size.""" + sanitized: Dict[str, Any] = {} + dropped_count = 0 + + for key, value in tags.items(): + if len(sanitized) >= MAX_TAGS: + dropped_count += 1 + continue + if not _is_safe_tag_key(key): + dropped_count += 1 + continue + sanitized[key] = _normalize_tag_value(value) + + if dropped_count > 0: + sanitized["_dropped_tag_count"] = str(dropped_count) + + # Preserve explicitly listed observability keys whenever present. + for key in ALLOWED_OBSERVABILITY_KEYS: + if key in tags and key not in sanitized: + sanitized[key] = _normalize_tag_value(tags[key]) + + return sanitized def sanitize_path(path: str) -> str: diff --git a/tests/test_privacy.py b/tests/test_privacy.py index 466a740..1d029ce 100644 --- a/tests/test_privacy.py +++ b/tests/test_privacy.py @@ -307,6 +307,12 @@ def test_already_anonymized_id_is_not_rehashed(self): assert anonymize_identifier("anon:v1:1234567890abcdef") == "anon:v1:1234567890abcdef" assert anonymize_identifier("anon:1234567890abcdef") == "anon:1234567890abcdef" + def test_non_canonical_anon_prefix_is_rehashed(self): + with patch("openadapt_telemetry.privacy._get_anon_salt_cached", return_value="a" * 32): + value = anonymize_identifier("anon:user@example.com") + assert value.startswith("anon:v2:") + assert value != "anon:user@example.com" + class TestBeforeSendUserScrubbing: """Tests for user context anonymization in before_send.""" @@ -359,13 +365,18 @@ def test_tags_are_allowlisted_and_scrubbed(self): "tags": { "package": "openadapt-evals", "internal": "false", - "custom_tag": "user@example.com", + "task_id": "abc123", + "email": "user@example.com", + "not valid": "drop-me", }, } sanitized = before_send(event, hint={}) assert sanitized is not None - assert set(sanitized["tags"].keys()).issubset(ALLOWED_OBSERVABILITY_KEYS) - assert "custom_tag" not in sanitized["tags"] + assert set(ALLOWED_OBSERVABILITY_KEYS).intersection(set(sanitized["tags"].keys())) + assert sanitized["tags"]["task_id"] == "abc123" + assert "email" not in sanitized["tags"] + assert "not valid" not in sanitized["tags"] + assert sanitized["tags"]["_dropped_tag_count"] == "2" def test_filter_fails_closed_on_unexpected_error(self): before_send = create_before_send_filter() From 5620452cfc56336042b3c74d3dc4e52cf2ae0318 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 16 Mar 2026 18:52:17 -0400 Subject: [PATCH 09/11] fix: guard request shape and enforce tag cap semantics --- src/openadapt_telemetry/config.py | 8 +++++++- src/openadapt_telemetry/privacy.py | 17 ++++++++++------- tests/test_config.py | 7 +++++++ tests/test_privacy.py | 20 ++++++++++++++++++++ 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/openadapt_telemetry/config.py b/src/openadapt_telemetry/config.py index c7f1853..f2ccfee 100644 --- a/src/openadapt_telemetry/config.py +++ b/src/openadapt_telemetry/config.py @@ -130,7 +130,13 @@ def _get_env_config() -> dict[str, Any]: # Optional override for deterministic anonymization in controlled environments. anon_salt = os.getenv("OPENADAPT_TELEMETRY_ANON_SALT") if anon_salt: - config["anon_salt"] = anon_salt + if _is_valid_anon_salt(anon_salt): + config["anon_salt"] = anon_salt.strip() + else: + warnings.warn( + "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT; must be >= 32 chars.", + stacklevel=2, + ) return config diff --git a/src/openadapt_telemetry/privacy.py b/src/openadapt_telemetry/privacy.py index 8b109e6..3a94684 100644 --- a/src/openadapt_telemetry/privacy.py +++ b/src/openadapt_telemetry/privacy.py @@ -168,7 +168,14 @@ def _scrub_tags(tags: Dict[str, Any]) -> Dict[str, Any]: sanitized: Dict[str, Any] = {} dropped_count = 0 + # Preserve explicitly listed observability keys first so they survive capping. + for key in ALLOWED_OBSERVABILITY_KEYS: + if key in tags and _is_safe_tag_key(key): + sanitized[key] = _normalize_tag_value(tags[key]) + for key, value in tags.items(): + if key in sanitized: + continue if len(sanitized) >= MAX_TAGS: dropped_count += 1 continue @@ -177,14 +184,10 @@ def _scrub_tags(tags: Dict[str, Any]) -> Dict[str, Any]: continue sanitized[key] = _normalize_tag_value(value) - if dropped_count > 0: + # Keep hard cap strict, including metadata markers. + if dropped_count > 0 and len(sanitized) < MAX_TAGS: sanitized["_dropped_tag_count"] = str(dropped_count) - # Preserve explicitly listed observability keys whenever present. - for key in ALLOWED_OBSERVABILITY_KEYS: - if key in tags and key not in sanitized: - sanitized[key] = _normalize_tag_value(tags[key]) - return sanitized @@ -398,7 +401,7 @@ def before_send(event: Event, hint: Hint) -> Optional[Event]: event["tags"] = _scrub_tags(event["tags"]) # Scrub request data (if present) - if "request" in event: + if "request" in event and isinstance(event["request"], dict): request = event["request"] if "headers" in request: request["headers"] = scrub_dict(request["headers"], deep=False, scrub_values=True) diff --git a/tests/test_config.py b/tests/test_config.py index 3542585..cfea5de 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -166,6 +166,13 @@ def test_anon_salt_env_override(self): config = _get_env_config() assert config.get("anon_salt") == "x" * 32 + def test_invalid_anon_salt_env_ignored(self): + """Invalid OPENADAPT_TELEMETRY_ANON_SALT should be ignored in env config.""" + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "short"}, clear=False): + with pytest.warns(UserWarning, match="Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT"): + config = _get_env_config() + assert "anon_salt" not in config + class TestConfigFile: """Tests for configuration file loading.""" diff --git a/tests/test_privacy.py b/tests/test_privacy.py index 1d029ce..3dfd08e 100644 --- a/tests/test_privacy.py +++ b/tests/test_privacy.py @@ -4,6 +4,7 @@ from openadapt_telemetry.privacy import ( ALLOWED_OBSERVABILITY_KEYS, + MAX_TAGS, anonymize_identifier, create_before_send_filter, is_sensitive_key, @@ -383,3 +384,22 @@ def test_filter_fails_closed_on_unexpected_error(self): with patch("openadapt_telemetry.privacy.scrub_exception_data", side_effect=RuntimeError("boom")): event = {"exception": {"values": []}} assert before_send(event, hint={}) is None + + def test_request_non_dict_does_not_drop_event(self): + before_send = create_before_send_filter() + event = {"request": "unexpected-request-shape", "message": "ok"} + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["request"] == "unexpected-request-shape" + + def test_tag_cap_is_hard_limited(self): + before_send = create_before_send_filter() + many_tags = {f"k{i}": f"v{i}" for i in range(MAX_TAGS + 20)} + many_tags.update({"package": "openadapt-evals", "internal": "false"}) + event = {"tags": many_tags} + sanitized = before_send(event, hint={}) + assert sanitized is not None + tags = sanitized["tags"] + assert "package" in tags + assert "internal" in tags + assert len(tags) <= MAX_TAGS From 2c0b0072882227184ed12108da102e70f074f94f Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 16 Mar 2026 21:42:33 -0400 Subject: [PATCH 10/11] fix: scrub request header variants and dedupe salt warnings --- src/openadapt_telemetry/config.py | 23 +++++++++++++++-------- src/openadapt_telemetry/privacy.py | 15 ++++++++++++++- tests/test_config.py | 15 +++++++++++++++ tests/test_privacy.py | 15 +++++++++++++++ 4 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/openadapt_telemetry/config.py b/src/openadapt_telemetry/config.py index f2ccfee..8129f79 100644 --- a/src/openadapt_telemetry/config.py +++ b/src/openadapt_telemetry/config.py @@ -35,6 +35,7 @@ # Config file location CONFIG_DIR = Path.home() / ".config" / "openadapt" CONFIG_FILE = CONFIG_DIR / "telemetry.json" +_INVALID_ANON_SALT_WARNED = False @dataclass @@ -133,10 +134,7 @@ def _get_env_config() -> dict[str, Any]: if _is_valid_anon_salt(anon_salt): config["anon_salt"] = anon_salt.strip() else: - warnings.warn( - "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT; must be >= 32 chars.", - stacklevel=2, - ) + _warn_invalid_anon_salt_once() return config @@ -146,6 +144,18 @@ def _is_valid_anon_salt(value: Any) -> bool: return isinstance(value, str) and len(value.strip()) >= 32 +def _warn_invalid_anon_salt_once() -> None: + """Warn once per process when OPENADAPT_TELEMETRY_ANON_SALT is invalid.""" + global _INVALID_ANON_SALT_WARNED + if _INVALID_ANON_SALT_WARNED: + return + warnings.warn( + "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT; must be >= 32 chars.", + stacklevel=2, + ) + _INVALID_ANON_SALT_WARNED = True + + def _generate_anon_salt() -> str: """Generate a high-entropy random salt.""" return secrets.token_hex(32) @@ -163,10 +173,7 @@ def get_or_create_anon_salt() -> str: if env_salt: if _is_valid_anon_salt(env_salt): return env_salt.strip() - warnings.warn( - "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT; must be >= 32 chars.", - stacklevel=2, - ) + _warn_invalid_anon_salt_once() config_data = _load_config_file() file_salt = config_data.get("anon_salt") diff --git a/src/openadapt_telemetry/privacy.py b/src/openadapt_telemetry/privacy.py index 3a94684..027d10b 100644 --- a/src/openadapt_telemetry/privacy.py +++ b/src/openadapt_telemetry/privacy.py @@ -294,6 +294,8 @@ def scrub_dict( result[key] = scrub_dict(value, deep=True, scrub_values=scrub_values) elif isinstance(value, list) and deep: result[key] = scrub_list(value, scrub_values=scrub_values) + elif isinstance(value, tuple) and deep: + result[key] = tuple(scrub_list(list(value), scrub_values=scrub_values)) elif isinstance(value, str): if scrub_values: result[key] = scrub_string(value) @@ -322,6 +324,8 @@ def scrub_list(data: List[Any], scrub_values: bool = False) -> List[Any]: result.append(scrub_dict(item, deep=True, scrub_values=scrub_values)) elif isinstance(item, list): result.append(scrub_list(item, scrub_values=scrub_values)) + elif isinstance(item, tuple): + result.append(tuple(scrub_list(list(item), scrub_values=scrub_values))) elif isinstance(item, str) and scrub_values: result.append(scrub_string(item)) else: @@ -404,10 +408,19 @@ def before_send(event: Event, hint: Hint) -> Optional[Event]: if "request" in event and isinstance(event["request"], dict): request = event["request"] if "headers" in request: - request["headers"] = scrub_dict(request["headers"], deep=False, scrub_values=True) + if isinstance(request["headers"], dict): + request["headers"] = scrub_dict( + request["headers"], deep=False, scrub_values=True + ) + elif isinstance(request["headers"], list): + request["headers"] = scrub_list(request["headers"], scrub_values=True) + elif isinstance(request["headers"], str): + request["headers"] = scrub_string(request["headers"]) if "data" in request: if isinstance(request["data"], dict): request["data"] = scrub_dict(request["data"], deep=True, scrub_values=True) + elif isinstance(request["data"], list): + request["data"] = scrub_list(request["data"], scrub_values=True) elif isinstance(request["data"], str): request["data"] = scrub_string(request["data"]) diff --git a/tests/test_config.py b/tests/test_config.py index cfea5de..9be445f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -3,6 +3,7 @@ import json import os import tempfile +import warnings from pathlib import Path from unittest.mock import patch @@ -321,3 +322,17 @@ def test_get_or_create_anon_salt_recovers_from_corrupt_file(self): salt = get_or_create_anon_salt() assert isinstance(salt, str) assert len(salt) >= 32 + + def test_invalid_anon_salt_warns_once_across_config_paths(self): + with patch("openadapt_telemetry.config._INVALID_ANON_SALT_WARNED", False): + with patch.dict(os.environ, {"OPENADAPT_TELEMETRY_ANON_SALT": "short"}, clear=False): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _get_env_config() + get_or_create_anon_salt() + matches = [ + warning + for warning in caught + if "Ignoring invalid OPENADAPT_TELEMETRY_ANON_SALT" in str(warning.message) + ] + assert len(matches) == 1 diff --git a/tests/test_privacy.py b/tests/test_privacy.py index 3dfd08e..67f4f31 100644 --- a/tests/test_privacy.py +++ b/tests/test_privacy.py @@ -392,6 +392,21 @@ def test_request_non_dict_does_not_drop_event(self): assert sanitized is not None assert sanitized["request"] == "unexpected-request-shape" + def test_request_headers_tuple_list_is_scrubbed(self): + before_send = create_before_send_filter() + event = { + "request": { + "headers": [ + ("Authorization", "Bearer abc123"), + ("X-User", "user@example.com"), + ], + }, + } + sanitized = before_send(event, hint={}) + assert sanitized is not None + assert sanitized["request"]["headers"][0][1] == "[REDACTED]" + assert sanitized["request"]["headers"][1][1] == "[REDACTED]" + def test_tag_cap_is_hard_limited(self): before_send = create_before_send_filter() many_tags = {f"k{i}": f"v{i}" for i in range(MAX_TAGS + 20)} From 216f6d09337149b2aeec5c50b1e29ff4a940c05c Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 16 Mar 2026 22:12:29 -0400 Subject: [PATCH 11/11] chore: fix existing ruff violations in decorators --- src/openadapt_telemetry/decorators.py | 4 ++-- tests/test_decorators.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/openadapt_telemetry/decorators.py b/src/openadapt_telemetry/decorators.py index 7925f34..1e6d714 100644 --- a/src/openadapt_telemetry/decorators.py +++ b/src/openadapt_telemetry/decorators.py @@ -10,7 +10,7 @@ import functools import time -from typing import Any, Callable, Optional, TypeVar, Union +from typing import Any, Callable, Optional, TypeVar import sentry_sdk @@ -57,7 +57,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: result = func(*args, **kwargs) transaction.set_status("ok") return result - except Exception as e: + except Exception: transaction.set_status("internal_error") raise finally: diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 00346e7..e5698b8 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -1,6 +1,5 @@ """Tests for telemetry decorators.""" -import os from unittest.mock import MagicMock, patch import pytest @@ -171,20 +170,20 @@ def teardown_method(self): def test_basic_usage(self): """Basic span usage should work.""" - with TelemetrySpan("test_op", "test_span") as span: + with TelemetrySpan("test_op", "test_span"): result = 1 + 1 assert result == 2 def test_with_description(self): """Span with description should work.""" - with TelemetrySpan("op", "name", description="Test description") as span: + with TelemetrySpan("op", "name", description="Test description"): pass def test_exception_handling(self): """Exceptions should propagate through span.""" with pytest.raises(ValueError, match="test"): - with TelemetrySpan("op", "name") as span: + with TelemetrySpan("op", "name"): raise ValueError("test") def test_set_tag(self):