Skip to content

Commit 2134b53

Browse files
authored
feat: Integrate OpenTelemetry (#189)
1 parent 275cf59 commit 2134b53

12 files changed

Lines changed: 1426 additions & 6 deletions

File tree

README.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,54 @@ Use this mark to auto-use the `saas_mode` fixture.
9696

9797
Use this mark to auto-use the `enterprise_mode` fixture.
9898

99+
### OpenTelemetry
100+
101+
Flagsmith supports exporting traces and structured logs over OTLP.
102+
103+
#### Configuration
104+
105+
OTel instrumentation is opt-in, controlled by environment variables:
106+
107+
| Variable | Description | Default |
108+
| --------------------------------- | --------------------------------------------------------------------------------------------------------------------- | --------------- |
109+
| `OTEL_EXPORTER_OTLP_ENDPOINT` | Base OTLP endpoint (e.g. `http://collector:4318`). If unset, no OTel setup occurs. | _(disabled)_ |
110+
| `OTEL_SERVICE_NAME` | The `service.name` resource attribute. | `flagsmith-api` |
111+
| `OTEL_TRACING_EXCLUDED_URL_PATHS` | Comma-separated URL paths to exclude from tracing (e.g. `health/liveness,health/readiness`). | _(none)_ |
112+
113+
Standard `OTEL_*` env vars (e.g. `OTEL_RESOURCE_ATTRIBUTES`, `OTEL_EXPORTER_OTLP_HEADERS`) are also respected by the OTel SDK.
114+
115+
#### What gets configured
116+
117+
When `OTEL_EXPORTER_OTLP_ENDPOINT` is set, `ensure_cli_env()` sets up:
118+
119+
- **Tracing**: `TracerProvider` with OTLP/HTTP span export, W3C `TraceContext` + `Baggage` propagation, and auto-instrumentation for:
120+
- **Django** (`DjangoInstrumentor`): creates a root span per HTTP request with span names formatted as `{METHOD} {route_template}` (e.g. `GET /api/v1/projects/{pk}/`).
121+
- **psycopg2** (`Psycopg2Instrumentor`): creates child spans for each SQL query with `db.system`, `db.statement`, and `db.name` attributes. SQL commenter is enabled, adding trace context as SQL comments for database-side correlation.
122+
- **Redis** (`RedisInstrumentor`): creates child spans for each Redis command with `db.system` and `db.statement` attributes.
123+
- **Structured log export**: A structlog processor that emits each log event as both an OTLP log record and a span event (when an active span exists).
124+
125+
#### Emitting OTel log events via structlog
126+
127+
Use structlog as usual. The OTel processor captures events and maps them to OTLP log records:
128+
129+
```python
130+
import structlog
131+
132+
log = structlog.get_logger("code_references")
133+
log.info("scan-created", code_references__count=3, feature__count=2)
134+
```
135+
136+
This produces:
137+
138+
1. An **OTLP log record** with:
139+
- `Body: scan-created`
140+
- `EventName: code_references.scan_created` (logger name + `inflection.underscore` of the event)
141+
- `Severity: INFO`
142+
- `Attributes: code_references.count=3, feature.count=2` (double underscores are converted to dots)
143+
- W3C Baggage entries from the current OTel context are copied into log attributes (e.g. `amplitude.device_id`, `amplitude.session_id`).
144+
145+
2. A **span event** on the active span (if one exists) with the same name and attributes. This makes structlog events visible in trace backends (e.g. SigNoz's "Events" tab) without requiring separate log correlation. When no span is active (e.g. during startup or management commands), only the OTLP log record is emitted.
146+
99147
### Metrics
100148

101149
Flagsmith uses Prometheus to track performance metrics.

pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ optional-dependencies = { test-tools = [
1616
"drf-writable-nested",
1717
"environs (<15)",
1818
"gunicorn (>=19.1)",
19+
"inflection",
20+
"opentelemetry-api (>=1.25,<2)",
21+
"opentelemetry-sdk (>=1.25,<2)",
22+
"opentelemetry-exporter-otlp-proto-http (>=1.25,<2)",
23+
"opentelemetry-instrumentation-django (>=0.46b0,<1)",
24+
"opentelemetry-instrumentation-psycopg2 (>=0.46b0,<1)",
25+
"opentelemetry-instrumentation-redis (>=0.46b0,<1)",
26+
"redis (>=5,<6)",
1927
"prometheus-client (>=0.0.16)",
2028
"psycopg2-binary (>=2.9,<3)",
2129
"requests",

src/common/core/logging.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def setup_logging(
3636
logging_configuration_file: str | None = None,
3737
application_loggers: list[str] | None = None,
3838
extra_foreign_processors: list[Processor] | None = None,
39+
otel_processors: list[Processor] | None = None,
3940
) -> None:
4041
"""
4142
Set up logging for the application.
@@ -91,7 +92,9 @@ def setup_logging(
9192
logging.config.dictConfig(dict_config)
9293

9394
setup_structlog(
94-
log_format=log_format, extra_foreign_processors=extra_foreign_processors
95+
log_format=log_format,
96+
extra_foreign_processors=extra_foreign_processors,
97+
otel_processors=otel_processors,
9598
)
9699

97100

@@ -122,6 +125,7 @@ def map_event_to_json_record(
122125
def setup_structlog(
123126
log_format: str,
124127
extra_foreign_processors: list[Processor] | None = None,
128+
otel_processors: list[Processor] | None = None,
125129
) -> None:
126130
"""Configure structlog to route through stdlib logging."""
127131

@@ -172,6 +176,7 @@ def setup_structlog(
172176
structlog.processors.format_exc_info,
173177
structlog.processors.TimeStamper(fmt="iso"),
174178
sentry_processor,
179+
*(otel_processors or []),
175180
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
176181
],
177182
wrapper_class=structlog.stdlib.BoundLogger,

src/common/core/main.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,35 @@ def ensure_cli_env() -> typing.Generator[None, None, None]:
3737
"""
3838
ctx = contextlib.ExitStack()
3939

40+
# Set up OTel instrumentation (opt-in via OTEL_EXPORTER_OTLP_ENDPOINT).
41+
otel_processors = None
42+
otel_endpoint = env.str("OTEL_EXPORTER_OTLP_ENDPOINT", None)
43+
if otel_endpoint:
44+
from common.core.otel import (
45+
add_otel_trace_context,
46+
build_otel_log_provider,
47+
build_tracer_provider,
48+
make_structlog_otel_processor,
49+
setup_tracing,
50+
)
51+
52+
service_name = env.str("OTEL_SERVICE_NAME", "flagsmith-api")
53+
log_provider = build_otel_log_provider(
54+
endpoint=f"{otel_endpoint}/v1/logs",
55+
service_name=service_name,
56+
)
57+
otel_processors = [
58+
add_otel_trace_context,
59+
make_structlog_otel_processor(log_provider),
60+
]
61+
tracer_provider = build_tracer_provider(
62+
endpoint=f"{otel_endpoint}/v1/traces",
63+
service_name=service_name,
64+
)
65+
excluded_urls = env.str("OTEL_TRACING_EXCLUDED_URL_PATHS", None)
66+
ctx.enter_context(setup_tracing(tracer_provider, excluded_urls=excluded_urls))
67+
ctx.callback(log_provider.shutdown)
68+
4069
# Set up logging early, before Django settings are loaded.
4170
setup_logging(
4271
log_level=env.str("LOG_LEVEL", "INFO"),
@@ -48,6 +77,7 @@ def ensure_cli_env() -> typing.Generator[None, None, None]:
4877
env.list("ACCESS_LOG_EXTRA_ITEMS", []) or None,
4978
),
5079
],
80+
otel_processors=otel_processors,
5181
)
5282

5383
# Prometheus multiproc support

src/common/core/otel.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import contextlib
2+
import json
3+
from collections.abc import Generator
4+
from datetime import datetime, timezone
5+
from importlib.metadata import version
6+
from typing import cast
7+
8+
import inflection
9+
import structlog
10+
from opentelemetry import baggage, trace
11+
from opentelemetry import context as otel_context
12+
from opentelemetry._logs import SeverityNumber
13+
from opentelemetry.baggage.propagation import W3CBaggagePropagator
14+
from opentelemetry.exporter.otlp.proto.http._log_exporter import (
15+
OTLPLogExporter,
16+
)
17+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
18+
OTLPSpanExporter,
19+
)
20+
from opentelemetry.instrumentation.django import DjangoInstrumentor
21+
from opentelemetry.instrumentation.psycopg2 import Psycopg2Instrumentor
22+
from opentelemetry.instrumentation.redis import RedisInstrumentor
23+
from opentelemetry.propagate import set_global_textmap
24+
from opentelemetry.propagators.composite import CompositePropagator
25+
from opentelemetry.propagators.textmap import TextMapPropagator
26+
from opentelemetry.sdk._logs import LoggerProvider
27+
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
28+
from opentelemetry.sdk.resources import Resource
29+
from opentelemetry.sdk.trace import TracerProvider
30+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
31+
from opentelemetry.trace.propagation.tracecontext import (
32+
TraceContextTextMapPropagator,
33+
)
34+
from opentelemetry.util.types import AnyValue, Attributes
35+
from structlog.typing import EventDict, Processor
36+
37+
_SEVERITY_MAP: dict[str, SeverityNumber] = {
38+
"debug": SeverityNumber.DEBUG,
39+
"info": SeverityNumber.INFO,
40+
"warning": SeverityNumber.WARN,
41+
"error": SeverityNumber.ERROR,
42+
"critical": SeverityNumber.FATAL,
43+
}
44+
45+
_RESERVED_KEYS = frozenset(
46+
[
47+
"event",
48+
"level",
49+
"timestamp",
50+
"logger",
51+
"trace_id",
52+
"span_id",
53+
]
54+
)
55+
56+
57+
def add_otel_trace_context(
58+
logger: structlog.types.WrappedLogger,
59+
method_name: str,
60+
event_dict: EventDict,
61+
) -> EventDict:
62+
"""Add ``trace_id`` and ``span_id`` from the active OTel span to the event dict."""
63+
span = trace.get_current_span()
64+
ctx = span.get_span_context()
65+
if ctx and ctx.is_valid:
66+
event_dict["trace_id"] = f"{ctx.trace_id:032x}"
67+
event_dict["span_id"] = f"{ctx.span_id:016x}"
68+
return event_dict
69+
70+
71+
def make_structlog_otel_processor(logger_provider: LoggerProvider) -> Processor:
72+
"""Create a structlog processor that emits log records to OpenTelemetry.
73+
74+
Sits in the processor chain *before* the final renderer so that
75+
only structlog-originated logs reach OTel. Passes the event_dict
76+
through unchanged so downstream processors (console/JSON renderers)
77+
still work normally.
78+
79+
Pass the returned processor to :func:`~common.core.logging.setup_logging`
80+
via ``otel_processor``.
81+
"""
82+
otel_logger = logger_provider.get_logger(__name__, version("flagsmith-common"))
83+
84+
def processor(
85+
logger: structlog.types.WrappedLogger,
86+
method_name: str,
87+
event_dict: EventDict,
88+
) -> EventDict:
89+
attributes = map_event_dict_to_otel_attributes(event_dict)
90+
91+
# Copy W3C baggage entries into log attributes so downstream
92+
# exporters can access them.
93+
ctx = otel_context.get_current()
94+
for key, value in baggage.get_all(ctx).items():
95+
attributes[key] = str(value)
96+
97+
body = event_dict.get("event", "")
98+
logger_name = event_dict.get("logger")
99+
event_name = inflection.underscore(body) if body else "unknown"
100+
if logger_name:
101+
event_name = f"{logger_name}.{event_name}"
102+
103+
# Some observability platforms don't surface OTel's EventName.
104+
# Keep a custom attribute for better visibility.
105+
attributes["flagsmith.event"] = event_name
106+
107+
log_level = event_dict.get("level", method_name)
108+
109+
otel_logger.emit(
110+
timestamp=int(datetime.now(timezone.utc).timestamp() * 1e9),
111+
context=otel_context.get_current(),
112+
severity_text=log_level,
113+
severity_number=_SEVERITY_MAP.get(log_level, SeverityNumber.TRACE),
114+
body=body,
115+
event_name=event_name,
116+
attributes=attributes,
117+
)
118+
119+
# Also attach as a span event if there's an active span.
120+
span = trace.get_current_span()
121+
if span.is_recording():
122+
# AnyValue is a superset of AttributeValue at runtime;
123+
# the cast keeps mypy happy.
124+
span.add_event(event_name, attributes=cast(Attributes, attributes))
125+
126+
return event_dict
127+
128+
return processor
129+
130+
131+
def map_event_dict_to_otel_attributes(event_dict: EventDict) -> dict[str, AnyValue]:
132+
return {
133+
k.replace("__", "."): map_value_to_otel_value(v)
134+
for k, v in event_dict.items()
135+
if k not in _RESERVED_KEYS
136+
}
137+
138+
139+
def map_value_to_otel_value(value: object) -> str | int | float | bool:
140+
"""Coerce a value to an OTel-attribute-compatible type."""
141+
if isinstance(value, (bool, str, int, float)):
142+
return value
143+
return json.dumps(value, default=str)
144+
145+
146+
def build_otel_log_provider(*, endpoint: str, service_name: str) -> LoggerProvider:
147+
"""Create and configure an OTel LoggerProvider with OTLP/HTTP export."""
148+
resource = Resource.create({"service.name": service_name})
149+
provider = LoggerProvider(resource=resource)
150+
exporter = OTLPLogExporter(endpoint=endpoint)
151+
provider.add_log_record_processor(BatchLogRecordProcessor(exporter))
152+
return provider
153+
154+
155+
def build_tracer_provider(*, endpoint: str, service_name: str) -> TracerProvider:
156+
"""Create a TracerProvider with OTLP/HTTP export."""
157+
resource = Resource.create({"service.name": service_name})
158+
tracer_provider = TracerProvider(resource=resource)
159+
span_exporter = OTLPSpanExporter(endpoint=endpoint)
160+
tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))
161+
return tracer_provider
162+
163+
164+
@contextlib.contextmanager
165+
def setup_tracing(
166+
tracer_provider: TracerProvider,
167+
excluded_urls: str | None = None,
168+
) -> Generator[None, None, None]:
169+
"""Set up and tear down OTel distributed tracing with Django instrumentation.
170+
171+
Sets the global TracerProvider, configures W3C trace context +
172+
baggage propagation, and instruments Django so that every request
173+
creates a span with the incoming trace context.
174+
175+
On exit, uninstruments Django and shuts down the tracer provider.
176+
177+
Must be called *before* Django's WSGI app is created.
178+
179+
Args:
180+
tracer_provider: The TracerProvider to use.
181+
excluded_urls: Comma-separated URL paths to exclude from tracing
182+
(e.g. ``"health/liveness,health/readiness"``). If not provided,
183+
falls back to the ``OTEL_PYTHON_DJANGO_EXCLUDED_URLS`` env var.
184+
"""
185+
trace.set_tracer_provider(tracer_provider)
186+
187+
propagator: TextMapPropagator = CompositePropagator(
188+
[
189+
TraceContextTextMapPropagator(),
190+
W3CBaggagePropagator(),
191+
]
192+
)
193+
set_global_textmap(propagator)
194+
195+
DjangoInstrumentor().instrument(excluded_urls=excluded_urls)
196+
Psycopg2Instrumentor().instrument(enable_commenter=True, skip_dep_check=True)
197+
RedisInstrumentor().instrument()
198+
try:
199+
yield
200+
finally:
201+
RedisInstrumentor().uninstrument()
202+
Psycopg2Instrumentor().uninstrument()
203+
DjangoInstrumentor().uninstrument()
204+
tracer_provider.shutdown()

src/common/gunicorn/middleware.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
from typing import Callable
22

33
from django.http import HttpRequest, HttpResponse
4+
from opentelemetry import trace
45

56
from common.gunicorn.utils import get_route_template, log_extra
67

78

89
class RouteLoggerMiddleware:
910
"""
1011
Make the resolved Django route available to the WSGI server
11-
(e.g. Gunicorn) for logging purposes.
12+
(e.g. Gunicorn) for logging and tracing purposes.
1213
"""
1314

1415
def __init__(
@@ -21,10 +22,15 @@ def __call__(self, request: HttpRequest) -> HttpResponse:
2122
response = self.get_response(request)
2223

2324
if resolver_match := request.resolver_match:
25+
route_template = get_route_template(resolver_match.route)
2426
log_extra(
2527
request=request,
2628
key="route",
27-
value=get_route_template(resolver_match.route),
29+
value=route_template,
2830
)
31+
span = trace.get_current_span()
32+
if span.is_recording():
33+
span.update_name(f"{request.method} {route_template}")
34+
span.set_attribute("http.route", route_template)
2935

3036
return response

0 commit comments

Comments
 (0)