From 8ea25d20dc4bcc43f11f2727dd2f8bbeb84683d1 Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Thu, 4 Jun 2026 13:59:31 +1000 Subject: [PATCH] =?UTF-8?q?telemetry:=20ship=20edge=20logs=20via=20Vector?= =?UTF-8?q?=E2=86=92OTLP,=20drop=20Loki=20+=20otelcol?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the Loki log sink (and the dormant otelcol-contrib + clickhouse HTTP experiments) with Vector's OTLP logs pipeline to the prod analytics gateway, parsing log level into OTel severity at ingest. otelcol-contrib is removed at the edge via the role's cleanup. Ingress identity (auth user + ingress_user tag) is derived from ethereum_network_name instead of secret_loki.username, so attribution stays correct when the sops username isn't bumped between devnet iterations. Bump Vector to 0.56.0. Applies to the live inventories. --- .../devnet-3/group_vars/all/all.yaml | 224 +++++++++------- .../devnet-3/group_vars/all/images.yaml | 2 +- .../devnet-6/group_vars/all/all.yaml | 247 +++++++++--------- .../devnet-6/group_vars/all/images.yaml | 2 +- .../devnet-7/group_vars/all/all.yaml | 247 +++++++++--------- .../devnet-7/group_vars/all/images.yaml | 2 +- 6 files changed, 383 insertions(+), 341 deletions(-) diff --git a/ansible/inventories/devnet-3/group_vars/all/all.yaml b/ansible/inventories/devnet-3/group_vars/all/all.yaml index eca60cd..a69ef91 100644 --- a/ansible/inventories/devnet-3/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-3/group_vars/all/all.yaml @@ -277,15 +277,20 @@ docker_nginx_proxy_wildcard_cert: "{{ network_server_subdomain }}" docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain }}/{{ network_server_subdomain }}-latest.tar.enc" docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}" -# role: ethpandaops.general.otelcol_contrib +# OTLP egress to the prod analytics gateway (Vector ships container logs here). +# Ingress identity (auth username + ingress_user tag) is the current devnet name +# rather than secret_loki.username: the sops username drifts stale between devnet +# iterations, while the gateway only validates the (constant) password — so using +# the network name keeps log attribution correct no matter what sops holds. otlp_endpoint: "https://otlp.analytics.production.platform.ethpandaops.io" otlp_deployment_env: production -otelcol_contrib_container_networks: "{{ docker_networks_shared }}" +# otelcol-contrib is no longer used at the edge; the role removes its container. +otelcol_contrib_cleanup: true # role: ethpandaops.general.vector -vector_container_networks: "{{ docker_networks_shared }}" vector_config: | + # Docker container logs (clean per-container metadata straight from the Docker API) [sources.in] type = "docker_logs" exclude_containers = [ @@ -298,97 +303,128 @@ vector_config: | "snooper-", ] - [sinks.loki] - type = "loki" + # Shape docker_logs events into an OTLP resourceLogs envelope with full metadata. + [transforms.otel_shape] + type = "remap" inputs = ["in"] - out_of_order_action = "accept" - labels.forwarder = "vector" - labels.instance = "{{ inventory_hostname }}" - labels.network = "{{ ethereum_network_name }}" - labels.testnet = "{{ ethereum_network_name }}" - labels.ingress_user = "{{ secret_loki.username }}" - labels.container_name = "{{ '{{ container_name }}' }}" - {%- if ethereum_node_el is defined +%} - labels.ethereum_el = "{{ ethereum_node_el }}" - {%- endif +%} + source = ''' + ts_ns = to_string(to_unix_timestamp(now(), unit: "nanoseconds")) + if is_timestamp(.timestamp) { + ts_ns = to_string(to_unix_timestamp!(.timestamp, unit: "nanoseconds")) + } + msg = string(.message) ?? "" + ctr = string(.container_name) ?? "unknown" + img = string(.image) ?? "" + strm = string(.stream) ?? "stdout" + + # Extract the source log level: JSON-structured lines (.level / .severity) + # first, then logfmt (level=xxx), then a level token near the start of a + # plain-text line. Client formats vary (lighthouse truncates to 4 chars: + # DEBG/ERRO/CRIT), so the text matcher allows a union of forms. SeverityText + # keeps the source's exact text; only SeverityNumber is normalised to the + # OTel scale (https://opentelemetry.io/docs/specs/otel/logs/data-model/#field-severitynumber). + # Unrecognised lines are left unset rather than guessed. + sevtext = "" + if starts_with(msg, "{") { + j = parse_json(msg) ?? {} + if is_string(j.level) { + sevtext = string!(j.level) + } else if is_string(j.severity) { + sevtext = string!(j.severity) + } + } + if sevtext == "" { + lm = parse_regex(msg, r'(?i)\b(?:level|lvl|severity)="?(?P[a-z]+)') ?? {} + if is_string(lm.l) { + sevtext = string!(lm.l) + } else { + hm = parse_regex(truncate(msg, 48), r'(?i)\b(?PTRACE|DEBUG|DEBG|DBUG|NOTICE|INFO|WARNING|WARN|ERROR|ERRO|CRITICAL|CRIT|FATAL|PANIC)\b') ?? {} + if is_string(hm.l) { + sevtext = string!(hm.l) + } + } + } + lvl = upcase(sevtext) + sevnum = 0 + if lvl == "TRACE" { + sevnum = 1 + } else if lvl == "DEBUG" || lvl == "DEBG" || lvl == "DBUG" { + sevnum = 5 + } else if lvl == "INFO" || lvl == "INFOR" || lvl == "NOTICE" { + sevnum = 9 + } else if lvl == "WARN" || lvl == "WARNING" { + sevnum = 13 + } else if lvl == "ERROR" || lvl == "ERRO" { + sevnum = 17 + } else if lvl == "CRIT" || lvl == "CRITICAL" { + sevnum = 18 + } else if lvl == "FATAL" || lvl == "PANIC" { + sevnum = 21 + } + + attrs = [ + {"key": "service.name", "value": {"stringValue": ctr}}, + {"key": "container.name", "value": {"stringValue": ctr}}, + {"key": "container.image.name", "value": {"stringValue": img}}, + {"key": "deployment.environment", "value": {"stringValue": "{{ otlp_deployment_env }}"}}, + {"key": "forwarder", "value": {"stringValue": "vector"}}, + {"key": "ingress_user", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "network", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "testnet", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "instance", "value": {"stringValue": "{{ inventory_hostname }}"}}, + {"key": "host.name", "value": {"stringValue": "{{ inventory_hostname }}"}} + ] {%- if ethereum_node_cl is defined +%} - labels.ethereum_cl = "{{ ethereum_node_cl }}" + attrs = push(attrs, {"key": "ethereum_cl", "value": {"stringValue": "{{ ethereum_node_cl }}"}}) {%- endif +%} - encoding.codec = "json" - endpoint = "{{ secret_loki.endpoint }}" - auth.strategy = "basic" - auth.user = "{{ secret_loki.username }}" - auth.password = "{{ secret_loki.password }}" -otelcol_contrib_config: | - extensions: - basicauth/client: - client_auth: - username: {{ secret_loki.username }} - password: {{ secret_loki.password }} - - receivers: - filelog: - include: [/var/lib/docker/containers/*/*-json.log] - include_file_path: true - start_at: end - operators: - - type: container - format: docker - add_metadata_from_filepath: true - - type: filter - expr: '(attributes["container.name"] != nil and attributes["container.name"] matches "^(otelcol|ethereum-metrics-exporter|nginx-proxy|node_exporter|prometheus|snooper-.*)$") or body matches "github\\.com/open-telemetry/opentelemetry-collector-contrib|otelcol-contrib"' - - type: json_parser - if: 'body matches "^\\s*\\{"' - on_error: send - severity: - parse_from: attributes.level - overwrite_text: true - mapping: - fatal4: [emergency, emerg] - fatal3: [alert] - fatal2: [critical, crit] - fatal: [panic] - - otlp: - protocols: - grpc: {endpoint: "[::]:4317"} - http: {endpoint: "[::]:4318"} - - processors: - resource: - attributes: - - {key: deployment.environment, value: "{{ otlp_deployment_env }}", action: upsert} - - {key: network, value: "{{ ethereum_network_name }}", action: upsert} - - {key: ingress_user, value: "{{ secret_loki.username }}", action: upsert} - - {key: host.name, value: "{{ inventory_hostname }}", action: upsert} - - transform/service_name: - log_statements: - - context: resource - statements: - - set(attributes["service.name"], attributes["container.name"]) where attributes["container.name"] != nil - - batch: - send_batch_size: 500 - timeout: 5s - - exporters: - otlphttp/staging: - endpoint: "{{ otlp_endpoint }}" - auth: - authenticator: basicauth/client - - otlp/tempo: - endpoint: "{{ tempo_grpc_url | regex_replace('^grpcs?://', '') }}" - - service: - extensions: [basicauth/client] - pipelines: - logs: - receivers: [filelog, otlp] - processors: [resource, transform/service_name, batch] - exporters: [otlphttp/staging] - traces: - receivers: [otlp] - processors: [resource, batch] - exporters: [otlphttp/staging, otlp/tempo] + {%- if ethereum_node_el is defined +%} + attrs = push(attrs, {"key": "ethereum_el", "value": {"stringValue": "{{ ethereum_node_el }}"}}) + {%- endif +%} + . = { + "resource_log": { + "resource": {"attributes": attrs}, + "scopeLogs": [{ + "scope": {"name": "{{ ethereum_network_name }}-vector"}, + "logRecords": [{ + "timeUnixNano": ts_ns, + "severityNumber": sevnum, + "severityText": sevtext, + "body": {"stringValue": msg}, + "attributes": [{"key": "stream", "value": {"stringValue": strm}}] + }] + }] + } + } + ''' + + # Batch many shaped log events into one OTLP envelope (reduce works on logs). + [transforms.batch_envelope] + type = "reduce" + inputs = ["otel_shape"] + expire_after_ms = 30000 + end_every_period_ms = 5000 + max_events = 500 + merge_strategies.resource_log = "array" + + [transforms.finalize_envelope] + type = "remap" + inputs = ["batch_envelope"] + source = ''' + . = {"resourceLogs": .resource_log} + ''' + + [sinks.otlp_logs] + type = "opentelemetry" + inputs = ["finalize_envelope"] + [sinks.otlp_logs.protocol] + type = "http" + uri = "{{ otlp_endpoint }}/v1/logs" + method = "post" + encoding.codec = "otlp" + auth.strategy = "basic" + auth.user = "{{ ethereum_network_name }}" + auth.password = "{{ secret_loki.password }}" + # One event here is already a full OTLP envelope (built by the reduce above). + # max_events MUST be 1 — OTLP/HTTP allows one envelope per request. + batch.max_events = 1 + batch.timeout_secs = 5 diff --git a/ansible/inventories/devnet-3/group_vars/all/images.yaml b/ansible/inventories/devnet-3/group_vars/all/images.yaml index 0355ac0..ba8bd68 100644 --- a/ansible/inventories/devnet-3/group_vars/all/images.yaml +++ b/ansible/inventories/devnet-3/group_vars/all/images.yaml @@ -57,7 +57,7 @@ default_tooling_images: nginx_proxy_acme: nginxproxy/acme-companion nginx_proxy_cert_loader: ethpandaops/debian-docker:latest nginx_proxy_cert_linker: nginxproxy/docker-gen - vector: timberio/vector:0.46.1-alpine + vector: timberio/vector:0.56.0-alpine spamoor: ethpandaops/spamoor:pk910-bump-deployment-gas-limits blobber: ethpandaops/blobber:latest syncoor_web: docker.ethquokkaops.io/gh/ethpandaops/syncoor-web:master diff --git a/ansible/inventories/devnet-6/group_vars/all/all.yaml b/ansible/inventories/devnet-6/group_vars/all/all.yaml index 8faee65..4cd5ca7 100644 --- a/ansible/inventories/devnet-6/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-6/group_vars/all/all.yaml @@ -278,17 +278,20 @@ docker_nginx_proxy_wildcard_cert: "{{ network_server_subdomain }}" docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain }}/{{ network_server_subdomain }}-latest.tar.enc" docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}" -# role: ethpandaops.general.otelcol_contrib +# OTLP egress to the prod analytics gateway (Vector ships container logs here). +# Ingress identity (auth username + ingress_user tag) is the current devnet name +# rather than secret_loki.username: the sops username drifts stale between devnet +# iterations, while the gateway only validates the (constant) password — so using +# the network name keeps log attribution correct no matter what sops holds. otlp_endpoint: "https://otlp.analytics.production.platform.ethpandaops.io" otlp_deployment_env: production -otelcol_contrib_container_networks: "{{ docker_networks_shared }}" +# otelcol-contrib is no longer used at the edge; the role removes its container. +otelcol_contrib_cleanup: true # role: ethpandaops.general.vector -clickhouse_logs_enabled: false -clickhouse_logs_endpoint: "https://logs-ingest.analytics.production.platform.ethpandaops.io" -vector_container_networks: "{{ docker_networks_shared }}" vector_config: | + # Docker container logs (clean per-container metadata straight from the Docker API) [sources.in] type = "docker_logs" exclude_containers = [ @@ -301,128 +304,128 @@ vector_config: | "snooper-", ] - [sinks.loki] - type = "loki" + # Shape docker_logs events into an OTLP resourceLogs envelope with full metadata. + [transforms.otel_shape] + type = "remap" inputs = ["in"] - out_of_order_action = "accept" - labels.forwarder = "vector" - labels.instance = "{{ inventory_hostname }}" - labels.network = "{{ ethereum_network_name }}" - labels.testnet = "{{ ethereum_network_name }}" - labels.ingress_user = "{{ secret_loki.username }}" - labels.container_name = "{{ '{{ container_name }}' }}" - {%- if ethereum_node_el is defined +%} - labels.ethereum_el = "{{ ethereum_node_el }}" - {%- endif +%} + source = ''' + ts_ns = to_string(to_unix_timestamp(now(), unit: "nanoseconds")) + if is_timestamp(.timestamp) { + ts_ns = to_string(to_unix_timestamp!(.timestamp, unit: "nanoseconds")) + } + msg = string(.message) ?? "" + ctr = string(.container_name) ?? "unknown" + img = string(.image) ?? "" + strm = string(.stream) ?? "stdout" + + # Extract the source log level: JSON-structured lines (.level / .severity) + # first, then logfmt (level=xxx), then a level token near the start of a + # plain-text line. Client formats vary (lighthouse truncates to 4 chars: + # DEBG/ERRO/CRIT), so the text matcher allows a union of forms. SeverityText + # keeps the source's exact text; only SeverityNumber is normalised to the + # OTel scale (https://opentelemetry.io/docs/specs/otel/logs/data-model/#field-severitynumber). + # Unrecognised lines are left unset rather than guessed. + sevtext = "" + if starts_with(msg, "{") { + j = parse_json(msg) ?? {} + if is_string(j.level) { + sevtext = string!(j.level) + } else if is_string(j.severity) { + sevtext = string!(j.severity) + } + } + if sevtext == "" { + lm = parse_regex(msg, r'(?i)\b(?:level|lvl|severity)="?(?P[a-z]+)') ?? {} + if is_string(lm.l) { + sevtext = string!(lm.l) + } else { + hm = parse_regex(truncate(msg, 48), r'(?i)\b(?PTRACE|DEBUG|DEBG|DBUG|NOTICE|INFO|WARNING|WARN|ERROR|ERRO|CRITICAL|CRIT|FATAL|PANIC)\b') ?? {} + if is_string(hm.l) { + sevtext = string!(hm.l) + } + } + } + lvl = upcase(sevtext) + sevnum = 0 + if lvl == "TRACE" { + sevnum = 1 + } else if lvl == "DEBUG" || lvl == "DEBG" || lvl == "DBUG" { + sevnum = 5 + } else if lvl == "INFO" || lvl == "INFOR" || lvl == "NOTICE" { + sevnum = 9 + } else if lvl == "WARN" || lvl == "WARNING" { + sevnum = 13 + } else if lvl == "ERROR" || lvl == "ERRO" { + sevnum = 17 + } else if lvl == "CRIT" || lvl == "CRITICAL" { + sevnum = 18 + } else if lvl == "FATAL" || lvl == "PANIC" { + sevnum = 21 + } + + attrs = [ + {"key": "service.name", "value": {"stringValue": ctr}}, + {"key": "container.name", "value": {"stringValue": ctr}}, + {"key": "container.image.name", "value": {"stringValue": img}}, + {"key": "deployment.environment", "value": {"stringValue": "{{ otlp_deployment_env }}"}}, + {"key": "forwarder", "value": {"stringValue": "vector"}}, + {"key": "ingress_user", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "network", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "testnet", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "instance", "value": {"stringValue": "{{ inventory_hostname }}"}}, + {"key": "host.name", "value": {"stringValue": "{{ inventory_hostname }}"}} + ] {%- if ethereum_node_cl is defined +%} - labels.ethereum_cl = "{{ ethereum_node_cl }}" + attrs = push(attrs, {"key": "ethereum_cl", "value": {"stringValue": "{{ ethereum_node_cl }}"}}) {%- endif +%} - encoding.codec = "json" - endpoint = "{{ secret_loki.endpoint }}" - auth.strategy = "basic" - auth.user = "{{ secret_loki.username }}" - auth.password = "{{ secret_loki.password }}" - {%- if clickhouse_logs_enabled | default(false) +%} + {%- if ethereum_node_el is defined +%} + attrs = push(attrs, {"key": "ethereum_el", "value": {"stringValue": "{{ ethereum_node_el }}"}}) + {%- endif +%} + . = { + "resource_log": { + "resource": {"attributes": attrs}, + "scopeLogs": [{ + "scope": {"name": "{{ ethereum_network_name }}-vector"}, + "logRecords": [{ + "timeUnixNano": ts_ns, + "severityNumber": sevnum, + "severityText": sevtext, + "body": {"stringValue": msg}, + "attributes": [{"key": "stream", "value": {"stringValue": strm}}] + }] + }] + } + } + ''' - [transforms.clickhouse_shape] + # Batch many shaped log events into one OTLP envelope (reduce works on logs). + [transforms.batch_envelope] + type = "reduce" + inputs = ["otel_shape"] + expire_after_ms = 30000 + end_every_period_ms = 5000 + max_events = 500 + merge_strategies.resource_log = "array" + + [transforms.finalize_envelope] type = "remap" - inputs = ["in"] + inputs = ["batch_envelope"] source = ''' - .IngressUser = "{{ secret_loki.username }}" - .Namespace = "" - .Pod = "" - .Container = string(.container_name) ?? "" - .Node = "{{ inventory_hostname }}" - .Stream = string(.stream) ?? "" - .Message = string(.message) ?? "" - .Timestamp = .timestamp - del(.container_name); del(.container_id); del(.container_created_at) - del(.image); del(.host); del(.label); del(.source_type) - del(.stream); del(.message); del(.timestamp) + . = {"resourceLogs": .resource_log} ''' - [sinks.clickhouse_logs] - type = "http" - inputs = ["clickhouse_shape"] - uri = "{{ clickhouse_logs_endpoint }}" - method = "post" - encoding.codec = "json" - auth.strategy = "basic" - auth.user = "{{ secret_loki.username }}" - auth.password = "{{ secret_loki.password }}" - batch.max_events = 5000 - batch.timeout_secs = 3 - {%- endif +%} -otelcol_contrib_config: | - extensions: - basicauth/client: - client_auth: - username: {{ secret_loki.username }} - password: {{ secret_loki.password }} - - receivers: - filelog: - include: [/var/lib/docker/containers/*/*-json.log] - include_file_path: true - start_at: end - operators: - - type: container - format: docker - add_metadata_from_filepath: true - - type: filter - expr: '(attributes["container.name"] != nil and attributes["container.name"] matches "^(otelcol|ethereum-metrics-exporter|nginx-proxy|node_exporter|prometheus|snooper-.*)$") or body matches "github\\.com/open-telemetry/opentelemetry-collector-contrib|otelcol-contrib"' - - type: json_parser - if: 'body matches "^\\s*\\{"' - on_error: send - severity: - parse_from: attributes.level - overwrite_text: true - mapping: - fatal4: [emergency, emerg] - fatal3: [alert] - fatal2: [critical, crit] - fatal: [panic] - - otlp: - protocols: - grpc: {endpoint: "[::]:4317"} - http: {endpoint: "[::]:4318"} - - processors: - resource: - attributes: - - {key: deployment.environment, value: "{{ otlp_deployment_env }}", action: upsert} - - {key: network, value: "{{ ethereum_network_name }}", action: upsert} - - {key: ingress_user, value: "{{ secret_loki.username }}", action: upsert} - - {key: host.name, value: "{{ inventory_hostname }}", action: upsert} - - transform/service_name: - log_statements: - - context: resource - statements: - - set(attributes["service.name"], attributes["container.name"]) where attributes["container.name"] != nil - - batch: - send_batch_size: 500 - timeout: 5s - - exporters: - otlphttp/staging: - endpoint: "{{ otlp_endpoint }}" - auth: - authenticator: basicauth/client - - otlp/tempo: - endpoint: "{{ tempo_grpc_url | regex_replace('^grpcs?://', '') }}" - - service: - extensions: [basicauth/client] - pipelines: - logs: - receivers: [filelog, otlp] - processors: [resource, transform/service_name, batch] - exporters: [otlphttp/staging] - traces: - receivers: [otlp] - processors: [resource, batch] - exporters: [otlphttp/staging, otlp/tempo] + [sinks.otlp_logs] + type = "opentelemetry" + inputs = ["finalize_envelope"] + [sinks.otlp_logs.protocol] + type = "http" + uri = "{{ otlp_endpoint }}/v1/logs" + method = "post" + encoding.codec = "otlp" + auth.strategy = "basic" + auth.user = "{{ ethereum_network_name }}" + auth.password = "{{ secret_loki.password }}" + # One event here is already a full OTLP envelope (built by the reduce above). + # max_events MUST be 1 — OTLP/HTTP allows one envelope per request. + batch.max_events = 1 + batch.timeout_secs = 5 diff --git a/ansible/inventories/devnet-6/group_vars/all/images.yaml b/ansible/inventories/devnet-6/group_vars/all/images.yaml index ec70735..4facf23 100644 --- a/ansible/inventories/devnet-6/group_vars/all/images.yaml +++ b/ansible/inventories/devnet-6/group_vars/all/images.yaml @@ -57,7 +57,7 @@ default_tooling_images: nginx_proxy_acme: nginxproxy/acme-companion nginx_proxy_cert_loader: ethpandaops/debian-docker:latest nginx_proxy_cert_linker: nginxproxy/docker-gen - vector: timberio/vector:0.46.1-alpine + vector: timberio/vector:0.56.0-alpine spamoor: ethpandaops/spamoor:master-latest blobber: ethpandaops/blobber:latest syncoor_web: docker.ethquokkaops.io/gh/ethpandaops/syncoor-web:master diff --git a/ansible/inventories/devnet-7/group_vars/all/all.yaml b/ansible/inventories/devnet-7/group_vars/all/all.yaml index 494a143..28b8269 100644 --- a/ansible/inventories/devnet-7/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-7/group_vars/all/all.yaml @@ -278,17 +278,20 @@ docker_nginx_proxy_wildcard_cert: "{{ network_server_subdomain }}" docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain }}/{{ network_server_subdomain }}-latest.tar.enc" docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}" -# role: ethpandaops.general.otelcol_contrib +# OTLP egress to the prod analytics gateway (Vector ships container logs here). +# Ingress identity (auth username + ingress_user tag) is the current devnet name +# rather than secret_loki.username: the sops username drifts stale between devnet +# iterations, while the gateway only validates the (constant) password — so using +# the network name keeps log attribution correct no matter what sops holds. otlp_endpoint: "https://otlp.analytics.production.platform.ethpandaops.io" otlp_deployment_env: production -otelcol_contrib_container_networks: "{{ docker_networks_shared }}" +# otelcol-contrib is no longer used at the edge; the role removes its container. +otelcol_contrib_cleanup: true # role: ethpandaops.general.vector -clickhouse_logs_enabled: false -clickhouse_logs_endpoint: "https://logs-ingest.analytics.production.platform.ethpandaops.io" -vector_container_networks: "{{ docker_networks_shared }}" vector_config: | + # Docker container logs (clean per-container metadata straight from the Docker API) [sources.in] type = "docker_logs" exclude_containers = [ @@ -301,128 +304,128 @@ vector_config: | "snooper-", ] - [sinks.loki] - type = "loki" + # Shape docker_logs events into an OTLP resourceLogs envelope with full metadata. + [transforms.otel_shape] + type = "remap" inputs = ["in"] - out_of_order_action = "accept" - labels.forwarder = "vector" - labels.instance = "{{ inventory_hostname }}" - labels.network = "{{ ethereum_network_name }}" - labels.testnet = "{{ ethereum_network_name }}" - labels.ingress_user = "{{ secret_loki.username }}" - labels.container_name = "{{ '{{ container_name }}' }}" - {%- if ethereum_node_el is defined +%} - labels.ethereum_el = "{{ ethereum_node_el }}" - {%- endif +%} + source = ''' + ts_ns = to_string(to_unix_timestamp(now(), unit: "nanoseconds")) + if is_timestamp(.timestamp) { + ts_ns = to_string(to_unix_timestamp!(.timestamp, unit: "nanoseconds")) + } + msg = string(.message) ?? "" + ctr = string(.container_name) ?? "unknown" + img = string(.image) ?? "" + strm = string(.stream) ?? "stdout" + + # Extract the source log level: JSON-structured lines (.level / .severity) + # first, then logfmt (level=xxx), then a level token near the start of a + # plain-text line. Client formats vary (lighthouse truncates to 4 chars: + # DEBG/ERRO/CRIT), so the text matcher allows a union of forms. SeverityText + # keeps the source's exact text; only SeverityNumber is normalised to the + # OTel scale (https://opentelemetry.io/docs/specs/otel/logs/data-model/#field-severitynumber). + # Unrecognised lines are left unset rather than guessed. + sevtext = "" + if starts_with(msg, "{") { + j = parse_json(msg) ?? {} + if is_string(j.level) { + sevtext = string!(j.level) + } else if is_string(j.severity) { + sevtext = string!(j.severity) + } + } + if sevtext == "" { + lm = parse_regex(msg, r'(?i)\b(?:level|lvl|severity)="?(?P[a-z]+)') ?? {} + if is_string(lm.l) { + sevtext = string!(lm.l) + } else { + hm = parse_regex(truncate(msg, 48), r'(?i)\b(?PTRACE|DEBUG|DEBG|DBUG|NOTICE|INFO|WARNING|WARN|ERROR|ERRO|CRITICAL|CRIT|FATAL|PANIC)\b') ?? {} + if is_string(hm.l) { + sevtext = string!(hm.l) + } + } + } + lvl = upcase(sevtext) + sevnum = 0 + if lvl == "TRACE" { + sevnum = 1 + } else if lvl == "DEBUG" || lvl == "DEBG" || lvl == "DBUG" { + sevnum = 5 + } else if lvl == "INFO" || lvl == "INFOR" || lvl == "NOTICE" { + sevnum = 9 + } else if lvl == "WARN" || lvl == "WARNING" { + sevnum = 13 + } else if lvl == "ERROR" || lvl == "ERRO" { + sevnum = 17 + } else if lvl == "CRIT" || lvl == "CRITICAL" { + sevnum = 18 + } else if lvl == "FATAL" || lvl == "PANIC" { + sevnum = 21 + } + + attrs = [ + {"key": "service.name", "value": {"stringValue": ctr}}, + {"key": "container.name", "value": {"stringValue": ctr}}, + {"key": "container.image.name", "value": {"stringValue": img}}, + {"key": "deployment.environment", "value": {"stringValue": "{{ otlp_deployment_env }}"}}, + {"key": "forwarder", "value": {"stringValue": "vector"}}, + {"key": "ingress_user", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "network", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "testnet", "value": {"stringValue": "{{ ethereum_network_name }}"}}, + {"key": "instance", "value": {"stringValue": "{{ inventory_hostname }}"}}, + {"key": "host.name", "value": {"stringValue": "{{ inventory_hostname }}"}} + ] {%- if ethereum_node_cl is defined +%} - labels.ethereum_cl = "{{ ethereum_node_cl }}" + attrs = push(attrs, {"key": "ethereum_cl", "value": {"stringValue": "{{ ethereum_node_cl }}"}}) {%- endif +%} - encoding.codec = "json" - endpoint = "{{ secret_loki.endpoint }}" - auth.strategy = "basic" - auth.user = "{{ secret_loki.username }}" - auth.password = "{{ secret_loki.password }}" - {%- if clickhouse_logs_enabled | default(false) +%} + {%- if ethereum_node_el is defined +%} + attrs = push(attrs, {"key": "ethereum_el", "value": {"stringValue": "{{ ethereum_node_el }}"}}) + {%- endif +%} + . = { + "resource_log": { + "resource": {"attributes": attrs}, + "scopeLogs": [{ + "scope": {"name": "{{ ethereum_network_name }}-vector"}, + "logRecords": [{ + "timeUnixNano": ts_ns, + "severityNumber": sevnum, + "severityText": sevtext, + "body": {"stringValue": msg}, + "attributes": [{"key": "stream", "value": {"stringValue": strm}}] + }] + }] + } + } + ''' - [transforms.clickhouse_shape] + # Batch many shaped log events into one OTLP envelope (reduce works on logs). + [transforms.batch_envelope] + type = "reduce" + inputs = ["otel_shape"] + expire_after_ms = 30000 + end_every_period_ms = 5000 + max_events = 500 + merge_strategies.resource_log = "array" + + [transforms.finalize_envelope] type = "remap" - inputs = ["in"] + inputs = ["batch_envelope"] source = ''' - .IngressUser = "{{ secret_loki.username }}" - .Namespace = "" - .Pod = "" - .Container = string(.container_name) ?? "" - .Node = "{{ inventory_hostname }}" - .Stream = string(.stream) ?? "" - .Message = string(.message) ?? "" - .Timestamp = .timestamp - del(.container_name); del(.container_id); del(.container_created_at) - del(.image); del(.host); del(.label); del(.source_type) - del(.stream); del(.message); del(.timestamp) + . = {"resourceLogs": .resource_log} ''' - [sinks.clickhouse_logs] - type = "http" - inputs = ["clickhouse_shape"] - uri = "{{ clickhouse_logs_endpoint }}" - method = "post" - encoding.codec = "json" - auth.strategy = "basic" - auth.user = "{{ secret_loki.username }}" - auth.password = "{{ secret_loki.password }}" - batch.max_events = 5000 - batch.timeout_secs = 3 - {%- endif +%} -otelcol_contrib_config: | - extensions: - basicauth/client: - client_auth: - username: {{ secret_loki.username }} - password: {{ secret_loki.password }} - - receivers: - filelog: - include: [/var/lib/docker/containers/*/*-json.log] - include_file_path: true - start_at: end - operators: - - type: container - format: docker - add_metadata_from_filepath: true - - type: filter - expr: 'attributes["container.name"] != nil and attributes["container.name"] matches "^(otelcol|ethereum-metrics-exporter|nginx-proxy|node_exporter|prometheus|snooper-.*)$"' - - type: json_parser - if: 'body matches "^\\s*\\{"' - on_error: send - severity: - parse_from: attributes.level - overwrite_text: true - mapping: - fatal4: [emergency, emerg] - fatal3: [alert] - fatal2: [critical, crit] - fatal: [panic] - - otlp: - protocols: - grpc: {endpoint: "[::]:4317"} - http: {endpoint: "[::]:4318"} - - processors: - resource: - attributes: - - {key: deployment.environment, value: "{{ otlp_deployment_env }}", action: upsert} - - {key: network, value: "{{ ethereum_network_name }}", action: upsert} - - {key: ingress_user, value: "{{ secret_loki.username }}", action: upsert} - - {key: host.name, value: "{{ inventory_hostname }}", action: upsert} - - transform/service_name: - log_statements: - - context: resource - statements: - - set(attributes["service.name"], attributes["container.name"]) where attributes["container.name"] != nil - - batch: - send_batch_size: 500 - timeout: 5s - - exporters: - otlphttp/staging: - endpoint: "{{ otlp_endpoint }}" - auth: - authenticator: basicauth/client - - otlp/tempo: - endpoint: "{{ tempo_grpc_url | regex_replace('^grpcs?://', '') }}" - - service: - extensions: [basicauth/client] - pipelines: - logs: - receivers: [filelog, otlp] - processors: [resource, transform/service_name, batch] - exporters: [otlphttp/staging] - traces: - receivers: [otlp] - processors: [resource, batch] - exporters: [otlphttp/staging, otlp/tempo] + [sinks.otlp_logs] + type = "opentelemetry" + inputs = ["finalize_envelope"] + [sinks.otlp_logs.protocol] + type = "http" + uri = "{{ otlp_endpoint }}/v1/logs" + method = "post" + encoding.codec = "otlp" + auth.strategy = "basic" + auth.user = "{{ ethereum_network_name }}" + auth.password = "{{ secret_loki.password }}" + # One event here is already a full OTLP envelope (built by the reduce above). + # max_events MUST be 1 — OTLP/HTTP allows one envelope per request. + batch.max_events = 1 + batch.timeout_secs = 5 diff --git a/ansible/inventories/devnet-7/group_vars/all/images.yaml b/ansible/inventories/devnet-7/group_vars/all/images.yaml index 961beec..e552b7a 100644 --- a/ansible/inventories/devnet-7/group_vars/all/images.yaml +++ b/ansible/inventories/devnet-7/group_vars/all/images.yaml @@ -59,7 +59,7 @@ default_tooling_images: nginx_proxy_acme: nginxproxy/acme-companion nginx_proxy_cert_loader: ethpandaops/debian-docker:latest nginx_proxy_cert_linker: nginxproxy/docker-gen - vector: timberio/vector:0.46.1-alpine + vector: timberio/vector:0.56.0-alpine spamoor: ethpandaops/spamoor:master-latest blobber: ethpandaops/blobber:latest syncoor_web: docker.ethquokkaops.io/gh/ethpandaops/syncoor-web:master