From 586728d365c6688241e03bffde95b5fb77c347d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Wed, 4 Feb 2026 17:07:35 +0100 Subject: [PATCH 01/18] feat: observability with otel and default grafana dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 240 +++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100755 observability/install-observability.sh diff --git a/observability/install-observability.sh b/observability/install-observability.sh new file mode 100755 index 0000000000..2db117e0cd --- /dev/null +++ b/observability/install-observability.sh @@ -0,0 +1,240 @@ +#!/bin/bash + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Observability Stack${NC}" +echo -e "${GREEN}OpenTelemetry + Prometheus + Grafana${NC}" +echo -e "${GREEN}========================================${NC}" + +# Check if helm is installed +echo -e "\n${YELLOW}Checking helm installation...${NC}" +if ! command -v helm &> /dev/null; then + echo -e "${RED}Error: helm is not installed${NC}" + echo "Please install helm: https://helm.sh/docs/intro/install/" + exit 1 +fi +echo -e "${GREEN}✓ helm is installed${NC}" + +# Add Helm repositories +echo -e "\n${YELLOW}Adding Helm repositories...${NC}" +helm repo add jetstack https://charts.jetstack.io +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +echo -e "${GREEN}✓ Helm repositories added${NC}" + +# Install cert-manager (required for OpenTelemetry Operator) +echo -e "\n${YELLOW}Installing cert-manager...${NC}" +if kubectl get namespace cert-manager > /dev/null 2>&1; then + echo -e "${YELLOW}cert-manager namespace already exists, skipping...${NC}" +else + kubectl create namespace cert-manager + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --set crds.enabled=true \ + --wait + echo -e "${GREEN}✓ cert-manager installed${NC}" +fi + +# Create observability namespace +echo -e "\n${YELLOW}Creating observability namespace...${NC}" +kubectl create namespace observability --dry-run=client -o yaml | kubectl apply -f - +echo -e "${GREEN}✓ observability namespace ready${NC}" + +# Install OpenTelemetry Operator +echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" +if helm list -n observability | grep -q opentelemetry-operator; then + echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" + helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ + --wait +else + helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ + --wait +fi +echo -e "${GREEN}✓ OpenTelemetry Operator installed${NC}" + +# Install kube-prometheus-stack (includes Prometheus + Grafana) +echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" +if helm list -n observability | grep -q kube-prometheus-stack; then + echo -e "${YELLOW}kube-prometheus-stack already installed, upgrading...${NC}" + helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin \ + --wait +else + helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin \ + --wait +fi +echo -e "${GREEN}✓ Prometheus and Grafana installed${NC}" + +# Create OpenTelemetry Collector instance +echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" +cat < Date: Wed, 4 Feb 2026 17:28:34 +0100 Subject: [PATCH 02/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index 2db117e0cd..e9c42f5968 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -50,6 +50,7 @@ echo -e "${GREEN}✓ observability namespace ready${NC}" # Install OpenTelemetry Operator echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" + if helm list -n observability | grep -q opentelemetry-operator; then echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ From afab176eb9cc79bb6b84faa116316f2207782ef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Wed, 4 Feb 2026 17:54:09 +0100 Subject: [PATCH 03/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index e9c42f5968..314ee1e4aa 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -1,4 +1,19 @@ #!/bin/bash +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# set -e From b0af1e214c010956d9a185ce354cd8649986b5c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Wed, 4 Feb 2026 18:05:06 +0100 Subject: [PATCH 04/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .github/workflows/pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 7a5964ba35..34bc6c7d0f 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -11,6 +11,7 @@ on: paths-ignore: - 'docs/**' - 'adr/**' + - 'observability/**' workflow_dispatch: jobs: check_format_and_unit_tests: From edd24fe38f1fdc9bd5789530db4813c820e41863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Sun, 8 Feb 2026 16:17:46 +0100 Subject: [PATCH 05/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- grafana/README.md | 225 +++++++++++++++ grafana/install-observability.sh | 264 ++++++++++++++++++ sample-operators/webpage/pom.xml | 14 + .../operator/sample/WebPageOperator.java | 49 +++- .../src/main/resources/otlp-config.yaml | 6 + 5 files changed, 557 insertions(+), 1 deletion(-) create mode 100644 grafana/README.md create mode 100755 grafana/install-observability.sh create mode 100644 sample-operators/webpage/src/main/resources/otlp-config.yaml diff --git a/grafana/README.md b/grafana/README.md new file mode 100644 index 0000000000..35e1167190 --- /dev/null +++ b/grafana/README.md @@ -0,0 +1,225 @@ +# Observability Stack for Java Operator SDK + +This directory contains scripts and configuration for setting up a complete observability stack on minikube. + +## Quick Start + +```bash +./install-observability.sh +``` + +This script installs: +- **OpenTelemetry Operator** - For collecting metrics and traces +- **Prometheus** - For metrics storage and querying +- **Grafana** - For visualization and dashboards +- **cert-manager** - Required for OpenTelemetry Operator webhooks + +## Prerequisites + +- kubectl configured +- Helm 3.x installed + +## Components Installed + +### OpenTelemetry Collector +- Receives metrics and traces via OTLP (gRPC and HTTP) +- Exports metrics to Prometheus format +- Configured with memory limiter and batch processing + +**Endpoints:** +- OTLP gRPC: `otel-collector-collector.observability.svc.cluster.local:4317` +- OTLP HTTP: `otel-collector-collector.observability.svc.cluster.local:4318` +- Prometheus metrics: `http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics` + +### Prometheus +- Scrapes metrics from OpenTelemetry Collector +- Supports ServiceMonitor and PodMonitor CRDs +- Configured to discover all metrics automatically + +**Access:** +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Open http://localhost:9090 + +### Grafana +- Pre-configured with Prometheus as data source +- Includes Kubernetes monitoring dashboards + +**Access:** +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80 +``` +Open http://localhost:3000 +- **Username:** admin +- **Password:** admin + +## Integrating with Your Operator + +### 1. Add OpenTelemetry Dependency + +Add to your `pom.xml`: + +```xml + + io.javaoperatorsdk + operator-framework-opentelemetry-support + ${josdk.version} + +``` + +### 2. Configure OpenTelemetry in Your Operator + +In your operator code: + +```java +import io.javaoperatorsdk.operator.monitoring.opentelemetry.OpenTelemetryMetrics; +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdk; + +// Initialize OpenTelemetry +OpenTelemetry openTelemetry = AutoConfiguredOpenTelemetrySdk.initialize() + .getOpenTelemetrySdk(); + +// Create JOSDK metrics instance +Metrics metrics = OpenTelemetryMetrics.builder(openTelemetry) + .build(); + +// Configure operator with metrics +Operator operator = new Operator(client, o -> o.withMetrics(metrics)); +``` + +### 3. Set Environment Variables + +In your operator deployment YAML: + +```yaml +env: + - name: OTEL_SERVICE_NAME + value: "your-operator-name" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://otel-collector-collector.observability.svc.cluster.local:4318" + - name: OTEL_METRICS_EXPORTER + value: "otlp" + - name: OTEL_TRACES_EXPORTER + value: "otlp" + - name: OTEL_EXPORTER_OTLP_PROTOCOL + value: "http/protobuf" +``` + +## Available JOSDK Metrics + +The following metrics are exported by JOSDK: + +| Metric | Type | Description | +|--------|------|-------------| +| `operator_sdk_reconciliations_started_total` | Counter | Total number of reconciliations started | +| `operator_sdk_reconciliations_success_total` | Counter | Total number of successful reconciliations | +| `operator_sdk_reconciliations_failed_total` | Counter | Total number of failed reconciliations | +| `operator_sdk_reconciliations_queue_size` | Gauge | Current reconciliation queue size | +| `operator_sdk_events_received_total` | Counter | Total number of Kubernetes events received | +| `operator_sdk_controllers_execution_reconcile_seconds` | Timer | Time taken for reconciliations | +| `operator_sdk_controllers_execution_cleanup_seconds` | Timer | Time taken for cleanup operations | + +## Creating Grafana Dashboards + +### Example PromQL Queries + +**Reconciliation Rate:** +```promql +sum(rate(operator_sdk_reconciliations_started_total[5m])) by (controller) +``` + +**Success Rate:** +```promql +sum(rate(operator_sdk_reconciliations_success_total[5m])) / +sum(rate(operator_sdk_reconciliations_started_total[5m])) +``` + +**Error Rate:** +```promql +sum(rate(operator_sdk_reconciliations_failed_total[5m])) by (controller, exception) +``` + +**Queue Size:** +```promql +operator_sdk_reconciliations_queue_size +``` + +**Average Reconciliation Duration:** +```promql +rate(operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / +rate(operator_sdk_controllers_execution_reconcile_seconds_count[5m]) +``` + +### Sample Dashboard Configuration + +1. Open Grafana (http://localhost:3000) +2. Go to "Dashboards" → "New Dashboard" +3. Add panels with the PromQL queries above +4. Configure visualization types: + - Time series for rates and durations + - Gauge for queue size + - Stat for current values + +## Troubleshooting + +### Check Pod Status +```bash +kubectl get pods -n observability +``` + +### Check OpenTelemetry Collector Logs +```bash +kubectl logs -n observability -l app.kubernetes.io/name=otel-collector -f +``` + +### Check Prometheus Targets +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Then open http://localhost:9090/targets + +### Verify Metrics are Being Collected +```bash +# Check if OpenTelemetry is receiving metrics +kubectl port-forward -n observability svc/otel-collector-prometheus 8889:8889 +curl http://localhost:8889/metrics | grep operator_sdk +``` + +### Test OTLP Endpoint +```bash +# Port forward the OTLP HTTP endpoint +kubectl port-forward -n observability svc/otel-collector-collector 4318:4318 + +# Send a test metric (requires curl and valid OTLP JSON) +# This is just for testing connectivity +curl -X POST http://localhost:4318/v1/metrics \ + -H "Content-Type: application/json" \ + -d '{"resourceMetrics":[]}' +``` + +## Uninstalling + +To remove all components: + +```bash +# Delete OpenTelemetry resources +kubectl delete -n observability OpenTelemetryCollector otel-collector + +# Uninstall Helm releases +helm uninstall -n observability kube-prometheus-stack +helm uninstall -n observability opentelemetry-operator +helm uninstall -n cert-manager cert-manager + +# Delete namespaces +kubectl delete namespace observability cert-manager +``` + +## References + +- [JOSDK Observability Documentation](https://javaoperatorsdk.io/docs/documentation/observability/) +- [OpenTelemetry Java Documentation](https://opentelemetry.io/docs/instrumentation/java/) +- [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) +- [Grafana Documentation](https://grafana.com/docs/) +- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) diff --git a/grafana/install-observability.sh b/grafana/install-observability.sh new file mode 100755 index 0000000000..63bdcb706f --- /dev/null +++ b/grafana/install-observability.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Observability Stack${NC}" +echo -e "${GREEN}OpenTelemetry + Prometheus + Grafana${NC}" +echo -e "${GREEN}========================================${NC}" + +# Check if minikube is running +echo -e "\n${YELLOW}Checking minikube status...${NC}" +if ! minikube status > /dev/null 2>&1; then + echo -e "${RED}Error: minikube is not running${NC}" + echo "Please start minikube with: minikube start" + exit 1 +fi +echo -e "${GREEN}✓ minikube is running${NC}" + +# Check if helm is installed +echo -e "\n${YELLOW}Checking helm installation...${NC}" +if ! command -v helm &> /dev/null; then + echo -e "${RED}Error: helm is not installed${NC}" + echo "Please install helm: https://helm.sh/docs/intro/install/" + exit 1 +fi +echo -e "${GREEN}✓ helm is installed${NC}" + +# Add Helm repositories +echo -e "\n${YELLOW}Adding Helm repositories...${NC}" +helm repo add jetstack https://charts.jetstack.io +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +echo -e "${GREEN}✓ Helm repositories added${NC}" + +# Install cert-manager (required for OpenTelemetry Operator) +echo -e "\n${YELLOW}Installing cert-manager...${NC}" +if kubectl get namespace cert-manager > /dev/null 2>&1; then + echo -e "${YELLOW}cert-manager namespace already exists, skipping...${NC}" +else + kubectl create namespace cert-manager + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --set crds.enabled=true \ + --wait + echo -e "${GREEN}✓ cert-manager installed${NC}" +fi + +# Create observability namespace +echo -e "\n${YELLOW}Creating observability namespace...${NC}" +kubectl create namespace observability --dry-run=client -o yaml | kubectl apply -f - +echo -e "${GREEN}✓ observability namespace ready${NC}" + +# Install OpenTelemetry Operator +echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" +if helm list -n observability | grep -q opentelemetry-operator; then + echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" + helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ + --wait +else + helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ + --wait +fi +echo -e "${GREEN}✓ OpenTelemetry Operator installed${NC}" + +# Install kube-prometheus-stack (includes Prometheus + Grafana) +echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" +if helm list -n observability | grep -q kube-prometheus-stack; then + echo -e "${YELLOW}kube-prometheus-stack already installed, upgrading...${NC}" + helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin \ + --wait +else + helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin \ + --wait +fi +echo -e "${GREEN}✓ Prometheus and Grafana installed${NC}" + +# Create OpenTelemetry Collector instance +echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" +kubectl apply -f - <io.javaoperatorsdk operator-framework + + io.javaoperatorsdk + micrometer-support + + + io.micrometer + micrometer-registry-otlp + ${micrometer-core.version} + + + org.yaml + snakeyaml + 2.3 + org.apache.logging.log4j log4j-slf4j2-impl diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 5366dc2e9a..78c05f8df7 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -16,14 +16,25 @@ package io.javaoperatorsdk.operator.sample; import java.io.IOException; +import java.io.InputStream; import java.net.InetSocketAddress; +import java.util.HashMap; +import java.util.Map; +import org.jspecify.annotations.NonNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.yaml.snakeyaml.Yaml; import io.javaoperatorsdk.operator.Operator; +import io.javaoperatorsdk.operator.api.monitoring.Metrics; +import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetrics; import io.javaoperatorsdk.operator.sample.probes.LivenessHandler; import io.javaoperatorsdk.operator.sample.probes.StartupHandler; +import io.micrometer.core.instrument.Clock; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.registry.otlp.OtlpConfig; +import io.micrometer.registry.otlp.OtlpMeterRegistry; import com.sun.net.httpserver.HttpServer; @@ -40,7 +51,10 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); - Operator operator = new Operator(o -> o.withStopOnInformerErrorDuringStartup(false)); + // Load configuration from config.yaml + Metrics metrics = initOTLPMetrics(); + Operator operator = + new Operator(o -> o.withStopOnInformerErrorDuringStartup(false).withMetrics(metrics)); String reconcilerEnvVar = System.getenv(WEBPAGE_RECONCILER_ENV); if (WEBPAGE_CLASSIC_RECONCILER_ENV_VALUE.equals(reconcilerEnvVar)) { operator.register(new WebPageReconciler()); @@ -58,4 +72,37 @@ public static void main(String[] args) throws IOException { server.setExecutor(null); server.start(); } + + private static @NonNull Metrics initOTLPMetrics() { + Map configProperties = loadConfigFromYaml(); + OtlpConfig otlpConfig = configProperties::get; + + MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); + return MicrometerMetrics.withoutPerResourceMetrics(registry); + } + + @SuppressWarnings("unchecked") + private static Map loadConfigFromYaml() { + Map configMap = new HashMap<>(); + try (InputStream inputStream = WebPageOperator.class.getResourceAsStream("/otlp-config.yaml")) { + if (inputStream == null) { + log.warn("otlp-config.yaml not found in resources, using default OTLP configuration"); + return configMap; + } + + Yaml yaml = new Yaml(); + Map yamlData = yaml.load(inputStream); + + // Navigate to otlp section and map properties directly + Map otlp = (Map) yamlData.get("otlp"); + if (otlp != null) { + otlp.forEach((key, value) -> configMap.put("otlp." + key, value.toString())); + } + + log.info("Loaded OTLP configuration from otlp-config.yaml: {}", configMap); + } catch (IOException e) { + log.error("Error loading otlp-config.yaml", e); + } + return configMap; + } } diff --git a/sample-operators/webpage/src/main/resources/otlp-config.yaml b/sample-operators/webpage/src/main/resources/otlp-config.yaml new file mode 100644 index 0000000000..30d6f283da --- /dev/null +++ b/sample-operators/webpage/src/main/resources/otlp-config.yaml @@ -0,0 +1,6 @@ +otlp: + # OTLP Collector endpoint - see observability/install-observability.sh for setup + url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative" From ece63e8f6aaca8c4f1ccf27f7bd95e33f39a27ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Sun, 8 Feb 2026 16:24:42 +0100 Subject: [PATCH 06/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../webpage/src/main/resources/otlp-config.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sample-operators/webpage/src/main/resources/otlp-config.yaml b/sample-operators/webpage/src/main/resources/otlp-config.yaml index 30d6f283da..ca93bfc965 100644 --- a/sample-operators/webpage/src/main/resources/otlp-config.yaml +++ b/sample-operators/webpage/src/main/resources/otlp-config.yaml @@ -1,3 +1,19 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + otlp: # OTLP Collector endpoint - see observability/install-observability.sh for setup url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" From 72ca6e8ead29f00d590043b4472b0ed60837fd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Sun, 8 Feb 2026 16:26:41 +0100 Subject: [PATCH 07/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- grafana/README.md | 225 -------------------------- grafana/install-observability.sh | 264 ------------------------------- 2 files changed, 489 deletions(-) delete mode 100644 grafana/README.md delete mode 100755 grafana/install-observability.sh diff --git a/grafana/README.md b/grafana/README.md deleted file mode 100644 index 35e1167190..0000000000 --- a/grafana/README.md +++ /dev/null @@ -1,225 +0,0 @@ -# Observability Stack for Java Operator SDK - -This directory contains scripts and configuration for setting up a complete observability stack on minikube. - -## Quick Start - -```bash -./install-observability.sh -``` - -This script installs: -- **OpenTelemetry Operator** - For collecting metrics and traces -- **Prometheus** - For metrics storage and querying -- **Grafana** - For visualization and dashboards -- **cert-manager** - Required for OpenTelemetry Operator webhooks - -## Prerequisites - -- kubectl configured -- Helm 3.x installed - -## Components Installed - -### OpenTelemetry Collector -- Receives metrics and traces via OTLP (gRPC and HTTP) -- Exports metrics to Prometheus format -- Configured with memory limiter and batch processing - -**Endpoints:** -- OTLP gRPC: `otel-collector-collector.observability.svc.cluster.local:4317` -- OTLP HTTP: `otel-collector-collector.observability.svc.cluster.local:4318` -- Prometheus metrics: `http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics` - -### Prometheus -- Scrapes metrics from OpenTelemetry Collector -- Supports ServiceMonitor and PodMonitor CRDs -- Configured to discover all metrics automatically - -**Access:** -```bash -kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 -``` -Open http://localhost:9090 - -### Grafana -- Pre-configured with Prometheus as data source -- Includes Kubernetes monitoring dashboards - -**Access:** -```bash -kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80 -``` -Open http://localhost:3000 -- **Username:** admin -- **Password:** admin - -## Integrating with Your Operator - -### 1. Add OpenTelemetry Dependency - -Add to your `pom.xml`: - -```xml - - io.javaoperatorsdk - operator-framework-opentelemetry-support - ${josdk.version} - -``` - -### 2. Configure OpenTelemetry in Your Operator - -In your operator code: - -```java -import io.javaoperatorsdk.operator.monitoring.opentelemetry.OpenTelemetryMetrics; -import io.opentelemetry.api.OpenTelemetry; -import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdk; - -// Initialize OpenTelemetry -OpenTelemetry openTelemetry = AutoConfiguredOpenTelemetrySdk.initialize() - .getOpenTelemetrySdk(); - -// Create JOSDK metrics instance -Metrics metrics = OpenTelemetryMetrics.builder(openTelemetry) - .build(); - -// Configure operator with metrics -Operator operator = new Operator(client, o -> o.withMetrics(metrics)); -``` - -### 3. Set Environment Variables - -In your operator deployment YAML: - -```yaml -env: - - name: OTEL_SERVICE_NAME - value: "your-operator-name" - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://otel-collector-collector.observability.svc.cluster.local:4318" - - name: OTEL_METRICS_EXPORTER - value: "otlp" - - name: OTEL_TRACES_EXPORTER - value: "otlp" - - name: OTEL_EXPORTER_OTLP_PROTOCOL - value: "http/protobuf" -``` - -## Available JOSDK Metrics - -The following metrics are exported by JOSDK: - -| Metric | Type | Description | -|--------|------|-------------| -| `operator_sdk_reconciliations_started_total` | Counter | Total number of reconciliations started | -| `operator_sdk_reconciliations_success_total` | Counter | Total number of successful reconciliations | -| `operator_sdk_reconciliations_failed_total` | Counter | Total number of failed reconciliations | -| `operator_sdk_reconciliations_queue_size` | Gauge | Current reconciliation queue size | -| `operator_sdk_events_received_total` | Counter | Total number of Kubernetes events received | -| `operator_sdk_controllers_execution_reconcile_seconds` | Timer | Time taken for reconciliations | -| `operator_sdk_controllers_execution_cleanup_seconds` | Timer | Time taken for cleanup operations | - -## Creating Grafana Dashboards - -### Example PromQL Queries - -**Reconciliation Rate:** -```promql -sum(rate(operator_sdk_reconciliations_started_total[5m])) by (controller) -``` - -**Success Rate:** -```promql -sum(rate(operator_sdk_reconciliations_success_total[5m])) / -sum(rate(operator_sdk_reconciliations_started_total[5m])) -``` - -**Error Rate:** -```promql -sum(rate(operator_sdk_reconciliations_failed_total[5m])) by (controller, exception) -``` - -**Queue Size:** -```promql -operator_sdk_reconciliations_queue_size -``` - -**Average Reconciliation Duration:** -```promql -rate(operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / -rate(operator_sdk_controllers_execution_reconcile_seconds_count[5m]) -``` - -### Sample Dashboard Configuration - -1. Open Grafana (http://localhost:3000) -2. Go to "Dashboards" → "New Dashboard" -3. Add panels with the PromQL queries above -4. Configure visualization types: - - Time series for rates and durations - - Gauge for queue size - - Stat for current values - -## Troubleshooting - -### Check Pod Status -```bash -kubectl get pods -n observability -``` - -### Check OpenTelemetry Collector Logs -```bash -kubectl logs -n observability -l app.kubernetes.io/name=otel-collector -f -``` - -### Check Prometheus Targets -```bash -kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 -``` -Then open http://localhost:9090/targets - -### Verify Metrics are Being Collected -```bash -# Check if OpenTelemetry is receiving metrics -kubectl port-forward -n observability svc/otel-collector-prometheus 8889:8889 -curl http://localhost:8889/metrics | grep operator_sdk -``` - -### Test OTLP Endpoint -```bash -# Port forward the OTLP HTTP endpoint -kubectl port-forward -n observability svc/otel-collector-collector 4318:4318 - -# Send a test metric (requires curl and valid OTLP JSON) -# This is just for testing connectivity -curl -X POST http://localhost:4318/v1/metrics \ - -H "Content-Type: application/json" \ - -d '{"resourceMetrics":[]}' -``` - -## Uninstalling - -To remove all components: - -```bash -# Delete OpenTelemetry resources -kubectl delete -n observability OpenTelemetryCollector otel-collector - -# Uninstall Helm releases -helm uninstall -n observability kube-prometheus-stack -helm uninstall -n observability opentelemetry-operator -helm uninstall -n cert-manager cert-manager - -# Delete namespaces -kubectl delete namespace observability cert-manager -``` - -## References - -- [JOSDK Observability Documentation](https://javaoperatorsdk.io/docs/documentation/observability/) -- [OpenTelemetry Java Documentation](https://opentelemetry.io/docs/instrumentation/java/) -- [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) -- [Grafana Documentation](https://grafana.com/docs/) -- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) diff --git a/grafana/install-observability.sh b/grafana/install-observability.sh deleted file mode 100755 index 63bdcb706f..0000000000 --- a/grafana/install-observability.sh +++ /dev/null @@ -1,264 +0,0 @@ -#!/bin/bash -# -# Copyright Java Operator SDK Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -e - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -echo -e "${GREEN}========================================${NC}" -echo -e "${GREEN}Installing Observability Stack${NC}" -echo -e "${GREEN}OpenTelemetry + Prometheus + Grafana${NC}" -echo -e "${GREEN}========================================${NC}" - -# Check if minikube is running -echo -e "\n${YELLOW}Checking minikube status...${NC}" -if ! minikube status > /dev/null 2>&1; then - echo -e "${RED}Error: minikube is not running${NC}" - echo "Please start minikube with: minikube start" - exit 1 -fi -echo -e "${GREEN}✓ minikube is running${NC}" - -# Check if helm is installed -echo -e "\n${YELLOW}Checking helm installation...${NC}" -if ! command -v helm &> /dev/null; then - echo -e "${RED}Error: helm is not installed${NC}" - echo "Please install helm: https://helm.sh/docs/intro/install/" - exit 1 -fi -echo -e "${GREEN}✓ helm is installed${NC}" - -# Add Helm repositories -echo -e "\n${YELLOW}Adding Helm repositories...${NC}" -helm repo add jetstack https://charts.jetstack.io -helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts -helm repo add prometheus-community https://prometheus-community.github.io/helm-charts -helm repo update -echo -e "${GREEN}✓ Helm repositories added${NC}" - -# Install cert-manager (required for OpenTelemetry Operator) -echo -e "\n${YELLOW}Installing cert-manager...${NC}" -if kubectl get namespace cert-manager > /dev/null 2>&1; then - echo -e "${YELLOW}cert-manager namespace already exists, skipping...${NC}" -else - kubectl create namespace cert-manager - helm install cert-manager jetstack/cert-manager \ - --namespace cert-manager \ - --set crds.enabled=true \ - --wait - echo -e "${GREEN}✓ cert-manager installed${NC}" -fi - -# Create observability namespace -echo -e "\n${YELLOW}Creating observability namespace...${NC}" -kubectl create namespace observability --dry-run=client -o yaml | kubectl apply -f - -echo -e "${GREEN}✓ observability namespace ready${NC}" - -# Install OpenTelemetry Operator -echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" -if helm list -n observability | grep -q opentelemetry-operator; then - echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" - helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ - --namespace observability \ - --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ - --wait -else - helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ - --namespace observability \ - --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ - --wait -fi -echo -e "${GREEN}✓ OpenTelemetry Operator installed${NC}" - -# Install kube-prometheus-stack (includes Prometheus + Grafana) -echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" -if helm list -n observability | grep -q kube-prometheus-stack; then - echo -e "${YELLOW}kube-prometheus-stack already installed, upgrading...${NC}" - helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \ - --namespace observability \ - --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ - --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ - --set grafana.adminPassword=admin \ - --wait -else - helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ - --namespace observability \ - --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ - --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ - --set grafana.adminPassword=admin \ - --wait -fi -echo -e "${GREEN}✓ Prometheus and Grafana installed${NC}" - -# Create OpenTelemetry Collector instance -echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" -kubectl apply -f - < Date: Mon, 9 Feb 2026 09:16:51 +0100 Subject: [PATCH 08/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 41 ++++++++++++++++--- .../operator/sample/WebPageOperator.java | 21 +++++++++- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index 314ee1e4aa..2c81f2bf38 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -199,6 +199,36 @@ echo -e "\n${YELLOW}Waiting for all pods to be ready...${NC}" kubectl wait --for=condition=ready pod --all -n observability --timeout=300s echo -e "${GREEN}✓ All pods are ready${NC}" +# Import Grafana dashboards +echo -e "\n${YELLOW}Importing Grafana dashboards...${NC}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [ -f "$SCRIPT_DIR/jvm-metrics-dashboard.json" ]; then + kubectl create configmap jvm-metrics-dashboard \ + --from-file="$SCRIPT_DIR/jvm-metrics-dashboard.json" \ + -n observability \ + --dry-run=client -o yaml | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + echo -e "${GREEN}✓ JVM Metrics dashboard imported${NC}" +else + echo -e "${YELLOW}⚠ JVM Metrics dashboard not found at $SCRIPT_DIR/jvm-metrics-dashboard.json${NC}" +fi + +if [ -f "$SCRIPT_DIR/josdk-operator-metrics-dashboard.json" ]; then + kubectl create configmap josdk-operator-metrics-dashboard \ + --from-file="$SCRIPT_DIR/josdk-operator-metrics-dashboard.json" \ + -n observability \ + --dry-run=client -o yaml | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + echo -e "${GREEN}✓ JOSDK Operator Metrics dashboard imported${NC}" +else + echo -e "${YELLOW}⚠ JOSDK Operator Metrics dashboard not found at $SCRIPT_DIR/josdk-operator-metrics-dashboard.json${NC}" +fi + +echo -e "${GREEN}✓ Dashboards will be available in Grafana shortly${NC}" + # Get pod statuses echo -e "\n${GREEN}========================================${NC}" echo -e "${GREEN}Installation Complete!${NC}" @@ -237,16 +267,17 @@ echo -e " ${GREEN}OTEL_TRACES_EXPORTER=otlp${NC}" echo -e "\n${GREEN}========================================${NC}" echo -e "${GREEN}Grafana Dashboards${NC}" echo -e "${GREEN}========================================${NC}" -echo -e "\nPre-installed dashboards in Grafana:" +echo -e "\nAutomatically imported dashboards:" +echo -e " - ${GREEN}JOSDK - JVM Metrics${NC} - Java Virtual Machine health and performance" +echo -e " - ${GREEN}JOSDK - Operator Metrics${NC} - Kubernetes operator performance and reconciliation" +echo -e "\nPre-installed Kubernetes dashboards:" echo -e " - Kubernetes / Compute Resources / Cluster" echo -e " - Kubernetes / Compute Resources / Namespace (Pods)" echo -e " - Node Exporter / Nodes" -echo -e "\nFor JOSDK metrics, create a custom dashboard with queries like:" -echo -e " ${GREEN}sum(rate(operator_sdk_reconciliations_started_total[5m]))${NC}" -echo -e " ${GREEN}sum(rate(operator_sdk_reconciliations_success_total[5m]))${NC}" -echo -e " ${GREEN}sum(rate(operator_sdk_reconciliations_failed_total[5m]))${NC}" +echo -e "\n${YELLOW}Note:${NC} Dashboards may take 30-60 seconds to appear in Grafana after installation." echo -e "\n${YELLOW}To uninstall:${NC}" +echo -e " kubectl delete configmap -n observability jvm-metrics-dashboard josdk-operator-metrics-dashboard" echo -e " kubectl delete -n observability OpenTelemetryCollector otel-collector" echo -e " helm uninstall -n observability kube-prometheus-stack" echo -e " helm uninstall -n observability opentelemetry-operator" diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 78c05f8df7..a2c342dc5e 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -33,6 +33,12 @@ import io.javaoperatorsdk.operator.sample.probes.StartupHandler; import io.micrometer.core.instrument.Clock; import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.binder.jvm.ClassLoaderMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmGcMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmMemoryMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmThreadMetrics; +import io.micrometer.core.instrument.binder.system.ProcessorMetrics; +import io.micrometer.core.instrument.binder.system.UptimeMetrics; import io.micrometer.registry.otlp.OtlpConfig; import io.micrometer.registry.otlp.OtlpMeterRegistry; @@ -78,7 +84,20 @@ public static void main(String[] args) throws IOException { OtlpConfig otlpConfig = configProperties::get; MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); - return MicrometerMetrics.withoutPerResourceMetrics(registry); + + // Register JVM and system metrics + log.info("Registering JVM and system metrics..."); + new JvmMemoryMetrics().bindTo(registry); + new JvmGcMetrics().bindTo(registry); + new JvmThreadMetrics().bindTo(registry); + new ClassLoaderMetrics().bindTo(registry); + new ProcessorMetrics().bindTo(registry); + new UptimeMetrics().bindTo(registry); + log.info("JVM and system metrics registered"); + + return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(registry) + .collectingMetricsPerResource() + .build(); } @SuppressWarnings("unchecked") From 899e34564857f872d7e257f65c128a110412b153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 09:27:30 +0100 Subject: [PATCH 09/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/README.md | 246 ++++ .../josdk-operator-metrics-dashboard.json | 1109 +++++++++++++++++ observability/jvm-metrics-dashboard.json | 857 +++++++++++++ 3 files changed, 2212 insertions(+) create mode 100644 observability/README.md create mode 100644 observability/josdk-operator-metrics-dashboard.json create mode 100644 observability/jvm-metrics-dashboard.json diff --git a/observability/README.md b/observability/README.md new file mode 100644 index 0000000000..9706a466e9 --- /dev/null +++ b/observability/README.md @@ -0,0 +1,246 @@ +# Observability Stack for Java Operator SDK + +This directory contains the setup scripts and Grafana dashboards for monitoring Java Operator SDK applications. + +## Installation + +Run the installation script to deploy the full observability stack (OpenTelemetry Collector, Prometheus, and Grafana): + +```bash +./install-observability.sh +``` + +This will install: +- **cert-manager** - Required for OpenTelemetry Operator +- **OpenTelemetry Operator** - Manages OpenTelemetry Collector instances +- **OpenTelemetry Collector** - Receives OTLP metrics and exports to Prometheus +- **Prometheus** - Metrics storage and querying +- **Grafana** - Metrics visualization + +## Accessing Services + +### Grafana +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80 +``` +Then open http://localhost:3000 +- Username: `admin` +- Password: `admin` + +### Prometheus +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Then open http://localhost:9090 + +## Grafana Dashboards + +Two pre-configured dashboards are **automatically imported** during installation: + +### 1. JVM Metrics Dashboard (`jvm-metrics-dashboard.json`) + +Monitors Java Virtual Machine health and performance: + +**Panels:** +- **JVM Memory Used** - Heap and non-heap memory consumption by memory pool +- **JVM Threads** - Live, daemon, and peak thread counts +- **GC Pause Time Rate** - Garbage collection pause duration +- **GC Pause Count Rate** - Frequency of garbage collection events +- **CPU Usage** - System CPU utilization percentage +- **Classes Loaded** - Number of classes currently loaded +- **Process Uptime** - Application uptime in seconds +- **CPU Count** - Available processor cores +- **GC Memory Allocation Rate** - Memory allocation and promotion rates +- **Heap Memory Max vs Committed** - Heap memory limits and commitments + +**Key Metrics:** +- `jvm.memory.used`, `jvm.memory.max`, `jvm.memory.committed` +- `jvm.gc.pause`, `jvm.gc.memory.allocated`, `jvm.gc.memory.promoted` +- `jvm.threads.live`, `jvm.threads.daemon`, `jvm.threads.peak` +- `jvm.classes.loaded`, `jvm.classes.unloaded` +- `system.cpu.usage`, `system.cpu.count` +- `process.uptime` + +### 2. Java Operator SDK Metrics Dashboard (`josdk-operator-metrics-dashboard.json`) + +Monitors Kubernetes operator performance and health: + +**Panels:** +- **Reconciliation Rate (Started)** - Rate of reconciliation loops triggered +- **Reconciliation Success vs Failure Rate** - Success/failure ratio over time +- **Currently Executing Reconciliations** - Active reconciliation threads +- **Reconciliation Queue Size** - Pending reconciliation work +- **Total Reconciliations** - Cumulative count of reconciliations +- **Error Rate** - Overall error rate across all reconciliations +- **Reconciliation Execution Time** - P50, P95, P99 latency percentiles +- **Event Reception Rate** - Kubernetes event processing rate +- **Failures by Exception Type** - Breakdown of errors by exception class +- **Controller Execution Success vs Failure** - Controller-level success metrics +- **Delete Event Rate** - Resource deletion event frequency +- **Reconciliation Retry Rate** - Retry attempts and patterns + +**Key Metrics:** +- `operator.sdk.reconciliations.started`, `.success`, `.failed` +- `operator.sdk.reconciliations.executions` - Current execution count +- `operator.sdk.reconciliations.queue.size` - Queue depth +- `operator.sdk.controllers.execution.reconcile` - Execution timing histograms +- `operator.sdk.events.received`, `.delete` - Event reception +- Retry metrics and failure breakdowns + +## Importing Dashboards into Grafana + +### Automatic Import (Default) + +The dashboards are **automatically imported** when you run `./install-observability.sh`. They will appear in Grafana within 30-60 seconds after installation. No manual steps required! + +To verify the dashboards were imported: +1. Access Grafana at http://localhost:3000 +2. Navigate to **Dashboards** → **Browse** +3. Look for "JOSDK - JVM Metrics" and "JOSDK - Operator Metrics" + +### Manual Import Methods + +If you need to re-import or update the dashboards manually: + +#### Method 1: Via Grafana UI + +1. Access Grafana at http://localhost:3000 +2. Login with admin/admin +3. Navigate to **Dashboards** → **Import** +4. Click **Upload JSON file** +5. Select `jvm-metrics-dashboard.json` or `josdk-operator-metrics-dashboard.json` +6. Select **Prometheus** as the data source +7. Click **Import** + +#### Method 2: Via kubectl ConfigMap + +```bash +# Re-import JVM dashboard +kubectl create configmap jvm-metrics-dashboard \ + --from-file=jvm-metrics-dashboard.json \ + -n observability \ + -o yaml --dry-run=client | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + +# Re-import Operator dashboard +kubectl create configmap josdk-operator-metrics-dashboard \ + --from-file=josdk-operator-metrics-dashboard.json \ + -n observability \ + -o yaml --dry-run=client | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - +``` + +The dashboards will be automatically discovered and loaded by Grafana within 30-60 seconds. + +## Configuring Your Operator + +To enable metrics export from your JOSDK operator, ensure your application: + +1. **Has the required dependency** (already included in webpage sample): + ```xml + + io.micrometer + micrometer-registry-otlp + + ``` + +2. **Configures OTLP export** via `otlp-config.yaml`: + ```yaml + otlp: + url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative" + ``` + +3. **Registers JVM and JOSDK metrics** (see `WebPageOperator.java` for reference implementation) + +## OTLP Endpoints + +The OpenTelemetry Collector provides the following endpoints: + +- **OTLP gRPC**: `otel-collector-collector.observability.svc.cluster.local:4317` +- **OTLP HTTP**: `otel-collector-collector.observability.svc.cluster.local:4318` +- **Prometheus Scrape**: `http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics` + +## Troubleshooting + +### Check OpenTelemetry Collector Logs +```bash +kubectl logs -n observability -l app.kubernetes.io/name=otel-collector -f +``` + +### Check Prometheus Targets +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Open http://localhost:9090/targets and verify the OTLP collector target is UP. + +### Verify Metrics in Prometheus +Open Prometheus UI and search for metrics: +- JVM metrics: `otel_jvm_*` +- Operator metrics: `otel_operator_sdk_*` + +### Check Grafana Data Source +1. Navigate to **Configuration** → **Data Sources** +2. Verify Prometheus data source is configured and working +3. Click **Test** to verify connectivity + +## Uninstalling + +To remove the observability stack: + +```bash +kubectl delete configmap -n observability jvm-metrics-dashboard josdk-operator-metrics-dashboard +kubectl delete -n observability OpenTelemetryCollector otel-collector +helm uninstall -n observability kube-prometheus-stack +helm uninstall -n observability opentelemetry-operator +helm uninstall -n cert-manager cert-manager +kubectl delete namespace observability cert-manager +``` + +## Customizing Dashboards + +The dashboard JSON files can be modified to: +- Add new panels for custom metrics +- Adjust time ranges and refresh intervals +- Change visualization types +- Add templating variables for filtering +- Modify alert thresholds + +After making changes, re-import the dashboard using one of the methods above. + +## Example Queries + +### JVM Metrics +```promql +# Heap memory usage percentage +(otel_jvm_memory_used_bytes{area="heap"} / otel_jvm_memory_max_bytes{area="heap"}) * 100 + +# GC throughput (percentage of time NOT in GC) +100 - (rate(otel_jvm_gc_pause_seconds_sum[5m]) * 100) + +# Thread count trend +otel_jvm_threads_live_threads +``` + +### Operator Metrics +```promql +# Reconciliation success rate +rate(otel_operator_sdk_reconciliations_success_total[5m]) / rate(otel_operator_sdk_reconciliations_started_total[5m]) + +# Average reconciliation time +rate(otel_operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / rate(otel_operator_sdk_controllers_execution_reconcile_seconds_count[5m]) + +# Queue saturation +otel_operator_sdk_reconciliations_queue_size / on() group_left() max(otel_operator_sdk_reconciliations_queue_size) +``` + +## References + +- [Java Operator SDK Documentation](https://javaoperatorsdk.io) +- [Micrometer OTLP Documentation](https://micrometer.io/docs/registry/otlp) +- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) +- [Grafana Dashboards](https://grafana.com/docs/grafana/latest/dashboards/) diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json new file mode 100644 index 0000000000..006821a467 --- /dev/null +++ b/observability/josdk-operator-metrics-dashboard.json @@ -0,0 +1,1109 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of reconciliations started per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\"}[5m])) by (kind, version)", + "legendFormat": "{{kind}} ({{version}})", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Rate (Started)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Success vs Failure rate of reconciliations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_success_total{job=\"webpage-operator\"}[5m]))", + "legendFormat": "Success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m]))", + "legendFormat": "Failure", + "range": true, + "refId": "B" + } + ], + "title": "Reconciliation Success vs Failure Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current number of reconciliations being executed", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(otel_operator_sdk_reconciliations_executions{job=\"webpage-operator\"})", + "legendFormat": "Executing", + "range": true, + "refId": "A" + } + ], + "title": "Currently Executing Reconciliations", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current reconciliation queue size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 4, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(otel_operator_sdk_reconciliations_queue_size{job=\"webpage-operator\"})", + "legendFormat": "Queue Size", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Queue Size", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Total reconciliations started", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\"})", + "legendFormat": "Total", + "range": true, + "refId": "A" + } + ], + "title": "Total Reconciliations", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Error rate by exception type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m]))", + "legendFormat": "Error Rate", + "range": true, + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Controller execution time percentiles", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "legendFormat": "p50 - {{controller}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "legendFormat": "p95 - {{controller}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "legendFormat": "p99 - {{controller}}", + "range": true, + "refId": "C" + } + ], + "title": "Reconciliation Execution Time (Percentiles)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of events received by the operator", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_events_received_total{job=\"webpage-operator\"}[5m])) by (event, action)", + "legendFormat": "{{event}} - {{action}}", + "range": true, + "refId": "A" + } + ], + "title": "Event Reception Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Failures by exception type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["last", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m])) by (exception)", + "legendFormat": "{{exception}}", + "range": true, + "refId": "A" + } + ], + "title": "Failures by Exception Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Controller execution success vs failure", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_controllers_execution_reconcile_success_total{job=\"webpage-operator\"}[5m])) by (type)", + "legendFormat": "Success - {{type}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_controllers_execution_reconcile_failure_total{job=\"webpage-operator\"}[5m])) by (exception)", + "legendFormat": "Failure - {{exception}}", + "range": true, + "refId": "B" + } + ], + "title": "Controller Execution Success vs Failure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of delete events received", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 11, + "options": { + "legend": { + "calcs": ["last", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_events_delete_total{job=\"webpage-operator\"}[5m])) by (kind, version)", + "legendFormat": "{{kind}} ({{version}})", + "range": true, + "refId": "A" + } + ], + "title": "Delete Event Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Reconciliation retry information", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 12, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\", operator_sdk_reconciliations_retries_last=\"true\"}[5m]))", + "legendFormat": "Last Retry Attempts", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\", operator_sdk_reconciliations_retries_last=\"false\"}[5m]))", + "legendFormat": "Retries (Not Last)", + "range": true, + "refId": "B" + } + ], + "title": "Reconciliation Retry Rate", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["operator", "kubernetes", "josdk"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JOSDK - Operator Metrics", + "uid": "josdk-operator-metrics", + "version": 0, + "weekStart": "" +} diff --git a/observability/jvm-metrics-dashboard.json b/observability/jvm-metrics-dashboard.json new file mode 100644 index 0000000000..0a817aa09c --- /dev/null +++ b/observability/jvm-metrics-dashboard.json @@ -0,0 +1,857 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_memory_used_bytes{job=\"webpage-operator\"}", + "legendFormat": "{{area}} - {{id}}", + "range": true, + "refId": "A" + } + ], + "title": "JVM Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_threads_live_threads{job=\"webpage-operator\"}", + "legendFormat": "Live Threads", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_threads_daemon_threads{job=\"webpage-operator\"}", + "legendFormat": "Daemon Threads", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_threads_peak_threads{job=\"webpage-operator\"}", + "legendFormat": "Peak Threads", + "range": true, + "refId": "C" + } + ], + "title": "JVM Threads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(otel_jvm_gc_pause_seconds_sum{job=\"webpage-operator\"}[5m])", + "legendFormat": "{{action}} - {{cause}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Time Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(otel_jvm_gc_pause_seconds_count{job=\"webpage-operator\"}[5m])", + "legendFormat": "{{action}} - {{cause}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Count Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_system_cpu_usage{job=\"webpage-operator\"}", + "legendFormat": "CPU Usage", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 6, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_classes_loaded_classes{job=\"webpage-operator\"}", + "legendFormat": "Classes Loaded", + "range": true, + "refId": "A" + } + ], + "title": "Classes Loaded", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_process_uptime_seconds{job=\"webpage-operator\"}", + "legendFormat": "Uptime", + "range": true, + "refId": "A" + } + ], + "title": "Process Uptime", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 8, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_system_cpu_count{job=\"webpage-operator\"}", + "legendFormat": "CPU Count", + "range": true, + "refId": "A" + } + ], + "title": "CPU Count", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(otel_jvm_gc_memory_allocated_bytes_total{job=\"webpage-operator\"}[5m])", + "legendFormat": "Allocated", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(otel_jvm_gc_memory_promoted_bytes_total{job=\"webpage-operator\"}[5m])", + "legendFormat": "Promoted", + "range": true, + "refId": "B" + } + ], + "title": "GC Memory Allocation Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_memory_max_bytes{job=\"webpage-operator\", area=\"heap\"}", + "legendFormat": "Max Heap", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_memory_committed_bytes{job=\"webpage-operator\", area=\"heap\"}", + "legendFormat": "Committed Heap", + "range": true, + "refId": "B" + } + ], + "title": "Heap Memory Max vs Committed", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["jvm", "java", "josdk"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JOSDK - JVM Metrics", + "uid": "josdk-jvm-metrics", + "version": 0, + "weekStart": "" +} From ff05901f8a5089229a28b1ad70288ff69b044bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 10:06:15 +0100 Subject: [PATCH 10/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 45 +++++++++++++------ .../operator/sample/WebPageOperator.java | 4 +- .../webpage/src/main/resources/log4j2.xml | 2 +- .../src/main/resources/otlp-config.yaml | 3 +- 4 files changed, 36 insertions(+), 18 deletions(-) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index 2c81f2bf38..017e9adf86 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -45,6 +45,17 @@ helm repo add prometheus-community https://prometheus-community.github.io/helm-c helm repo update echo -e "${GREEN}✓ Helm repositories added${NC}" +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Components (Parallel)${NC}" +echo -e "${GREEN}========================================${NC}" +echo -e "The following will be installed:" +echo -e " • cert-manager" +echo -e " • OpenTelemetry Operator" +echo -e " • Prometheus & Grafana" +echo -e " • OpenTelemetry Collector" +echo -e " • Service Monitors" +echo -e "\n${YELLOW}All resources will be applied first, then we'll wait for them to become ready.${NC}\n" + # Install cert-manager (required for OpenTelemetry Operator) echo -e "\n${YELLOW}Installing cert-manager...${NC}" if kubectl get namespace cert-manager > /dev/null 2>&1; then @@ -53,9 +64,8 @@ else kubectl create namespace cert-manager helm install cert-manager jetstack/cert-manager \ --namespace cert-manager \ - --set crds.enabled=true \ - --wait - echo -e "${GREEN}✓ cert-manager installed${NC}" + --set crds.enabled=true + echo -e "${GREEN}✓ cert-manager installation started${NC}" fi # Create observability namespace @@ -70,15 +80,13 @@ if helm list -n observability | grep -q opentelemetry-operator; then echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ --namespace observability \ - --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ - --wait + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" else helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ --namespace observability \ - --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ - --wait + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" fi -echo -e "${GREEN}✓ OpenTelemetry Operator installed${NC}" +echo -e "${GREEN}✓ OpenTelemetry Operator installation started${NC}" # Install kube-prometheus-stack (includes Prometheus + Grafana) echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" @@ -88,17 +96,15 @@ if helm list -n observability | grep -q kube-prometheus-stack; then --namespace observability \ --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ - --set grafana.adminPassword=admin \ - --wait + --set grafana.adminPassword=admin else helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ --namespace observability \ --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ - --set grafana.adminPassword=admin \ - --wait + --set grafana.adminPassword=admin fi -echo -e "${GREEN}✓ Prometheus and Grafana installed${NC}" +echo -e "${GREEN}✓ Prometheus and Grafana installation started${NC}" # Create OpenTelemetry Collector instance echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" @@ -195,8 +201,19 @@ EOF echo -e "${GREEN}✓ ServiceMonitor created${NC}" # Wait for all pods to be ready -echo -e "\n${YELLOW}Waiting for all pods to be ready...${NC}" +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}All resources have been applied!${NC}" +echo -e "${GREEN}========================================${NC}" +echo -e "\n${YELLOW}Waiting for all pods to become ready (this may take 2-3 minutes)...${NC}" + +# Wait for cert-manager pods +echo -e "${YELLOW}Checking cert-manager pods...${NC}" +kubectl wait --for=condition=ready pod --all -n cert-manager --timeout=300s 2>/dev/null || echo -e "${YELLOW}cert-manager already running or skipped${NC}" + +# Wait for observability pods +echo -e "${YELLOW}Checking observability pods...${NC}" kubectl wait --for=condition=ready pod --all -n observability --timeout=300s + echo -e "${GREEN}✓ All pods are ready${NC}" # Import Grafana dashboards diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index a2c342dc5e..dd1155eab3 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -61,6 +61,7 @@ public static void main(String[] args) throws IOException { Metrics metrics = initOTLPMetrics(); Operator operator = new Operator(o -> o.withStopOnInformerErrorDuringStartup(false).withMetrics(metrics)); + String reconcilerEnvVar = System.getenv(WEBPAGE_RECONCILER_ENV); if (WEBPAGE_CLASSIC_RECONCILER_ENV_VALUE.equals(reconcilerEnvVar)) { operator.register(new WebPageReconciler()); @@ -81,7 +82,7 @@ public static void main(String[] args) throws IOException { private static @NonNull Metrics initOTLPMetrics() { Map configProperties = loadConfigFromYaml(); - OtlpConfig otlpConfig = configProperties::get; + OtlpConfig otlpConfig = key -> configProperties.get(key); MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); @@ -93,7 +94,6 @@ public static void main(String[] args) throws IOException { new ClassLoaderMetrics().bindTo(registry); new ProcessorMetrics().bindTo(registry); new UptimeMetrics().bindTo(registry); - log.info("JVM and system metrics registered"); return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(registry) .collectingMetricsPerResource() diff --git a/sample-operators/webpage/src/main/resources/log4j2.xml b/sample-operators/webpage/src/main/resources/log4j2.xml index 0bf270c7e6..ebe273e40e 100644 --- a/sample-operators/webpage/src/main/resources/log4j2.xml +++ b/sample-operators/webpage/src/main/resources/log4j2.xml @@ -23,7 +23,7 @@ - + diff --git a/sample-operators/webpage/src/main/resources/otlp-config.yaml b/sample-operators/webpage/src/main/resources/otlp-config.yaml index ca93bfc965..17d773eb70 100644 --- a/sample-operators/webpage/src/main/resources/otlp-config.yaml +++ b/sample-operators/webpage/src/main/resources/otlp-config.yaml @@ -16,7 +16,8 @@ otlp: # OTLP Collector endpoint - see observability/install-observability.sh for setup - url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + url: "http://localhost:4318/v1/metrics" +# url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" step: 15s batchSize: 15000 aggregationTemporality: "cumulative" From f7e2565189dc61e6499d7e23547d8ae9a25e2500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 10:19:39 +0100 Subject: [PATCH 11/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- sample-operators/webpage/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sample-operators/webpage/README.md b/sample-operators/webpage/README.md index 7718d0f2f3..96329d18a9 100644 --- a/sample-operators/webpage/README.md +++ b/sample-operators/webpage/README.md @@ -76,3 +76,6 @@ of your choice. The JAR file is built using your local Maven and JDK and then co 1. Deploy the CRD: `kubectl apply -f target/classes/META-INF/fabric8/webpages.sample.javaoperatorsdk-v1.yml` 2. Deploy the operator: `kubectl apply -f k8s/operator.yaml` + +To install observability components - such as Prometheus, Open Telemetry, Grafana use - execute: +[install-observability.sh](../../observability/install-observability.sh) From 98f200f2b158d1d3f1e9c7fded43ef40cce3c6af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 15:46:11 +0100 Subject: [PATCH 12/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 3 ++- sample-operators/webpage/pom.xml | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index 017e9adf86..ea3a083eec 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -181,7 +181,7 @@ spec: targetPort: 8889 protocol: TCP selector: - app.kubernetes.io/name: otel-collector + app.kubernetes.io/name: otel-collector-collector --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor @@ -190,6 +190,7 @@ metadata: namespace: observability labels: app: otel-collector + release: kube-prometheus-stack spec: selector: matchLabels: diff --git a/sample-operators/webpage/pom.xml b/sample-operators/webpage/pom.xml index 97c885e403..10b0352605 100644 --- a/sample-operators/webpage/pom.xml +++ b/sample-operators/webpage/pom.xml @@ -39,6 +39,13 @@ pom import + + io.micrometer + micrometer-bom + ${micrometer-core.version} + pom + import + From 77307e23297cbc817f3e25bc32f0618ee6b69943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 15:47:03 +0100 Subject: [PATCH 13/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- sample-operators/webpage/pom.xml | 14 +++++++------- .../operator/sample/WebPageOperator.java | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sample-operators/webpage/pom.xml b/sample-operators/webpage/pom.xml index 10b0352605..f8c79cf268 100644 --- a/sample-operators/webpage/pom.xml +++ b/sample-operators/webpage/pom.xml @@ -39,13 +39,13 @@ pom import - - io.micrometer - micrometer-bom - ${micrometer-core.version} - pom - import - + + io.micrometer + micrometer-bom + ${micrometer-core.version} + pom + import + diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index dd1155eab3..837963f00a 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -57,6 +57,7 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); + // TODO remove otel prefix, add job and additional labels?! // Load configuration from config.yaml Metrics metrics = initOTLPMetrics(); Operator operator = From 1daab47fa8dc6f99620f80482d9d0abfaa3a767e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 16:18:30 +0100 Subject: [PATCH 14/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../java/io/javaoperatorsdk/operator/sample/WebPageOperator.java | 1 + 1 file changed, 1 insertion(+) diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 837963f00a..d92dfdd863 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -58,6 +58,7 @@ public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); // TODO remove otel prefix, add job and additional labels?! + // TODO add test for checking if there are metrics in prometheus // Load configuration from config.yaml Metrics metrics = initOTLPMetrics(); Operator operator = From cefad784386786f79891ff849f72c5a58b94980e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 16:59:54 +0100 Subject: [PATCH 15/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 3 ++- .../operator/sample/WebPageOperator.java | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index ea3a083eec..e724ac54d4 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -143,7 +143,7 @@ spec: exporters: prometheus: endpoint: "0.0.0.0:8889" - namespace: "otel" + namespace: "" send_timestamps: true metric_expiration: 5m debug: @@ -192,6 +192,7 @@ metadata: app: otel-collector release: kube-prometheus-stack spec: + jobLabel: app selector: matchLabels: app: otel-collector diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index d92dfdd863..e43a253511 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -22,6 +22,7 @@ import java.util.Map; import org.jspecify.annotations.NonNull; +import org.jspecify.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.yaml.snakeyaml.Yaml; @@ -84,7 +85,22 @@ public static void main(String[] args) throws IOException { private static @NonNull Metrics initOTLPMetrics() { Map configProperties = loadConfigFromYaml(); - OtlpConfig otlpConfig = key -> configProperties.get(key); + OtlpConfig otlpConfig = new OtlpConfig() { + @Override + public String prefix() { + return ""; + } + + @Override + public @Nullable String get(String key) { + return configProperties.get(key); + } + + @Override + public Map resourceAttributes() { + return Map.of("service.name","josdk","operator","webpage"); + } + }; MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); From b40766e992cc9b9a457c38f60b8cc5caf176ef1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 18:42:54 +0100 Subject: [PATCH 16/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/README.md | 22 +++++++---- observability/install-observability.sh | 2 + .../josdk-operator-metrics-dashboard.json | 34 ++++++++-------- observability/jvm-metrics-dashboard.json | 30 +++++++------- .../operator/sample/WebPageOperator.java | 39 +++++++++++-------- 5 files changed, 70 insertions(+), 57 deletions(-) diff --git a/observability/README.md b/observability/README.md index 9706a466e9..58caae27d0 100644 --- a/observability/README.md +++ b/observability/README.md @@ -61,6 +61,9 @@ Monitors Java Virtual Machine health and performance: - `system.cpu.usage`, `system.cpu.count` - `process.uptime` +**Filtering:** +All panels filter by `service_name="josdk"` to show metrics only from your operator. + ### 2. Java Operator SDK Metrics Dashboard (`josdk-operator-metrics-dashboard.json`) Monitors Kubernetes operator performance and health: @@ -87,6 +90,9 @@ Monitors Kubernetes operator performance and health: - `operator.sdk.events.received`, `.delete` - Event reception - Retry metrics and failure breakdowns +**Filtering:** +All panels filter by `service_name="josdk"` to show metrics only from your operator. + ## Importing Dashboards into Grafana ### Automatic Import (Default) @@ -180,8 +186,8 @@ Open http://localhost:9090/targets and verify the OTLP collector target is UP. ### Verify Metrics in Prometheus Open Prometheus UI and search for metrics: -- JVM metrics: `otel_jvm_*` -- Operator metrics: `otel_operator_sdk_*` +- JVM metrics: `jvm_*` +- Operator metrics: `operator_sdk_*` ### Check Grafana Data Source 1. Navigate to **Configuration** → **Data Sources** @@ -217,25 +223,25 @@ After making changes, re-import the dashboard using one of the methods above. ### JVM Metrics ```promql # Heap memory usage percentage -(otel_jvm_memory_used_bytes{area="heap"} / otel_jvm_memory_max_bytes{area="heap"}) * 100 +(jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 # GC throughput (percentage of time NOT in GC) -100 - (rate(otel_jvm_gc_pause_seconds_sum[5m]) * 100) +100 - (rate(jvm_gc_pause_seconds_sum[5m]) * 100) # Thread count trend -otel_jvm_threads_live_threads +jvm_threads_live_threads ``` ### Operator Metrics ```promql # Reconciliation success rate -rate(otel_operator_sdk_reconciliations_success_total[5m]) / rate(otel_operator_sdk_reconciliations_started_total[5m]) +rate(operator_sdk_reconciliations_success_total[5m]) / rate(operator_sdk_reconciliations_started_total[5m]) # Average reconciliation time -rate(otel_operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / rate(otel_operator_sdk_controllers_execution_reconcile_seconds_count[5m]) +rate(operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / rate(operator_sdk_controllers_execution_reconcile_seconds_count[5m]) # Queue saturation -otel_operator_sdk_reconciliations_queue_size / on() group_left() max(otel_operator_sdk_reconciliations_queue_size) +operator_sdk_reconciliations_queue_size / on() group_left() max(operator_sdk_reconciliations_queue_size) ``` ## References diff --git a/observability/install-observability.sh b/observability/install-observability.sh index e724ac54d4..dc7430520b 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -146,6 +146,8 @@ spec: namespace: "" send_timestamps: true metric_expiration: 5m + resource_to_telemetry_conversion: + enabled: true debug: verbosity: detailed sampling_initial: 5 diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json index 006821a467..6b53d26611 100644 --- a/observability/josdk-operator-metrics-dashboard.json +++ b/observability/josdk-operator-metrics-dashboard.json @@ -103,7 +103,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\"}[5m])) by (kind, version)", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\"}[5m])) by (kind, version)", "legendFormat": "{{kind}} ({{version}})", "range": true, "refId": "A" @@ -224,7 +224,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_success_total{job=\"webpage-operator\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_success_total{service_name=\"josdk\"}[5m]))", "legendFormat": "Success", "range": true, "refId": "A" @@ -235,7 +235,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m]))", "legendFormat": "Failure", "range": true, "refId": "B" @@ -302,7 +302,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(otel_operator_sdk_reconciliations_executions{job=\"webpage-operator\"})", + "expr": "sum(operator_sdk_reconciliations_executions{service_name=\"josdk\"})", "legendFormat": "Executing", "range": true, "refId": "A" @@ -369,7 +369,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(otel_operator_sdk_reconciliations_queue_size{job=\"webpage-operator\"})", + "expr": "sum(operator_sdk_reconciliations_queue_size{service_name=\"josdk\"})", "legendFormat": "Queue Size", "range": true, "refId": "A" @@ -430,7 +430,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\"})", + "expr": "sum(operator_sdk_reconciliations_started_total{service_name=\"josdk\"})", "legendFormat": "Total", "range": true, "refId": "A" @@ -495,7 +495,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m]))", "legendFormat": "Error Rate", "range": true, "refId": "A" @@ -585,7 +585,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.50, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p50 - {{controller}}", "range": true, "refId": "A" @@ -596,7 +596,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.95, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p95 - {{controller}}", "range": true, "refId": "B" @@ -607,7 +607,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.99, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p99 - {{controller}}", "range": true, "refId": "C" @@ -697,7 +697,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_events_received_total{job=\"webpage-operator\"}[5m])) by (event, action)", + "expr": "sum(rate(operator_sdk_events_received_total{service_name=\"josdk\"}[5m])) by (event, action)", "legendFormat": "{{event}} - {{action}}", "range": true, "refId": "A" @@ -787,7 +787,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m])) by (exception)", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m])) by (exception)", "legendFormat": "{{exception}}", "range": true, "refId": "A" @@ -877,7 +877,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_controllers_execution_reconcile_success_total{job=\"webpage-operator\"}[5m])) by (type)", + "expr": "sum(rate(operator_sdk_controllers_execution_reconcile_success_total{service_name=\"josdk\"}[5m])) by (type)", "legendFormat": "Success - {{type}}", "range": true, "refId": "A" @@ -888,7 +888,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_controllers_execution_reconcile_failure_total{job=\"webpage-operator\"}[5m])) by (exception)", + "expr": "sum(rate(operator_sdk_controllers_execution_reconcile_failure_total{service_name=\"josdk\"}[5m])) by (exception)", "legendFormat": "Failure - {{exception}}", "range": true, "refId": "B" @@ -978,7 +978,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_events_delete_total{job=\"webpage-operator\"}[5m])) by (kind, version)", + "expr": "sum(rate(operator_sdk_events_delete_total{service_name=\"josdk\"}[5m])) by (kind, version)", "legendFormat": "{{kind}} ({{version}})", "range": true, "refId": "A" @@ -1068,7 +1068,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\", operator_sdk_reconciliations_retries_last=\"true\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\", operator_sdk_reconciliations_retries_last=\"true\"}[5m]))", "legendFormat": "Last Retry Attempts", "range": true, "refId": "A" @@ -1079,7 +1079,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\", operator_sdk_reconciliations_retries_last=\"false\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\", operator_sdk_reconciliations_retries_last=\"false\"}[5m]))", "legendFormat": "Retries (Not Last)", "range": true, "refId": "B" diff --git a/observability/jvm-metrics-dashboard.json b/observability/jvm-metrics-dashboard.json index 0a817aa09c..528f29674e 100644 --- a/observability/jvm-metrics-dashboard.json +++ b/observability/jvm-metrics-dashboard.json @@ -106,7 +106,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_memory_used_bytes{job=\"webpage-operator\"}", + "expr": "jvm_memory_used_bytes{service_name=\"josdk\"}", "legendFormat": "{{area}} - {{id}}", "range": true, "refId": "A" @@ -195,7 +195,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_threads_live_threads{job=\"webpage-operator\"}", + "expr": "jvm_threads_live{service_name=\"josdk\"}", "legendFormat": "Live Threads", "range": true, "refId": "A" @@ -206,7 +206,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_threads_daemon_threads{job=\"webpage-operator\"}", + "expr": "jvm_threads_daemon_threads{service_name=\"josdk\"}", "legendFormat": "Daemon Threads", "range": true, "refId": "B" @@ -217,7 +217,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_threads_peak_threads{job=\"webpage-operator\"}", + "expr": "jvm_threads_peak_threads{service_name=\"josdk\"}", "legendFormat": "Peak Threads", "range": true, "refId": "C" @@ -306,7 +306,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(otel_jvm_gc_pause_seconds_sum{job=\"webpage-operator\"}[5m])", + "expr": "rate(jvm_gc_pause_milliseconds_sum{service_name=\"josdk\"}[5m])", "legendFormat": "{{action}} - {{cause}}", "range": true, "refId": "A" @@ -395,7 +395,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(otel_jvm_gc_pause_seconds_count{job=\"webpage-operator\"}[5m])", + "expr": "rate(jvm_gc_pause_milliseconds_count{service_name=\"josdk\"}[5m])", "legendFormat": "{{action}} - {{cause}}", "range": true, "refId": "A" @@ -453,7 +453,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_system_cpu_usage{job=\"webpage-operator\"}", + "expr": "system_cpu_usage{service_name=\"josdk\"}", "legendFormat": "CPU Usage", "range": true, "refId": "A" @@ -511,7 +511,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_classes_loaded_classes{job=\"webpage-operator\"}", + "expr": "jvm_classes_loaded{service_name=\"josdk\"}", "legendFormat": "Classes Loaded", "range": true, "refId": "A" @@ -540,7 +540,7 @@ } ] }, - "unit": "s" + "unit": "ms" }, "overrides": [] }, @@ -569,7 +569,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_process_uptime_seconds{job=\"webpage-operator\"}", + "expr": "process_uptime_milliseconds{service_name=\"josdk\"}", "legendFormat": "Uptime", "range": true, "refId": "A" @@ -627,7 +627,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_system_cpu_count{job=\"webpage-operator\"}", + "expr": "system_cpu_count{service_name=\"josdk\"}", "legendFormat": "CPU Count", "range": true, "refId": "A" @@ -716,7 +716,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(otel_jvm_gc_memory_allocated_bytes_total{job=\"webpage-operator\"}[5m])", + "expr": "rate(jvm_gc_memory_allocated_bytes_total{service_name=\"josdk\"}[5m])", "legendFormat": "Allocated", "range": true, "refId": "A" @@ -727,7 +727,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(otel_jvm_gc_memory_promoted_bytes_total{job=\"webpage-operator\"}[5m])", + "expr": "rate(jvm_gc_memory_promoted_bytes_total{service_name=\"josdk\"}[5m])", "legendFormat": "Promoted", "range": true, "refId": "B" @@ -816,7 +816,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_memory_max_bytes{job=\"webpage-operator\", area=\"heap\"}", + "expr": "jvm_memory_max_bytes{service_name=\"josdk\", area=\"heap\"}", "legendFormat": "Max Heap", "range": true, "refId": "A" @@ -827,7 +827,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_memory_committed_bytes{job=\"webpage-operator\", area=\"heap\"}", + "expr": "jvm_memory_committed_bytes{service_name=\"josdk\", area=\"heap\"}", "legendFormat": "Committed Heap", "range": true, "refId": "B" diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index e43a253511..ad580736c1 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -58,7 +58,10 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); - // TODO remove otel prefix, add job and additional labels?! + // TODO // todo change: + // operator_sdk_reconciliations_queue_size_webpagestandalonedependentsreconciler + // operator_sdk_reconciliations_executions_webpagestandalonedependentsreconciler + // => controller name as label // TODO add test for checking if there are metrics in prometheus // Load configuration from config.yaml Metrics metrics = initOTLPMetrics(); @@ -85,22 +88,24 @@ public static void main(String[] args) throws IOException { private static @NonNull Metrics initOTLPMetrics() { Map configProperties = loadConfigFromYaml(); - OtlpConfig otlpConfig = new OtlpConfig() { - @Override - public String prefix() { - return ""; - } - - @Override - public @Nullable String get(String key) { - return configProperties.get(key); - } - - @Override - public Map resourceAttributes() { - return Map.of("service.name","josdk","operator","webpage"); - } - }; + var otlpConfig = + new OtlpConfig() { + @Override + public String prefix() { + return ""; + } + + @Override + public @Nullable String get(String key) { + return configProperties.get(key); + } + + // these should come from env variables + @Override + public Map resourceAttributes() { + return Map.of("service.name", "josdk", "operator", "webpage"); + } + }; MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); From 6efffd6cb172dc6102a249c0046708be9d42b282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 21:48:28 +0100 Subject: [PATCH 17/18] improve: micrometer metrics improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../micrometer/MicrometerMetrics.java | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index 7beabb7a6e..cd0572db7b 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -22,6 +22,8 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import org.jspecify.annotations.NonNull; + import io.fabric8.kubernetes.api.model.HasMetadata; import io.javaoperatorsdk.operator.OperatorException; import io.javaoperatorsdk.operator.api.monitoring.Metrics; @@ -37,8 +39,6 @@ import io.micrometer.core.instrument.Tag; import io.micrometer.core.instrument.Timer; -import static io.javaoperatorsdk.operator.api.reconciler.Constants.CONTROLLER_NAME; - public class MicrometerMetrics implements Metrics { private static final String PREFIX = "operator.sdk."; @@ -48,8 +48,8 @@ public class MicrometerMetrics implements Metrics { private static final String RECONCILIATIONS_RETRIES_LAST = RECONCILIATIONS + "retries.last"; private static final String RECONCILIATIONS_RETRIES_NUMBER = RECONCILIATIONS + "retries.number"; private static final String RECONCILIATIONS_STARTED = RECONCILIATIONS + "started"; - private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions."; - private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "queue.size."; + private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions"; + private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "queue.size"; private static final String NAME = "name"; private static final String NAMESPACE = "namespace"; private static final String GROUP = "group"; @@ -59,6 +59,7 @@ public class MicrometerMetrics implements Metrics { private static final String METADATA_PREFIX = "resource."; private static final String CONTROLLERS_EXECUTION = "controllers.execution."; private static final String CONTROLLER = "controller"; + private static final String CONTROLLER_NAME = CONTROLLER + ".name"; private static final String SUCCESS_SUFFIX = ".success"; private static final String FAILURE_SUFFIX = ".failure"; private static final String TYPE = "type"; @@ -130,18 +131,27 @@ private MicrometerMetrics( public void controllerRegistered(Controller controller) { final var configuration = controller.getConfiguration(); final var name = configuration.getName(); - final var executingThreadsName = RECONCILIATIONS_EXECUTIONS + name; + final var executingThreadsRefName = reconciliationExecutionGaugeRefName(name); final var resourceClass = configuration.getResourceClass(); - final var tags = new ArrayList(3); + final var tags = new ArrayList(); + tags.add(Tag.of(CONTROLLER_NAME, name)); addGVKTags(GroupVersionKind.gvkFor(resourceClass), tags, false); AtomicInteger executingThreads = - registry.gauge(executingThreadsName, tags, new AtomicInteger(0)); - gauges.put(executingThreadsName, executingThreads); + registry.gauge(RECONCILIATIONS_EXECUTIONS, tags, new AtomicInteger(0)); + gauges.put(executingThreadsRefName, executingThreads); - final var controllerQueueName = RECONCILIATIONS_QUEUE_SIZE + name; + final var controllerQueueRefName = controllerQueueSizeGaugeRefName(name); AtomicInteger controllerQueueSize = - registry.gauge(controllerQueueName, tags, new AtomicInteger(0)); - gauges.put(controllerQueueName, controllerQueueSize); + registry.gauge(RECONCILIATIONS_QUEUE_SIZE, tags, new AtomicInteger(0)); + gauges.put(controllerQueueRefName, controllerQueueSize); + } + + private static @NonNull String reconciliationExecutionGaugeRefName(String controllerName) { + return RECONCILIATIONS_EXECUTIONS + "." + controllerName; + } + + private static @NonNull String controllerQueueSizeGaugeRefName(String controllerName) { + return RECONCILIATIONS_QUEUE_SIZE + "." + controllerName; } @Override @@ -223,7 +233,7 @@ public void reconcileCustomResource( String.valueOf(retryInfo.map(RetryInfo::isLastAttempt).orElse(true)))); var controllerQueueSize = - gauges.get(RECONCILIATIONS_QUEUE_SIZE + metadata.get(CONTROLLER_NAME)); + gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); controllerQueueSize.incrementAndGet(); } @@ -235,18 +245,18 @@ public void finishedReconciliation(HasMetadata resource, Map met @Override public void reconciliationExecutionStarted(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(RECONCILIATIONS_EXECUTIONS + metadata.get(CONTROLLER_NAME)); + gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); reconcilerExecutions.incrementAndGet(); } @Override public void reconciliationExecutionFinished(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(RECONCILIATIONS_EXECUTIONS + metadata.get(CONTROLLER_NAME)); + gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); reconcilerExecutions.decrementAndGet(); var controllerQueueSize = - gauges.get(RECONCILIATIONS_QUEUE_SIZE + metadata.get(CONTROLLER_NAME)); + gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); controllerQueueSize.decrementAndGet(); } From 4f38ca9d4e5b9be38ae41f185737cabd1b2837f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Tue, 10 Feb 2026 10:12:14 +0100 Subject: [PATCH 18/18] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../micrometer/MicrometerMetrics.java | 58 +++++++++++++++---- .../api/monitoring/AggregatedMetrics.java | 5 +- .../operator/api/monitoring/Metrics.java | 3 +- .../processing/event/EventProcessor.java | 2 +- .../api/monitoring/AggregatedMetricsTest.java | 10 ++-- 5 files changed, 57 insertions(+), 21 deletions(-) diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index cd0572db7b..94391bec82 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -221,16 +221,18 @@ public void cleanupDoneFor(ResourceID resourceID, Map metadata) public void reconcileCustomResource( HasMetadata resource, RetryInfo retryInfoNullable, Map metadata) { Optional retryInfo = Optional.ofNullable(retryInfoNullable); - incrementCounter( - ResourceID.fromResource(resource), - RECONCILIATIONS_STARTED, - metadata, - Tag.of( - RECONCILIATIONS_RETRIES_NUMBER, - String.valueOf(retryInfo.map(RetryInfo::getAttemptCount).orElse(0))), - Tag.of( - RECONCILIATIONS_RETRIES_LAST, - String.valueOf(retryInfo.map(RetryInfo::isLastAttempt).orElse(true)))); + ResourceID resourceID = ResourceID.fromResource(resource); + + // Record the counter without retry tags + incrementCounter(resourceID, RECONCILIATIONS_STARTED, metadata); + + // Update retry number gauge + int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0); + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, retryNumber); + + // Update retry last attempt gauge (1 for true, 0 for false) + int isLastAttempt = retryInfo.map(RetryInfo::isLastAttempt).orElse(true) ? 1 : 0; + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_LAST, isLastAttempt); var controllerQueueSize = gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); @@ -238,8 +240,14 @@ public void reconcileCustomResource( } @Override - public void finishedReconciliation(HasMetadata resource, Map metadata) { - incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_SUCCESS, metadata); + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + ResourceID resourceID = ResourceID.fromResource(resource); + incrementCounter(resourceID, RECONCILIATIONS_SUCCESS, metadata); + + // Reset retry gauges on successful reconciliation + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, 0); + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_LAST, 0); } @Override @@ -335,6 +343,32 @@ private void incrementCounter( counter.increment(); } + private void updateGauge( + ResourceID id, Map metadata, String gaugeName, int value) { + final var tags = new ArrayList(6); + addMetadataTags(id, metadata, tags, false); + + final var gaugeRefName = buildGaugeRefName(id, gaugeName); + AtomicInteger gauge = + gauges.computeIfAbsent( + gaugeRefName, + key -> { + AtomicInteger newGauge = + registry.gauge(PREFIX + gaugeName, tags, new AtomicInteger(0)); + // Find the meter in the registry and record it for cleanup + var meter = registry.find(PREFIX + gaugeName).tags(tags).gauge(); + if (meter != null) { + cleaner.recordAssociation(id, meter); + } + return newGauge; + }); + gauge.set(value); + } + + private String buildGaugeRefName(ResourceID id, String gaugeName) { + return gaugeName + "." + id.getName() + "." + id.getNamespace().orElse(CLUSTER); + } + protected Set recordedMeterIdsFor(ResourceID resourceID) { return cleaner.recordedMeterIdsFor(resourceID); } diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java index f66bdc47c6..4e3540bf55 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java @@ -103,8 +103,9 @@ public void cleanupDoneFor(ResourceID resourceID, Map metadata) } @Override - public void finishedReconciliation(HasMetadata resource, Map metadata) { - metricsList.forEach(metrics -> metrics.finishedReconciliation(resource, metadata)); + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + metricsList.forEach(metrics -> metrics.successfullyFinishedReconciliation(resource, metadata)); } @Override diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java index 10b2db6774..cda6fd167b 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java @@ -93,7 +93,8 @@ default void cleanupDoneFor(ResourceID resourceID, Map metadata) * @param resource the {@link ResourceID} associated with the resource being processed * @param metadata metadata associated with the resource being processed */ - default void finishedReconciliation(HasMetadata resource, Map metadata) {} + default void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) {} /** * Encapsulates the information about a controller execution i.e. a call to either {@link diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java index b476c39614..4ff482f03e 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java @@ -292,7 +292,7 @@ synchronized void eventProcessingFinished( return; } cleanupOnSuccessfulExecution(executionScope); - metrics.finishedReconciliation(executionScope.getResource(), metricsMetadata); + metrics.successfullyFinishedReconciliation(executionScope.getResource(), metricsMetadata); if ((triggerOnAllEvents() && executionScope.isDeleteEvent()) || (!triggerOnAllEvents() && state.deleteEventPresent())) { cleanupForDeletedEvent(executionScope.getResourceID()); diff --git a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java index 68142048b6..36a3ca0877 100644 --- a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java +++ b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java @@ -141,13 +141,13 @@ void cleanupDoneFor_shouldDelegateToAllMetricsInOrder() { } @Test - void finishedReconciliation_shouldDelegateToAllMetricsInOrder() { - aggregatedMetrics.finishedReconciliation(resource, metadata); + void successfullyFinishedReconciliation_shouldDelegateToAllMetricsInOrder() { + aggregatedMetrics.successfullyFinishedReconciliation(resource, metadata); final var inOrder = inOrder(metrics1, metrics2, metrics3); - inOrder.verify(metrics1).finishedReconciliation(resource, metadata); - inOrder.verify(metrics2).finishedReconciliation(resource, metadata); - inOrder.verify(metrics3).finishedReconciliation(resource, metadata); + inOrder.verify(metrics1).successfullyFinishedReconciliation(resource, metadata); + inOrder.verify(metrics2).successfullyFinishedReconciliation(resource, metadata); + inOrder.verify(metrics3).successfullyFinishedReconciliation(resource, metadata); verifyNoMoreInteractions(metrics1, metrics2, metrics3); }