diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 7a5964ba35..34bc6c7d0f 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -11,6 +11,7 @@ on: paths-ignore: - 'docs/**' - 'adr/**' + - 'observability/**' workflow_dispatch: jobs: check_format_and_unit_tests: diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index 7beabb7a6e..94391bec82 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -22,6 +22,8 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import org.jspecify.annotations.NonNull; + import io.fabric8.kubernetes.api.model.HasMetadata; import io.javaoperatorsdk.operator.OperatorException; import io.javaoperatorsdk.operator.api.monitoring.Metrics; @@ -37,8 +39,6 @@ import io.micrometer.core.instrument.Tag; import io.micrometer.core.instrument.Timer; -import static io.javaoperatorsdk.operator.api.reconciler.Constants.CONTROLLER_NAME; - public class MicrometerMetrics implements Metrics { private static final String PREFIX = "operator.sdk."; @@ -48,8 +48,8 @@ public class MicrometerMetrics implements Metrics { private static final String RECONCILIATIONS_RETRIES_LAST = RECONCILIATIONS + "retries.last"; private static final String RECONCILIATIONS_RETRIES_NUMBER = RECONCILIATIONS + "retries.number"; private static final String RECONCILIATIONS_STARTED = RECONCILIATIONS + "started"; - private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions."; - private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "queue.size."; + private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions"; + private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "queue.size"; private static final String NAME = "name"; private static final String NAMESPACE = "namespace"; private static final String GROUP = "group"; @@ -59,6 +59,7 @@ public class MicrometerMetrics implements Metrics { private static final String METADATA_PREFIX = "resource."; private static final String CONTROLLERS_EXECUTION = "controllers.execution."; private static final String CONTROLLER = "controller"; + private static final String CONTROLLER_NAME = CONTROLLER + ".name"; private static final String SUCCESS_SUFFIX = ".success"; private static final String FAILURE_SUFFIX = ".failure"; private static final String TYPE = "type"; @@ -130,18 +131,27 @@ private MicrometerMetrics( public void controllerRegistered(Controller controller) { final var configuration = controller.getConfiguration(); final var name = configuration.getName(); - final var executingThreadsName = RECONCILIATIONS_EXECUTIONS + name; + final var executingThreadsRefName = reconciliationExecutionGaugeRefName(name); final var resourceClass = configuration.getResourceClass(); - final var tags = new ArrayList(3); + final var tags = new ArrayList(); + tags.add(Tag.of(CONTROLLER_NAME, name)); addGVKTags(GroupVersionKind.gvkFor(resourceClass), tags, false); AtomicInteger executingThreads = - registry.gauge(executingThreadsName, tags, new AtomicInteger(0)); - gauges.put(executingThreadsName, executingThreads); + registry.gauge(RECONCILIATIONS_EXECUTIONS, tags, new AtomicInteger(0)); + gauges.put(executingThreadsRefName, executingThreads); - final var controllerQueueName = RECONCILIATIONS_QUEUE_SIZE + name; + final var controllerQueueRefName = controllerQueueSizeGaugeRefName(name); AtomicInteger controllerQueueSize = - registry.gauge(controllerQueueName, tags, new AtomicInteger(0)); - gauges.put(controllerQueueName, controllerQueueSize); + registry.gauge(RECONCILIATIONS_QUEUE_SIZE, tags, new AtomicInteger(0)); + gauges.put(controllerQueueRefName, controllerQueueSize); + } + + private static @NonNull String reconciliationExecutionGaugeRefName(String controllerName) { + return RECONCILIATIONS_EXECUTIONS + "." + controllerName; + } + + private static @NonNull String controllerQueueSizeGaugeRefName(String controllerName) { + return RECONCILIATIONS_QUEUE_SIZE + "." + controllerName; } @Override @@ -211,42 +221,50 @@ public void cleanupDoneFor(ResourceID resourceID, Map metadata) public void reconcileCustomResource( HasMetadata resource, RetryInfo retryInfoNullable, Map metadata) { Optional retryInfo = Optional.ofNullable(retryInfoNullable); - incrementCounter( - ResourceID.fromResource(resource), - RECONCILIATIONS_STARTED, - metadata, - Tag.of( - RECONCILIATIONS_RETRIES_NUMBER, - String.valueOf(retryInfo.map(RetryInfo::getAttemptCount).orElse(0))), - Tag.of( - RECONCILIATIONS_RETRIES_LAST, - String.valueOf(retryInfo.map(RetryInfo::isLastAttempt).orElse(true)))); + ResourceID resourceID = ResourceID.fromResource(resource); + + // Record the counter without retry tags + incrementCounter(resourceID, RECONCILIATIONS_STARTED, metadata); + + // Update retry number gauge + int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0); + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, retryNumber); + + // Update retry last attempt gauge (1 for true, 0 for false) + int isLastAttempt = retryInfo.map(RetryInfo::isLastAttempt).orElse(true) ? 1 : 0; + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_LAST, isLastAttempt); var controllerQueueSize = - gauges.get(RECONCILIATIONS_QUEUE_SIZE + metadata.get(CONTROLLER_NAME)); + gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); controllerQueueSize.incrementAndGet(); } @Override - public void finishedReconciliation(HasMetadata resource, Map metadata) { - incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_SUCCESS, metadata); + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + ResourceID resourceID = ResourceID.fromResource(resource); + incrementCounter(resourceID, RECONCILIATIONS_SUCCESS, metadata); + + // Reset retry gauges on successful reconciliation + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, 0); + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_LAST, 0); } @Override public void reconciliationExecutionStarted(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(RECONCILIATIONS_EXECUTIONS + metadata.get(CONTROLLER_NAME)); + gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); reconcilerExecutions.incrementAndGet(); } @Override public void reconciliationExecutionFinished(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(RECONCILIATIONS_EXECUTIONS + metadata.get(CONTROLLER_NAME)); + gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); reconcilerExecutions.decrementAndGet(); var controllerQueueSize = - gauges.get(RECONCILIATIONS_QUEUE_SIZE + metadata.get(CONTROLLER_NAME)); + gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); controllerQueueSize.decrementAndGet(); } @@ -325,6 +343,32 @@ private void incrementCounter( counter.increment(); } + private void updateGauge( + ResourceID id, Map metadata, String gaugeName, int value) { + final var tags = new ArrayList(6); + addMetadataTags(id, metadata, tags, false); + + final var gaugeRefName = buildGaugeRefName(id, gaugeName); + AtomicInteger gauge = + gauges.computeIfAbsent( + gaugeRefName, + key -> { + AtomicInteger newGauge = + registry.gauge(PREFIX + gaugeName, tags, new AtomicInteger(0)); + // Find the meter in the registry and record it for cleanup + var meter = registry.find(PREFIX + gaugeName).tags(tags).gauge(); + if (meter != null) { + cleaner.recordAssociation(id, meter); + } + return newGauge; + }); + gauge.set(value); + } + + private String buildGaugeRefName(ResourceID id, String gaugeName) { + return gaugeName + "." + id.getName() + "." + id.getNamespace().orElse(CLUSTER); + } + protected Set recordedMeterIdsFor(ResourceID resourceID) { return cleaner.recordedMeterIdsFor(resourceID); } diff --git a/observability/README.md b/observability/README.md new file mode 100644 index 0000000000..58caae27d0 --- /dev/null +++ b/observability/README.md @@ -0,0 +1,252 @@ +# Observability Stack for Java Operator SDK + +This directory contains the setup scripts and Grafana dashboards for monitoring Java Operator SDK applications. + +## Installation + +Run the installation script to deploy the full observability stack (OpenTelemetry Collector, Prometheus, and Grafana): + +```bash +./install-observability.sh +``` + +This will install: +- **cert-manager** - Required for OpenTelemetry Operator +- **OpenTelemetry Operator** - Manages OpenTelemetry Collector instances +- **OpenTelemetry Collector** - Receives OTLP metrics and exports to Prometheus +- **Prometheus** - Metrics storage and querying +- **Grafana** - Metrics visualization + +## Accessing Services + +### Grafana +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80 +``` +Then open http://localhost:3000 +- Username: `admin` +- Password: `admin` + +### Prometheus +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Then open http://localhost:9090 + +## Grafana Dashboards + +Two pre-configured dashboards are **automatically imported** during installation: + +### 1. JVM Metrics Dashboard (`jvm-metrics-dashboard.json`) + +Monitors Java Virtual Machine health and performance: + +**Panels:** +- **JVM Memory Used** - Heap and non-heap memory consumption by memory pool +- **JVM Threads** - Live, daemon, and peak thread counts +- **GC Pause Time Rate** - Garbage collection pause duration +- **GC Pause Count Rate** - Frequency of garbage collection events +- **CPU Usage** - System CPU utilization percentage +- **Classes Loaded** - Number of classes currently loaded +- **Process Uptime** - Application uptime in seconds +- **CPU Count** - Available processor cores +- **GC Memory Allocation Rate** - Memory allocation and promotion rates +- **Heap Memory Max vs Committed** - Heap memory limits and commitments + +**Key Metrics:** +- `jvm.memory.used`, `jvm.memory.max`, `jvm.memory.committed` +- `jvm.gc.pause`, `jvm.gc.memory.allocated`, `jvm.gc.memory.promoted` +- `jvm.threads.live`, `jvm.threads.daemon`, `jvm.threads.peak` +- `jvm.classes.loaded`, `jvm.classes.unloaded` +- `system.cpu.usage`, `system.cpu.count` +- `process.uptime` + +**Filtering:** +All panels filter by `service_name="josdk"` to show metrics only from your operator. + +### 2. Java Operator SDK Metrics Dashboard (`josdk-operator-metrics-dashboard.json`) + +Monitors Kubernetes operator performance and health: + +**Panels:** +- **Reconciliation Rate (Started)** - Rate of reconciliation loops triggered +- **Reconciliation Success vs Failure Rate** - Success/failure ratio over time +- **Currently Executing Reconciliations** - Active reconciliation threads +- **Reconciliation Queue Size** - Pending reconciliation work +- **Total Reconciliations** - Cumulative count of reconciliations +- **Error Rate** - Overall error rate across all reconciliations +- **Reconciliation Execution Time** - P50, P95, P99 latency percentiles +- **Event Reception Rate** - Kubernetes event processing rate +- **Failures by Exception Type** - Breakdown of errors by exception class +- **Controller Execution Success vs Failure** - Controller-level success metrics +- **Delete Event Rate** - Resource deletion event frequency +- **Reconciliation Retry Rate** - Retry attempts and patterns + +**Key Metrics:** +- `operator.sdk.reconciliations.started`, `.success`, `.failed` +- `operator.sdk.reconciliations.executions` - Current execution count +- `operator.sdk.reconciliations.queue.size` - Queue depth +- `operator.sdk.controllers.execution.reconcile` - Execution timing histograms +- `operator.sdk.events.received`, `.delete` - Event reception +- Retry metrics and failure breakdowns + +**Filtering:** +All panels filter by `service_name="josdk"` to show metrics only from your operator. + +## Importing Dashboards into Grafana + +### Automatic Import (Default) + +The dashboards are **automatically imported** when you run `./install-observability.sh`. They will appear in Grafana within 30-60 seconds after installation. No manual steps required! + +To verify the dashboards were imported: +1. Access Grafana at http://localhost:3000 +2. Navigate to **Dashboards** → **Browse** +3. Look for "JOSDK - JVM Metrics" and "JOSDK - Operator Metrics" + +### Manual Import Methods + +If you need to re-import or update the dashboards manually: + +#### Method 1: Via Grafana UI + +1. Access Grafana at http://localhost:3000 +2. Login with admin/admin +3. Navigate to **Dashboards** → **Import** +4. Click **Upload JSON file** +5. Select `jvm-metrics-dashboard.json` or `josdk-operator-metrics-dashboard.json` +6. Select **Prometheus** as the data source +7. Click **Import** + +#### Method 2: Via kubectl ConfigMap + +```bash +# Re-import JVM dashboard +kubectl create configmap jvm-metrics-dashboard \ + --from-file=jvm-metrics-dashboard.json \ + -n observability \ + -o yaml --dry-run=client | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + +# Re-import Operator dashboard +kubectl create configmap josdk-operator-metrics-dashboard \ + --from-file=josdk-operator-metrics-dashboard.json \ + -n observability \ + -o yaml --dry-run=client | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - +``` + +The dashboards will be automatically discovered and loaded by Grafana within 30-60 seconds. + +## Configuring Your Operator + +To enable metrics export from your JOSDK operator, ensure your application: + +1. **Has the required dependency** (already included in webpage sample): + ```xml + + io.micrometer + micrometer-registry-otlp + + ``` + +2. **Configures OTLP export** via `otlp-config.yaml`: + ```yaml + otlp: + url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative" + ``` + +3. **Registers JVM and JOSDK metrics** (see `WebPageOperator.java` for reference implementation) + +## OTLP Endpoints + +The OpenTelemetry Collector provides the following endpoints: + +- **OTLP gRPC**: `otel-collector-collector.observability.svc.cluster.local:4317` +- **OTLP HTTP**: `otel-collector-collector.observability.svc.cluster.local:4318` +- **Prometheus Scrape**: `http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics` + +## Troubleshooting + +### Check OpenTelemetry Collector Logs +```bash +kubectl logs -n observability -l app.kubernetes.io/name=otel-collector -f +``` + +### Check Prometheus Targets +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Open http://localhost:9090/targets and verify the OTLP collector target is UP. + +### Verify Metrics in Prometheus +Open Prometheus UI and search for metrics: +- JVM metrics: `jvm_*` +- Operator metrics: `operator_sdk_*` + +### Check Grafana Data Source +1. Navigate to **Configuration** → **Data Sources** +2. Verify Prometheus data source is configured and working +3. Click **Test** to verify connectivity + +## Uninstalling + +To remove the observability stack: + +```bash +kubectl delete configmap -n observability jvm-metrics-dashboard josdk-operator-metrics-dashboard +kubectl delete -n observability OpenTelemetryCollector otel-collector +helm uninstall -n observability kube-prometheus-stack +helm uninstall -n observability opentelemetry-operator +helm uninstall -n cert-manager cert-manager +kubectl delete namespace observability cert-manager +``` + +## Customizing Dashboards + +The dashboard JSON files can be modified to: +- Add new panels for custom metrics +- Adjust time ranges and refresh intervals +- Change visualization types +- Add templating variables for filtering +- Modify alert thresholds + +After making changes, re-import the dashboard using one of the methods above. + +## Example Queries + +### JVM Metrics +```promql +# Heap memory usage percentage +(jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 + +# GC throughput (percentage of time NOT in GC) +100 - (rate(jvm_gc_pause_seconds_sum[5m]) * 100) + +# Thread count trend +jvm_threads_live_threads +``` + +### Operator Metrics +```promql +# Reconciliation success rate +rate(operator_sdk_reconciliations_success_total[5m]) / rate(operator_sdk_reconciliations_started_total[5m]) + +# Average reconciliation time +rate(operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / rate(operator_sdk_controllers_execution_reconcile_seconds_count[5m]) + +# Queue saturation +operator_sdk_reconciliations_queue_size / on() group_left() max(operator_sdk_reconciliations_queue_size) +``` + +## References + +- [Java Operator SDK Documentation](https://javaoperatorsdk.io) +- [Micrometer OTLP Documentation](https://micrometer.io/docs/registry/otlp) +- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) +- [Grafana Dashboards](https://grafana.com/docs/grafana/latest/dashboards/) diff --git a/observability/install-observability.sh b/observability/install-observability.sh new file mode 100755 index 0000000000..dc7430520b --- /dev/null +++ b/observability/install-observability.sh @@ -0,0 +1,308 @@ +#!/bin/bash +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Observability Stack${NC}" +echo -e "${GREEN}OpenTelemetry + Prometheus + Grafana${NC}" +echo -e "${GREEN}========================================${NC}" + +# Check if helm is installed +echo -e "\n${YELLOW}Checking helm installation...${NC}" +if ! command -v helm &> /dev/null; then + echo -e "${RED}Error: helm is not installed${NC}" + echo "Please install helm: https://helm.sh/docs/intro/install/" + exit 1 +fi +echo -e "${GREEN}✓ helm is installed${NC}" + +# Add Helm repositories +echo -e "\n${YELLOW}Adding Helm repositories...${NC}" +helm repo add jetstack https://charts.jetstack.io +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +echo -e "${GREEN}✓ Helm repositories added${NC}" + +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Components (Parallel)${NC}" +echo -e "${GREEN}========================================${NC}" +echo -e "The following will be installed:" +echo -e " • cert-manager" +echo -e " • OpenTelemetry Operator" +echo -e " • Prometheus & Grafana" +echo -e " • OpenTelemetry Collector" +echo -e " • Service Monitors" +echo -e "\n${YELLOW}All resources will be applied first, then we'll wait for them to become ready.${NC}\n" + +# Install cert-manager (required for OpenTelemetry Operator) +echo -e "\n${YELLOW}Installing cert-manager...${NC}" +if kubectl get namespace cert-manager > /dev/null 2>&1; then + echo -e "${YELLOW}cert-manager namespace already exists, skipping...${NC}" +else + kubectl create namespace cert-manager + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --set crds.enabled=true + echo -e "${GREEN}✓ cert-manager installation started${NC}" +fi + +# Create observability namespace +echo -e "\n${YELLOW}Creating observability namespace...${NC}" +kubectl create namespace observability --dry-run=client -o yaml | kubectl apply -f - +echo -e "${GREEN}✓ observability namespace ready${NC}" + +# Install OpenTelemetry Operator +echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" + +if helm list -n observability | grep -q opentelemetry-operator; then + echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" + helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" +else + helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" +fi +echo -e "${GREEN}✓ OpenTelemetry Operator installation started${NC}" + +# Install kube-prometheus-stack (includes Prometheus + Grafana) +echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" +if helm list -n observability | grep -q kube-prometheus-stack; then + echo -e "${YELLOW}kube-prometheus-stack already installed, upgrading...${NC}" + helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin +else + helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin +fi +echo -e "${GREEN}✓ Prometheus and Grafana installation started${NC}" + +# Create OpenTelemetry Collector instance +echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" +cat </dev/null || echo -e "${YELLOW}cert-manager already running or skipped${NC}" + +# Wait for observability pods +echo -e "${YELLOW}Checking observability pods...${NC}" +kubectl wait --for=condition=ready pod --all -n observability --timeout=300s + +echo -e "${GREEN}✓ All pods are ready${NC}" + +# Import Grafana dashboards +echo -e "\n${YELLOW}Importing Grafana dashboards...${NC}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [ -f "$SCRIPT_DIR/jvm-metrics-dashboard.json" ]; then + kubectl create configmap jvm-metrics-dashboard \ + --from-file="$SCRIPT_DIR/jvm-metrics-dashboard.json" \ + -n observability \ + --dry-run=client -o yaml | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + echo -e "${GREEN}✓ JVM Metrics dashboard imported${NC}" +else + echo -e "${YELLOW}⚠ JVM Metrics dashboard not found at $SCRIPT_DIR/jvm-metrics-dashboard.json${NC}" +fi + +if [ -f "$SCRIPT_DIR/josdk-operator-metrics-dashboard.json" ]; then + kubectl create configmap josdk-operator-metrics-dashboard \ + --from-file="$SCRIPT_DIR/josdk-operator-metrics-dashboard.json" \ + -n observability \ + --dry-run=client -o yaml | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + echo -e "${GREEN}✓ JOSDK Operator Metrics dashboard imported${NC}" +else + echo -e "${YELLOW}⚠ JOSDK Operator Metrics dashboard not found at $SCRIPT_DIR/josdk-operator-metrics-dashboard.json${NC}" +fi + +echo -e "${GREEN}✓ Dashboards will be available in Grafana shortly${NC}" + +# Get pod statuses +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Installation Complete!${NC}" +echo -e "${GREEN}========================================${NC}" + +echo -e "\n${YELLOW}Pod Status:${NC}" +kubectl get pods -n observability + +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Access Information${NC}" +echo -e "${GREEN}========================================${NC}" + +echo -e "\n${YELLOW}Grafana:${NC}" +echo -e " Username: ${GREEN}admin${NC}" +echo -e " Password: ${GREEN}admin${NC}" +echo -e " Access with: ${GREEN}kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80${NC}" +echo -e " Then open: ${GREEN}http://localhost:3000${NC}" + +echo -e "\n${YELLOW}Prometheus:${NC}" +echo -e " Access with: ${GREEN}kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090${NC}" +echo -e " Then open: ${GREEN}http://localhost:9090${NC}" + +echo -e "\n${YELLOW}OpenTelemetry Collector:${NC}" +echo -e " OTLP gRPC endpoint: ${GREEN}otel-collector-collector.observability.svc.cluster.local:4317${NC}" +echo -e " OTLP HTTP endpoint: ${GREEN}otel-collector-collector.observability.svc.cluster.local:4318${NC}" +echo -e " Prometheus metrics: ${GREEN}http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics${NC}" + +echo -e "\n${YELLOW}Configure your Java Operator to use OpenTelemetry:${NC}" +echo -e " Add dependency: ${GREEN}io.javaoperatorsdk:operator-framework-opentelemetry-support${NC}" +echo -e " Set environment variables:" +echo -e " ${GREEN}OTEL_SERVICE_NAME=your-operator-name${NC}" +echo -e " ${GREEN}OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector-collector.observability.svc.cluster.local:4318${NC}" +echo -e " ${GREEN}OTEL_METRICS_EXPORTER=otlp${NC}" +echo -e " ${GREEN}OTEL_TRACES_EXPORTER=otlp${NC}" + +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Grafana Dashboards${NC}" +echo -e "${GREEN}========================================${NC}" +echo -e "\nAutomatically imported dashboards:" +echo -e " - ${GREEN}JOSDK - JVM Metrics${NC} - Java Virtual Machine health and performance" +echo -e " - ${GREEN}JOSDK - Operator Metrics${NC} - Kubernetes operator performance and reconciliation" +echo -e "\nPre-installed Kubernetes dashboards:" +echo -e " - Kubernetes / Compute Resources / Cluster" +echo -e " - Kubernetes / Compute Resources / Namespace (Pods)" +echo -e " - Node Exporter / Nodes" +echo -e "\n${YELLOW}Note:${NC} Dashboards may take 30-60 seconds to appear in Grafana after installation." + +echo -e "\n${YELLOW}To uninstall:${NC}" +echo -e " kubectl delete configmap -n observability jvm-metrics-dashboard josdk-operator-metrics-dashboard" +echo -e " kubectl delete -n observability OpenTelemetryCollector otel-collector" +echo -e " helm uninstall -n observability kube-prometheus-stack" +echo -e " helm uninstall -n observability opentelemetry-operator" +echo -e " helm uninstall -n cert-manager cert-manager" +echo -e " kubectl delete namespace observability cert-manager" + +echo -e "\n${GREEN}Done!${NC}" diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json new file mode 100644 index 0000000000..6b53d26611 --- /dev/null +++ b/observability/josdk-operator-metrics-dashboard.json @@ -0,0 +1,1109 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of reconciliations started per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\"}[5m])) by (kind, version)", + "legendFormat": "{{kind}} ({{version}})", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Rate (Started)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Success vs Failure rate of reconciliations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_success_total{service_name=\"josdk\"}[5m]))", + "legendFormat": "Success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m]))", + "legendFormat": "Failure", + "range": true, + "refId": "B" + } + ], + "title": "Reconciliation Success vs Failure Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current number of reconciliations being executed", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(operator_sdk_reconciliations_executions{service_name=\"josdk\"})", + "legendFormat": "Executing", + "range": true, + "refId": "A" + } + ], + "title": "Currently Executing Reconciliations", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current reconciliation queue size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 4, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(operator_sdk_reconciliations_queue_size{service_name=\"josdk\"})", + "legendFormat": "Queue Size", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Queue Size", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Total reconciliations started", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(operator_sdk_reconciliations_started_total{service_name=\"josdk\"})", + "legendFormat": "Total", + "range": true, + "refId": "A" + } + ], + "title": "Total Reconciliations", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Error rate by exception type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m]))", + "legendFormat": "Error Rate", + "range": true, + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Controller execution time percentiles", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", + "legendFormat": "p50 - {{controller}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", + "legendFormat": "p95 - {{controller}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", + "legendFormat": "p99 - {{controller}}", + "range": true, + "refId": "C" + } + ], + "title": "Reconciliation Execution Time (Percentiles)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of events received by the operator", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_events_received_total{service_name=\"josdk\"}[5m])) by (event, action)", + "legendFormat": "{{event}} - {{action}}", + "range": true, + "refId": "A" + } + ], + "title": "Event Reception Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Failures by exception type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["last", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m])) by (exception)", + "legendFormat": "{{exception}}", + "range": true, + "refId": "A" + } + ], + "title": "Failures by Exception Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Controller execution success vs failure", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_controllers_execution_reconcile_success_total{service_name=\"josdk\"}[5m])) by (type)", + "legendFormat": "Success - {{type}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_controllers_execution_reconcile_failure_total{service_name=\"josdk\"}[5m])) by (exception)", + "legendFormat": "Failure - {{exception}}", + "range": true, + "refId": "B" + } + ], + "title": "Controller Execution Success vs Failure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of delete events received", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 11, + "options": { + "legend": { + "calcs": ["last", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_events_delete_total{service_name=\"josdk\"}[5m])) by (kind, version)", + "legendFormat": "{{kind}} ({{version}})", + "range": true, + "refId": "A" + } + ], + "title": "Delete Event Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Reconciliation retry information", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 12, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\", operator_sdk_reconciliations_retries_last=\"true\"}[5m]))", + "legendFormat": "Last Retry Attempts", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\", operator_sdk_reconciliations_retries_last=\"false\"}[5m]))", + "legendFormat": "Retries (Not Last)", + "range": true, + "refId": "B" + } + ], + "title": "Reconciliation Retry Rate", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["operator", "kubernetes", "josdk"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JOSDK - Operator Metrics", + "uid": "josdk-operator-metrics", + "version": 0, + "weekStart": "" +} diff --git a/observability/jvm-metrics-dashboard.json b/observability/jvm-metrics-dashboard.json new file mode 100644 index 0000000000..528f29674e --- /dev/null +++ b/observability/jvm-metrics-dashboard.json @@ -0,0 +1,857 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_memory_used_bytes{service_name=\"josdk\"}", + "legendFormat": "{{area}} - {{id}}", + "range": true, + "refId": "A" + } + ], + "title": "JVM Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_threads_live{service_name=\"josdk\"}", + "legendFormat": "Live Threads", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_threads_daemon_threads{service_name=\"josdk\"}", + "legendFormat": "Daemon Threads", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_threads_peak_threads{service_name=\"josdk\"}", + "legendFormat": "Peak Threads", + "range": true, + "refId": "C" + } + ], + "title": "JVM Threads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(jvm_gc_pause_milliseconds_sum{service_name=\"josdk\"}[5m])", + "legendFormat": "{{action}} - {{cause}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Time Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(jvm_gc_pause_milliseconds_count{service_name=\"josdk\"}[5m])", + "legendFormat": "{{action}} - {{cause}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Count Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "system_cpu_usage{service_name=\"josdk\"}", + "legendFormat": "CPU Usage", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 6, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_classes_loaded{service_name=\"josdk\"}", + "legendFormat": "Classes Loaded", + "range": true, + "refId": "A" + } + ], + "title": "Classes Loaded", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "process_uptime_milliseconds{service_name=\"josdk\"}", + "legendFormat": "Uptime", + "range": true, + "refId": "A" + } + ], + "title": "Process Uptime", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 8, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "system_cpu_count{service_name=\"josdk\"}", + "legendFormat": "CPU Count", + "range": true, + "refId": "A" + } + ], + "title": "CPU Count", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(jvm_gc_memory_allocated_bytes_total{service_name=\"josdk\"}[5m])", + "legendFormat": "Allocated", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(jvm_gc_memory_promoted_bytes_total{service_name=\"josdk\"}[5m])", + "legendFormat": "Promoted", + "range": true, + "refId": "B" + } + ], + "title": "GC Memory Allocation Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_memory_max_bytes{service_name=\"josdk\", area=\"heap\"}", + "legendFormat": "Max Heap", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "jvm_memory_committed_bytes{service_name=\"josdk\", area=\"heap\"}", + "legendFormat": "Committed Heap", + "range": true, + "refId": "B" + } + ], + "title": "Heap Memory Max vs Committed", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["jvm", "java", "josdk"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JOSDK - JVM Metrics", + "uid": "josdk-jvm-metrics", + "version": 0, + "weekStart": "" +} diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java index f66bdc47c6..4e3540bf55 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java @@ -103,8 +103,9 @@ public void cleanupDoneFor(ResourceID resourceID, Map metadata) } @Override - public void finishedReconciliation(HasMetadata resource, Map metadata) { - metricsList.forEach(metrics -> metrics.finishedReconciliation(resource, metadata)); + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + metricsList.forEach(metrics -> metrics.successfullyFinishedReconciliation(resource, metadata)); } @Override diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java index 10b2db6774..cda6fd167b 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java @@ -93,7 +93,8 @@ default void cleanupDoneFor(ResourceID resourceID, Map metadata) * @param resource the {@link ResourceID} associated with the resource being processed * @param metadata metadata associated with the resource being processed */ - default void finishedReconciliation(HasMetadata resource, Map metadata) {} + default void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) {} /** * Encapsulates the information about a controller execution i.e. a call to either {@link diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java index b476c39614..4ff482f03e 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java @@ -292,7 +292,7 @@ synchronized void eventProcessingFinished( return; } cleanupOnSuccessfulExecution(executionScope); - metrics.finishedReconciliation(executionScope.getResource(), metricsMetadata); + metrics.successfullyFinishedReconciliation(executionScope.getResource(), metricsMetadata); if ((triggerOnAllEvents() && executionScope.isDeleteEvent()) || (!triggerOnAllEvents() && state.deleteEventPresent())) { cleanupForDeletedEvent(executionScope.getResourceID()); diff --git a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java index 68142048b6..36a3ca0877 100644 --- a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java +++ b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java @@ -141,13 +141,13 @@ void cleanupDoneFor_shouldDelegateToAllMetricsInOrder() { } @Test - void finishedReconciliation_shouldDelegateToAllMetricsInOrder() { - aggregatedMetrics.finishedReconciliation(resource, metadata); + void successfullyFinishedReconciliation_shouldDelegateToAllMetricsInOrder() { + aggregatedMetrics.successfullyFinishedReconciliation(resource, metadata); final var inOrder = inOrder(metrics1, metrics2, metrics3); - inOrder.verify(metrics1).finishedReconciliation(resource, metadata); - inOrder.verify(metrics2).finishedReconciliation(resource, metadata); - inOrder.verify(metrics3).finishedReconciliation(resource, metadata); + inOrder.verify(metrics1).successfullyFinishedReconciliation(resource, metadata); + inOrder.verify(metrics2).successfullyFinishedReconciliation(resource, metadata); + inOrder.verify(metrics3).successfullyFinishedReconciliation(resource, metadata); verifyNoMoreInteractions(metrics1, metrics2, metrics3); } diff --git a/sample-operators/webpage/README.md b/sample-operators/webpage/README.md index 7718d0f2f3..96329d18a9 100644 --- a/sample-operators/webpage/README.md +++ b/sample-operators/webpage/README.md @@ -76,3 +76,6 @@ of your choice. The JAR file is built using your local Maven and JDK and then co 1. Deploy the CRD: `kubectl apply -f target/classes/META-INF/fabric8/webpages.sample.javaoperatorsdk-v1.yml` 2. Deploy the operator: `kubectl apply -f k8s/operator.yaml` + +To install observability components - such as Prometheus, Open Telemetry, Grafana use - execute: +[install-observability.sh](../../observability/install-observability.sh) diff --git a/sample-operators/webpage/pom.xml b/sample-operators/webpage/pom.xml index 6ec60340ae..f8c79cf268 100644 --- a/sample-operators/webpage/pom.xml +++ b/sample-operators/webpage/pom.xml @@ -39,6 +39,13 @@ pom import + + io.micrometer + micrometer-bom + ${micrometer-core.version} + pom + import + @@ -47,6 +54,20 @@ io.javaoperatorsdk operator-framework + + io.javaoperatorsdk + micrometer-support + + + io.micrometer + micrometer-registry-otlp + ${micrometer-core.version} + + + org.yaml + snakeyaml + 2.3 + org.apache.logging.log4j log4j-slf4j2-impl diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 5366dc2e9a..ad580736c1 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -16,14 +16,32 @@ package io.javaoperatorsdk.operator.sample; import java.io.IOException; +import java.io.InputStream; import java.net.InetSocketAddress; +import java.util.HashMap; +import java.util.Map; +import org.jspecify.annotations.NonNull; +import org.jspecify.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.yaml.snakeyaml.Yaml; import io.javaoperatorsdk.operator.Operator; +import io.javaoperatorsdk.operator.api.monitoring.Metrics; +import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetrics; import io.javaoperatorsdk.operator.sample.probes.LivenessHandler; import io.javaoperatorsdk.operator.sample.probes.StartupHandler; +import io.micrometer.core.instrument.Clock; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.binder.jvm.ClassLoaderMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmGcMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmMemoryMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmThreadMetrics; +import io.micrometer.core.instrument.binder.system.ProcessorMetrics; +import io.micrometer.core.instrument.binder.system.UptimeMetrics; +import io.micrometer.registry.otlp.OtlpConfig; +import io.micrometer.registry.otlp.OtlpMeterRegistry; import com.sun.net.httpserver.HttpServer; @@ -40,7 +58,16 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); - Operator operator = new Operator(o -> o.withStopOnInformerErrorDuringStartup(false)); + // TODO // todo change: + // operator_sdk_reconciliations_queue_size_webpagestandalonedependentsreconciler + // operator_sdk_reconciliations_executions_webpagestandalonedependentsreconciler + // => controller name as label + // TODO add test for checking if there are metrics in prometheus + // Load configuration from config.yaml + Metrics metrics = initOTLPMetrics(); + Operator operator = + new Operator(o -> o.withStopOnInformerErrorDuringStartup(false).withMetrics(metrics)); + String reconcilerEnvVar = System.getenv(WEBPAGE_RECONCILER_ENV); if (WEBPAGE_CLASSIC_RECONCILER_ENV_VALUE.equals(reconcilerEnvVar)) { operator.register(new WebPageReconciler()); @@ -58,4 +85,66 @@ public static void main(String[] args) throws IOException { server.setExecutor(null); server.start(); } + + private static @NonNull Metrics initOTLPMetrics() { + Map configProperties = loadConfigFromYaml(); + var otlpConfig = + new OtlpConfig() { + @Override + public String prefix() { + return ""; + } + + @Override + public @Nullable String get(String key) { + return configProperties.get(key); + } + + // these should come from env variables + @Override + public Map resourceAttributes() { + return Map.of("service.name", "josdk", "operator", "webpage"); + } + }; + + MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); + + // Register JVM and system metrics + log.info("Registering JVM and system metrics..."); + new JvmMemoryMetrics().bindTo(registry); + new JvmGcMetrics().bindTo(registry); + new JvmThreadMetrics().bindTo(registry); + new ClassLoaderMetrics().bindTo(registry); + new ProcessorMetrics().bindTo(registry); + new UptimeMetrics().bindTo(registry); + + return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(registry) + .collectingMetricsPerResource() + .build(); + } + + @SuppressWarnings("unchecked") + private static Map loadConfigFromYaml() { + Map configMap = new HashMap<>(); + try (InputStream inputStream = WebPageOperator.class.getResourceAsStream("/otlp-config.yaml")) { + if (inputStream == null) { + log.warn("otlp-config.yaml not found in resources, using default OTLP configuration"); + return configMap; + } + + Yaml yaml = new Yaml(); + Map yamlData = yaml.load(inputStream); + + // Navigate to otlp section and map properties directly + Map otlp = (Map) yamlData.get("otlp"); + if (otlp != null) { + otlp.forEach((key, value) -> configMap.put("otlp." + key, value.toString())); + } + + log.info("Loaded OTLP configuration from otlp-config.yaml: {}", configMap); + } catch (IOException e) { + log.error("Error loading otlp-config.yaml", e); + } + return configMap; + } } diff --git a/sample-operators/webpage/src/main/resources/log4j2.xml b/sample-operators/webpage/src/main/resources/log4j2.xml index 0bf270c7e6..ebe273e40e 100644 --- a/sample-operators/webpage/src/main/resources/log4j2.xml +++ b/sample-operators/webpage/src/main/resources/log4j2.xml @@ -23,7 +23,7 @@ - + diff --git a/sample-operators/webpage/src/main/resources/otlp-config.yaml b/sample-operators/webpage/src/main/resources/otlp-config.yaml new file mode 100644 index 0000000000..17d773eb70 --- /dev/null +++ b/sample-operators/webpage/src/main/resources/otlp-config.yaml @@ -0,0 +1,23 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +otlp: + # OTLP Collector endpoint - see observability/install-observability.sh for setup + url: "http://localhost:4318/v1/metrics" +# url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative"