diff --git a/docs/metrics.md b/docs/metrics.md index 49405e1..35622eb 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -8,6 +8,8 @@ By default, version-checker exposes the following Prometheus metrics on `0.0.0.0 - `version_checker_last_checked`: Timestamp when the image was last checked. - `version_checker_image_lookup_duration`: Duration of the image version check. - `version_checker_image_failures_total`: Total of errors encountered during image version checks. + - Labels: `namespace`, `pod`, `container`, `image` + - This counter is incremented when version-checker cannot determine the upstream image version, including cases where a registry lookup fails or the image/tag is no longer available upstream. ## Kubernetes Version Metrics @@ -26,8 +28,46 @@ QUERY="version_checker_is_latest_version" curl -s --get --data-urlencode query=$QUERY ``` +### Check for failed image lookups +```sh +QUERY='increase(version_checker_image_failures_total[15m]) > 0' +curl -s --get --data-urlencode query="$QUERY" +``` + ### Check Kubernetes cluster version ```sh QUERY="version_checker_is_latest_kube_version" curl -s --get --data-urlencode query=$QUERY ``` + +## Alerting on missing or unavailable images + +If a pod references an image tag that has been removed upstream, version-checker will fail the lookup for that image and increment `version_checker_image_failures_total` for the affected `namespace`, `pod`, `container`, and `image`. + +Example `PrometheusRule`: + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: version-checker-image-failures +spec: + groups: + - name: version-checker.rules + rules: + - alert: VersionCheckerImageLookupFailures + expr: increase(version_checker_image_failures_total[15m]) > 0 + for: 15m + labels: + severity: warning + annotations: + summary: version-checker cannot resolve an upstream image tag + description: >- + version-checker has failed to look up the upstream image for + {{ $labels.namespace }}/{{ $labels.pod }} container + {{ $labels.container }} (image {{ $labels.image }}) in the last + 15 minutes. This can indicate that the tag has been removed or is + otherwise unavailable in the registry. +``` + +To make this alert effective, ensure version-checker is actually checking the containers you care about, either by enabling `--test-all-containers` / `versionChecker.testAllContainers=true` or by opting specific containers in with `version-checker.jetstack.io/enabled`. diff --git a/pkg/controller/pod_sync_test.go b/pkg/controller/pod_sync_test.go index ff214d8..7e695bc 100644 --- a/pkg/controller/pod_sync_test.go +++ b/pkg/controller/pod_sync_test.go @@ -2,24 +2,29 @@ package controller import ( "context" + "fmt" "testing" "time" "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client/fake" + dto "github.com/prometheus/client_model/go" "github.com/prometheus/client_golang/prometheus" "github.com/jetstack/version-checker/pkg/api" "github.com/jetstack/version-checker/pkg/client" "github.com/jetstack/version-checker/pkg/controller/checker" + fakesearch "github.com/jetstack/version-checker/pkg/controller/internal/fake/search" "github.com/jetstack/version-checker/pkg/controller/options" "github.com/jetstack/version-checker/pkg/controller/search" "github.com/jetstack/version-checker/pkg/metrics" "github.com/jetstack/version-checker/pkg/version" + versionerrors "github.com/jetstack/version-checker/pkg/version/errors" ) // Test for the sync method. @@ -155,3 +160,97 @@ func TestController_SyncContainer_NoVersionFound(t *testing.T) { err := controller.syncContainer(context.Background(), log, builder, pod, container, "container") assert.NoError(t, err) // We expect no error because IsNoVersionFound is handled gracefully } + +func TestController_SyncContainer_NoVersionFoundReportsFailureMetric(t *testing.T) { + t.Parallel() + + log := logrus.NewEntry(logrus.New()) + reg := prometheus.NewRegistry() + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "main-container", Image: "docker.io/example/missing:v1.2.3"}, + }, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main-container", ImageID: "docker.io/example/missing@sha256:deadbeef"}, + }, + }, + } + kubeClient := fake.NewClientBuilder().WithObjects(pod).Build() + metrics := metrics.New(log, reg, kubeClient) + checker := checker.New( + fakesearch.New().With(nil, versionerrors.NewVersionErrorNotFound("%s", fmt.Sprintf("no tags found for given image URL: %q", "docker.io/example/missing"))), + ) + + controller := &PodReconciler{ + Log: log, + VersionChecker: checker, + Metrics: metrics, + defaultTestAll: true, + } + + builder := options.New(map[string]string{ + "version-checker.jetstack.io/enabled": "true", + }) + + err := controller.syncContainer( + context.Background(), + log, + builder, + pod, + &pod.Spec.Containers[0], + "container", + ) + require.NoError(t, err) + + metricFamilies, err := reg.Gather() + require.NoError(t, err) + + metric := findMetricWithLabels(t, metricFamilies, "version_checker_image_failures_total", map[string]string{ + "namespace": "default", + "pod": "test-pod", + "container": "main-container", + "image": "docker.io/example/missing:v1.2.3", + }) + require.NotNil(t, metric.Counter) + assert.Equal(t, float64(1), metric.Counter.GetValue()) +} + +func findMetricWithLabels(t *testing.T, metricFamilies []*dto.MetricFamily, name string, expectedLabels map[string]string) *dto.Metric { + t.Helper() + + for _, mf := range metricFamilies { + if mf.GetName() != name { + continue + } + + for _, metric := range mf.GetMetric() { + labels := make(map[string]string, len(metric.GetLabel())) + for _, label := range metric.GetLabel() { + labels[label.GetName()] = label.GetValue() + } + if matchesExpectedLabels(labels, expectedLabels) { + return metric + } + } + } + + require.FailNow(t, fmt.Sprintf("metric %q with labels %+v not found", name, expectedLabels)) + return nil +} + +func matchesExpectedLabels(labels, expectedLabels map[string]string) bool { + for key, value := range expectedLabels { + if labels[key] != value { + return false + } + } + + return true +}