Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ By default, version-checker exposes the following Prometheus metrics on `0.0.0.0
- `version_checker_last_checked`: Timestamp when the image was last checked.
- `version_checker_image_lookup_duration`: Duration of the image version check.
- `version_checker_image_failures_total`: Total of errors encountered during image version checks.
- Labels: `namespace`, `pod`, `container`, `image`
- This counter is incremented when version-checker cannot determine the upstream image version, including cases where a registry lookup fails or the image/tag is no longer available upstream.

## Kubernetes Version Metrics

Expand All @@ -26,8 +28,46 @@ QUERY="version_checker_is_latest_version"
curl -s --get --data-urlencode query=$QUERY <PROMETHEUS_URL>
```

### Check for failed image lookups
```sh
QUERY='increase(version_checker_image_failures_total[15m]) > 0'
curl -s --get --data-urlencode query="$QUERY" <PROMETHEUS_URL>
```

### Check Kubernetes cluster version
```sh
QUERY="version_checker_is_latest_kube_version"
curl -s --get --data-urlencode query=$QUERY <PROMETHEUS_URL>
```

## Alerting on missing or unavailable images

If a pod references an image tag that has been removed upstream, version-checker will fail the lookup for that image and increment `version_checker_image_failures_total` for the affected `namespace`, `pod`, `container`, and `image`.

Example `PrometheusRule`:

```yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: version-checker-image-failures
spec:
groups:
- name: version-checker.rules
rules:
- alert: VersionCheckerImageLookupFailures
expr: increase(version_checker_image_failures_total[15m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: version-checker cannot resolve an upstream image tag
description: >-
version-checker has failed to look up the upstream image for
{{ $labels.namespace }}/{{ $labels.pod }} container
{{ $labels.container }} (image {{ $labels.image }}) in the last
15 minutes. This can indicate that the tag has been removed or is
otherwise unavailable in the registry.
```

To make this alert effective, ensure version-checker is actually checking the containers you care about, either by enabling `--test-all-containers` / `versionChecker.testAllContainers=true` or by opting specific containers in with `version-checker.jetstack.io/enabled`.
99 changes: 99 additions & 0 deletions pkg/controller/pod_sync_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,29 @@ package controller

import (
"context"
"fmt"
"testing"
"time"

"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

dto "github.com/prometheus/client_model/go"
"github.com/prometheus/client_golang/prometheus"

"github.com/jetstack/version-checker/pkg/api"
"github.com/jetstack/version-checker/pkg/client"
"github.com/jetstack/version-checker/pkg/controller/checker"
fakesearch "github.com/jetstack/version-checker/pkg/controller/internal/fake/search"
"github.com/jetstack/version-checker/pkg/controller/options"
"github.com/jetstack/version-checker/pkg/controller/search"
"github.com/jetstack/version-checker/pkg/metrics"
"github.com/jetstack/version-checker/pkg/version"
versionerrors "github.com/jetstack/version-checker/pkg/version/errors"
)

// Test for the sync method.
Expand Down Expand Up @@ -155,3 +160,97 @@ func TestController_SyncContainer_NoVersionFound(t *testing.T) {
err := controller.syncContainer(context.Background(), log, builder, pod, container, "container")
assert.NoError(t, err) // We expect no error because IsNoVersionFound is handled gracefully
}

func TestController_SyncContainer_NoVersionFoundReportsFailureMetric(t *testing.T) {
t.Parallel()

log := logrus.NewEntry(logrus.New())
reg := prometheus.NewRegistry()
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
Namespace: "default",
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{Name: "main-container", Image: "docker.io/example/missing:v1.2.3"},
},
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{Name: "main-container", ImageID: "docker.io/example/missing@sha256:deadbeef"},
},
},
}
kubeClient := fake.NewClientBuilder().WithObjects(pod).Build()
metrics := metrics.New(log, reg, kubeClient)
checker := checker.New(
fakesearch.New().With(nil, versionerrors.NewVersionErrorNotFound("%s", fmt.Sprintf("no tags found for given image URL: %q", "docker.io/example/missing"))),
)

controller := &PodReconciler{
Log: log,
VersionChecker: checker,
Metrics: metrics,
defaultTestAll: true,
}

builder := options.New(map[string]string{
"version-checker.jetstack.io/enabled": "true",
})
Comment on lines +198 to +200

err := controller.syncContainer(
context.Background(),
log,
builder,
pod,
&pod.Spec.Containers[0],
"container",
)
require.NoError(t, err)

metricFamilies, err := reg.Gather()
require.NoError(t, err)

metric := findMetricWithLabels(t, metricFamilies, "version_checker_image_failures_total", map[string]string{
"namespace": "default",
"pod": "test-pod",
"container": "main-container",
"image": "docker.io/example/missing:v1.2.3",
})
require.NotNil(t, metric.Counter)
assert.Equal(t, float64(1), metric.Counter.GetValue())
}

func findMetricWithLabels(t *testing.T, metricFamilies []*dto.MetricFamily, name string, expectedLabels map[string]string) *dto.Metric {
t.Helper()

for _, mf := range metricFamilies {
if mf.GetName() != name {
continue
}

for _, metric := range mf.GetMetric() {
labels := make(map[string]string, len(metric.GetLabel()))
for _, label := range metric.GetLabel() {
labels[label.GetName()] = label.GetValue()
}
if matchesExpectedLabels(labels, expectedLabels) {
return metric
}
}
}

require.FailNow(t, fmt.Sprintf("metric %q with labels %+v not found", name, expectedLabels))
return nil
}

func matchesExpectedLabels(labels, expectedLabels map[string]string) bool {
for key, value := range expectedLabels {
if labels[key] != value {
return false
}
}

return true
}
Loading