From 1d6ef5ff554160066f401dc7370b369ef09e5391 Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Thu, 9 Apr 2026 09:36:50 +0200 Subject: [PATCH 01/10] feat(instances): cache server lookups to reduce API calls --- docs/reference/README.md | 1 + docs/reference/instance_cache.md | 26 +++ hcloud/cloud.go | 36 ++-- hcloud/cloud_test.go | 5 + hcloud/instances.go | 12 +- hcloud/instances_test.go | 41 ++-- hcloud/instances_util.go | 27 --- internal/config/config.go | 31 ++++ internal/config/config_test.go | 46 ++--- internal/metrics/metrics.go | 7 +- internal/servercache/allservercache.go | 119 ++++++++++++ internal/servercache/allservercache_test.go | 195 ++++++++++++++++++++ internal/servercache/evalservercache.go | 63 +++++++ internal/servercache/perservercache.go | 123 ++++++++++++ internal/servercache/perservercache_test.go | 165 +++++++++++++++++ internal/servercache/servercache.go | 72 ++++++++ internal/servercache/servercache_test.go | 23 +++ 17 files changed, 909 insertions(+), 83 deletions(-) create mode 100644 docs/reference/instance_cache.md create mode 100644 internal/servercache/allservercache.go create mode 100644 internal/servercache/allservercache_test.go create mode 100644 internal/servercache/evalservercache.go create mode 100644 internal/servercache/perservercache.go create mode 100644 internal/servercache/perservercache_test.go create mode 100644 internal/servercache/servercache.go create mode 100644 internal/servercache/servercache_test.go diff --git a/docs/reference/README.md b/docs/reference/README.md index 82e995f24..12942c0be 100644 --- a/docs/reference/README.md +++ b/docs/reference/README.md @@ -6,3 +6,4 @@ In this folder, you should find technical references material of the hcloud-clou - [Version Policy](version-policy.md) - [Load Balancer Annotations](load_balancer_annotations.md) - [Load Balancer Environment Variables](load_balancer_envs.md) +- [Instance Cache](instance_cache.md) diff --git a/docs/reference/instance_cache.md b/docs/reference/instance_cache.md new file mode 100644 index 000000000..6ba6491a2 --- /dev/null +++ b/docs/reference/instance_cache.md @@ -0,0 +1,26 @@ +# Instance Cache + +> **Experimental:** Instance caching is experimental, breaking changes may occur within minor releases. We believe the implementation is safe in practice — that is why it ships enabled by default (`all-server`). Set `HCLOUD_INSTANCES_CACHE_MODE=off` to opt out. + +The instance cache reduces calls to the Hetzner Cloud API made by the `InstancesV2` controller, which looks up Servers by ID or name to reconcile Node state. The cache sits between the controller and the Hetzner Cloud API; behavior is controlled by the environment variables below. + +## Environment Variables + +| Name | Type | Default | Description | +| ----------------------------- | --------------------------------- | ------------ | ------------------------------------------------------------------------------------- | +| `HCLOUD_INSTANCES_CACHE_MODE` | `all-server \| per-server \| off` | `all-server` | Selects the caching strategy. See [Modes](#modes) below. | +| `HCLOUD_INSTANCES_CACHE_TTL` | `duration` | `10s` | Lifetime of cached entries. Accepts any Go `time.Duration` string (e.g. `30s`, `2m`). | + +## Modes + +### `all-server` + +Fetches every Server in the project with a single `GET /servers` call and serves all subsequent `ByID` / `ByName` lookups from the resulting snapshot until the TTL expires. The snapshot is refreshed on the next lookup after expiry. On a cache miss within the TTL (e.g. a freshly created Server), one rate-limited refresh per TTL window is allowed to pick up the new Server; further misses in the same window return without an API call. + +### `per-server` + +Caches each Server individually with its own expiration. A `ByID` / `ByName` lookup either returns a non-expired entry or issues a `GET /servers/{id}` (or `GET /servers?name=`) call and stores the result. Expired entries are evicted lazily when other entries are inserted. + +### `off` + +Disables caching entirely. Every lookup goes directly to the API. diff --git a/hcloud/cloud.go b/hcloud/cloud.go index ab1bd4cdf..63d2fbe73 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -37,6 +37,7 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/robot" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/metadata" ) @@ -50,13 +51,14 @@ const ( var providerVersion = "unknown" type cloud struct { - client *hcloud.Client - robotClient hrobot.RobotClient - cfg config.HCCMConfiguration - recorder record.EventRecorder - networkID int64 - cidr string - nodeLister corelisters.NodeLister + client *hcloud.Client + robotClient hrobot.RobotClient + instanceCache servercache.ServerCache + cfg config.HCCMConfiguration + recorder record.EventRecorder + networkID int64 + cidr string + nodeLister corelisters.NodeLister } func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Interface, error) { @@ -144,13 +146,19 @@ func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Int klog.Infof("Hetzner Cloud k8s cloud controller %s started\n", providerVersion) + instanceCache, err := servercache.New(client, "instances_v2", cfg.Instance.Cache.Mode, cfg.Instance.Cache.TTL) + if err != nil { + return nil, fmt.Errorf("%s: %w", op, err) + } + return &cloud{ - client: client, - robotClient: robotClient, - cfg: cfg, - networkID: networkID, - cidr: cidr, - nodeLister: nodeLister, + client: client, + robotClient: robotClient, + instanceCache: instanceCache, + cfg: cfg, + networkID: networkID, + cidr: cidr, + nodeLister: nodeLister, }, nil } @@ -175,7 +183,7 @@ func (c *cloud) Instances() (cloudprovider.Instances, bool) { } func (c *cloud) InstancesV2() (cloudprovider.InstancesV2, bool) { - return newInstances(c.client, c.robotClient, c.recorder, c.networkID, c.cfg), true + return newInstances(c.client, c.robotClient, c.instanceCache, c.recorder, c.networkID, c.cfg), true } func (c *cloud) Zones() (cloudprovider.Zones, bool) { diff --git a/hcloud/cloud_test.go b/hcloud/cloud_test.go index 28e363c95..fa653642d 100644 --- a/hcloud/cloud_test.go +++ b/hcloud/cloud_test.go @@ -31,6 +31,7 @@ import ( "k8s.io/client-go/tools/record" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/config" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" @@ -41,6 +42,7 @@ type testEnv struct { Mux *http.ServeMux Client *hcloud.Client RobotClient hrobot.RobotClient + ServerCache servercache.ServerCache Recorder record.EventRecorder Cfg config.HCCMConfiguration } @@ -51,6 +53,7 @@ func (env *testEnv) Teardown() { env.Mux = nil env.Client = nil env.RobotClient = nil + env.ServerCache = nil env.Recorder = nil } @@ -66,6 +69,7 @@ func newTestEnv() testEnv { ) robotClient := hrobot.NewBasicAuthClient("", "") robotClient.SetBaseURL(server.URL + "/robot") + serverCache := servercache.NewPerServerCache(client, "instances_v2", 10*time.Second) recorder := record.NewBroadcaster().NewRecorder(scheme.Scheme, corev1.EventSource{Component: "hcloud-cloud-controller-manager"}) cfg := config.HCCMConfiguration{} @@ -76,6 +80,7 @@ func newTestEnv() testEnv { Mux: mux, Client: client, RobotClient: robotClient, + ServerCache: serverCache, Recorder: recorder, Cfg: cfg, } diff --git a/hcloud/instances.go b/hcloud/instances.go index a6f7242b5..e72e8f7f8 100644 --- a/hcloud/instances.go +++ b/hcloud/instances.go @@ -33,6 +33,7 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/legacydatacenter" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/providerid" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) @@ -44,6 +45,7 @@ const ( type instances struct { client *hcloud.Client robotClient hrobot.RobotClient + serverCache servercache.ServerCache recorder record.EventRecorder networkID int64 cfg config.HCCMConfiguration @@ -57,6 +59,7 @@ var ( func newInstances( client *hcloud.Client, robotClient hrobot.RobotClient, + serverCache servercache.ServerCache, recorder record.EventRecorder, networkID int64, cfg config.HCCMConfiguration, @@ -64,6 +67,7 @@ func newInstances( return &instances{ client, robotClient, + serverCache, recorder, networkID, cfg, @@ -80,13 +84,12 @@ func (i *instances) lookupServer( if node.Spec.ProviderID != "" { var serverID int64 serverID, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID) - if err != nil { return nil, fmt.Errorf("failed to convert provider id to server id: %w", err) } if isCloudServer { - server, err := getCloudServerByID(ctx, i.client, serverID) + server, err := i.serverCache.ByID(ctx, serverID) if err != nil { return nil, fmt.Errorf("failed to get hcloud server \"%d\": %w", serverID, err) } @@ -115,7 +118,7 @@ func (i *instances) lookupServer( // If the node has no provider ID we try to find the server by name from // both sources. In case we find two servers, we return an error. - cloudServer, err := getCloudServerByName(ctx, i.client, node.Name) + cloudServer, err := i.serverCache.ByName(ctx, node.Name) if err != nil { return nil, fmt.Errorf("failed to get hcloud server %q: %w", node.Name, err) } @@ -153,6 +156,7 @@ func (i *instances) lookupServer( func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool, error) { const op = "hcloud/instancesv2.InstanceExists" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceExists called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { @@ -165,6 +169,7 @@ func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bool, error) { const op = "hcloud/instancesv2.InstanceShutdown" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceShutdown called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { @@ -188,6 +193,7 @@ func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bo func (i *instances) InstanceMetadata(ctx context.Context, node *corev1.Node) (*cloudprovider.InstanceMetadata, error) { const op = "hcloud/instancesv2.InstanceMetadata" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceMetadata called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { diff --git a/hcloud/instances_test.go b/hcloud/instances_test.go index 5f4aef91b..81871a3f2 100644 --- a/hcloud/instances_test.go +++ b/hcloud/instances_test.go @@ -91,7 +91,7 @@ func TestInstances_InstanceExists(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) tests := []struct { name string @@ -104,7 +104,8 @@ func TestInstances_InstanceExists(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, }, expected: true, - }, { + }, + { name: "existing robot server by id", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -123,25 +124,29 @@ func TestInstances_InstanceExists(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://bm-321"}, }, expected: true, - }, { + }, + { name: "missing server by id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://2"}, }, expected: false, - }, { + }, + { name: "missing robot server by id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hrobot://322"}, }, expected: false, - }, { + }, + { name: "missing robot server by (legacy) id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://bm-322"}, }, expected: false, - }, { + }, + { name: "existing server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -149,7 +154,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: true, - }, { + }, + { name: "existing robot server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -157,7 +163,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: true, - }, { + }, + { name: "missing server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -165,7 +172,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: false, - }, { + }, + { name: "missing robot server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -211,7 +219,7 @@ func TestInstances_InstanceShutdown(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) env.Mux.HandleFunc("/robot/server/3", func(w http.ResponseWriter, _ *http.Request) { json.NewEncoder(w).Encode(hrobotmodels.ServerResponse{ Server: hrobotmodels.Server{ @@ -274,13 +282,15 @@ func TestInstances_InstanceShutdown(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, }, expected: false, - }, { + }, + { name: "[cloud] shutdown", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://2"}, }, expected: true, - }, { + }, + { name: "[robot] running", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -289,7 +299,8 @@ func TestInstances_InstanceShutdown(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hrobot://3"}, }, expected: false, - }, { + }, + { name: "[robot] shutdown", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -346,7 +357,7 @@ func TestInstances_InstanceMetadata(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, @@ -390,7 +401,7 @@ func TestInstances_InstanceMetadataRobotServer(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ diff --git a/hcloud/instances_util.go b/hcloud/instances_util.go index 370c68c9a..a86f0b447 100644 --- a/hcloud/instances_util.go +++ b/hcloud/instances_util.go @@ -17,7 +17,6 @@ limitations under the License. package hcloud import ( - "context" "fmt" "regexp" "strings" @@ -26,9 +25,6 @@ import ( hrobotmodels "github.com/syself/hrobot-go/models" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" - - "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" - "github.com/hetznercloud/hcloud-go/v2/hcloud" ) type MockEventRecorder struct{} @@ -51,29 +47,6 @@ func (er *MockEventRecorder) AnnotatedEventf( ) { } -func getCloudServerByName(ctx context.Context, c *hcloud.Client, name string) (*hcloud.Server, error) { - const op = "hcloud/getCloudServerByName" - metrics.OperationCalled.WithLabelValues(op).Inc() - - server, _, err := c.Server.GetByName(ctx, name) - if err != nil { - return nil, fmt.Errorf("%s: %w", op, err) - } - - return server, nil -} - -func getCloudServerByID(ctx context.Context, c *hcloud.Client, id int64) (*hcloud.Server, error) { - const op = "hcloud/getCloudServerByID" - metrics.OperationCalled.WithLabelValues(op).Inc() - - server, _, err := c.Server.GetByID(ctx, id) - if err != nil { - return nil, fmt.Errorf("%s: %w", op, err) - } - return server, nil -} - func getRobotServerByName(c hrobot.RobotClient, node *corev1.Node) (server *hrobotmodels.Server, err error) { const op = "hcloud/getRobotServerByName" diff --git a/internal/config/config.go b/internal/config/config.go index 32c08f409..628f293c0 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,6 +11,7 @@ import ( "k8s.io/klog/v2" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/kit/envutil" ) @@ -29,6 +30,8 @@ const ( robotForwardInternalIPs = "ROBOT_FORWARD_INTERNAL_IPS" hcloudInstancesAddressFamily = "HCLOUD_INSTANCES_ADDRESS_FAMILY" + hcloudInstancesCacheMode = "HCLOUD_INSTANCES_CACHE_MODE" + hcloudInstancesCacheTTL = "HCLOUD_INSTANCES_CACHE_TTL" // Disable the "master/server is attached to the network" check against the metadata service. hcloudNetworkDisableAttachedCheck = "HCLOUD_NETWORK_DISABLE_ATTACHED_CHECK" @@ -67,8 +70,16 @@ const ( AddressFamilyIPv4 AddressFamily = "ipv4" ) +const InstanceCacheDefaultTTL time.Duration = 10 * time.Second + type InstanceConfiguration struct { AddressFamily AddressFamily + Cache InstanceConfigurationCache +} + +type InstanceConfigurationCache struct { + Mode servercache.Mode + TTL time.Duration } type LoadBalancerConfiguration struct { @@ -174,6 +185,26 @@ func Read() (HCCMConfiguration, error) { cfg.Instance.AddressFamily = AddressFamilyIPv4 } + // ---- Instance Cache ---- + + cfg.Instance.Cache = InstanceConfigurationCache{ + Mode: servercache.ModeAllServers, + TTL: InstanceCacheDefaultTTL, + } + + if mode, ok := os.LookupEnv(hcloudInstancesCacheMode); ok { + cfg.Instance.Cache.Mode = servercache.Mode(mode) + } + + if ttlStr, ok := os.LookupEnv(hcloudInstancesCacheTTL); ok { + ttl, err := time.ParseDuration(ttlStr) + if err != nil { + errs = append(errs, fmt.Errorf("invalid value for %q: %w", hcloudInstancesCacheTTL, err)) + } else { + cfg.Instance.Cache.TTL = ttl + } + } + cfg.LoadBalancer.Enabled, err = getEnvBool(hcloudLoadBalancersEnabled, true) if err != nil { errs = append(errs, err) diff --git a/internal/config/config_test.go b/internal/config/config_test.go index d6fb65ede..879cab2e5 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -7,6 +7,7 @@ import ( "github.com/stretchr/testify/assert" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) @@ -25,7 +26,7 @@ func TestRead(t *testing.T) { want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -48,7 +49,7 @@ func TestRead(t *testing.T) { HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ NameOrID: "foobar", AttachedCheckEnabled: true, @@ -85,7 +86,7 @@ func TestRead(t *testing.T) { ForwardInternalIPs: false, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -141,7 +142,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or }, Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -170,7 +171,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: false, Address: "127.0.0.1:9999"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -201,7 +202,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or ForwardInternalIPs: true, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -233,7 +234,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or ForwardInternalIPs: false, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -253,7 +254,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -274,7 +275,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -297,7 +298,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -324,7 +325,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -413,7 +414,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "minimal", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, }, wantErr: nil, }, @@ -423,7 +424,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ NameOrID: "foobar", }, @@ -437,7 +438,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "token missing", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: ""}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, }, wantErr: errors.New("environment variable \"HCLOUD_TOKEN\" is required"), }, @@ -445,7 +446,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "address family invalid", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamily("foobar")}, + Instance: InstanceConfiguration{AddressFamily: AddressFamily("foobar"), Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, }, wantErr: errors.New("invalid value for \"HCLOUD_INSTANCES_ADDRESS_FAMILY\", expect one of: ipv4,ipv6,dualstack"), }, @@ -453,7 +454,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "LB location and network zone set", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Location: "nbg1", NetworkZone: "eu-central", @@ -465,7 +466,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "LB private subnet invalid cidr", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ PrivateSubnetIPRange: "10.0.0.0/33", }, @@ -476,7 +477,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "algorithm type invalid", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ AlgorithmType: "invalid", }, @@ -487,7 +488,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled without credentials (valid)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -499,7 +500,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled with partial credentials (only user)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -513,7 +514,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled with partial credentials (only password)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -526,9 +527,8 @@ func TestHCCMConfiguration_Validate(t *testing.T) { { name: "robot & routes activated", fields: fields{ - HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Route: RouteConfiguration{Enabled: true}, Robot: RobotConfiguration{ Enabled: true, diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 46663e62d..ce93b426b 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -37,10 +37,15 @@ var ( Name: "cloud_controller_manager_operations_total", Help: "The total number of operation was called", }, []string{"op"}) + + CacheRequests = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cloud_controller_manager_cache_requests_total", + Help: "Total cache requests partitioned by cache name and result.", + }, []string{"subsytem", "mode", "result"}) ) func init() { - GetRegistry().MustRegister(OperationCalled) + GetRegistry().MustRegister(OperationCalled, CacheRequests) } func GetRegistry() prometheus.Registerer { diff --git a/internal/servercache/allservercache.go b/internal/servercache/allservercache.go new file mode 100644 index 000000000..f84c64cdf --- /dev/null +++ b/internal/servercache/allservercache.go @@ -0,0 +1,119 @@ +package servercache + +import ( + "context" + "sync" + "time" + + "golang.org/x/time/rate" + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +var _ ServerCache = (*AllServerCache)(nil) + +type AllServerCache struct { + subsystem string + mode Mode + ttl time.Duration + expiresAt time.Time + + client *hcloud.Client + + byID map[int64]*hcloud.Server + byName map[string]*hcloud.Server + + limiter *rate.Limiter + mu sync.Mutex +} + +func NewAllServerCache(client *hcloud.Client, subsystem string, ttl time.Duration) *AllServerCache { + return &AllServerCache{ + subsystem: subsystem, + mode: ModeAllServers, + ttl: ttl, + client: client, + expiresAt: time.Now(), + limiter: rate.NewLimiter(rate.Every(ttl), 1), + } +} + +// ByID implements [ServerCache]. +func (c *AllServerCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + return c.getFromCache(ctx, func() *hcloud.Server { return c.byID[id] }) +} + +// ByName implements [ServerCache]. +func (c *AllServerCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { + return c.getFromCache(ctx, func() *hcloud.Server { return c.byName[name] }) +} + +func (c *AllServerCache) refresh(ctx context.Context) error { + klog.V(4).InfoS("all-server cache: refreshing from api") + servers, err := c.client.Server.All(ctx) + if err != nil { + return err + } + + c.byID = make(map[int64]*hcloud.Server, len(servers)) + c.byName = make(map[string]*hcloud.Server, len(servers)) + + for _, server := range servers { + c.byID[server.ID] = server + c.byName[server.Name] = server + } + + c.expiresAt = time.Now().Add(c.ttl) + klog.V(4).InfoS("all-server cache: refresh complete", "count", len(servers), "expiresAt", c.expiresAt) + + return nil +} + +func (c *AllServerCache) getFromCache(ctx context.Context, lookup func() *hcloud.Server) (*hcloud.Server, error) { + c.mu.Lock() + defer c.mu.Unlock() + + cacheRefreshed := false + if time.Now().After(c.expiresAt) { + klog.V(4).InfoS("all-server cache: cache expired, refreshing", "expiresAt", c.expiresAt) + if err := c.refresh(ctx); err != nil { + return nil, err + } + cacheRefreshed = true + } + + if server := lookup(); server != nil { + metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "hit").Inc() + klog.V(4).InfoS("all-server cache hit", "id", server.ID, "name", server.Name) + return server, nil + } + + metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "miss").Inc() + + // Server not found on fresh cache so return early. + if cacheRefreshed { + klog.V(4).InfoS("all-server cache: server not found in fresh snapshot") + return nil, nil + } + + // Cache was not refreshed and rate limiter does not allow refreshing right now. + if !c.limiter.Allow() { + klog.V(4).InfoS("all-server cache: miss-driven refresh denied by rate limiter") + return nil, ErrRateLimited + } + + klog.V(4).InfoS("all-server cache miss: refreshing to catch newly-created server") + if err := c.refresh(ctx); err != nil { + return nil, err + } + + server := lookup() + if server == nil { + klog.V(4).InfoS("all-server cache: server not found after refresh") + } else { + klog.V(4).InfoS("all-server cache: server found after refresh", "id", server.ID, "name", server.Name) + } + return server, nil +} diff --git a/internal/servercache/allservercache_test.go b/internal/servercache/allservercache_test.go new file mode 100644 index 000000000..e1349721f --- /dev/null +++ b/internal/servercache/allservercache_test.go @@ -0,0 +1,195 @@ +package servercache + +import ( + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/time/rate" + + "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" + "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" +) + +// allServersPath is the canonical response to `Server.All()` used by the +// AllServerCache tests. [hcloud.ServerClient.All] paginates with per_page=50. +const allServersPath = "/servers?page=1&per_page=50" + +func TestAllServerCache_ByID_HitAfterMiss(t *testing.T) { + // One refresh is expected; the second lookup within TTL must not trigger another. + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{ + {ID: 1, Name: "server-1"}, + {ID: 2, Name: "server-2"}, + }}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-1", srv.Name) + + srv, err = cache.ByID(t.Context(), 2) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-2", srv.Name) +} + +func TestAllServerCache_ByName_HitAfterMiss(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, int64(1), srv.ID) + + srv, err = cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestAllServerCache_TTLExpiry(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", 0) + + _, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + _, err = cache.ByID(t.Context(), 1) + require.NoError(t, err) +} + +func TestAllServerCache_MissTriggersRefresh(t *testing.T) { + // Initially only server 1 is returned. A lookup for id=2 triggers a refresh + // that now also returns server 2 (e.g. because it was just created). + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{ + {ID: 1, Name: "server-1"}, + {ID: 2, Name: "server-2"}, + }}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + cache.limiter = rate.NewLimiter(rate.Inf, 1) // isolate the test from the limiter + + _, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + + srv, err := cache.ByID(t.Context(), 2) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-2", srv.Name) +} + +func TestAllServerCache_ServerNotFoundAfterRefresh(t *testing.T) { + // A missing server triggers a refresh on every lookup, since we have no + // way to distinguish "just created" from "does not exist". + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + cache.limiter = rate.NewLimiter(rate.Inf, 1) // isolate the test from the limiter + + srv, err := cache.ByID(t.Context(), 999) + require.NoError(t, err) + assert.Nil(t, srv) + + srv, err = cache.ByID(t.Context(), 999) + require.NoError(t, err) + assert.Nil(t, srv) +} + +func TestAllServerCache_RateLimitedRefreshReturnsErr(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + cache.limiter = rate.NewLimiter(rate.Every(time.Hour), 1) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + + // Drain the limiter so the next miss-driven refresh is denied. + require.True(t, cache.limiter.Allow()) + + srv, err = cache.ByID(t.Context(), 999) + require.ErrorIs(t, err, ErrRateLimited) + assert.Nil(t, srv) +} + +func TestAllServerCache_ExpiredRefreshFailureDoesNotConsumeLimiter(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + cache.limiter = rate.NewLimiter(rate.Every(time.Hour), 1) + + _, err := cache.ByID(t.Context(), 1) + require.Error(t, err) + assert.NotErrorIs(t, err, ErrRateLimited) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-1", srv.Name) +} + +func TestAllServerCache_APIError(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.Error(t, err) + assert.Nil(t, srv) +} diff --git a/internal/servercache/evalservercache.go b/internal/servercache/evalservercache.go new file mode 100644 index 000000000..c1df8664c --- /dev/null +++ b/internal/servercache/evalservercache.go @@ -0,0 +1,63 @@ +package servercache + +import ( + "context" + "time" + + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +// EvalCache wraps a [PerServerCache], an [AllServerCache], and a [Passthrough] and +// runs each cache sequentially for every lookup. + +var _ ServerCache = (*EvalCache)(nil) + +type EvalCache struct { + caches []ServerCache +} + +func NewEvalCache(client *hcloud.Client, subsystem string, ttl time.Duration) *EvalCache { + caches := []ServerCache{ + NewAllServerCache(client, subsystem, ttl), + NewPerServerCache(client, subsystem, ttl), + NewPassthrough(client), + } + + return &EvalCache{ + caches: caches, + } +} + +func (c *EvalCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + return c.run(func(s ServerCache) (*hcloud.Server, error) { return s.ByID(ctx, id) }) +} + +func (c *EvalCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { + return c.run(func(s ServerCache) (*hcloud.Server, error) { return s.ByName(ctx, name) }) +} + +// run invokes every underlying cache so each one records its own metrics for +// the same request, then returns the first cache's result as the authoritative +// answer. Errors from secondary caches are logged but not propagated, so a +// transient failure in a comparison cache cannot affect the controller. +func (c *EvalCache) run(lookup func(ServerCache) (*hcloud.Server, error)) (*hcloud.Server, error) { + var ( + firstServer *hcloud.Server + firstErr error + ) + + for i, cache := range c.caches { + server, err := lookup(cache) + if i == 0 { + firstServer, firstErr = server, err + continue + } + if err != nil { + klog.V(4).InfoS("eval cache: secondary cache returned error", "index", i, "err", err) + } + } + + return firstServer, firstErr +} diff --git a/internal/servercache/perservercache.go b/internal/servercache/perservercache.go new file mode 100644 index 000000000..d54ad3e56 --- /dev/null +++ b/internal/servercache/perservercache.go @@ -0,0 +1,123 @@ +package servercache + +import ( + "context" + "sync" + "time" + + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +var _ ServerCache = (*PerServerCache)(nil) + +// PerServerCache caches each Server with a separate expiration time. +// The caches removes expired entries. +type PerServerCache struct { + subsystem string + mode Mode + ttl time.Duration + + client *hcloud.Client + + byID map[int64]*perServerCacheEntry + byName map[string]*perServerCacheEntry + + mu sync.Mutex +} + +type perServerCacheEntry struct { + server *hcloud.Server + expiresAt time.Time +} + +func NewPerServerCache(client *hcloud.Client, subsystem string, ttl time.Duration) *PerServerCache { + return &PerServerCache{ + subsystem: subsystem, + mode: ModePerServer, + ttl: ttl, + client: client, + byID: make(map[int64]*perServerCacheEntry), + byName: make(map[string]*perServerCacheEntry), + } +} + +func (c *PerServerCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + return c.getOrFetch( + func() *perServerCacheEntry { return c.byID[id] }, + func() (*hcloud.Server, *hcloud.Response, error) { return c.client.Server.GetByID(ctx, id) }, + ) +} + +func (c *PerServerCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { + return c.getOrFetch( + func() *perServerCacheEntry { return c.byName[name] }, + func() (*hcloud.Server, *hcloud.Response, error) { return c.client.Server.GetByName(ctx, name) }, + ) +} + +func (c *PerServerCache) getOrFetch( + lookup func() *perServerCacheEntry, + fetch func() (*hcloud.Server, *hcloud.Response, error), +) (*hcloud.Server, error) { + c.mu.Lock() + defer c.mu.Unlock() + + if entry := lookup(); entry != nil && time.Now().Before(entry.expiresAt) { + metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "hit").Inc() + klog.V(4).InfoS("per-server cache hit", "id", entry.server.ID, "name", entry.server.Name) + return entry.server, nil + } + + klog.V(4).InfoS("per-server cache miss, fetching from api") + server, _, err := fetch() + if err != nil { + return nil, err + } + metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "miss").Inc() + if server != nil { + klog.V(4).InfoS("per-server cache: fetched server from api", "id", server.ID, "name", server.Name) + c.addToCache(server) + } else { + klog.V(4).InfoS("per-server cache: server not found via api") + } + + return server, nil +} + +// addToCache inserts (or refreshes) a server in the cache, evicting the +// expired entries. +// The caller must hold the mutex. +func (c *PerServerCache) addToCache(server *hcloud.Server) { + if existing, ok := c.byID[server.ID]; ok { + // Server name changed and needs updating. + if existing.server.Name != server.Name { + delete(c.byName, existing.server.Name) + } + existing.server = server + existing.expiresAt = time.Now().Add(c.ttl) + c.byName[server.Name] = existing + return + } + + // Create new server entry. + entry := &perServerCacheEntry{ + server: server, + expiresAt: time.Now().Add(c.ttl), + } + c.byID[server.ID] = entry + c.byName[server.Name] = entry + + // Evict expired entries to avoid deleted Servers being around + // until hccm restarts. + for key := range c.byID { + oldEntry := c.byID[key] + if time.Now().After(oldEntry.expiresAt) { + delete(c.byID, key) + delete(c.byName, oldEntry.server.Name) + klog.V(4).InfoS("per-server cache: evicted LRU entry", "id", key) + } + } +} diff --git a/internal/servercache/perservercache_test.go b/internal/servercache/perservercache_test.go new file mode 100644 index 000000000..dfb42a920 --- /dev/null +++ b/internal/servercache/perservercache_test.go @@ -0,0 +1,165 @@ +package servercache + +import ( + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" + "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" + "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" +) + +func TestPerServerCache_ByID_HitAfterMiss(t *testing.T) { + // Exactly one GET /servers/1 is expected; the second ByID must be served from cache. + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", + Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, int64(1), srv.ID) + + srv, err = cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestPerServerCache_ByName_HitAfterMiss(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers?name=server-1", + Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-1", srv.Name) + + srv, err = cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestPerServerCache_ByID_CrossPopulatesByName(t *testing.T) { + // Only the initial GetByID call is expected; the subsequent ByName must hit + // the cache populated by ByID. + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", + Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + + srv, err = cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, int64(1), srv.ID) +} + +func TestPerServerCache_TTLExpiry(t *testing.T) { + // Zero TTL → every lookup misses and triggers a fresh GET. + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1", Status: string(hcloud.ServerStatusStarting)}}, + }, + { + Method: "GET", Path: "/servers/1", Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1", Status: string(hcloud.ServerStatusRunning)}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", 0) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, hcloud.ServerStatusStarting, srv.Status) + + srv, err = cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, hcloud.ServerStatusRunning, srv.Status) +} + +func TestPerServerCache_ServerNotFound(t *testing.T) { + // A nil result is not cached → both lookups must hit the api. + client := newCacheTestClient(t, []mockutil.Request{ + {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, + {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 42) + require.NoError(t, err) + assert.Nil(t, srv) + + srv, err = cache.ByID(t.Context(), 42) + require.NoError(t, err) + assert.Nil(t, srv) +} + +func TestPerServerCache_APIError(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.Error(t, err) + assert.Nil(t, srv) +} + +func TestPerServerCache_LRUEviction(t *testing.T) { + // Cache size 2; adding a third entry evicts the least-recently-used. + // Touching server 1 before inserting 3 keeps 1 in cache and evicts 2. + client := newCacheTestClient(t, []mockutil.Request{ + {Method: "GET", Path: "/servers/1", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}}, + {Method: "GET", Path: "/servers/2", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 2, Name: "server-2"}}}, + {Method: "GET", Path: "/servers/3", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 3, Name: "server-3"}}}, + }) + cache := NewPerServerCache(client, "instances_v2", time.Hour) + + ctx := t.Context() + _, err := cache.ByID(ctx, 1) + require.NoError(t, err) + _, err = cache.ByID(ctx, 2) + require.NoError(t, err) + + // Expire 2 + cache.byID[2].expiresAt = time.Unix(0, 0) + + // Adding 3 evicts the expired entry (2). + _, err = cache.ByID(ctx, 3) + require.NoError(t, err) + + assert.Len(t, cache.byID, 2) + assert.Contains(t, cache.byID, int64(1)) + assert.Contains(t, cache.byID, int64(3)) + assert.NotContains(t, cache.byID, int64(2)) + assert.Len(t, cache.byName, 2) + assert.NotContains(t, cache.byName, "server-2") +} diff --git a/internal/servercache/servercache.go b/internal/servercache/servercache.go new file mode 100644 index 000000000..d671173ba --- /dev/null +++ b/internal/servercache/servercache.go @@ -0,0 +1,72 @@ +package servercache + +import ( + "context" + "errors" + "fmt" + "time" + + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +// ErrRateLimited is returned by a [ServerCache] when a lookup would have +// required a refresh but the cache's internal rate limiter denied it. +var ErrRateLimited = errors.New("refresh_rate_limited") + +type ServerCache interface { + ByID(context.Context, int64) (*hcloud.Server, error) + ByName(context.Context, string) (*hcloud.Server, error) +} + +type Mode string + +const ( + ModeAllServers Mode = "all-server" + ModePerServer Mode = "per-server" + ModeEval Mode = "eval" + ModeOff Mode = "off" +) + +func New(client *hcloud.Client, subsystem string, mode Mode, ttl time.Duration) (ServerCache, error) { + if mode != ModeOff { + klog.Warningf("instance caching is experimental, breaking changes may occur within minor releases; set HCLOUD_INSTANCES_CACHE_MODE=off to opt out (mode=%q)", mode) + } + switch mode { + case ModeAllServers: + return NewAllServerCache(client, subsystem, ttl), nil + case ModePerServer: + return NewPerServerCache(client, subsystem, ttl), nil + case ModeEval: + klog.Warningf("instance cache mode %q is for internal evaluation only and is not intended for production use", mode) + return NewEvalCache(client, subsystem, ttl), nil + case ModeOff: + return NewPassthrough(client), nil + } + return nil, fmt.Errorf("invalid cache mode %q", mode) +} + +// ----- Passthrough ----- + +// Passthrough is a [ServerCache] that always queries the API. + +var _ ServerCache = (*Passthrough)(nil) + +type Passthrough struct { + client *hcloud.Client +} + +func NewPassthrough(client *hcloud.Client) *Passthrough { + return &Passthrough{client: client} +} + +func (c *Passthrough) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + server, _, err := c.client.Server.GetByID(ctx, id) + return server, err +} + +func (c *Passthrough) ByName(ctx context.Context, name string) (*hcloud.Server, error) { + server, _, err := c.client.Server.GetByName(ctx, name) + return server, err +} diff --git a/internal/servercache/servercache_test.go b/internal/servercache/servercache_test.go new file mode 100644 index 000000000..d2dbeebbd --- /dev/null +++ b/internal/servercache/servercache_test.go @@ -0,0 +1,23 @@ +package servercache + +import ( + "testing" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" + "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" + "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" +) + +// newCacheTestClient spins up a [mockutil.Server] and returns a client pointed +// at it. +func newCacheTestClient(t *testing.T, requests []mockutil.Request) *hcloud.Client { + t.Helper() + server := mockutil.NewServer(t, requests) + return hcloud.NewClient( + hcloud.WithEndpoint(server.URL), + hcloud.WithPollOpts(hcloud.PollOpts{BackoffFunc: hcloud.ConstantBackoff(0)}), + hcloud.WithRetryOpts(hcloud.RetryOpts{BackoffFunc: hcloud.ConstantBackoff(0)}), + ) +} + +var notFoundResponse = schema.ErrorResponse{Error: schema.Error{Code: string(hcloud.ErrorCodeNotFound), Message: "not found"}} From d308874418401780e82c2d5d96c577a41c213c2e Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Tue, 26 May 2026 12:26:46 +0200 Subject: [PATCH 02/10] docs: server cache interface --- internal/servercache/servercache.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/internal/servercache/servercache.go b/internal/servercache/servercache.go index d671173ba..adc297b21 100644 --- a/internal/servercache/servercache.go +++ b/internal/servercache/servercache.go @@ -11,15 +11,23 @@ import ( "github.com/hetznercloud/hcloud-go/v2/hcloud" ) -// ErrRateLimited is returned by a [ServerCache] when a lookup would have -// required a refresh but the cache's internal rate limiter denied it. -var ErrRateLimited = errors.New("refresh_rate_limited") - +// ServerCache defines a caching layer for retrieving Hetzner Cloud servers. type ServerCache interface { + // ByID retrieves a server by its unique numeric ID. + // Returns the server if found, nil and no error if not found, + // or nil and an error if the lookup fails. ByID(context.Context, int64) (*hcloud.Server, error) + + // ByName retrieves a server by its name. + // Returns the server if found, nil and no error if not found, + // or nil and an error if the lookup fails. ByName(context.Context, string) (*hcloud.Server, error) } +// ErrRateLimited is returned by a [ServerCache] when a lookup would have +// required a refresh but the cache's internal rate limiter denied it. +var ErrRateLimited = errors.New("refresh_rate_limited") + type Mode string const ( From 8c694bc2dbc94136f53e9be888f5a4ca376c5ca2 Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Tue, 26 May 2026 12:37:25 +0200 Subject: [PATCH 03/10] test: improve coverage --- internal/servercache/servercache_test.go | 138 +++++++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/internal/servercache/servercache_test.go b/internal/servercache/servercache_test.go index d2dbeebbd..d17117546 100644 --- a/internal/servercache/servercache_test.go +++ b/internal/servercache/servercache_test.go @@ -1,7 +1,12 @@ package servercache import ( + "net/http" "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" @@ -21,3 +26,136 @@ func newCacheTestClient(t *testing.T, requests []mockutil.Request) *hcloud.Clien } var notFoundResponse = schema.ErrorResponse{Error: schema.Error{Code: string(hcloud.ErrorCodeNotFound), Message: "not found"}} + +func TestNew(t *testing.T) { + client := hcloud.NewClient() + + tests := []struct { + name string + mode Mode + wantType any + wantErr bool + }{ + {name: "all-server", mode: ModeAllServers, wantType: (*AllServerCache)(nil)}, + {name: "per-server", mode: ModePerServer, wantType: (*PerServerCache)(nil)}, + {name: "eval", mode: ModeEval, wantType: (*EvalCache)(nil)}, + {name: "off", mode: ModeOff, wantType: (*Passthrough)(nil)}, + {name: "invalid", mode: Mode("bogus"), wantErr: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cache, err := New(client, "instances_v2", tt.mode, time.Minute) + if tt.wantErr { + require.Error(t, err) + assert.Nil(t, cache) + return + } + require.NoError(t, err) + require.NotNil(t, cache) + assert.IsType(t, tt.wantType, cache) + }) + } +} + +func TestPassthrough_ByID(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, + }, + // Passthrough never caches, so a second lookup must hit the API again. + { + Method: "GET", Path: "/servers/1", Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, int64(1), srv.ID) + + srv, err = cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestPassthrough_ByID_NotFound(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByID(t.Context(), 42) + require.NoError(t, err) + assert.Nil(t, srv) +} + +func TestPassthrough_ByID_APIError(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByID(t.Context(), 1) + require.Error(t, err) + assert.Nil(t, srv) +} + +func TestPassthrough_ByName(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers?name=server-1", Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + { + Method: "GET", Path: "/servers?name=server-1", Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-1", srv.Name) + + srv, err = cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestPassthrough_ByName_NotFound(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers?name=missing", Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByName(t.Context(), "missing") + require.NoError(t, err) + assert.Nil(t, srv) +} + +func TestPassthrough_ByName_APIError(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers?name=server-1", + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByName(t.Context(), "server-1") + require.Error(t, err) + assert.Nil(t, srv) +} From 5245ef7fff0be4fddf66943f2ca12c1e0dffa180 Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Tue, 26 May 2026 12:39:05 +0200 Subject: [PATCH 04/10] fix: prometheus metric label typo --- internal/metrics/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index ce93b426b..d17d79a76 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -41,7 +41,7 @@ var ( CacheRequests = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cloud_controller_manager_cache_requests_total", Help: "Total cache requests partitioned by cache name and result.", - }, []string{"subsytem", "mode", "result"}) + }, []string{"subsystem", "mode", "result"}) ) func init() { From 8f3fc0aaba6e81c7934882ab80b00d4847811d0e Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Tue, 26 May 2026 12:46:58 +0200 Subject: [PATCH 05/10] test: rename test after implementation changed --- internal/servercache/perservercache_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/servercache/perservercache_test.go b/internal/servercache/perservercache_test.go index dfb42a920..c2170dc8a 100644 --- a/internal/servercache/perservercache_test.go +++ b/internal/servercache/perservercache_test.go @@ -133,7 +133,7 @@ func TestPerServerCache_APIError(t *testing.T) { assert.Nil(t, srv) } -func TestPerServerCache_LRUEviction(t *testing.T) { +func TestPerServerCache_ExpiredEviction(t *testing.T) { // Cache size 2; adding a third entry evicts the least-recently-used. // Touching server 1 before inserting 3 keeps 1 in cache and evicts 2. client := newCacheTestClient(t, []mockutil.Request{ From 817e35fff922b3e38df0797ad0a0100037fb3927 Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Wed, 3 Jun 2026 12:27:09 +0200 Subject: [PATCH 06/10] feat: wip impl --- dev/Dockerfile | 2 +- hcloud/cloud.go | 7 +- hcloud/cloud_test.go | 4 +- hcloud/instances.go | 10 +- internal/servercache/allservercache.go | 119 ---- internal/servercache/allservercache_test.go | 195 ------ internal/servercache/evalservercache.go | 63 -- internal/servercache/perservercache.go | 123 ---- internal/servercache/perservercache_test.go | 165 ----- internal/servercache/servercache.go | 279 ++++++-- internal/servercache/servercache_test.go | 721 +++++++++++++++++--- 11 files changed, 854 insertions(+), 834 deletions(-) delete mode 100644 internal/servercache/allservercache.go delete mode 100644 internal/servercache/allservercache_test.go delete mode 100644 internal/servercache/evalservercache.go delete mode 100644 internal/servercache/perservercache.go delete mode 100644 internal/servercache/perservercache_test.go diff --git a/dev/Dockerfile b/dev/Dockerfile index 2a2c05500..15ad84a2f 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.26 as builder +FROM golang:1.26 AS builder WORKDIR /build diff --git a/hcloud/cloud.go b/hcloud/cloud.go index 63d2fbe73..99abd5d00 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -53,7 +53,7 @@ var providerVersion = "unknown" type cloud struct { client *hcloud.Client robotClient hrobot.RobotClient - instanceCache servercache.ServerCache + instanceCache *servercache.Cache[hcloud.Server] cfg config.HCCMConfiguration recorder record.EventRecorder networkID int64 @@ -146,10 +146,7 @@ func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Int klog.Infof("Hetzner Cloud k8s cloud controller %s started\n", providerVersion) - instanceCache, err := servercache.New(client, "instances_v2", cfg.Instance.Cache.Mode, cfg.Instance.Cache.TTL) - if err != nil { - return nil, fmt.Errorf("%s: %w", op, err) - } + instanceCache := servercache.NewServerCache(client, "instances_v2", cfg.Instance.Cache.Mode, cfg.Instance.Cache.TTL) return &cloud{ client: client, diff --git a/hcloud/cloud_test.go b/hcloud/cloud_test.go index fa653642d..226a92728 100644 --- a/hcloud/cloud_test.go +++ b/hcloud/cloud_test.go @@ -42,7 +42,7 @@ type testEnv struct { Mux *http.ServeMux Client *hcloud.Client RobotClient hrobot.RobotClient - ServerCache servercache.ServerCache + ServerCache *servercache.Cache[hcloud.Server] Recorder record.EventRecorder Cfg config.HCCMConfiguration } @@ -69,7 +69,7 @@ func newTestEnv() testEnv { ) robotClient := hrobot.NewBasicAuthClient("", "") robotClient.SetBaseURL(server.URL + "/robot") - serverCache := servercache.NewPerServerCache(client, "instances_v2", 10*time.Second) + serverCache := servercache.NewServerCache(client, "instances_v2", servercache.ModePerServer, 10*time.Second) recorder := record.NewBroadcaster().NewRecorder(scheme.Scheme, corev1.EventSource{Component: "hcloud-cloud-controller-manager"}) cfg := config.HCCMConfiguration{} diff --git a/hcloud/instances.go b/hcloud/instances.go index e72e8f7f8..9b6bf7840 100644 --- a/hcloud/instances.go +++ b/hcloud/instances.go @@ -45,7 +45,7 @@ const ( type instances struct { client *hcloud.Client robotClient hrobot.RobotClient - serverCache servercache.ServerCache + serverCache *servercache.Cache[hcloud.Server] recorder record.EventRecorder networkID int64 cfg config.HCCMConfiguration @@ -59,7 +59,7 @@ var ( func newInstances( client *hcloud.Client, robotClient hrobot.RobotClient, - serverCache servercache.ServerCache, + serverCache *servercache.Cache[hcloud.Server], recorder record.EventRecorder, networkID int64, cfg config.HCCMConfiguration, @@ -179,7 +179,8 @@ func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bo if server == nil { return false, fmt.Errorf( "%s: failed to get instance metadata: no matching server found for node '%s': %w", - op, node.Name, errServerNotFound) + op, node.Name, errServerNotFound, + ) } isShutdown, err := server.IsShutdown() @@ -203,7 +204,8 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *corev1.Node) (*c if server == nil { return nil, fmt.Errorf( "%s: failed to get instance metadata: no matching server found for node '%s': %w", - op, node.Name, errServerNotFound) + op, node.Name, errServerNotFound, + ) } metadata, err := server.Metadata(i.networkID, node, i.cfg) diff --git a/internal/servercache/allservercache.go b/internal/servercache/allservercache.go deleted file mode 100644 index f84c64cdf..000000000 --- a/internal/servercache/allservercache.go +++ /dev/null @@ -1,119 +0,0 @@ -package servercache - -import ( - "context" - "sync" - "time" - - "golang.org/x/time/rate" - "k8s.io/klog/v2" - - "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" - "github.com/hetznercloud/hcloud-go/v2/hcloud" -) - -var _ ServerCache = (*AllServerCache)(nil) - -type AllServerCache struct { - subsystem string - mode Mode - ttl time.Duration - expiresAt time.Time - - client *hcloud.Client - - byID map[int64]*hcloud.Server - byName map[string]*hcloud.Server - - limiter *rate.Limiter - mu sync.Mutex -} - -func NewAllServerCache(client *hcloud.Client, subsystem string, ttl time.Duration) *AllServerCache { - return &AllServerCache{ - subsystem: subsystem, - mode: ModeAllServers, - ttl: ttl, - client: client, - expiresAt: time.Now(), - limiter: rate.NewLimiter(rate.Every(ttl), 1), - } -} - -// ByID implements [ServerCache]. -func (c *AllServerCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { - return c.getFromCache(ctx, func() *hcloud.Server { return c.byID[id] }) -} - -// ByName implements [ServerCache]. -func (c *AllServerCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { - return c.getFromCache(ctx, func() *hcloud.Server { return c.byName[name] }) -} - -func (c *AllServerCache) refresh(ctx context.Context) error { - klog.V(4).InfoS("all-server cache: refreshing from api") - servers, err := c.client.Server.All(ctx) - if err != nil { - return err - } - - c.byID = make(map[int64]*hcloud.Server, len(servers)) - c.byName = make(map[string]*hcloud.Server, len(servers)) - - for _, server := range servers { - c.byID[server.ID] = server - c.byName[server.Name] = server - } - - c.expiresAt = time.Now().Add(c.ttl) - klog.V(4).InfoS("all-server cache: refresh complete", "count", len(servers), "expiresAt", c.expiresAt) - - return nil -} - -func (c *AllServerCache) getFromCache(ctx context.Context, lookup func() *hcloud.Server) (*hcloud.Server, error) { - c.mu.Lock() - defer c.mu.Unlock() - - cacheRefreshed := false - if time.Now().After(c.expiresAt) { - klog.V(4).InfoS("all-server cache: cache expired, refreshing", "expiresAt", c.expiresAt) - if err := c.refresh(ctx); err != nil { - return nil, err - } - cacheRefreshed = true - } - - if server := lookup(); server != nil { - metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "hit").Inc() - klog.V(4).InfoS("all-server cache hit", "id", server.ID, "name", server.Name) - return server, nil - } - - metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "miss").Inc() - - // Server not found on fresh cache so return early. - if cacheRefreshed { - klog.V(4).InfoS("all-server cache: server not found in fresh snapshot") - return nil, nil - } - - // Cache was not refreshed and rate limiter does not allow refreshing right now. - if !c.limiter.Allow() { - klog.V(4).InfoS("all-server cache: miss-driven refresh denied by rate limiter") - return nil, ErrRateLimited - } - - klog.V(4).InfoS("all-server cache miss: refreshing to catch newly-created server") - if err := c.refresh(ctx); err != nil { - return nil, err - } - - server := lookup() - if server == nil { - klog.V(4).InfoS("all-server cache: server not found after refresh") - } else { - klog.V(4).InfoS("all-server cache: server found after refresh", "id", server.ID, "name", server.Name) - } - return server, nil -} diff --git a/internal/servercache/allservercache_test.go b/internal/servercache/allservercache_test.go deleted file mode 100644 index e1349721f..000000000 --- a/internal/servercache/allservercache_test.go +++ /dev/null @@ -1,195 +0,0 @@ -package servercache - -import ( - "net/http" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "golang.org/x/time/rate" - - "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" - "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" -) - -// allServersPath is the canonical response to `Server.All()` used by the -// AllServerCache tests. [hcloud.ServerClient.All] paginates with per_page=50. -const allServersPath = "/servers?page=1&per_page=50" - -func TestAllServerCache_ByID_HitAfterMiss(t *testing.T) { - // One refresh is expected; the second lookup within TTL must not trigger another. - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{ - {ID: 1, Name: "server-1"}, - {ID: 2, Name: "server-2"}, - }}, - }, - }) - cache := NewAllServerCache(client, "instances_v2", time.Minute) - - srv, err := cache.ByID(t.Context(), 1) - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, "server-1", srv.Name) - - srv, err = cache.ByID(t.Context(), 2) - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, "server-2", srv.Name) -} - -func TestAllServerCache_ByName_HitAfterMiss(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - }) - cache := NewAllServerCache(client, "instances_v2", time.Minute) - - srv, err := cache.ByName(t.Context(), "server-1") - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, int64(1), srv.ID) - - srv, err = cache.ByName(t.Context(), "server-1") - require.NoError(t, err) - require.NotNil(t, srv) -} - -func TestAllServerCache_TTLExpiry(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - }) - cache := NewAllServerCache(client, "instances_v2", 0) - - _, err := cache.ByID(t.Context(), 1) - require.NoError(t, err) - _, err = cache.ByID(t.Context(), 1) - require.NoError(t, err) -} - -func TestAllServerCache_MissTriggersRefresh(t *testing.T) { - // Initially only server 1 is returned. A lookup for id=2 triggers a refresh - // that now also returns server 2 (e.g. because it was just created). - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{ - {ID: 1, Name: "server-1"}, - {ID: 2, Name: "server-2"}, - }}, - }, - }) - cache := NewAllServerCache(client, "instances_v2", time.Minute) - cache.limiter = rate.NewLimiter(rate.Inf, 1) // isolate the test from the limiter - - _, err := cache.ByID(t.Context(), 1) - require.NoError(t, err) - - srv, err := cache.ByID(t.Context(), 2) - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, "server-2", srv.Name) -} - -func TestAllServerCache_ServerNotFoundAfterRefresh(t *testing.T) { - // A missing server triggers a refresh on every lookup, since we have no - // way to distinguish "just created" from "does not exist". - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - }) - cache := NewAllServerCache(client, "instances_v2", time.Minute) - cache.limiter = rate.NewLimiter(rate.Inf, 1) // isolate the test from the limiter - - srv, err := cache.ByID(t.Context(), 999) - require.NoError(t, err) - assert.Nil(t, srv) - - srv, err = cache.ByID(t.Context(), 999) - require.NoError(t, err) - assert.Nil(t, srv) -} - -func TestAllServerCache_RateLimitedRefreshReturnsErr(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - }) - cache := NewAllServerCache(client, "instances_v2", time.Minute) - cache.limiter = rate.NewLimiter(rate.Every(time.Hour), 1) - - srv, err := cache.ByID(t.Context(), 1) - require.NoError(t, err) - require.NotNil(t, srv) - - // Drain the limiter so the next miss-driven refresh is denied. - require.True(t, cache.limiter.Allow()) - - srv, err = cache.ByID(t.Context(), 999) - require.ErrorIs(t, err, ErrRateLimited) - assert.Nil(t, srv) -} - -func TestAllServerCache_ExpiredRefreshFailureDoesNotConsumeLimiter(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: allServersPath, - Status: http.StatusInternalServerError, - JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, - }, - { - Method: "GET", Path: allServersPath, Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - }) - cache := NewAllServerCache(client, "instances_v2", time.Minute) - cache.limiter = rate.NewLimiter(rate.Every(time.Hour), 1) - - _, err := cache.ByID(t.Context(), 1) - require.Error(t, err) - assert.NotErrorIs(t, err, ErrRateLimited) - - srv, err := cache.ByID(t.Context(), 1) - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, "server-1", srv.Name) -} - -func TestAllServerCache_APIError(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: allServersPath, - Status: http.StatusInternalServerError, - JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, - }, - }) - cache := NewAllServerCache(client, "instances_v2", time.Minute) - - srv, err := cache.ByID(t.Context(), 1) - require.Error(t, err) - assert.Nil(t, srv) -} diff --git a/internal/servercache/evalservercache.go b/internal/servercache/evalservercache.go deleted file mode 100644 index c1df8664c..000000000 --- a/internal/servercache/evalservercache.go +++ /dev/null @@ -1,63 +0,0 @@ -package servercache - -import ( - "context" - "time" - - "k8s.io/klog/v2" - - "github.com/hetznercloud/hcloud-go/v2/hcloud" -) - -// EvalCache wraps a [PerServerCache], an [AllServerCache], and a [Passthrough] and -// runs each cache sequentially for every lookup. - -var _ ServerCache = (*EvalCache)(nil) - -type EvalCache struct { - caches []ServerCache -} - -func NewEvalCache(client *hcloud.Client, subsystem string, ttl time.Duration) *EvalCache { - caches := []ServerCache{ - NewAllServerCache(client, subsystem, ttl), - NewPerServerCache(client, subsystem, ttl), - NewPassthrough(client), - } - - return &EvalCache{ - caches: caches, - } -} - -func (c *EvalCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { - return c.run(func(s ServerCache) (*hcloud.Server, error) { return s.ByID(ctx, id) }) -} - -func (c *EvalCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { - return c.run(func(s ServerCache) (*hcloud.Server, error) { return s.ByName(ctx, name) }) -} - -// run invokes every underlying cache so each one records its own metrics for -// the same request, then returns the first cache's result as the authoritative -// answer. Errors from secondary caches are logged but not propagated, so a -// transient failure in a comparison cache cannot affect the controller. -func (c *EvalCache) run(lookup func(ServerCache) (*hcloud.Server, error)) (*hcloud.Server, error) { - var ( - firstServer *hcloud.Server - firstErr error - ) - - for i, cache := range c.caches { - server, err := lookup(cache) - if i == 0 { - firstServer, firstErr = server, err - continue - } - if err != nil { - klog.V(4).InfoS("eval cache: secondary cache returned error", "index", i, "err", err) - } - } - - return firstServer, firstErr -} diff --git a/internal/servercache/perservercache.go b/internal/servercache/perservercache.go deleted file mode 100644 index d54ad3e56..000000000 --- a/internal/servercache/perservercache.go +++ /dev/null @@ -1,123 +0,0 @@ -package servercache - -import ( - "context" - "sync" - "time" - - "k8s.io/klog/v2" - - "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" - "github.com/hetznercloud/hcloud-go/v2/hcloud" -) - -var _ ServerCache = (*PerServerCache)(nil) - -// PerServerCache caches each Server with a separate expiration time. -// The caches removes expired entries. -type PerServerCache struct { - subsystem string - mode Mode - ttl time.Duration - - client *hcloud.Client - - byID map[int64]*perServerCacheEntry - byName map[string]*perServerCacheEntry - - mu sync.Mutex -} - -type perServerCacheEntry struct { - server *hcloud.Server - expiresAt time.Time -} - -func NewPerServerCache(client *hcloud.Client, subsystem string, ttl time.Duration) *PerServerCache { - return &PerServerCache{ - subsystem: subsystem, - mode: ModePerServer, - ttl: ttl, - client: client, - byID: make(map[int64]*perServerCacheEntry), - byName: make(map[string]*perServerCacheEntry), - } -} - -func (c *PerServerCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { - return c.getOrFetch( - func() *perServerCacheEntry { return c.byID[id] }, - func() (*hcloud.Server, *hcloud.Response, error) { return c.client.Server.GetByID(ctx, id) }, - ) -} - -func (c *PerServerCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { - return c.getOrFetch( - func() *perServerCacheEntry { return c.byName[name] }, - func() (*hcloud.Server, *hcloud.Response, error) { return c.client.Server.GetByName(ctx, name) }, - ) -} - -func (c *PerServerCache) getOrFetch( - lookup func() *perServerCacheEntry, - fetch func() (*hcloud.Server, *hcloud.Response, error), -) (*hcloud.Server, error) { - c.mu.Lock() - defer c.mu.Unlock() - - if entry := lookup(); entry != nil && time.Now().Before(entry.expiresAt) { - metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "hit").Inc() - klog.V(4).InfoS("per-server cache hit", "id", entry.server.ID, "name", entry.server.Name) - return entry.server, nil - } - - klog.V(4).InfoS("per-server cache miss, fetching from api") - server, _, err := fetch() - if err != nil { - return nil, err - } - metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "miss").Inc() - if server != nil { - klog.V(4).InfoS("per-server cache: fetched server from api", "id", server.ID, "name", server.Name) - c.addToCache(server) - } else { - klog.V(4).InfoS("per-server cache: server not found via api") - } - - return server, nil -} - -// addToCache inserts (or refreshes) a server in the cache, evicting the -// expired entries. -// The caller must hold the mutex. -func (c *PerServerCache) addToCache(server *hcloud.Server) { - if existing, ok := c.byID[server.ID]; ok { - // Server name changed and needs updating. - if existing.server.Name != server.Name { - delete(c.byName, existing.server.Name) - } - existing.server = server - existing.expiresAt = time.Now().Add(c.ttl) - c.byName[server.Name] = existing - return - } - - // Create new server entry. - entry := &perServerCacheEntry{ - server: server, - expiresAt: time.Now().Add(c.ttl), - } - c.byID[server.ID] = entry - c.byName[server.Name] = entry - - // Evict expired entries to avoid deleted Servers being around - // until hccm restarts. - for key := range c.byID { - oldEntry := c.byID[key] - if time.Now().After(oldEntry.expiresAt) { - delete(c.byID, key) - delete(c.byName, oldEntry.server.Name) - klog.V(4).InfoS("per-server cache: evicted LRU entry", "id", key) - } - } -} diff --git a/internal/servercache/perservercache_test.go b/internal/servercache/perservercache_test.go deleted file mode 100644 index c2170dc8a..000000000 --- a/internal/servercache/perservercache_test.go +++ /dev/null @@ -1,165 +0,0 @@ -package servercache - -import ( - "net/http" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/hetznercloud/hcloud-go/v2/hcloud" - "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" - "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" -) - -func TestPerServerCache_ByID_HitAfterMiss(t *testing.T) { - // Exactly one GET /servers/1 is expected; the second ByID must be served from cache. - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers/1", - Status: http.StatusOK, - JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, - }, - }) - cache := NewPerServerCache(client, "instances_v2", time.Minute) - - srv, err := cache.ByID(t.Context(), 1) - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, int64(1), srv.ID) - - srv, err = cache.ByID(t.Context(), 1) - require.NoError(t, err) - require.NotNil(t, srv) -} - -func TestPerServerCache_ByName_HitAfterMiss(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers?name=server-1", - Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - }) - cache := NewPerServerCache(client, "instances_v2", time.Minute) - - srv, err := cache.ByName(t.Context(), "server-1") - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, "server-1", srv.Name) - - srv, err = cache.ByName(t.Context(), "server-1") - require.NoError(t, err) - require.NotNil(t, srv) -} - -func TestPerServerCache_ByID_CrossPopulatesByName(t *testing.T) { - // Only the initial GetByID call is expected; the subsequent ByName must hit - // the cache populated by ByID. - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers/1", - Status: http.StatusOK, - JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, - }, - }) - cache := NewPerServerCache(client, "instances_v2", time.Minute) - - srv, err := cache.ByID(t.Context(), 1) - require.NoError(t, err) - require.NotNil(t, srv) - - srv, err = cache.ByName(t.Context(), "server-1") - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, int64(1), srv.ID) -} - -func TestPerServerCache_TTLExpiry(t *testing.T) { - // Zero TTL → every lookup misses and triggers a fresh GET. - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers/1", Status: http.StatusOK, - JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1", Status: string(hcloud.ServerStatusStarting)}}, - }, - { - Method: "GET", Path: "/servers/1", Status: http.StatusOK, - JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1", Status: string(hcloud.ServerStatusRunning)}}, - }, - }) - cache := NewPerServerCache(client, "instances_v2", 0) - - srv, err := cache.ByID(t.Context(), 1) - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, hcloud.ServerStatusStarting, srv.Status) - - srv, err = cache.ByID(t.Context(), 1) - require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, hcloud.ServerStatusRunning, srv.Status) -} - -func TestPerServerCache_ServerNotFound(t *testing.T) { - // A nil result is not cached → both lookups must hit the api. - client := newCacheTestClient(t, []mockutil.Request{ - {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, - {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, - }) - cache := NewPerServerCache(client, "instances_v2", time.Minute) - - srv, err := cache.ByID(t.Context(), 42) - require.NoError(t, err) - assert.Nil(t, srv) - - srv, err = cache.ByID(t.Context(), 42) - require.NoError(t, err) - assert.Nil(t, srv) -} - -func TestPerServerCache_APIError(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers/1", - Status: http.StatusInternalServerError, - JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, - }, - }) - cache := NewPerServerCache(client, "instances_v2", time.Minute) - - srv, err := cache.ByID(t.Context(), 1) - require.Error(t, err) - assert.Nil(t, srv) -} - -func TestPerServerCache_ExpiredEviction(t *testing.T) { - // Cache size 2; adding a third entry evicts the least-recently-used. - // Touching server 1 before inserting 3 keeps 1 in cache and evicts 2. - client := newCacheTestClient(t, []mockutil.Request{ - {Method: "GET", Path: "/servers/1", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}}, - {Method: "GET", Path: "/servers/2", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 2, Name: "server-2"}}}, - {Method: "GET", Path: "/servers/3", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 3, Name: "server-3"}}}, - }) - cache := NewPerServerCache(client, "instances_v2", time.Hour) - - ctx := t.Context() - _, err := cache.ByID(ctx, 1) - require.NoError(t, err) - _, err = cache.ByID(ctx, 2) - require.NoError(t, err) - - // Expire 2 - cache.byID[2].expiresAt = time.Unix(0, 0) - - // Adding 3 evicts the expired entry (2). - _, err = cache.ByID(ctx, 3) - require.NoError(t, err) - - assert.Len(t, cache.byID, 2) - assert.Contains(t, cache.byID, int64(1)) - assert.Contains(t, cache.byID, int64(3)) - assert.NotContains(t, cache.byID, int64(2)) - assert.Len(t, cache.byName, 2) - assert.NotContains(t, cache.byName, "server-2") -} diff --git a/internal/servercache/servercache.go b/internal/servercache/servercache.go index adc297b21..a23153463 100644 --- a/internal/servercache/servercache.go +++ b/internal/servercache/servercache.go @@ -2,79 +2,264 @@ package servercache import ( "context" - "errors" - "fmt" + "maps" + "sync" "time" "k8s.io/klog/v2" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) -// ServerCache defines a caching layer for retrieving Hetzner Cloud servers. -type ServerCache interface { - // ByID retrieves a server by its unique numeric ID. - // Returns the server if found, nil and no error if not found, - // or nil and an error if the lookup fails. - ByID(context.Context, int64) (*hcloud.Server, error) - - // ByName retrieves a server by its name. - // Returns the server if found, nil and no error if not found, - // or nil and an error if the lookup fails. - ByName(context.Context, string) (*hcloud.Server, error) -} - -// ErrRateLimited is returned by a [ServerCache] when a lookup would have -// required a refresh but the cache's internal rate limiter denied it. -var ErrRateLimited = errors.New("refresh_rate_limited") - type Mode string const ( ModeAllServers Mode = "all-server" ModePerServer Mode = "per-server" - ModeEval Mode = "eval" ModeOff Mode = "off" ) -func New(client *hcloud.Client, subsystem string, mode Mode, ttl time.Duration) (ServerCache, error) { - if mode != ModeOff { - klog.Warningf("instance caching is experimental, breaking changes may occur within minor releases; set HCLOUD_INSTANCES_CACHE_MODE=off to opt out (mode=%q)", mode) +type RefreshOpts struct { + ttl time.Duration + mode Mode +} + +func newCacheRefreshOpts[T any](cache *Cache[T], opts ...RefreshOption) *RefreshOpts { + refreshOpts := &RefreshOpts{ + ttl: cache.defaultTTL, + mode: cache.defaultMode, } - switch mode { - case ModeAllServers: - return NewAllServerCache(client, subsystem, ttl), nil - case ModePerServer: - return NewPerServerCache(client, subsystem, ttl), nil - case ModeEval: - klog.Warningf("instance cache mode %q is for internal evaluation only and is not intended for production use", mode) - return NewEvalCache(client, subsystem, ttl), nil - case ModeOff: - return NewPassthrough(client), nil + for _, opt := range opts { + opt(refreshOpts) + } + return refreshOpts +} + +type RefreshOption func(cro *RefreshOpts) + +func WithTTL(ttl time.Duration) func(*RefreshOpts) { + return func(ro *RefreshOpts) { + ro.ttl = ttl + } +} + +func WithMode(mode Mode) func(*RefreshOpts) { + return func(ro *RefreshOpts) { + ro.mode = mode } - return nil, fmt.Errorf("invalid cache mode %q", mode) } -// ----- Passthrough ----- +type entry[T any] struct { + expiresAt time.Time + value *T +} + +type Cache[T any] struct { + fetchOneByID func(ctx context.Context, id int64) (*T, error) + fetchOneByName func(ctx context.Context, name string) (*T, error) + fetchAll func(ctx context.Context) ([]*T, error) + getID func(value *T) int64 + getName func(value *T) string + + defaultTTL time.Duration + defaultMode Mode + + subsystem string -// Passthrough is a [ServerCache] that always queries the API. + byID map[int64]*entry[T] + byName map[string]*entry[T] -var _ ServerCache = (*Passthrough)(nil) + mu sync.Mutex +} -type Passthrough struct { - client *hcloud.Client +func NewServerCache(client *hcloud.Client, subsystem string, defaultMode Mode, defaultTTL time.Duration) *Cache[hcloud.Server] { + return newCache[hcloud.Server]( + func(ctx context.Context, id int64) (*hcloud.Server, error) { + value, _, err := client.Server.GetByID(ctx, id) + return value, err + }, + func(ctx context.Context, name string) (*hcloud.Server, error) { + value, _, err := client.Server.GetByName(ctx, name) + return value, err + }, + func(ctx context.Context) ([]*hcloud.Server, error) { + values, err := client.Server.All(ctx) + return values, err + }, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + subsystem, + defaultMode, + defaultTTL, + ) } -func NewPassthrough(client *hcloud.Client) *Passthrough { - return &Passthrough{client: client} +func newCache[T any]( + fetchOneByID func(ctx context.Context, id int64) (*T, error), + fetchOneByName func(ctx context.Context, name string) (*T, error), + fetchAll func(ctx context.Context) ([]*T, error), + getID func(value *T) int64, + getName func(value *T) string, + subsystem string, + defaultMode Mode, + defaultTTL time.Duration, +) *Cache[T] { + return &Cache[T]{ + fetchOneByID: fetchOneByID, + fetchOneByName: fetchOneByName, + fetchAll: fetchAll, + getID: getID, + getName: getName, + + subsystem: subsystem, + defaultMode: defaultMode, + defaultTTL: defaultTTL, + + byID: make(map[int64]*entry[T]), + byName: make(map[string]*entry[T]), + } } -func (c *Passthrough) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { - server, _, err := c.client.Server.GetByID(ctx, id) - return server, err +func (c *Cache[T]) ByID(ctx context.Context, id int64, opts ...RefreshOption) (*T, error) { + return c.getFromCache( + ctx, + func() *entry[T] { + return c.byID[id] + }, + func() (*T, error) { + return c.fetchOneByID(ctx, id) + }, + opts..., + ) } -func (c *Passthrough) ByName(ctx context.Context, name string) (*hcloud.Server, error) { - server, _, err := c.client.Server.GetByName(ctx, name) - return server, err +func (c *Cache[T]) ByName(ctx context.Context, name string, opts ...RefreshOption) (*T, error) { + return c.getFromCache( + ctx, + func() *entry[T] { + return c.byName[name] + }, + func() (*T, error) { + return c.fetchOneByName(ctx, name) + }, + opts..., + ) +} + +func (c *Cache[T]) getFromCache( + ctx context.Context, + lookup func() *entry[T], + fetch func() (*T, error), + opts ...RefreshOption, +) (*T, error) { + refreshOpts := newCacheRefreshOpts(c, opts...) + + if refreshOpts.mode == ModeOff { + metrics.CacheRequests.WithLabelValues(c.subsystem, string(refreshOpts.mode), "miss").Inc() + klog.V(4).InfoS("cache mode is off") + return fetch() + } + + c.mu.Lock() + defer c.mu.Unlock() + + if e := lookup(); e != nil && time.Now().Before(e.expiresAt) { + metrics.CacheRequests.WithLabelValues(c.subsystem, string(refreshOpts.mode), "hit").Inc() + klog.V(4).InfoS("cache hit", "id", c.getID(e.value), "name", c.getName(e.value), "expiresAt", e.expiresAt.Format(time.RFC3339)) + return e.value, nil + } + + switch refreshOpts.mode { + case ModePerServer: + if err := c.refreshPerServer(fetch, refreshOpts.ttl); err != nil { + return nil, err + } + case ModeAllServers: + if err := c.refreshAllServer(ctx, refreshOpts.ttl); err != nil { + return nil, err + } + case ModeOff: + // Handled above -> early return + } + + metrics.CacheRequests.WithLabelValues(c.subsystem, string(refreshOpts.mode), "miss").Inc() + + if e := lookup(); e != nil { + klog.V(4).InfoS("entry found after refresh", "id", c.getID(e.value), "name", c.getName(e.value)) + return e.value, nil + } + + klog.V(4).InfoS("entry not found after refresh") + return nil, nil +} + +func (c *Cache[T]) refreshPerServer( + fetch func() (*T, error), + ttl time.Duration, +) error { + klog.V(4).InfoS("refreshing server from api") + value, err := fetch() + if err != nil { + return err + } + + if value == nil { + return nil + } + + e := &entry[T]{ + value: value, + expiresAt: time.Now().Add(ttl), + } + klog.V(4).InfoS("refreshed entry from api", "id", c.getID(e.value), "name", c.getName(e.value), "expiresAt", e.expiresAt.Format(time.RFC3339)) + + c.byID[c.getID(value)] = e + c.byName[c.getName(value)] = e + + // Evict expired entries so the cache does not grow indefinitely. This ensures deleted + // Nodes or renamed Servers are cleaned from the cache. + maps.DeleteFunc(c.byID, func(_ int64, ev *entry[T]) bool { + if time.Now().After(ev.expiresAt) { + klog.V(4).InfoS("evicting entry from cache by id", "id", c.getID(ev.value), "name", c.getName(ev.value), "expiresAt", ev.expiresAt.Format(time.RFC3339)) + return true + } + return false + }) + maps.DeleteFunc(c.byName, func(_ string, ev *entry[T]) bool { + if time.Now().After(ev.expiresAt) { + klog.V(4).InfoS("evicting entry from cache by name", "id", c.getID(ev.value), "name", c.getName(ev.value), "expiresAt", ev.expiresAt.Format(time.RFC3339)) + return true + } + return false + }) + + return nil +} + +func (c *Cache[T]) refreshAllServer(ctx context.Context, ttl time.Duration) error { + klog.V(4).InfoS("refreshing all entries from api") + + values, err := c.fetchAll(ctx) + if err != nil { + return err + } + + c.byID = make(map[int64]*entry[T], len(values)) + c.byName = make(map[string]*entry[T], len(values)) + + expiresAt := time.Now().Add(ttl) + + for _, value := range values { + e := &entry[T]{ + value: value, + expiresAt: expiresAt, + } + + c.byID[c.getID(value)] = e + c.byName[c.getName(value)] = e + } + + klog.V(4).InfoS("refreshed all entries from api", "count", len(values), "expiresAt", expiresAt.Format(time.RFC3339)) + return nil } diff --git a/internal/servercache/servercache_test.go b/internal/servercache/servercache_test.go index d17117546..a156e0a48 100644 --- a/internal/servercache/servercache_test.go +++ b/internal/servercache/servercache_test.go @@ -1,8 +1,11 @@ package servercache import ( - "net/http" + "context" + "fmt" + "sync/atomic" "testing" + "testing/synctest" "time" "github.com/stretchr/testify/assert" @@ -10,152 +13,650 @@ import ( "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" - "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" ) -// newCacheTestClient spins up a [mockutil.Server] and returns a client pointed -// at it. -func newCacheTestClient(t *testing.T, requests []mockutil.Request) *hcloud.Client { +func assertServer1(t *testing.T, server *hcloud.Server) { t.Helper() - server := mockutil.NewServer(t, requests) - return hcloud.NewClient( - hcloud.WithEndpoint(server.URL), - hcloud.WithPollOpts(hcloud.PollOpts{BackoffFunc: hcloud.ConstantBackoff(0)}), - hcloud.WithRetryOpts(hcloud.RetryOpts{BackoffFunc: hcloud.ConstantBackoff(0)}), - ) + require.NotNil(t, server) + assert.Equal(t, int64(1), server.ID) + assert.Equal(t, "test", server.Name) } -var notFoundResponse = schema.ErrorResponse{Error: schema.Error{Code: string(hcloud.ErrorCodeNotFound), Message: "not found"}} +func assertServer2(t *testing.T, server *hcloud.Server) { + t.Helper() + require.NotNil(t, server) + assert.Equal(t, int64(2), server.ID) + assert.Equal(t, "test2", server.Name) +} -func TestNew(t *testing.T) { - client := hcloud.NewClient() +func TestServerCacheModeAllServers(t *testing.T) { + apiCalls := atomic.Int64{} - tests := []struct { - name string - mode Mode - wantType any - wantErr bool - }{ - {name: "all-server", mode: ModeAllServers, wantType: (*AllServerCache)(nil)}, - {name: "per-server", mode: ModePerServer, wantType: (*PerServerCache)(nil)}, - {name: "eval", mode: ModeEval, wantType: (*EvalCache)(nil)}, - {name: "off", mode: ModeOff, wantType: (*Passthrough)(nil)}, - {name: "invalid", mode: Mode("bogus"), wantErr: true}, + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + ModeAllServers, + 10*time.Second, + ) + + ctx := t.Context() + + // Cache miss by ID 1, fetch from API + sc.fetchAll = func(_ context.Context) ([]*hcloud.Server, error) { + apiCalls.Add(1) + return []*hcloud.Server{{ID: 1, Name: "test"}, {ID: 2, Name: "test2"}}, nil } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - cache, err := New(client, "instances_v2", tt.mode, time.Minute) - if tt.wantErr { - require.Error(t, err) - assert.Nil(t, cache) - return - } - require.NoError(t, err) - require.NotNil(t, cache) - assert.IsType(t, tt.wantType, cache) - }) + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Fetch all returns 2 servers + assert.Equal(t, int64(1), apiCalls.Load()) + assert.Len(t, sc.byID, 2) + assert.Len(t, sc.byName, 2) + + assert.True(t, sc.byID[srv.ID].expiresAt.After(time.Now())) + assert.Equal(t, srv, sc.byID[srv.ID].value) + assert.Equal(t, srv, sc.byName[srv.Name].value) + + // Cache hit by ID 1 + srv, err = sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Cache hit by ID 2 + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Cache hit by Name 1 + srv, err = sc.ByName(ctx, "test") + require.NoError(t, err) + assertServer1(t, srv) + + // Cache hit by Name 2 + srv, err = sc.ByName(ctx, "test2") + require.NoError(t, err) + assertServer2(t, srv) + + // Fetched two Servers with one API call + assert.Equal(t, int64(1), apiCalls.Load()) +} + +func TestServerCacheModeAllServersNotFound(t *testing.T) { + apiCalls := atomic.Int64{} + + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + ModeAllServers, + 10*time.Second, + ) + + ctx := t.Context() + + // Cache miss by ID 1, fetch from API, not found + sc.fetchAll = func(_ context.Context) ([]*hcloud.Server, error) { + apiCalls.Add(1) + return []*hcloud.Server{{ID: 2, Name: "test2"}, {ID: 3, Name: "test3"}}, nil } + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assert.Nil(t, srv) + + // Fetch all returns 2 servers + assert.Equal(t, int64(1), apiCalls.Load()) + assert.Len(t, sc.byID, 2) + assert.Len(t, sc.byName, 2) + + // Cache hit by ID 2 + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Cache hit by Name "test2" + srv, err = sc.ByName(ctx, "test2") + require.NoError(t, err) + assertServer2(t, srv) + + // Cache miss by name "test", fetch from API, not found + srv, err = sc.ByName(ctx, "test") + require.NoError(t, err) + assert.Nil(t, srv) + + // Fetched two Servers with one API call + assert.Equal(t, int64(2), apiCalls.Load()) } -func TestPassthrough_ByID(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers/1", Status: http.StatusOK, - JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, - }, - // Passthrough never caches, so a second lookup must hit the API again. - { - Method: "GET", Path: "/servers/1", Status: http.StatusOK, - JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, - }, - }) - cache := NewPassthrough(client) +func TestServerCacheModePerServer(t *testing.T) { + apiCalls := atomic.Int64{} + + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + ModePerServer, + 10*time.Second, + ) + + ctx := t.Context() + + // Cache miss by ID 1, fetch from API + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(1), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 1, Name: "test"}, nil + } + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Fetched one server + assert.Equal(t, int64(1), apiCalls.Load()) + assert.Len(t, sc.byID, 1) + assert.Len(t, sc.byName, 1) - srv, err := cache.ByID(t.Context(), 1) + assert.True(t, sc.byID[srv.ID].expiresAt.After(time.Now())) + assert.Equal(t, srv, sc.byID[srv.ID].value) + assert.Equal(t, srv, sc.byName[srv.Name].value) + + // Cache hit by ID 1 + srv, err = sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Cache miss by ID 2, fetch from API + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(2), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 2, Name: "test2"}, nil + } + + srv, err = sc.ByID(ctx, 2) require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, int64(1), srv.ID) + assertServer2(t, srv) - srv, err = cache.ByID(t.Context(), 1) + // Cache hit by Name 1 + srv, err = sc.ByName(ctx, "test") require.NoError(t, err) - require.NotNil(t, srv) + assertServer1(t, srv) + + // Cache hit by Name 2 + srv, err = sc.ByName(ctx, "test2") + require.NoError(t, err) + assertServer2(t, srv) + + // Fetched two servers individually + assert.Equal(t, int64(2), apiCalls.Load()) } -func TestPassthrough_ByID_NotFound(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, - }) - cache := NewPassthrough(client) +func TestServerCacheModePerServerNotFound(t *testing.T) { + apiCalls := atomic.Int64{} + + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + ModePerServer, + 10*time.Second, + ) - srv, err := cache.ByID(t.Context(), 42) + ctx := t.Context() + + // Cache miss by ID 1, fetch from API, not found + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(1), id) + apiCalls.Add(1) + return nil, nil + } + + srv, err := sc.ByID(ctx, 1) require.NoError(t, err) assert.Nil(t, srv) -} -func TestPassthrough_ByID_APIError(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers/1", - Status: http.StatusInternalServerError, - JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, - }, - }) - cache := NewPassthrough(client) + // Cached zero server + assert.Equal(t, int64(1), apiCalls.Load()) + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Cache miss by ID 2, fetch from API + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(2), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 2, Name: "test2"}, nil + } + + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) - srv, err := cache.ByID(t.Context(), 1) - require.Error(t, err) + // Cached one server + assert.Equal(t, int64(2), apiCalls.Load()) + assert.Len(t, sc.byID, 1) + assert.Len(t, sc.byName, 1) + + // Cache miss by ID 1, fetch from API, not found + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(1), id) + apiCalls.Add(1) + return nil, nil + } + + srv, err = sc.ByID(ctx, 1) + require.NoError(t, err) assert.Nil(t, srv) + + // Fetched zero server + assert.Equal(t, int64(3), apiCalls.Load()) + assert.Len(t, sc.byID, 1) + assert.Len(t, sc.byName, 1) } -func TestPassthrough_ByName(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers?name=server-1", Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - { - Method: "GET", Path: "/servers?name=server-1", Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, - }, - }) - cache := NewPassthrough(client) +func TestServerCacheModeOff(t *testing.T) { + apiCalls := atomic.Int64{} - srv, err := cache.ByName(t.Context(), "server-1") + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + ModeOff, + 10*time.Second, + ) + + ctx := t.Context() + + // Cache miss by ID 1, fetch from API + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(1), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 1, Name: "test"}, nil + } + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Fetched one server + assert.Equal(t, int64(1), apiCalls.Load()) + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Cache miss by ID 1, fetch from API + srv, err = sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + assert.Equal(t, int64(2), apiCalls.Load()) + // Entries are not cached + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Reset + sc.fetchOneByID = nil + apiCalls = atomic.Int64{} + + // Cache miss by Name "test", fetch from API + sc.fetchOneByName = func(_ context.Context, name string) (*hcloud.Server, error) { + assert.Equal(t, "test", name) + apiCalls.Add(1) + return &hcloud.Server{ID: 1, Name: "test"}, nil + } + + srv, err = sc.ByName(ctx, "test") require.NoError(t, err) - require.NotNil(t, srv) - assert.Equal(t, "server-1", srv.Name) + assertServer1(t, srv) - srv, err = cache.ByName(t.Context(), "server-1") + // Fetched one server + assert.Equal(t, int64(1), apiCalls.Load()) + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Cache miss by Name "test", fetch from API + srv, err = sc.ByName(ctx, "test") require.NoError(t, err) - require.NotNil(t, srv) + assertServer1(t, srv) + + assert.Equal(t, int64(2), apiCalls.Load()) + // Entries are not cached + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) } -func TestPassthrough_ByName_NotFound(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ - { - Method: "GET", Path: "/servers?name=missing", Status: http.StatusOK, - JSON: schema.ServerListResponse{Servers: []schema.Server{}}, - }, +func TestServerCacheModePerServer_EvictExpiredEntries(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + apiCalls := atomic.Int64{} + + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + ModePerServer, + 10*time.Second, + ) + + ctx := t.Context() + + // Populate cache + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(1), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 1, Name: "test"}, nil + } + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + + assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Wait for expiration + time.Sleep(sc.defaultTTL + 1) + + // Cache miss by ID 2, fetch from API + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(2), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 2, Name: "test2"}, nil + } + + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Fetched two servers individually + assert.Equal(t, int64(2), apiCalls.Load()) + + // Server ID 1 has been evicted + assert.Len(t, sc.byID, 1) + assert.Len(t, sc.byName, 1) + assert.Nil(t, sc.byID[1]) + assert.Nil(t, sc.byName["test"]) }) - cache := NewPassthrough(client) +} - srv, err := cache.ByName(t.Context(), "missing") - require.NoError(t, err) - assert.Nil(t, srv) +func TestServerCacheModePerServer_WithTTLRefreshOpts(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + apiCalls := atomic.Int64{} + + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + ModePerServer, + 5*time.Second, + ) + + ctx := t.Context() + + // Populate cache with default TTL + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(1), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 1, Name: "test"}, nil + } + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Cache miss by ID 2, fetch from API with different TTL + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(2), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 2, Name: "test2"}, nil + } + + // Fetch Server ID 2, use larger TTL + srv, err = sc.ByID(ctx, 2, WithTTL(2*sc.defaultTTL)) + require.NoError(t, err) + assertServer2(t, srv) + // Server ID 2 should have different TTL + assert.Equal(t, time.Now().Add(2*sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Wait for expiration of Server ID 1 + time.Sleep(sc.defaultTTL + 1) + + // Fetch Server ID 2 again, Server ID 1 is not evicted as no refresh happens + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Expect two API calls + assert.Equal(t, int64(2), apiCalls.Load()) + + // Server ID 1 is not evicted, because no refresh happened + assert.Len(t, sc.byID, 2) + assert.Len(t, sc.byName, 2) + assertServer1(t, sc.byID[1].value) + assertServer2(t, sc.byID[2].value) + + // Server ID 1 is expired with default TTL + assert.False(t, time.Now().Before(sc.byID[1].expiresAt)) + // Server ID 2 is still fresh -> higher TTL with `WithTTL` option + assert.True(t, time.Now().Before(sc.byID[2].expiresAt)) + }) } -func TestPassthrough_ByName_APIError(t *testing.T) { - client := newCacheTestClient(t, []mockutil.Request{ +func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + apiCalls := atomic.Int64{} + + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + ModePerServer, + 5*time.Second, + ) + + ctx := t.Context() + + // Populate cache with default TTL + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(1), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 1, Name: "test", Status: hcloud.ServerStatusRunning}, nil + } + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Cache miss by ID 2, fetch from API with different TTL + sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(2), id) + apiCalls.Add(1) + return &hcloud.Server{ID: 2, Name: "test2", Status: hcloud.ServerStatusOff}, nil + } + // Fetch Server ID 2, use larger TTL + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + assert.Equal(t, hcloud.ServerStatusOff, srv.Status) + assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Wait for expiration of Server ID 1 and 2 + time.Sleep(sc.defaultTTL + 1) + + // Ensure we only call fetchAll + sc.fetchOneByID = nil + sc.fetchAll = func(_ context.Context) ([]*hcloud.Server, error) { + apiCalls.Add(1) + return []*hcloud.Server{ + {ID: 1, Name: "test", Status: hcloud.ServerStatusRunning}, + {ID: 2, Name: "test2", Status: hcloud.ServerStatusRunning}, + }, nil + } + srv, err = sc.ByID(ctx, 1, WithMode(ModeAllServers)) + require.NoError(t, err) + assertServer1(t, srv) + + // Server ID 2 is still valid and got powered on with the last fetch all + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + assert.Equal(t, hcloud.ServerStatusRunning, srv.Status) + + // Expect two API calls + assert.Equal(t, int64(3), apiCalls.Load()) + + // Server ID 1 is not evicted, because no refresh happened + assert.Len(t, sc.byID, 2) + assert.Len(t, sc.byName, 2) + assertServer1(t, sc.byID[1].value) + assertServer2(t, sc.byID[2].value) + + // Server ID 1 is expired with default TTL + assert.True(t, time.Now().Before(sc.byID[1].expiresAt)) + // Server ID 2 is still fresh -> higher TTL with `WithTTL` option + assert.True(t, time.Now().Before(sc.byID[2].expiresAt)) + }) +} + +func TestServerCacheAllModesError(t *testing.T) { + testCase := func(t *testing.T, mode Mode) { + apiCalls := atomic.Int64{} + + sc := newCache[hcloud.Server]( + func(_ context.Context, id int64) (*hcloud.Server, error) { + assert.Equal(t, int64(1), id) + apiCalls.Add(1) + return nil, fmt.Errorf("test error") + }, + func(_ context.Context, name string) (*hcloud.Server, error) { + assert.Equal(t, "test", name) + apiCalls.Add(1) + return nil, fmt.Errorf("test error") + }, + func(_ context.Context) ([]*hcloud.Server, error) { + apiCalls.Add(1) + return nil, fmt.Errorf("test error") + }, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + "test", + mode, + 10*time.Second, + ) + + ctx := t.Context() + + // Cache miss by ID 1, fetch from API + srv, err := sc.ByID(ctx, 1) + require.ErrorContains(t, err, "test error") + assert.Nil(t, srv) + + // Error - nothing stored in cache + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Second time still errors - two API calls + srv, err = sc.ByID(ctx, 1) + require.ErrorContains(t, err, "test error") + assert.Nil(t, srv) + assert.Equal(t, int64(2), apiCalls.Load()) + + // Reset for fetch by Name + apiCalls = atomic.Int64{} + + // Cache miss by name "test", fetch from API + srv, err = sc.ByName(ctx, "test") + require.ErrorContains(t, err, "test error") + assert.Nil(t, srv) + + // Error - nothing stored in cache + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Second time still errors - two API calls + srv, err = sc.ByName(ctx, "test") + require.ErrorContains(t, err, "test error") + assert.Nil(t, srv) + assert.Equal(t, int64(2), apiCalls.Load()) + } + + for _, mode := range []Mode{ModeAllServers, ModePerServer, ModeOff} { + t.Run(string(mode), func(t *testing.T) { testCase(t, mode) }) + } +} + +func TestNewServerCache(t *testing.T) { + // Really want to hit 100% coverage :3 + testCases := []struct { + name string + mode Mode + requests []mockutil.Request + }{ { - Method: "GET", Path: "/servers?name=server-1", - Status: http.StatusInternalServerError, - JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + mode: ModeAllServers, + requests: []mockutil.Request{ + {Method: "GET", Path: "/servers?page=1&per_page=50", Status: 200, JSONRaw: `{ "servers": [{ "id": 1, "name": "test" }]}`}, + }, }, - }) - cache := NewPassthrough(client) + { + mode: ModePerServer, + requests: []mockutil.Request{ + {Method: "GET", Path: "/servers/1", Status: 200, JSONRaw: `{ "server": { "id": 1, "name": "test" }}`}, + }, + }, + { + mode: ModeOff, + requests: []mockutil.Request{ + {Method: "GET", Path: "/servers/1", Status: 200, JSONRaw: `{ "server": { "id": 1, "name": "test" }}`}, + {Method: "GET", Path: "/servers?name=test", Status: 200, JSONRaw: `{ "servers": [{ "id": 1, "name": "test" }]}`}, + }, + }, + } - srv, err := cache.ByName(t.Context(), "server-1") - require.Error(t, err) - assert.Nil(t, srv) + for _, tt := range testCases { + t.Run(string(tt.mode), func(t *testing.T) { + server := mockutil.NewServer(t, tt.requests) + client := hcloud.NewClient(hcloud.WithEndpoint(server.Server.URL)) + + cache := NewServerCache(client, "test", tt.mode, 10*time.Second) + require.NotNil(t, cache) + require.NotNil(t, cache.fetchOneByID) + require.NotNil(t, cache.fetchOneByName) + require.NotNil(t, cache.fetchAll) + require.NotNil(t, cache.getID) + require.NotNil(t, cache.getName) + + ctx := t.Context() + + srv, err := cache.ByID(ctx, int64(1)) + require.NoError(t, err) + assert.NotNil(t, srv) + + srv, err = cache.ByName(ctx, "test") + require.NoError(t, err) + assert.NotNil(t, srv) + }) + } } From 47fd6a0ae2b7a09aea7186f468ba234473155a2f Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Wed, 3 Jun 2026 13:05:13 +0200 Subject: [PATCH 07/10] refactor: wip cleanup --- hcloud/cloud.go | 2 +- hcloud/cloud_test.go | 2 +- hcloud/instances.go | 3 + internal/config/config.go | 2 +- internal/config/config_test.go | 44 +++++------ internal/servercache/context.go | 21 ++++++ internal/servercache/servercache.go | 93 ++++++++++++++++-------- internal/servercache/servercache_test.go | 35 ++++----- 8 files changed, 126 insertions(+), 76 deletions(-) create mode 100644 internal/servercache/context.go diff --git a/hcloud/cloud.go b/hcloud/cloud.go index 99abd5d00..0082fbee1 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -146,7 +146,7 @@ func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Int klog.Infof("Hetzner Cloud k8s cloud controller %s started\n", providerVersion) - instanceCache := servercache.NewServerCache(client, "instances_v2", cfg.Instance.Cache.Mode, cfg.Instance.Cache.TTL) + instanceCache := servercache.NewServerCache(client, cfg.Instance.Cache.Mode, cfg.Instance.Cache.TTL) return &cloud{ client: client, diff --git a/hcloud/cloud_test.go b/hcloud/cloud_test.go index 226a92728..e246b2325 100644 --- a/hcloud/cloud_test.go +++ b/hcloud/cloud_test.go @@ -69,7 +69,7 @@ func newTestEnv() testEnv { ) robotClient := hrobot.NewBasicAuthClient("", "") robotClient.SetBaseURL(server.URL + "/robot") - serverCache := servercache.NewServerCache(client, "instances_v2", servercache.ModePerServer, 10*time.Second) + serverCache := servercache.NewServerCache(client, servercache.ModeOne, 10*time.Second) recorder := record.NewBroadcaster().NewRecorder(scheme.Scheme, corev1.EventSource{Component: "hcloud-cloud-controller-manager"}) cfg := config.HCCMConfiguration{} diff --git a/hcloud/instances.go b/hcloud/instances.go index 9b6bf7840..f08e32068 100644 --- a/hcloud/instances.go +++ b/hcloud/instances.go @@ -40,6 +40,7 @@ import ( const ( ProvidedBy = "instance.hetzner.cloud/provided-by" MisconfiguredInternalIP = "MisconfiguredInternalIP" + instancesV2Subsystem = "instances_v2" ) type instances struct { @@ -81,6 +82,8 @@ func (i *instances) lookupServer( ctx context.Context, node *corev1.Node, ) (genericServer, error) { + ctx = servercache.SetSubsystem(ctx, instancesV2Subsystem) + if node.Spec.ProviderID != "" { var serverID int64 serverID, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID) diff --git a/internal/config/config.go b/internal/config/config.go index 628f293c0..471470cd9 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -188,7 +188,7 @@ func Read() (HCCMConfiguration, error) { // ---- Instance Cache ---- cfg.Instance.Cache = InstanceConfigurationCache{ - Mode: servercache.ModeAllServers, + Mode: servercache.ModeAll, TTL: InstanceCacheDefaultTTL, } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 879cab2e5..6237b2ae4 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -26,7 +26,7 @@ func TestRead(t *testing.T) { want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -49,7 +49,7 @@ func TestRead(t *testing.T) { HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ NameOrID: "foobar", AttachedCheckEnabled: true, @@ -86,7 +86,7 @@ func TestRead(t *testing.T) { ForwardInternalIPs: false, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -142,7 +142,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or }, Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -171,7 +171,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: false, Address: "127.0.0.1:9999"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -202,7 +202,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or ForwardInternalIPs: true, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -234,7 +234,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or ForwardInternalIPs: false, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -254,7 +254,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -275,7 +275,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -298,7 +298,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -325,7 +325,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -414,7 +414,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "minimal", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, }, wantErr: nil, }, @@ -424,7 +424,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ NameOrID: "foobar", }, @@ -438,7 +438,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "token missing", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: ""}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, }, wantErr: errors.New("environment variable \"HCLOUD_TOKEN\" is required"), }, @@ -446,7 +446,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "address family invalid", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamily("foobar"), Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamily("foobar"), Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, }, wantErr: errors.New("invalid value for \"HCLOUD_INSTANCES_ADDRESS_FAMILY\", expect one of: ipv4,ipv6,dualstack"), }, @@ -454,7 +454,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "LB location and network zone set", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Location: "nbg1", NetworkZone: "eu-central", @@ -466,7 +466,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "LB private subnet invalid cidr", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ PrivateSubnetIPRange: "10.0.0.0/33", }, @@ -477,7 +477,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "algorithm type invalid", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ AlgorithmType: "invalid", }, @@ -488,7 +488,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled without credentials (valid)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -500,7 +500,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled with partial credentials (only user)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -514,7 +514,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled with partial credentials (only password)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -528,7 +528,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot & routes activated", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, Route: RouteConfiguration{Enabled: true}, Robot: RobotConfiguration{ Enabled: true, diff --git a/internal/servercache/context.go b/internal/servercache/context.go new file mode 100644 index 000000000..68a0e93b4 --- /dev/null +++ b/internal/servercache/context.go @@ -0,0 +1,21 @@ +package servercache + +import ( + "context" +) + +type key struct{} + +var subsystemKey = key{} + +func SetSubsystem(ctx context.Context, subsystem string) context.Context { + return context.WithValue(ctx, subsystemKey, subsystem) +} + +func GetSubsystem(ctx context.Context) string { + result, ok := ctx.Value(subsystemKey).(string) + if !ok { + return "none" + } + return result +} diff --git a/internal/servercache/servercache.go b/internal/servercache/servercache.go index a23153463..10fbe73a5 100644 --- a/internal/servercache/servercache.go +++ b/internal/servercache/servercache.go @@ -15,9 +15,12 @@ import ( type Mode string const ( - ModeAllServers Mode = "all-server" - ModePerServer Mode = "per-server" - ModeOff Mode = "off" + // ModeAll fetches and caches all Servers. + ModeAll Mode = "all" + // ModeOne fetches and caches one Server. + ModeOne Mode = "one" + // ModeOff disables caching. + ModeOff Mode = "off" ) type RefreshOpts struct { @@ -36,7 +39,7 @@ func newCacheRefreshOpts[T any](cache *Cache[T], opts ...RefreshOption) *Refresh return refreshOpts } -type RefreshOption func(cro *RefreshOpts) +type RefreshOption func(ro *RefreshOpts) func WithTTL(ttl time.Duration) func(*RefreshOpts) { return func(ro *RefreshOpts) { @@ -65,15 +68,13 @@ type Cache[T any] struct { defaultTTL time.Duration defaultMode Mode - subsystem string - byID map[int64]*entry[T] byName map[string]*entry[T] mu sync.Mutex } -func NewServerCache(client *hcloud.Client, subsystem string, defaultMode Mode, defaultTTL time.Duration) *Cache[hcloud.Server] { +func NewServerCache(client *hcloud.Client, defaultMode Mode, defaultTTL time.Duration) *Cache[hcloud.Server] { return newCache[hcloud.Server]( func(ctx context.Context, id int64) (*hcloud.Server, error) { value, _, err := client.Server.GetByID(ctx, id) @@ -89,7 +90,6 @@ func NewServerCache(client *hcloud.Client, subsystem string, defaultMode Mode, d }, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - subsystem, defaultMode, defaultTTL, ) @@ -101,7 +101,6 @@ func newCache[T any]( fetchAll func(ctx context.Context) ([]*T, error), getID func(value *T) int64, getName func(value *T) string, - subsystem string, defaultMode Mode, defaultTTL time.Duration, ) *Cache[T] { @@ -112,7 +111,6 @@ func newCache[T any]( getID: getID, getName: getName, - subsystem: subsystem, defaultMode: defaultMode, defaultTTL: defaultTTL, @@ -124,6 +122,7 @@ func newCache[T any]( func (c *Cache[T]) ByID(ctx context.Context, id int64, opts ...RefreshOption) (*T, error) { return c.getFromCache( ctx, + GetSubsystem(ctx), func() *entry[T] { return c.byID[id] }, @@ -137,6 +136,7 @@ func (c *Cache[T]) ByID(ctx context.Context, id int64, opts ...RefreshOption) (* func (c *Cache[T]) ByName(ctx context.Context, name string, opts ...RefreshOption) (*T, error) { return c.getFromCache( ctx, + GetSubsystem(ctx), func() *entry[T] { return c.byName[name] }, @@ -149,6 +149,7 @@ func (c *Cache[T]) ByName(ctx context.Context, name string, opts ...RefreshOptio func (c *Cache[T]) getFromCache( ctx context.Context, + subsystem string, lookup func() *entry[T], fetch func() (*T, error), opts ...RefreshOption, @@ -156,8 +157,8 @@ func (c *Cache[T]) getFromCache( refreshOpts := newCacheRefreshOpts(c, opts...) if refreshOpts.mode == ModeOff { - metrics.CacheRequests.WithLabelValues(c.subsystem, string(refreshOpts.mode), "miss").Inc() - klog.V(4).InfoS("cache mode is off") + metrics.CacheRequests.WithLabelValues(subsystem, string(refreshOpts.mode), "miss").Inc() + klog.V(4).InfoS("cache mode is off: fetching entry from api", "subsystem", subsystem) return fetch() } @@ -165,40 +166,51 @@ func (c *Cache[T]) getFromCache( defer c.mu.Unlock() if e := lookup(); e != nil && time.Now().Before(e.expiresAt) { - metrics.CacheRequests.WithLabelValues(c.subsystem, string(refreshOpts.mode), "hit").Inc() - klog.V(4).InfoS("cache hit", "id", c.getID(e.value), "name", c.getName(e.value), "expiresAt", e.expiresAt.Format(time.RFC3339)) + metrics.CacheRequests.WithLabelValues(subsystem, string(refreshOpts.mode), "hit").Inc() + klog.V(4).InfoS( + "cache hit", + "subsystem", subsystem, + "id", c.getID(e.value), + "name", c.getName(e.value), + "expiresAt", e.expiresAt.Format(time.RFC3339), + ) return e.value, nil } switch refreshOpts.mode { - case ModePerServer: - if err := c.refreshPerServer(fetch, refreshOpts.ttl); err != nil { + case ModeOne: + if err := c.refreshOne(subsystem, fetch, refreshOpts.ttl); err != nil { return nil, err } - case ModeAllServers: - if err := c.refreshAllServer(ctx, refreshOpts.ttl); err != nil { + case ModeAll: + if err := c.refreshAll(ctx, subsystem, refreshOpts.ttl); err != nil { return nil, err } case ModeOff: // Handled above -> early return } - metrics.CacheRequests.WithLabelValues(c.subsystem, string(refreshOpts.mode), "miss").Inc() + metrics.CacheRequests.WithLabelValues(subsystem, string(refreshOpts.mode), "miss").Inc() if e := lookup(); e != nil { - klog.V(4).InfoS("entry found after refresh", "id", c.getID(e.value), "name", c.getName(e.value)) + klog.V(4).InfoS( + "entry found after refresh", + "subsystem", subsystem, + "id", c.getID(e.value), "name", c.getName(e.value), + ) return e.value, nil } - klog.V(4).InfoS("entry not found after refresh") + klog.V(4).InfoS("entry not found after refresh", "subsystem", subsystem) return nil, nil } -func (c *Cache[T]) refreshPerServer( +func (c *Cache[T]) refreshOne( + subsystem string, fetch func() (*T, error), ttl time.Duration, ) error { - klog.V(4).InfoS("refreshing server from api") + klog.V(4).InfoS("refreshing entry from api", "subsystem", subsystem) value, err := fetch() if err != nil { return err @@ -212,7 +224,13 @@ func (c *Cache[T]) refreshPerServer( value: value, expiresAt: time.Now().Add(ttl), } - klog.V(4).InfoS("refreshed entry from api", "id", c.getID(e.value), "name", c.getName(e.value), "expiresAt", e.expiresAt.Format(time.RFC3339)) + klog.V(4).InfoS( + "refreshed entry from api", + "subsystem", subsystem, + "id", c.getID(e.value), + "name", c.getName(e.value), + "expiresAt", e.expiresAt.Format(time.RFC3339), + ) c.byID[c.getID(value)] = e c.byName[c.getName(value)] = e @@ -221,14 +239,26 @@ func (c *Cache[T]) refreshPerServer( // Nodes or renamed Servers are cleaned from the cache. maps.DeleteFunc(c.byID, func(_ int64, ev *entry[T]) bool { if time.Now().After(ev.expiresAt) { - klog.V(4).InfoS("evicting entry from cache by id", "id", c.getID(ev.value), "name", c.getName(ev.value), "expiresAt", ev.expiresAt.Format(time.RFC3339)) + klog.V(4).InfoS( + "evicting entry from cache by id", + "subsystem", subsystem, + "id", c.getID(ev.value), + "name", c.getName(ev.value), + "expiresAt", ev.expiresAt.Format(time.RFC3339), + ) return true } return false }) maps.DeleteFunc(c.byName, func(_ string, ev *entry[T]) bool { if time.Now().After(ev.expiresAt) { - klog.V(4).InfoS("evicting entry from cache by name", "id", c.getID(ev.value), "name", c.getName(ev.value), "expiresAt", ev.expiresAt.Format(time.RFC3339)) + klog.V(4).InfoS( + "evicting entry from cache by name", + "subsystem", subsystem, + "id", c.getID(ev.value), + "name", c.getName(ev.value), + "expiresAt", ev.expiresAt.Format(time.RFC3339), + ) return true } return false @@ -237,8 +267,8 @@ func (c *Cache[T]) refreshPerServer( return nil } -func (c *Cache[T]) refreshAllServer(ctx context.Context, ttl time.Duration) error { - klog.V(4).InfoS("refreshing all entries from api") +func (c *Cache[T]) refreshAll(ctx context.Context, subsystem string, ttl time.Duration) error { + klog.V(4).InfoS("refreshing all entries from api", "subsystem", subsystem) values, err := c.fetchAll(ctx) if err != nil { @@ -260,6 +290,11 @@ func (c *Cache[T]) refreshAllServer(ctx context.Context, ttl time.Duration) erro c.byName[c.getName(value)] = e } - klog.V(4).InfoS("refreshed all entries from api", "count", len(values), "expiresAt", expiresAt.Format(time.RFC3339)) + klog.V(4).InfoS( + "refreshed all entries from api", + "subsystem", subsystem, + "count", len(values), + "expiresAt", expiresAt.Format(time.RFC3339), + ) return nil } diff --git a/internal/servercache/servercache_test.go b/internal/servercache/servercache_test.go index a156e0a48..da068f7e3 100644 --- a/internal/servercache/servercache_test.go +++ b/internal/servercache/servercache_test.go @@ -38,8 +38,7 @@ func TestServerCacheModeAllServers(t *testing.T) { nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", - ModeAllServers, + ModeAll, 10*time.Second, ) @@ -97,8 +96,7 @@ func TestServerCacheModeAllServersNotFound(t *testing.T) { nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", - ModeAllServers, + ModeAll, 10*time.Second, ) @@ -147,8 +145,7 @@ func TestServerCacheModePerServer(t *testing.T) { nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", - ModePerServer, + ModeOne, 10*time.Second, ) @@ -204,7 +201,7 @@ func TestServerCacheModePerServer(t *testing.T) { assert.Equal(t, int64(2), apiCalls.Load()) } -func TestServerCacheModePerServerNotFound(t *testing.T) { +func TestServerCacheModeOneNotFound(t *testing.T) { apiCalls := atomic.Int64{} sc := newCache[hcloud.Server]( @@ -213,8 +210,7 @@ func TestServerCacheModePerServerNotFound(t *testing.T) { nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", - ModePerServer, + ModeOne, 10*time.Second, ) @@ -278,7 +274,6 @@ func TestServerCacheModeOff(t *testing.T) { nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", ModeOff, 10*time.Second, ) @@ -352,8 +347,7 @@ func TestServerCacheModePerServer_EvictExpiredEntries(t *testing.T) { nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", - ModePerServer, + ModeOne, 10*time.Second, ) @@ -405,8 +399,7 @@ func TestServerCacheModePerServer_WithTTLRefreshOpts(t *testing.T) { nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", - ModePerServer, + ModeOne, 5*time.Second, ) @@ -471,8 +464,7 @@ func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", - ModePerServer, + ModeOne, 5*time.Second, ) @@ -514,7 +506,7 @@ func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { {ID: 2, Name: "test2", Status: hcloud.ServerStatusRunning}, }, nil } - srv, err = sc.ByID(ctx, 1, WithMode(ModeAllServers)) + srv, err = sc.ByID(ctx, 1, WithMode(ModeAll)) require.NoError(t, err) assertServer1(t, srv) @@ -561,7 +553,6 @@ func TestServerCacheAllModesError(t *testing.T) { }, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, - "test", mode, 10*time.Second, ) @@ -602,7 +593,7 @@ func TestServerCacheAllModesError(t *testing.T) { assert.Equal(t, int64(2), apiCalls.Load()) } - for _, mode := range []Mode{ModeAllServers, ModePerServer, ModeOff} { + for _, mode := range []Mode{ModeAll, ModeOne, ModeOff} { t.Run(string(mode), func(t *testing.T) { testCase(t, mode) }) } } @@ -615,13 +606,13 @@ func TestNewServerCache(t *testing.T) { requests []mockutil.Request }{ { - mode: ModeAllServers, + mode: ModeAll, requests: []mockutil.Request{ {Method: "GET", Path: "/servers?page=1&per_page=50", Status: 200, JSONRaw: `{ "servers": [{ "id": 1, "name": "test" }]}`}, }, }, { - mode: ModePerServer, + mode: ModeOne, requests: []mockutil.Request{ {Method: "GET", Path: "/servers/1", Status: 200, JSONRaw: `{ "server": { "id": 1, "name": "test" }}`}, }, @@ -640,7 +631,7 @@ func TestNewServerCache(t *testing.T) { server := mockutil.NewServer(t, tt.requests) client := hcloud.NewClient(hcloud.WithEndpoint(server.Server.URL)) - cache := NewServerCache(client, "test", tt.mode, 10*time.Second) + cache := NewServerCache(client, tt.mode, 10*time.Second) require.NotNil(t, cache) require.NotNil(t, cache.fetchOneByID) require.NotNil(t, cache.fetchOneByName) From 035c73f5ab24347c1733a7ed257bfcaa53ae524a Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Wed, 3 Jun 2026 13:30:39 +0200 Subject: [PATCH 08/10] refactor: gobal config option --- hcloud/cloud.go | 2 +- internal/config/config.go | 28 +++++++++-------- internal/config/config_test.go | 55 ++++++++++++++++++++-------------- 3 files changed, 49 insertions(+), 36 deletions(-) diff --git a/hcloud/cloud.go b/hcloud/cloud.go index 0082fbee1..67bbeb713 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -146,7 +146,7 @@ func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Int klog.Infof("Hetzner Cloud k8s cloud controller %s started\n", providerVersion) - instanceCache := servercache.NewServerCache(client, cfg.Instance.Cache.Mode, cfg.Instance.Cache.TTL) + instanceCache := servercache.NewServerCache(client, cfg.Cache.Mode, cfg.Cache.TTL) return &cloud{ client: client, diff --git a/internal/config/config.go b/internal/config/config.go index 471470cd9..91d476ad8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -30,8 +30,8 @@ const ( robotForwardInternalIPs = "ROBOT_FORWARD_INTERNAL_IPS" hcloudInstancesAddressFamily = "HCLOUD_INSTANCES_ADDRESS_FAMILY" - hcloudInstancesCacheMode = "HCLOUD_INSTANCES_CACHE_MODE" - hcloudInstancesCacheTTL = "HCLOUD_INSTANCES_CACHE_TTL" + hcloudCacheMode = "HCLOUD_CACHE_MODE" + hcloudCacheTTL = "HCLOUD_CACHE_TTL" // Disable the "master/server is attached to the network" check against the metadata service. hcloudNetworkDisableAttachedCheck = "HCLOUD_NETWORK_DISABLE_ATTACHED_CHECK" @@ -70,14 +70,13 @@ const ( AddressFamilyIPv4 AddressFamily = "ipv4" ) -const InstanceCacheDefaultTTL time.Duration = 10 * time.Second +const CacheDefaultTTL time.Duration = 10 * time.Second type InstanceConfiguration struct { AddressFamily AddressFamily - Cache InstanceConfigurationCache } -type InstanceConfigurationCache struct { +type CacheConfiguration struct { Mode servercache.Mode TTL time.Duration } @@ -116,6 +115,7 @@ type HCCMConfiguration struct { LoadBalancer LoadBalancerConfiguration Network NetworkConfiguration Route RouteConfiguration + Cache CacheConfiguration } // Read evaluates all environment variables and returns a [HCCMConfiguration]. It only validates as far as @@ -185,23 +185,25 @@ func Read() (HCCMConfiguration, error) { cfg.Instance.AddressFamily = AddressFamilyIPv4 } - // ---- Instance Cache ---- + // ---- Server Cache ---- - cfg.Instance.Cache = InstanceConfigurationCache{ + cfg.Cache = CacheConfiguration{ Mode: servercache.ModeAll, - TTL: InstanceCacheDefaultTTL, + TTL: CacheDefaultTTL, } - if mode, ok := os.LookupEnv(hcloudInstancesCacheMode); ok { - cfg.Instance.Cache.Mode = servercache.Mode(mode) + if mode, ok := os.LookupEnv(hcloudCacheMode); ok { + klog.Warningf("Experimental: %s is experimental, breaking changes may occur within minor releases.", hcloudCacheMode) + cfg.Cache.Mode = servercache.Mode(mode) } - if ttlStr, ok := os.LookupEnv(hcloudInstancesCacheTTL); ok { + if ttlStr, ok := os.LookupEnv(hcloudCacheTTL); ok { + klog.Warningf("Experimental: %s is experimental, breaking changes may occur within minor releases.", hcloudCacheTTL) ttl, err := time.ParseDuration(ttlStr) if err != nil { - errs = append(errs, fmt.Errorf("invalid value for %q: %w", hcloudInstancesCacheTTL, err)) + errs = append(errs, fmt.Errorf("invalid value for %q: %w", hcloudCacheTTL, err)) } else { - cfg.Instance.Cache.TTL = ttl + cfg.Cache.TTL = ttl } } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 6237b2ae4..ccf88a00b 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -26,7 +26,8 @@ func TestRead(t *testing.T) { want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -49,7 +50,8 @@ func TestRead(t *testing.T) { HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ NameOrID: "foobar", AttachedCheckEnabled: true, @@ -86,7 +88,8 @@ func TestRead(t *testing.T) { ForwardInternalIPs: false, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -142,7 +145,8 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or }, Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -171,7 +175,8 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: false, Address: "127.0.0.1:9999"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -202,7 +207,8 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or ForwardInternalIPs: true, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -234,7 +240,8 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or ForwardInternalIPs: false, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -254,7 +261,8 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -275,7 +283,8 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -298,7 +307,8 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -325,7 +335,8 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -414,7 +425,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "minimal", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, }, wantErr: nil, }, @@ -424,7 +435,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, Network: NetworkConfiguration{ NameOrID: "foobar", }, @@ -438,7 +449,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "token missing", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: ""}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, }, wantErr: errors.New("environment variable \"HCLOUD_TOKEN\" is required"), }, @@ -446,7 +457,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "address family invalid", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamily("foobar"), Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamily("foobar")}, }, wantErr: errors.New("invalid value for \"HCLOUD_INSTANCES_ADDRESS_FAMILY\", expect one of: ipv4,ipv6,dualstack"), }, @@ -454,7 +465,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "LB location and network zone set", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, LoadBalancer: LoadBalancerConfiguration{ Location: "nbg1", NetworkZone: "eu-central", @@ -466,7 +477,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "LB private subnet invalid cidr", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, LoadBalancer: LoadBalancerConfiguration{ PrivateSubnetIPRange: "10.0.0.0/33", }, @@ -477,7 +488,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "algorithm type invalid", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, LoadBalancer: LoadBalancerConfiguration{ AlgorithmType: "invalid", }, @@ -488,7 +499,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled without credentials (valid)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, Robot: RobotConfiguration{ Enabled: true, @@ -500,7 +511,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled with partial credentials (only user)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, Robot: RobotConfiguration{ Enabled: true, @@ -514,7 +525,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled with partial credentials (only password)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, Robot: RobotConfiguration{ Enabled: true, @@ -528,7 +539,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot & routes activated", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAll, TTL: 10 * time.Second}}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, Route: RouteConfiguration{Enabled: true}, Robot: RobotConfiguration{ Enabled: true, From e5ca590d52343acc7ba6419cd72289f124fc9e9c Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Wed, 3 Jun 2026 13:34:21 +0200 Subject: [PATCH 09/10] docs: update cache settings --- docs/reference/instance_cache.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/reference/instance_cache.md b/docs/reference/instance_cache.md index 6ba6491a2..65461c00a 100644 --- a/docs/reference/instance_cache.md +++ b/docs/reference/instance_cache.md @@ -1,23 +1,23 @@ # Instance Cache -> **Experimental:** Instance caching is experimental, breaking changes may occur within minor releases. We believe the implementation is safe in practice — that is why it ships enabled by default (`all-server`). Set `HCLOUD_INSTANCES_CACHE_MODE=off` to opt out. +> **Experimental:** Instance caching is experimental, breaking changes may occur within minor releases. We believe the implementation is safe in practice — that is why it ships enabled by default (`all-server`). Set `HCLOUD_CACHE_MODE=off` to opt out. The instance cache reduces calls to the Hetzner Cloud API made by the `InstancesV2` controller, which looks up Servers by ID or name to reconcile Node state. The cache sits between the controller and the Hetzner Cloud API; behavior is controlled by the environment variables below. ## Environment Variables -| Name | Type | Default | Description | -| ----------------------------- | --------------------------------- | ------------ | ------------------------------------------------------------------------------------- | -| `HCLOUD_INSTANCES_CACHE_MODE` | `all-server \| per-server \| off` | `all-server` | Selects the caching strategy. See [Modes](#modes) below. | -| `HCLOUD_INSTANCES_CACHE_TTL` | `duration` | `10s` | Lifetime of cached entries. Accepts any Go `time.Duration` string (e.g. `30s`, `2m`). | +| Name | Type | Default | Description | +| ------------------- | ------------------- | ------- | ------------------------------------------------------------------------------------- | +| `HCLOUD_CACHE_MODE` | `all \| one \| off` | `all` | Selects the caching strategy. See [Modes](#modes) below. | +| `HCLOUD_CACHE_TTL` | `duration` | `10s` | Lifetime of cached entries. Accepts any Go `time.Duration` string (e.g. `30s`, `2m`). | ## Modes -### `all-server` +### `all` Fetches every Server in the project with a single `GET /servers` call and serves all subsequent `ByID` / `ByName` lookups from the resulting snapshot until the TTL expires. The snapshot is refreshed on the next lookup after expiry. On a cache miss within the TTL (e.g. a freshly created Server), one rate-limited refresh per TTL window is allowed to pick up the new Server; further misses in the same window return without an API call. -### `per-server` +### `one` Caches each Server individually with its own expiration. A `ByID` / `ByName` lookup either returns a non-expired entry or issues a `GET /servers/{id}` (or `GET /servers?name=`) call and stores the result. Expired entries are evicted lazily when other entries are inserted. From a2c56c8f8e41833facc64651e4003a64a7432831 Mon Sep 17 00:00:00 2001 From: jo Date: Wed, 3 Jun 2026 16:23:49 +0200 Subject: [PATCH 10/10] test: simplify call counting and client mocking --- internal/servercache/servercache_test.go | 244 +++++++++++------------ 1 file changed, 111 insertions(+), 133 deletions(-) diff --git a/internal/servercache/servercache_test.go b/internal/servercache/servercache_test.go index da068f7e3..3c20596b9 100644 --- a/internal/servercache/servercache_test.go +++ b/internal/servercache/servercache_test.go @@ -3,7 +3,6 @@ package servercache import ( "context" "fmt" - "sync/atomic" "testing" "testing/synctest" "time" @@ -29,9 +28,55 @@ func assertServer2(t *testing.T, server *hcloud.Server) { assert.Equal(t, "test2", server.Name) } -func TestServerCacheModeAllServers(t *testing.T) { - apiCalls := atomic.Int64{} +type testClient struct { + t *testing.T + callCount int +} + +func newTestClient(t *testing.T) *testClient { + return &testClient{t: t, callCount: 0} +} + +func (c *testClient) CallCount() int { + return c.callCount +} + +func (c *testClient) FetchAllFunc(servers []*hcloud.Server, err error) func(context.Context) ([]*hcloud.Server, error) { + return func(context.Context) ([]*hcloud.Server, error) { + c.t.Helper() + + c.callCount++ + return servers, err + } +} + +func (c *testClient) FetchOneByIDFunc(server *hcloud.Server, err error) func(context.Context, int64) (*hcloud.Server, error) { + return func(_ context.Context, id int64) (*hcloud.Server, error) { + c.t.Helper() + + if server != nil { + require.Equal(c.t, server.ID, id, "fetch one by id expected id %d, got %d", server.ID, id) + } + + c.callCount++ + return server, err + } +} + +func (c *testClient) FetchOneByNameFunc(server *hcloud.Server, err error) func(context.Context, string) (*hcloud.Server, error) { + return func(_ context.Context, name string) (*hcloud.Server, error) { + c.t.Helper() + + if server != nil { + require.Equal(c.t, server.Name, name, "fetch one by name expected name %s, got %s", server.Name, name) + } + c.callCount++ + return server, err + } +} + +func TestServerCacheModeAllServers(t *testing.T) { sc := newCache[hcloud.Server]( nil, nil, @@ -43,19 +88,17 @@ func TestServerCacheModeAllServers(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) // Cache miss by ID 1, fetch from API - sc.fetchAll = func(_ context.Context) ([]*hcloud.Server, error) { - apiCalls.Add(1) - return []*hcloud.Server{{ID: 1, Name: "test"}, {ID: 2, Name: "test2"}}, nil - } + sc.fetchAll = client.FetchAllFunc([]*hcloud.Server{{ID: 1, Name: "test"}, {ID: 2, Name: "test2"}}, nil) srv, err := sc.ByID(ctx, 1) require.NoError(t, err) assertServer1(t, srv) // Fetch all returns 2 servers - assert.Equal(t, int64(1), apiCalls.Load()) + assert.Equal(t, 1, client.CallCount()) assert.Len(t, sc.byID, 2) assert.Len(t, sc.byName, 2) @@ -84,12 +127,10 @@ func TestServerCacheModeAllServers(t *testing.T) { assertServer2(t, srv) // Fetched two Servers with one API call - assert.Equal(t, int64(1), apiCalls.Load()) + assert.Equal(t, 1, client.CallCount()) } func TestServerCacheModeAllServersNotFound(t *testing.T) { - apiCalls := atomic.Int64{} - sc := newCache[hcloud.Server]( nil, nil, @@ -101,19 +142,17 @@ func TestServerCacheModeAllServersNotFound(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) // Cache miss by ID 1, fetch from API, not found - sc.fetchAll = func(_ context.Context) ([]*hcloud.Server, error) { - apiCalls.Add(1) - return []*hcloud.Server{{ID: 2, Name: "test2"}, {ID: 3, Name: "test3"}}, nil - } + sc.fetchAll = client.FetchAllFunc([]*hcloud.Server{{ID: 2, Name: "test2"}, {ID: 3, Name: "test3"}}, nil) srv, err := sc.ByID(ctx, 1) require.NoError(t, err) assert.Nil(t, srv) // Fetch all returns 2 servers - assert.Equal(t, int64(1), apiCalls.Load()) + assert.Equal(t, 1, client.CallCount()) assert.Len(t, sc.byID, 2) assert.Len(t, sc.byName, 2) @@ -133,12 +172,10 @@ func TestServerCacheModeAllServersNotFound(t *testing.T) { assert.Nil(t, srv) // Fetched two Servers with one API call - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) } func TestServerCacheModePerServer(t *testing.T) { - apiCalls := atomic.Int64{} - sc := newCache[hcloud.Server]( nil, nil, @@ -150,20 +187,17 @@ func TestServerCacheModePerServer(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) // Cache miss by ID 1, fetch from API - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(1), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 1, Name: "test"}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) srv, err := sc.ByID(ctx, 1) require.NoError(t, err) assertServer1(t, srv) // Fetched one server - assert.Equal(t, int64(1), apiCalls.Load()) + assert.Equal(t, 1, client.CallCount()) assert.Len(t, sc.byID, 1) assert.Len(t, sc.byName, 1) @@ -177,11 +211,7 @@ func TestServerCacheModePerServer(t *testing.T) { assertServer1(t, srv) // Cache miss by ID 2, fetch from API - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(2), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 2, Name: "test2"}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2"}, nil) srv, err = sc.ByID(ctx, 2) require.NoError(t, err) @@ -198,12 +228,10 @@ func TestServerCacheModePerServer(t *testing.T) { assertServer2(t, srv) // Fetched two servers individually - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) } func TestServerCacheModeOneNotFound(t *testing.T) { - apiCalls := atomic.Int64{} - sc := newCache[hcloud.Server]( nil, nil, @@ -215,59 +243,46 @@ func TestServerCacheModeOneNotFound(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) // Cache miss by ID 1, fetch from API, not found - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(1), id) - apiCalls.Add(1) - return nil, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(nil, nil) srv, err := sc.ByID(ctx, 1) require.NoError(t, err) assert.Nil(t, srv) // Cached zero server - assert.Equal(t, int64(1), apiCalls.Load()) + assert.Equal(t, 1, client.CallCount()) assert.Empty(t, sc.byID) assert.Empty(t, sc.byName) // Cache miss by ID 2, fetch from API - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(2), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 2, Name: "test2"}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2"}, nil) srv, err = sc.ByID(ctx, 2) require.NoError(t, err) assertServer2(t, srv) // Cached one server - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) assert.Len(t, sc.byID, 1) assert.Len(t, sc.byName, 1) // Cache miss by ID 1, fetch from API, not found - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(1), id) - apiCalls.Add(1) - return nil, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(nil, nil) srv, err = sc.ByID(ctx, 1) require.NoError(t, err) assert.Nil(t, srv) // Fetched zero server - assert.Equal(t, int64(3), apiCalls.Load()) + assert.Equal(t, 3, client.CallCount()) assert.Len(t, sc.byID, 1) assert.Len(t, sc.byName, 1) } func TestServerCacheModeOff(t *testing.T) { - apiCalls := atomic.Int64{} - sc := newCache[hcloud.Server]( nil, nil, @@ -279,20 +294,17 @@ func TestServerCacheModeOff(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) // Cache miss by ID 1, fetch from API - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(1), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 1, Name: "test"}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) srv, err := sc.ByID(ctx, 1) require.NoError(t, err) assertServer1(t, srv) // Fetched one server - assert.Equal(t, int64(1), apiCalls.Load()) + assert.Equal(t, 1, client.CallCount()) assert.Empty(t, sc.byID) assert.Empty(t, sc.byName) @@ -301,28 +313,24 @@ func TestServerCacheModeOff(t *testing.T) { require.NoError(t, err) assertServer1(t, srv) - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) // Entries are not cached assert.Empty(t, sc.byID) assert.Empty(t, sc.byName) // Reset sc.fetchOneByID = nil - apiCalls = atomic.Int64{} + client = newTestClient(t) // Cache miss by Name "test", fetch from API - sc.fetchOneByName = func(_ context.Context, name string) (*hcloud.Server, error) { - assert.Equal(t, "test", name) - apiCalls.Add(1) - return &hcloud.Server{ID: 1, Name: "test"}, nil - } + sc.fetchOneByName = client.FetchOneByNameFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) srv, err = sc.ByName(ctx, "test") require.NoError(t, err) assertServer1(t, srv) // Fetched one server - assert.Equal(t, int64(1), apiCalls.Load()) + assert.Equal(t, 1, client.CallCount()) assert.Empty(t, sc.byID) assert.Empty(t, sc.byName) @@ -331,7 +339,7 @@ func TestServerCacheModeOff(t *testing.T) { require.NoError(t, err) assertServer1(t, srv) - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) // Entries are not cached assert.Empty(t, sc.byID) assert.Empty(t, sc.byName) @@ -339,8 +347,6 @@ func TestServerCacheModeOff(t *testing.T) { func TestServerCacheModePerServer_EvictExpiredEntries(t *testing.T) { synctest.Test(t, func(t *testing.T) { - apiCalls := atomic.Int64{} - sc := newCache[hcloud.Server]( nil, nil, @@ -352,13 +358,11 @@ func TestServerCacheModePerServer_EvictExpiredEntries(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) // Populate cache - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(1), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 1, Name: "test"}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) + srv, err := sc.ByID(ctx, 1) require.NoError(t, err) @@ -368,18 +372,14 @@ func TestServerCacheModePerServer_EvictExpiredEntries(t *testing.T) { time.Sleep(sc.defaultTTL + 1) // Cache miss by ID 2, fetch from API - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(2), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 2, Name: "test2"}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2"}, nil) srv, err = sc.ByID(ctx, 2) require.NoError(t, err) assertServer2(t, srv) // Fetched two servers individually - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) // Server ID 1 has been evicted assert.Len(t, sc.byID, 1) @@ -391,8 +391,6 @@ func TestServerCacheModePerServer_EvictExpiredEntries(t *testing.T) { func TestServerCacheModePerServer_WithTTLRefreshOpts(t *testing.T) { synctest.Test(t, func(t *testing.T) { - apiCalls := atomic.Int64{} - sc := newCache[hcloud.Server]( nil, nil, @@ -404,24 +402,18 @@ func TestServerCacheModePerServer_WithTTLRefreshOpts(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) // Populate cache with default TTL - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(1), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 1, Name: "test"}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) + srv, err := sc.ByID(ctx, 1) require.NoError(t, err) assertServer1(t, srv) assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) // Cache miss by ID 2, fetch from API with different TTL - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(2), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 2, Name: "test2"}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2"}, nil) // Fetch Server ID 2, use larger TTL srv, err = sc.ByID(ctx, 2, WithTTL(2*sc.defaultTTL)) @@ -439,7 +431,7 @@ func TestServerCacheModePerServer_WithTTLRefreshOpts(t *testing.T) { assertServer2(t, srv) // Expect two API calls - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) // Server ID 1 is not evicted, because no refresh happened assert.Len(t, sc.byID, 2) @@ -456,8 +448,6 @@ func TestServerCacheModePerServer_WithTTLRefreshOpts(t *testing.T) { func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { synctest.Test(t, func(t *testing.T) { - apiCalls := atomic.Int64{} - sc := newCache[hcloud.Server]( nil, nil, @@ -469,24 +459,19 @@ func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) // Populate cache with default TTL - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(1), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 1, Name: "test", Status: hcloud.ServerStatusRunning}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test", Status: hcloud.ServerStatusRunning}, nil) + srv, err := sc.ByID(ctx, 1) require.NoError(t, err) assertServer1(t, srv) assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) // Cache miss by ID 2, fetch from API with different TTL - sc.fetchOneByID = func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(2), id) - apiCalls.Add(1) - return &hcloud.Server{ID: 2, Name: "test2", Status: hcloud.ServerStatusOff}, nil - } + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2", Status: hcloud.ServerStatusOff}, nil) + // Fetch Server ID 2, use larger TTL srv, err = sc.ByID(ctx, 2) require.NoError(t, err) @@ -499,13 +484,11 @@ func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { // Ensure we only call fetchAll sc.fetchOneByID = nil - sc.fetchAll = func(_ context.Context) ([]*hcloud.Server, error) { - apiCalls.Add(1) - return []*hcloud.Server{ - {ID: 1, Name: "test", Status: hcloud.ServerStatusRunning}, - {ID: 2, Name: "test2", Status: hcloud.ServerStatusRunning}, - }, nil - } + sc.fetchAll = client.FetchAllFunc([]*hcloud.Server{ + {ID: 1, Name: "test", Status: hcloud.ServerStatusRunning}, + {ID: 2, Name: "test2", Status: hcloud.ServerStatusRunning}, + }, nil) + srv, err = sc.ByID(ctx, 1, WithMode(ModeAll)) require.NoError(t, err) assertServer1(t, srv) @@ -517,7 +500,7 @@ func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { assert.Equal(t, hcloud.ServerStatusRunning, srv.Status) // Expect two API calls - assert.Equal(t, int64(3), apiCalls.Load()) + assert.Equal(t, 3, client.CallCount()) // Server ID 1 is not evicted, because no refresh happened assert.Len(t, sc.byID, 2) @@ -534,23 +517,10 @@ func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { func TestServerCacheAllModesError(t *testing.T) { testCase := func(t *testing.T, mode Mode) { - apiCalls := atomic.Int64{} - sc := newCache[hcloud.Server]( - func(_ context.Context, id int64) (*hcloud.Server, error) { - assert.Equal(t, int64(1), id) - apiCalls.Add(1) - return nil, fmt.Errorf("test error") - }, - func(_ context.Context, name string) (*hcloud.Server, error) { - assert.Equal(t, "test", name) - apiCalls.Add(1) - return nil, fmt.Errorf("test error") - }, - func(_ context.Context) ([]*hcloud.Server, error) { - apiCalls.Add(1) - return nil, fmt.Errorf("test error") - }, + nil, + nil, + nil, func(value *hcloud.Server) int64 { return value.ID }, func(value *hcloud.Server) string { return value.Name }, mode, @@ -558,6 +528,11 @@ func TestServerCacheAllModesError(t *testing.T) { ) ctx := t.Context() + client := newTestClient(t) + + sc.fetchOneByID = client.FetchOneByIDFunc(nil, fmt.Errorf("test error")) + sc.fetchOneByName = client.FetchOneByNameFunc(nil, fmt.Errorf("test error")) + sc.fetchAll = client.FetchAllFunc(nil, fmt.Errorf("test error")) // Cache miss by ID 1, fetch from API srv, err := sc.ByID(ctx, 1) @@ -572,10 +547,13 @@ func TestServerCacheAllModesError(t *testing.T) { srv, err = sc.ByID(ctx, 1) require.ErrorContains(t, err, "test error") assert.Nil(t, srv) - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) // Reset for fetch by Name - apiCalls = atomic.Int64{} + client = newTestClient(t) + sc.fetchOneByID = client.FetchOneByIDFunc(nil, fmt.Errorf("test error")) + sc.fetchOneByName = client.FetchOneByNameFunc(nil, fmt.Errorf("test error")) + sc.fetchAll = client.FetchAllFunc(nil, fmt.Errorf("test error")) // Cache miss by name "test", fetch from API srv, err = sc.ByName(ctx, "test") @@ -590,7 +568,7 @@ func TestServerCacheAllModesError(t *testing.T) { srv, err = sc.ByName(ctx, "test") require.ErrorContains(t, err, "test error") assert.Nil(t, srv) - assert.Equal(t, int64(2), apiCalls.Load()) + assert.Equal(t, 2, client.CallCount()) } for _, mode := range []Mode{ModeAll, ModeOne, ModeOff} {