diff --git a/dev/Dockerfile b/dev/Dockerfile index 2a2c05500..15ad84a2f 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.26 as builder +FROM golang:1.26 AS builder WORKDIR /build diff --git a/docs/reference/README.md b/docs/reference/README.md index 82e995f24..12942c0be 100644 --- a/docs/reference/README.md +++ b/docs/reference/README.md @@ -6,3 +6,4 @@ In this folder, you should find technical references material of the hcloud-clou - [Version Policy](version-policy.md) - [Load Balancer Annotations](load_balancer_annotations.md) - [Load Balancer Environment Variables](load_balancer_envs.md) +- [Instance Cache](instance_cache.md) diff --git a/docs/reference/instance_cache.md b/docs/reference/instance_cache.md new file mode 100644 index 000000000..65461c00a --- /dev/null +++ b/docs/reference/instance_cache.md @@ -0,0 +1,26 @@ +# Instance Cache + +> **Experimental:** Instance caching is experimental, breaking changes may occur within minor releases. We believe the implementation is safe in practice — that is why it ships enabled by default (`all-server`). Set `HCLOUD_CACHE_MODE=off` to opt out. + +The instance cache reduces calls to the Hetzner Cloud API made by the `InstancesV2` controller, which looks up Servers by ID or name to reconcile Node state. The cache sits between the controller and the Hetzner Cloud API; behavior is controlled by the environment variables below. + +## Environment Variables + +| Name | Type | Default | Description | +| ------------------- | ------------------- | ------- | ------------------------------------------------------------------------------------- | +| `HCLOUD_CACHE_MODE` | `all \| one \| off` | `all` | Selects the caching strategy. See [Modes](#modes) below. | +| `HCLOUD_CACHE_TTL` | `duration` | `10s` | Lifetime of cached entries. Accepts any Go `time.Duration` string (e.g. `30s`, `2m`). | + +## Modes + +### `all` + +Fetches every Server in the project with a single `GET /servers` call and serves all subsequent `ByID` / `ByName` lookups from the resulting snapshot until the TTL expires. The snapshot is refreshed on the next lookup after expiry. On a cache miss within the TTL (e.g. a freshly created Server), one rate-limited refresh per TTL window is allowed to pick up the new Server; further misses in the same window return without an API call. + +### `one` + +Caches each Server individually with its own expiration. A `ByID` / `ByName` lookup either returns a non-expired entry or issues a `GET /servers/{id}` (or `GET /servers?name=`) call and stores the result. Expired entries are evicted lazily when other entries are inserted. + +### `off` + +Disables caching entirely. Every lookup goes directly to the API. diff --git a/hcloud/cloud.go b/hcloud/cloud.go index ab1bd4cdf..67bbeb713 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -37,6 +37,7 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/robot" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/metadata" ) @@ -50,13 +51,14 @@ const ( var providerVersion = "unknown" type cloud struct { - client *hcloud.Client - robotClient hrobot.RobotClient - cfg config.HCCMConfiguration - recorder record.EventRecorder - networkID int64 - cidr string - nodeLister corelisters.NodeLister + client *hcloud.Client + robotClient hrobot.RobotClient + instanceCache *servercache.Cache[hcloud.Server] + cfg config.HCCMConfiguration + recorder record.EventRecorder + networkID int64 + cidr string + nodeLister corelisters.NodeLister } func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Interface, error) { @@ -144,13 +146,16 @@ func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Int klog.Infof("Hetzner Cloud k8s cloud controller %s started\n", providerVersion) + instanceCache := servercache.NewServerCache(client, cfg.Cache.Mode, cfg.Cache.TTL) + return &cloud{ - client: client, - robotClient: robotClient, - cfg: cfg, - networkID: networkID, - cidr: cidr, - nodeLister: nodeLister, + client: client, + robotClient: robotClient, + instanceCache: instanceCache, + cfg: cfg, + networkID: networkID, + cidr: cidr, + nodeLister: nodeLister, }, nil } @@ -175,7 +180,7 @@ func (c *cloud) Instances() (cloudprovider.Instances, bool) { } func (c *cloud) InstancesV2() (cloudprovider.InstancesV2, bool) { - return newInstances(c.client, c.robotClient, c.recorder, c.networkID, c.cfg), true + return newInstances(c.client, c.robotClient, c.instanceCache, c.recorder, c.networkID, c.cfg), true } func (c *cloud) Zones() (cloudprovider.Zones, bool) { diff --git a/hcloud/cloud_test.go b/hcloud/cloud_test.go index 28e363c95..e246b2325 100644 --- a/hcloud/cloud_test.go +++ b/hcloud/cloud_test.go @@ -31,6 +31,7 @@ import ( "k8s.io/client-go/tools/record" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/config" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" @@ -41,6 +42,7 @@ type testEnv struct { Mux *http.ServeMux Client *hcloud.Client RobotClient hrobot.RobotClient + ServerCache *servercache.Cache[hcloud.Server] Recorder record.EventRecorder Cfg config.HCCMConfiguration } @@ -51,6 +53,7 @@ func (env *testEnv) Teardown() { env.Mux = nil env.Client = nil env.RobotClient = nil + env.ServerCache = nil env.Recorder = nil } @@ -66,6 +69,7 @@ func newTestEnv() testEnv { ) robotClient := hrobot.NewBasicAuthClient("", "") robotClient.SetBaseURL(server.URL + "/robot") + serverCache := servercache.NewServerCache(client, servercache.ModeOne, 10*time.Second) recorder := record.NewBroadcaster().NewRecorder(scheme.Scheme, corev1.EventSource{Component: "hcloud-cloud-controller-manager"}) cfg := config.HCCMConfiguration{} @@ -76,6 +80,7 @@ func newTestEnv() testEnv { Mux: mux, Client: client, RobotClient: robotClient, + ServerCache: serverCache, Recorder: recorder, Cfg: cfg, } diff --git a/hcloud/instances.go b/hcloud/instances.go index a6f7242b5..f08e32068 100644 --- a/hcloud/instances.go +++ b/hcloud/instances.go @@ -33,17 +33,20 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/legacydatacenter" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/providerid" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) const ( ProvidedBy = "instance.hetzner.cloud/provided-by" MisconfiguredInternalIP = "MisconfiguredInternalIP" + instancesV2Subsystem = "instances_v2" ) type instances struct { client *hcloud.Client robotClient hrobot.RobotClient + serverCache *servercache.Cache[hcloud.Server] recorder record.EventRecorder networkID int64 cfg config.HCCMConfiguration @@ -57,6 +60,7 @@ var ( func newInstances( client *hcloud.Client, robotClient hrobot.RobotClient, + serverCache *servercache.Cache[hcloud.Server], recorder record.EventRecorder, networkID int64, cfg config.HCCMConfiguration, @@ -64,6 +68,7 @@ func newInstances( return &instances{ client, robotClient, + serverCache, recorder, networkID, cfg, @@ -77,16 +82,17 @@ func (i *instances) lookupServer( ctx context.Context, node *corev1.Node, ) (genericServer, error) { + ctx = servercache.SetSubsystem(ctx, instancesV2Subsystem) + if node.Spec.ProviderID != "" { var serverID int64 serverID, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID) - if err != nil { return nil, fmt.Errorf("failed to convert provider id to server id: %w", err) } if isCloudServer { - server, err := getCloudServerByID(ctx, i.client, serverID) + server, err := i.serverCache.ByID(ctx, serverID) if err != nil { return nil, fmt.Errorf("failed to get hcloud server \"%d\": %w", serverID, err) } @@ -115,7 +121,7 @@ func (i *instances) lookupServer( // If the node has no provider ID we try to find the server by name from // both sources. In case we find two servers, we return an error. - cloudServer, err := getCloudServerByName(ctx, i.client, node.Name) + cloudServer, err := i.serverCache.ByName(ctx, node.Name) if err != nil { return nil, fmt.Errorf("failed to get hcloud server %q: %w", node.Name, err) } @@ -153,6 +159,7 @@ func (i *instances) lookupServer( func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool, error) { const op = "hcloud/instancesv2.InstanceExists" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceExists called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { @@ -165,6 +172,7 @@ func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bool, error) { const op = "hcloud/instancesv2.InstanceShutdown" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceShutdown called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { @@ -174,7 +182,8 @@ func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bo if server == nil { return false, fmt.Errorf( "%s: failed to get instance metadata: no matching server found for node '%s': %w", - op, node.Name, errServerNotFound) + op, node.Name, errServerNotFound, + ) } isShutdown, err := server.IsShutdown() @@ -188,6 +197,7 @@ func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bo func (i *instances) InstanceMetadata(ctx context.Context, node *corev1.Node) (*cloudprovider.InstanceMetadata, error) { const op = "hcloud/instancesv2.InstanceMetadata" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceMetadata called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { @@ -197,7 +207,8 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *corev1.Node) (*c if server == nil { return nil, fmt.Errorf( "%s: failed to get instance metadata: no matching server found for node '%s': %w", - op, node.Name, errServerNotFound) + op, node.Name, errServerNotFound, + ) } metadata, err := server.Metadata(i.networkID, node, i.cfg) diff --git a/hcloud/instances_test.go b/hcloud/instances_test.go index 5f4aef91b..81871a3f2 100644 --- a/hcloud/instances_test.go +++ b/hcloud/instances_test.go @@ -91,7 +91,7 @@ func TestInstances_InstanceExists(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) tests := []struct { name string @@ -104,7 +104,8 @@ func TestInstances_InstanceExists(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, }, expected: true, - }, { + }, + { name: "existing robot server by id", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -123,25 +124,29 @@ func TestInstances_InstanceExists(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://bm-321"}, }, expected: true, - }, { + }, + { name: "missing server by id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://2"}, }, expected: false, - }, { + }, + { name: "missing robot server by id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hrobot://322"}, }, expected: false, - }, { + }, + { name: "missing robot server by (legacy) id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://bm-322"}, }, expected: false, - }, { + }, + { name: "existing server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -149,7 +154,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: true, - }, { + }, + { name: "existing robot server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -157,7 +163,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: true, - }, { + }, + { name: "missing server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -165,7 +172,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: false, - }, { + }, + { name: "missing robot server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -211,7 +219,7 @@ func TestInstances_InstanceShutdown(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) env.Mux.HandleFunc("/robot/server/3", func(w http.ResponseWriter, _ *http.Request) { json.NewEncoder(w).Encode(hrobotmodels.ServerResponse{ Server: hrobotmodels.Server{ @@ -274,13 +282,15 @@ func TestInstances_InstanceShutdown(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, }, expected: false, - }, { + }, + { name: "[cloud] shutdown", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://2"}, }, expected: true, - }, { + }, + { name: "[robot] running", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -289,7 +299,8 @@ func TestInstances_InstanceShutdown(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hrobot://3"}, }, expected: false, - }, { + }, + { name: "[robot] shutdown", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -346,7 +357,7 @@ func TestInstances_InstanceMetadata(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, @@ -390,7 +401,7 @@ func TestInstances_InstanceMetadataRobotServer(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ diff --git a/hcloud/instances_util.go b/hcloud/instances_util.go index 370c68c9a..a86f0b447 100644 --- a/hcloud/instances_util.go +++ b/hcloud/instances_util.go @@ -17,7 +17,6 @@ limitations under the License. package hcloud import ( - "context" "fmt" "regexp" "strings" @@ -26,9 +25,6 @@ import ( hrobotmodels "github.com/syself/hrobot-go/models" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" - - "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" - "github.com/hetznercloud/hcloud-go/v2/hcloud" ) type MockEventRecorder struct{} @@ -51,29 +47,6 @@ func (er *MockEventRecorder) AnnotatedEventf( ) { } -func getCloudServerByName(ctx context.Context, c *hcloud.Client, name string) (*hcloud.Server, error) { - const op = "hcloud/getCloudServerByName" - metrics.OperationCalled.WithLabelValues(op).Inc() - - server, _, err := c.Server.GetByName(ctx, name) - if err != nil { - return nil, fmt.Errorf("%s: %w", op, err) - } - - return server, nil -} - -func getCloudServerByID(ctx context.Context, c *hcloud.Client, id int64) (*hcloud.Server, error) { - const op = "hcloud/getCloudServerByID" - metrics.OperationCalled.WithLabelValues(op).Inc() - - server, _, err := c.Server.GetByID(ctx, id) - if err != nil { - return nil, fmt.Errorf("%s: %w", op, err) - } - return server, nil -} - func getRobotServerByName(c hrobot.RobotClient, node *corev1.Node) (server *hrobotmodels.Server, err error) { const op = "hcloud/getRobotServerByName" diff --git a/internal/config/config.go b/internal/config/config.go index 32c08f409..91d476ad8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,6 +11,7 @@ import ( "k8s.io/klog/v2" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/kit/envutil" ) @@ -29,6 +30,8 @@ const ( robotForwardInternalIPs = "ROBOT_FORWARD_INTERNAL_IPS" hcloudInstancesAddressFamily = "HCLOUD_INSTANCES_ADDRESS_FAMILY" + hcloudCacheMode = "HCLOUD_CACHE_MODE" + hcloudCacheTTL = "HCLOUD_CACHE_TTL" // Disable the "master/server is attached to the network" check against the metadata service. hcloudNetworkDisableAttachedCheck = "HCLOUD_NETWORK_DISABLE_ATTACHED_CHECK" @@ -67,10 +70,17 @@ const ( AddressFamilyIPv4 AddressFamily = "ipv4" ) +const CacheDefaultTTL time.Duration = 10 * time.Second + type InstanceConfiguration struct { AddressFamily AddressFamily } +type CacheConfiguration struct { + Mode servercache.Mode + TTL time.Duration +} + type LoadBalancerConfiguration struct { AlgorithmType hcloud.LoadBalancerAlgorithmType DisablePublicNetwork *bool @@ -105,6 +115,7 @@ type HCCMConfiguration struct { LoadBalancer LoadBalancerConfiguration Network NetworkConfiguration Route RouteConfiguration + Cache CacheConfiguration } // Read evaluates all environment variables and returns a [HCCMConfiguration]. It only validates as far as @@ -174,6 +185,28 @@ func Read() (HCCMConfiguration, error) { cfg.Instance.AddressFamily = AddressFamilyIPv4 } + // ---- Server Cache ---- + + cfg.Cache = CacheConfiguration{ + Mode: servercache.ModeAll, + TTL: CacheDefaultTTL, + } + + if mode, ok := os.LookupEnv(hcloudCacheMode); ok { + klog.Warningf("Experimental: %s is experimental, breaking changes may occur within minor releases.", hcloudCacheMode) + cfg.Cache.Mode = servercache.Mode(mode) + } + + if ttlStr, ok := os.LookupEnv(hcloudCacheTTL); ok { + klog.Warningf("Experimental: %s is experimental, breaking changes may occur within minor releases.", hcloudCacheTTL) + ttl, err := time.ParseDuration(ttlStr) + if err != nil { + errs = append(errs, fmt.Errorf("invalid value for %q: %w", hcloudCacheTTL, err)) + } else { + cfg.Cache.TTL = ttl + } + } + cfg.LoadBalancer.Enabled, err = getEnvBool(hcloudLoadBalancersEnabled, true) if err != nil { errs = append(errs, err) diff --git a/internal/config/config_test.go b/internal/config/config_test.go index d6fb65ede..ccf88a00b 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -7,6 +7,7 @@ import ( "github.com/stretchr/testify/assert" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) @@ -26,6 +27,7 @@ func TestRead(t *testing.T) { Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -49,6 +51,7 @@ func TestRead(t *testing.T) { Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ NameOrID: "foobar", AttachedCheckEnabled: true, @@ -86,6 +89,7 @@ func TestRead(t *testing.T) { }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -142,6 +146,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -171,6 +176,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: false, Address: "127.0.0.1:9999"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -202,6 +208,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -234,6 +241,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -254,6 +262,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -275,6 +284,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -298,6 +308,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -325,6 +336,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Cache: CacheConfiguration{Mode: servercache.ModeAll, TTL: 10 * time.Second}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -526,7 +538,6 @@ func TestHCCMConfiguration_Validate(t *testing.T) { { name: "robot & routes activated", fields: fields{ - HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, Route: RouteConfiguration{Enabled: true}, diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 46663e62d..d17d79a76 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -37,10 +37,15 @@ var ( Name: "cloud_controller_manager_operations_total", Help: "The total number of operation was called", }, []string{"op"}) + + CacheRequests = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cloud_controller_manager_cache_requests_total", + Help: "Total cache requests partitioned by cache name and result.", + }, []string{"subsystem", "mode", "result"}) ) func init() { - GetRegistry().MustRegister(OperationCalled) + GetRegistry().MustRegister(OperationCalled, CacheRequests) } func GetRegistry() prometheus.Registerer { diff --git a/internal/servercache/context.go b/internal/servercache/context.go new file mode 100644 index 000000000..68a0e93b4 --- /dev/null +++ b/internal/servercache/context.go @@ -0,0 +1,21 @@ +package servercache + +import ( + "context" +) + +type key struct{} + +var subsystemKey = key{} + +func SetSubsystem(ctx context.Context, subsystem string) context.Context { + return context.WithValue(ctx, subsystemKey, subsystem) +} + +func GetSubsystem(ctx context.Context) string { + result, ok := ctx.Value(subsystemKey).(string) + if !ok { + return "none" + } + return result +} diff --git a/internal/servercache/servercache.go b/internal/servercache/servercache.go new file mode 100644 index 000000000..10fbe73a5 --- /dev/null +++ b/internal/servercache/servercache.go @@ -0,0 +1,300 @@ +package servercache + +import ( + "context" + "maps" + "sync" + "time" + + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +type Mode string + +const ( + // ModeAll fetches and caches all Servers. + ModeAll Mode = "all" + // ModeOne fetches and caches one Server. + ModeOne Mode = "one" + // ModeOff disables caching. + ModeOff Mode = "off" +) + +type RefreshOpts struct { + ttl time.Duration + mode Mode +} + +func newCacheRefreshOpts[T any](cache *Cache[T], opts ...RefreshOption) *RefreshOpts { + refreshOpts := &RefreshOpts{ + ttl: cache.defaultTTL, + mode: cache.defaultMode, + } + for _, opt := range opts { + opt(refreshOpts) + } + return refreshOpts +} + +type RefreshOption func(ro *RefreshOpts) + +func WithTTL(ttl time.Duration) func(*RefreshOpts) { + return func(ro *RefreshOpts) { + ro.ttl = ttl + } +} + +func WithMode(mode Mode) func(*RefreshOpts) { + return func(ro *RefreshOpts) { + ro.mode = mode + } +} + +type entry[T any] struct { + expiresAt time.Time + value *T +} + +type Cache[T any] struct { + fetchOneByID func(ctx context.Context, id int64) (*T, error) + fetchOneByName func(ctx context.Context, name string) (*T, error) + fetchAll func(ctx context.Context) ([]*T, error) + getID func(value *T) int64 + getName func(value *T) string + + defaultTTL time.Duration + defaultMode Mode + + byID map[int64]*entry[T] + byName map[string]*entry[T] + + mu sync.Mutex +} + +func NewServerCache(client *hcloud.Client, defaultMode Mode, defaultTTL time.Duration) *Cache[hcloud.Server] { + return newCache[hcloud.Server]( + func(ctx context.Context, id int64) (*hcloud.Server, error) { + value, _, err := client.Server.GetByID(ctx, id) + return value, err + }, + func(ctx context.Context, name string) (*hcloud.Server, error) { + value, _, err := client.Server.GetByName(ctx, name) + return value, err + }, + func(ctx context.Context) ([]*hcloud.Server, error) { + values, err := client.Server.All(ctx) + return values, err + }, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + defaultMode, + defaultTTL, + ) +} + +func newCache[T any]( + fetchOneByID func(ctx context.Context, id int64) (*T, error), + fetchOneByName func(ctx context.Context, name string) (*T, error), + fetchAll func(ctx context.Context) ([]*T, error), + getID func(value *T) int64, + getName func(value *T) string, + defaultMode Mode, + defaultTTL time.Duration, +) *Cache[T] { + return &Cache[T]{ + fetchOneByID: fetchOneByID, + fetchOneByName: fetchOneByName, + fetchAll: fetchAll, + getID: getID, + getName: getName, + + defaultMode: defaultMode, + defaultTTL: defaultTTL, + + byID: make(map[int64]*entry[T]), + byName: make(map[string]*entry[T]), + } +} + +func (c *Cache[T]) ByID(ctx context.Context, id int64, opts ...RefreshOption) (*T, error) { + return c.getFromCache( + ctx, + GetSubsystem(ctx), + func() *entry[T] { + return c.byID[id] + }, + func() (*T, error) { + return c.fetchOneByID(ctx, id) + }, + opts..., + ) +} + +func (c *Cache[T]) ByName(ctx context.Context, name string, opts ...RefreshOption) (*T, error) { + return c.getFromCache( + ctx, + GetSubsystem(ctx), + func() *entry[T] { + return c.byName[name] + }, + func() (*T, error) { + return c.fetchOneByName(ctx, name) + }, + opts..., + ) +} + +func (c *Cache[T]) getFromCache( + ctx context.Context, + subsystem string, + lookup func() *entry[T], + fetch func() (*T, error), + opts ...RefreshOption, +) (*T, error) { + refreshOpts := newCacheRefreshOpts(c, opts...) + + if refreshOpts.mode == ModeOff { + metrics.CacheRequests.WithLabelValues(subsystem, string(refreshOpts.mode), "miss").Inc() + klog.V(4).InfoS("cache mode is off: fetching entry from api", "subsystem", subsystem) + return fetch() + } + + c.mu.Lock() + defer c.mu.Unlock() + + if e := lookup(); e != nil && time.Now().Before(e.expiresAt) { + metrics.CacheRequests.WithLabelValues(subsystem, string(refreshOpts.mode), "hit").Inc() + klog.V(4).InfoS( + "cache hit", + "subsystem", subsystem, + "id", c.getID(e.value), + "name", c.getName(e.value), + "expiresAt", e.expiresAt.Format(time.RFC3339), + ) + return e.value, nil + } + + switch refreshOpts.mode { + case ModeOne: + if err := c.refreshOne(subsystem, fetch, refreshOpts.ttl); err != nil { + return nil, err + } + case ModeAll: + if err := c.refreshAll(ctx, subsystem, refreshOpts.ttl); err != nil { + return nil, err + } + case ModeOff: + // Handled above -> early return + } + + metrics.CacheRequests.WithLabelValues(subsystem, string(refreshOpts.mode), "miss").Inc() + + if e := lookup(); e != nil { + klog.V(4).InfoS( + "entry found after refresh", + "subsystem", subsystem, + "id", c.getID(e.value), "name", c.getName(e.value), + ) + return e.value, nil + } + + klog.V(4).InfoS("entry not found after refresh", "subsystem", subsystem) + return nil, nil +} + +func (c *Cache[T]) refreshOne( + subsystem string, + fetch func() (*T, error), + ttl time.Duration, +) error { + klog.V(4).InfoS("refreshing entry from api", "subsystem", subsystem) + value, err := fetch() + if err != nil { + return err + } + + if value == nil { + return nil + } + + e := &entry[T]{ + value: value, + expiresAt: time.Now().Add(ttl), + } + klog.V(4).InfoS( + "refreshed entry from api", + "subsystem", subsystem, + "id", c.getID(e.value), + "name", c.getName(e.value), + "expiresAt", e.expiresAt.Format(time.RFC3339), + ) + + c.byID[c.getID(value)] = e + c.byName[c.getName(value)] = e + + // Evict expired entries so the cache does not grow indefinitely. This ensures deleted + // Nodes or renamed Servers are cleaned from the cache. + maps.DeleteFunc(c.byID, func(_ int64, ev *entry[T]) bool { + if time.Now().After(ev.expiresAt) { + klog.V(4).InfoS( + "evicting entry from cache by id", + "subsystem", subsystem, + "id", c.getID(ev.value), + "name", c.getName(ev.value), + "expiresAt", ev.expiresAt.Format(time.RFC3339), + ) + return true + } + return false + }) + maps.DeleteFunc(c.byName, func(_ string, ev *entry[T]) bool { + if time.Now().After(ev.expiresAt) { + klog.V(4).InfoS( + "evicting entry from cache by name", + "subsystem", subsystem, + "id", c.getID(ev.value), + "name", c.getName(ev.value), + "expiresAt", ev.expiresAt.Format(time.RFC3339), + ) + return true + } + return false + }) + + return nil +} + +func (c *Cache[T]) refreshAll(ctx context.Context, subsystem string, ttl time.Duration) error { + klog.V(4).InfoS("refreshing all entries from api", "subsystem", subsystem) + + values, err := c.fetchAll(ctx) + if err != nil { + return err + } + + c.byID = make(map[int64]*entry[T], len(values)) + c.byName = make(map[string]*entry[T], len(values)) + + expiresAt := time.Now().Add(ttl) + + for _, value := range values { + e := &entry[T]{ + value: value, + expiresAt: expiresAt, + } + + c.byID[c.getID(value)] = e + c.byName[c.getName(value)] = e + } + + klog.V(4).InfoS( + "refreshed all entries from api", + "subsystem", subsystem, + "count", len(values), + "expiresAt", expiresAt.Format(time.RFC3339), + ) + return nil +} diff --git a/internal/servercache/servercache_test.go b/internal/servercache/servercache_test.go new file mode 100644 index 000000000..3c20596b9 --- /dev/null +++ b/internal/servercache/servercache_test.go @@ -0,0 +1,631 @@ +package servercache + +import ( + "context" + "fmt" + "testing" + "testing/synctest" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" + "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" +) + +func assertServer1(t *testing.T, server *hcloud.Server) { + t.Helper() + require.NotNil(t, server) + assert.Equal(t, int64(1), server.ID) + assert.Equal(t, "test", server.Name) +} + +func assertServer2(t *testing.T, server *hcloud.Server) { + t.Helper() + require.NotNil(t, server) + assert.Equal(t, int64(2), server.ID) + assert.Equal(t, "test2", server.Name) +} + +type testClient struct { + t *testing.T + callCount int +} + +func newTestClient(t *testing.T) *testClient { + return &testClient{t: t, callCount: 0} +} + +func (c *testClient) CallCount() int { + return c.callCount +} + +func (c *testClient) FetchAllFunc(servers []*hcloud.Server, err error) func(context.Context) ([]*hcloud.Server, error) { + return func(context.Context) ([]*hcloud.Server, error) { + c.t.Helper() + + c.callCount++ + return servers, err + } +} + +func (c *testClient) FetchOneByIDFunc(server *hcloud.Server, err error) func(context.Context, int64) (*hcloud.Server, error) { + return func(_ context.Context, id int64) (*hcloud.Server, error) { + c.t.Helper() + + if server != nil { + require.Equal(c.t, server.ID, id, "fetch one by id expected id %d, got %d", server.ID, id) + } + + c.callCount++ + return server, err + } +} + +func (c *testClient) FetchOneByNameFunc(server *hcloud.Server, err error) func(context.Context, string) (*hcloud.Server, error) { + return func(_ context.Context, name string) (*hcloud.Server, error) { + c.t.Helper() + + if server != nil { + require.Equal(c.t, server.Name, name, "fetch one by name expected name %s, got %s", server.Name, name) + } + + c.callCount++ + return server, err + } +} + +func TestServerCacheModeAllServers(t *testing.T) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + ModeAll, + 10*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + // Cache miss by ID 1, fetch from API + sc.fetchAll = client.FetchAllFunc([]*hcloud.Server{{ID: 1, Name: "test"}, {ID: 2, Name: "test2"}}, nil) + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Fetch all returns 2 servers + assert.Equal(t, 1, client.CallCount()) + assert.Len(t, sc.byID, 2) + assert.Len(t, sc.byName, 2) + + assert.True(t, sc.byID[srv.ID].expiresAt.After(time.Now())) + assert.Equal(t, srv, sc.byID[srv.ID].value) + assert.Equal(t, srv, sc.byName[srv.Name].value) + + // Cache hit by ID 1 + srv, err = sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Cache hit by ID 2 + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Cache hit by Name 1 + srv, err = sc.ByName(ctx, "test") + require.NoError(t, err) + assertServer1(t, srv) + + // Cache hit by Name 2 + srv, err = sc.ByName(ctx, "test2") + require.NoError(t, err) + assertServer2(t, srv) + + // Fetched two Servers with one API call + assert.Equal(t, 1, client.CallCount()) +} + +func TestServerCacheModeAllServersNotFound(t *testing.T) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + ModeAll, + 10*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + // Cache miss by ID 1, fetch from API, not found + sc.fetchAll = client.FetchAllFunc([]*hcloud.Server{{ID: 2, Name: "test2"}, {ID: 3, Name: "test3"}}, nil) + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assert.Nil(t, srv) + + // Fetch all returns 2 servers + assert.Equal(t, 1, client.CallCount()) + assert.Len(t, sc.byID, 2) + assert.Len(t, sc.byName, 2) + + // Cache hit by ID 2 + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Cache hit by Name "test2" + srv, err = sc.ByName(ctx, "test2") + require.NoError(t, err) + assertServer2(t, srv) + + // Cache miss by name "test", fetch from API, not found + srv, err = sc.ByName(ctx, "test") + require.NoError(t, err) + assert.Nil(t, srv) + + // Fetched two Servers with one API call + assert.Equal(t, 2, client.CallCount()) +} + +func TestServerCacheModePerServer(t *testing.T) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + ModeOne, + 10*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + // Cache miss by ID 1, fetch from API + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Fetched one server + assert.Equal(t, 1, client.CallCount()) + assert.Len(t, sc.byID, 1) + assert.Len(t, sc.byName, 1) + + assert.True(t, sc.byID[srv.ID].expiresAt.After(time.Now())) + assert.Equal(t, srv, sc.byID[srv.ID].value) + assert.Equal(t, srv, sc.byName[srv.Name].value) + + // Cache hit by ID 1 + srv, err = sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Cache miss by ID 2, fetch from API + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2"}, nil) + + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Cache hit by Name 1 + srv, err = sc.ByName(ctx, "test") + require.NoError(t, err) + assertServer1(t, srv) + + // Cache hit by Name 2 + srv, err = sc.ByName(ctx, "test2") + require.NoError(t, err) + assertServer2(t, srv) + + // Fetched two servers individually + assert.Equal(t, 2, client.CallCount()) +} + +func TestServerCacheModeOneNotFound(t *testing.T) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + ModeOne, + 10*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + // Cache miss by ID 1, fetch from API, not found + sc.fetchOneByID = client.FetchOneByIDFunc(nil, nil) + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assert.Nil(t, srv) + + // Cached zero server + assert.Equal(t, 1, client.CallCount()) + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Cache miss by ID 2, fetch from API + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2"}, nil) + + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Cached one server + assert.Equal(t, 2, client.CallCount()) + assert.Len(t, sc.byID, 1) + assert.Len(t, sc.byName, 1) + + // Cache miss by ID 1, fetch from API, not found + sc.fetchOneByID = client.FetchOneByIDFunc(nil, nil) + + srv, err = sc.ByID(ctx, 1) + require.NoError(t, err) + assert.Nil(t, srv) + + // Fetched zero server + assert.Equal(t, 3, client.CallCount()) + assert.Len(t, sc.byID, 1) + assert.Len(t, sc.byName, 1) +} + +func TestServerCacheModeOff(t *testing.T) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + ModeOff, + 10*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + // Cache miss by ID 1, fetch from API + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + // Fetched one server + assert.Equal(t, 1, client.CallCount()) + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Cache miss by ID 1, fetch from API + srv, err = sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + + assert.Equal(t, 2, client.CallCount()) + // Entries are not cached + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Reset + sc.fetchOneByID = nil + client = newTestClient(t) + + // Cache miss by Name "test", fetch from API + sc.fetchOneByName = client.FetchOneByNameFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) + + srv, err = sc.ByName(ctx, "test") + require.NoError(t, err) + assertServer1(t, srv) + + // Fetched one server + assert.Equal(t, 1, client.CallCount()) + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Cache miss by Name "test", fetch from API + srv, err = sc.ByName(ctx, "test") + require.NoError(t, err) + assertServer1(t, srv) + + assert.Equal(t, 2, client.CallCount()) + // Entries are not cached + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) +} + +func TestServerCacheModePerServer_EvictExpiredEntries(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + ModeOne, + 10*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + // Populate cache + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + + assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Wait for expiration + time.Sleep(sc.defaultTTL + 1) + + // Cache miss by ID 2, fetch from API + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2"}, nil) + + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Fetched two servers individually + assert.Equal(t, 2, client.CallCount()) + + // Server ID 1 has been evicted + assert.Len(t, sc.byID, 1) + assert.Len(t, sc.byName, 1) + assert.Nil(t, sc.byID[1]) + assert.Nil(t, sc.byName["test"]) + }) +} + +func TestServerCacheModePerServer_WithTTLRefreshOpts(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + ModeOne, + 5*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + // Populate cache with default TTL + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test"}, nil) + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Cache miss by ID 2, fetch from API with different TTL + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2"}, nil) + + // Fetch Server ID 2, use larger TTL + srv, err = sc.ByID(ctx, 2, WithTTL(2*sc.defaultTTL)) + require.NoError(t, err) + assertServer2(t, srv) + // Server ID 2 should have different TTL + assert.Equal(t, time.Now().Add(2*sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Wait for expiration of Server ID 1 + time.Sleep(sc.defaultTTL + 1) + + // Fetch Server ID 2 again, Server ID 1 is not evicted as no refresh happens + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + + // Expect two API calls + assert.Equal(t, 2, client.CallCount()) + + // Server ID 1 is not evicted, because no refresh happened + assert.Len(t, sc.byID, 2) + assert.Len(t, sc.byName, 2) + assertServer1(t, sc.byID[1].value) + assertServer2(t, sc.byID[2].value) + + // Server ID 1 is expired with default TTL + assert.False(t, time.Now().Before(sc.byID[1].expiresAt)) + // Server ID 2 is still fresh -> higher TTL with `WithTTL` option + assert.True(t, time.Now().Before(sc.byID[2].expiresAt)) + }) +} + +func TestServerCacheModePerServer_WithModeRefreshOpts(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + ModeOne, + 5*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + // Populate cache with default TTL + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 1, Name: "test", Status: hcloud.ServerStatusRunning}, nil) + + srv, err := sc.ByID(ctx, 1) + require.NoError(t, err) + assertServer1(t, srv) + assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Cache miss by ID 2, fetch from API with different TTL + sc.fetchOneByID = client.FetchOneByIDFunc(&hcloud.Server{ID: 2, Name: "test2", Status: hcloud.ServerStatusOff}, nil) + + // Fetch Server ID 2, use larger TTL + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + assert.Equal(t, hcloud.ServerStatusOff, srv.Status) + assert.Equal(t, time.Now().Add(sc.defaultTTL), sc.byID[srv.ID].expiresAt) + + // Wait for expiration of Server ID 1 and 2 + time.Sleep(sc.defaultTTL + 1) + + // Ensure we only call fetchAll + sc.fetchOneByID = nil + sc.fetchAll = client.FetchAllFunc([]*hcloud.Server{ + {ID: 1, Name: "test", Status: hcloud.ServerStatusRunning}, + {ID: 2, Name: "test2", Status: hcloud.ServerStatusRunning}, + }, nil) + + srv, err = sc.ByID(ctx, 1, WithMode(ModeAll)) + require.NoError(t, err) + assertServer1(t, srv) + + // Server ID 2 is still valid and got powered on with the last fetch all + srv, err = sc.ByID(ctx, 2) + require.NoError(t, err) + assertServer2(t, srv) + assert.Equal(t, hcloud.ServerStatusRunning, srv.Status) + + // Expect two API calls + assert.Equal(t, 3, client.CallCount()) + + // Server ID 1 is not evicted, because no refresh happened + assert.Len(t, sc.byID, 2) + assert.Len(t, sc.byName, 2) + assertServer1(t, sc.byID[1].value) + assertServer2(t, sc.byID[2].value) + + // Server ID 1 is expired with default TTL + assert.True(t, time.Now().Before(sc.byID[1].expiresAt)) + // Server ID 2 is still fresh -> higher TTL with `WithTTL` option + assert.True(t, time.Now().Before(sc.byID[2].expiresAt)) + }) +} + +func TestServerCacheAllModesError(t *testing.T) { + testCase := func(t *testing.T, mode Mode) { + sc := newCache[hcloud.Server]( + nil, + nil, + nil, + func(value *hcloud.Server) int64 { return value.ID }, + func(value *hcloud.Server) string { return value.Name }, + mode, + 10*time.Second, + ) + + ctx := t.Context() + client := newTestClient(t) + + sc.fetchOneByID = client.FetchOneByIDFunc(nil, fmt.Errorf("test error")) + sc.fetchOneByName = client.FetchOneByNameFunc(nil, fmt.Errorf("test error")) + sc.fetchAll = client.FetchAllFunc(nil, fmt.Errorf("test error")) + + // Cache miss by ID 1, fetch from API + srv, err := sc.ByID(ctx, 1) + require.ErrorContains(t, err, "test error") + assert.Nil(t, srv) + + // Error - nothing stored in cache + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Second time still errors - two API calls + srv, err = sc.ByID(ctx, 1) + require.ErrorContains(t, err, "test error") + assert.Nil(t, srv) + assert.Equal(t, 2, client.CallCount()) + + // Reset for fetch by Name + client = newTestClient(t) + sc.fetchOneByID = client.FetchOneByIDFunc(nil, fmt.Errorf("test error")) + sc.fetchOneByName = client.FetchOneByNameFunc(nil, fmt.Errorf("test error")) + sc.fetchAll = client.FetchAllFunc(nil, fmt.Errorf("test error")) + + // Cache miss by name "test", fetch from API + srv, err = sc.ByName(ctx, "test") + require.ErrorContains(t, err, "test error") + assert.Nil(t, srv) + + // Error - nothing stored in cache + assert.Empty(t, sc.byID) + assert.Empty(t, sc.byName) + + // Second time still errors - two API calls + srv, err = sc.ByName(ctx, "test") + require.ErrorContains(t, err, "test error") + assert.Nil(t, srv) + assert.Equal(t, 2, client.CallCount()) + } + + for _, mode := range []Mode{ModeAll, ModeOne, ModeOff} { + t.Run(string(mode), func(t *testing.T) { testCase(t, mode) }) + } +} + +func TestNewServerCache(t *testing.T) { + // Really want to hit 100% coverage :3 + testCases := []struct { + name string + mode Mode + requests []mockutil.Request + }{ + { + mode: ModeAll, + requests: []mockutil.Request{ + {Method: "GET", Path: "/servers?page=1&per_page=50", Status: 200, JSONRaw: `{ "servers": [{ "id": 1, "name": "test" }]}`}, + }, + }, + { + mode: ModeOne, + requests: []mockutil.Request{ + {Method: "GET", Path: "/servers/1", Status: 200, JSONRaw: `{ "server": { "id": 1, "name": "test" }}`}, + }, + }, + { + mode: ModeOff, + requests: []mockutil.Request{ + {Method: "GET", Path: "/servers/1", Status: 200, JSONRaw: `{ "server": { "id": 1, "name": "test" }}`}, + {Method: "GET", Path: "/servers?name=test", Status: 200, JSONRaw: `{ "servers": [{ "id": 1, "name": "test" }]}`}, + }, + }, + } + + for _, tt := range testCases { + t.Run(string(tt.mode), func(t *testing.T) { + server := mockutil.NewServer(t, tt.requests) + client := hcloud.NewClient(hcloud.WithEndpoint(server.Server.URL)) + + cache := NewServerCache(client, tt.mode, 10*time.Second) + require.NotNil(t, cache) + require.NotNil(t, cache.fetchOneByID) + require.NotNil(t, cache.fetchOneByName) + require.NotNil(t, cache.fetchAll) + require.NotNil(t, cache.getID) + require.NotNil(t, cache.getName) + + ctx := t.Context() + + srv, err := cache.ByID(ctx, int64(1)) + require.NoError(t, err) + assert.NotNil(t, srv) + + srv, err = cache.ByName(ctx, "test") + require.NoError(t, err) + assert.NotNil(t, srv) + }) + } +}