diff --git a/internal/agentcrd/agent.go b/internal/agentcrd/agent.go index 02ca080e..c9507926 100644 --- a/internal/agentcrd/agent.go +++ b/internal/agentcrd/agent.go @@ -32,7 +32,7 @@ func Namespace(name string) string { } // HostHomePath is where the agent's .hermes data lives on the host. The -// cluster mounts this into the Hermes pod via hostPath; writing +// cluster mounts this into the Hermes pod via the data PVC; writing // SOUL.md/skills here puts them inside the pod automatically. func HostHomePath(cfg *config.Config, name string) string { desc := agentruntime.Describe(agentruntime.Hermes) diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index 59e8c66a..5774330b 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -245,10 +245,7 @@ func Sync(cfg *config.Config, id string, u *ui.UI) error { return fmt.Errorf("helmfile sync failed: %w", err) } - // Host-side chown the PVC backing dirs to the in-pod UID/GID, bypassing - // the user-namespacing that defeats the in-pod `init-hermes-perms` - // chown from #446 (see ensureHermesPVCOwnership doc comment for details). - ensureHermesPVCOwnership(cfg, id, u) + fixHermesDataPVCK3dFallback(cfg, id, u) // Publish wallet-metadata ConfigMap for the frontend (namespace now exists). applyWalletMetadataConfigMap(cfg, id, deploymentDir) @@ -775,20 +772,8 @@ func generateValues(namespace, hostname, dashboardHostname, agentBaseURL, token, runAsUser: %d runAsGroup: %d fsGroup: %d + fsGroupChangePolicy: OnRootMismatch initContainers: - - name: init-hermes-perms - image: %s - imagePullPolicy: IfNotPresent - securityContext: - runAsUser: 0 - runAsGroup: 0 - command: - - sh - - -c - - chown -R %d:%d /data - volumeMounts: - - name: data - mountPath: /data - name: init-hermes-data image: %s imagePullPolicy: IfNotPresent @@ -862,7 +847,7 @@ func generateValues(namespace, hostname, dashboardHostname, agentBaseURL, token, value: %s - name: OBOL_SKILLS_DIR value: /data/.hermes/%s - `, desc.DataPVCName, namespace, desc.ServiceName, desc.ServiceName, namespace, desc.ServiceName, desc.ServiceName, desc.ServiceName, desc.ServiceName, containerUID, containerGID, containerGID, quoteYAML(image()), containerUID, containerGID, quoteYAML(image()), desc.ServiceName, quoteYAML(image()), quoteYAML(hermesBinary), desc.DefaultPort, desc.DefaultPort, quoteYAML(primary), quoteYAML(namespace), obolSkillsDirName) + `, desc.DataPVCName, namespace, desc.ServiceName, desc.ServiceName, namespace, desc.ServiceName, desc.ServiceName, desc.ServiceName, desc.ServiceName, containerUID, containerGID, containerGID, quoteYAML(image()), desc.ServiceName, quoteYAML(image()), quoteYAML(hermesBinary), desc.DefaultPort, desc.DefaultPort, quoteYAML(primary), quoteYAML(namespace), obolSkillsDirName) if agentBaseURL != "" { fmt.Fprintf(&b, " - name: AGENT_BASE_URL\n value: %s\n", quoteYAML(agentBaseURL)) @@ -1022,7 +1007,6 @@ func syncRuntimeFiles(cfg *config.Config, id string, configData []byte, u *ui.UI if err := removeLegacyHeartbeat(targetDir); err != nil { return err } - fixRuntimeVolumeOwnership(cfg, targetDir, u) return nil } @@ -1314,94 +1298,38 @@ func fixRuntimeVolumeOwnership(cfg *config.Config, hostPath string, u *ui.UI) { } } -// hermesPVCPaths returns the host-side PVC backing directories owned by the -// Hermes pod and chowned to containerUID:containerGID. -// -// Intentionally limited to PVCs that the Hermes container itself mounts — -// `remote-signer-keystores` is excluded even though it sits in the same -// namespace because the remote-signer pod runs as runAsUser=65532 with -// fsGroup=1000 (obol/remote-signer chart) and forcing its volume to -// 10000:10000 (Hermes' UID) makes the remote-signer crash-loop on -// `failed to load keystores: Permission denied (os error 13)` against -// the read-only /data/keystores mount. The local-path-provisioner default -// of 1000:1000 already matches that pod's fsGroup contract, so leaving -// that volume untouched is the safe behavior. -func hermesPVCPaths(cfg *config.Config, id string) []string { - namespace := agentruntime.Namespace(agentruntime.Hermes, id) - return []string{ - filepath.Join(cfg.DataDir, namespace, agentruntime.Describe(agentruntime.Hermes).DataPVCName), +// fsGroup should own Hermes' data volume. This fallback only repairs legacy +// k3d/userns clusters when the init container is already visibly stuck. +func fixHermesDataPVCK3dFallback(cfg *config.Config, id string, u *ui.UI) { + backendName := "k3d" + if data, err := os.ReadFile(filepath.Join(cfg.ConfigDir, ".stack-backend")); err == nil { + backendName = strings.TrimSpace(string(data)) + } + if backendName != "k3d" { + return } -} -// ensureHermesPVCOwnership host-side chowns the Hermes PVC backing directories -// to containerUID:containerGID so the agent's init containers can write under -// /data on the first start. -// -// Why this is needed (issue #475): -// - The embedded k3d config (internal/embed/k3d-config.yaml) sets -// KubeletInUserNamespace=true. Pod "root" maps to a host subuid that -// lacks chown authority over the host bind-mount path provisioned by -// local-path-provisioner. The in-pod `init-hermes-perms` chown added in -// #446 (commit c066baa) silently no-ops in this configuration. -// - local-path-provisioner's helper-pod sets the dir to 1000:1000 (see -// internal/embed/infrastructure/base/templates/local-path.yaml). Hermes -// runs as 10000:10000, so the next init container fails on -// `mkdir /data/.hermes/home: Permission denied`. -// -// The fix is to chown from outside the user namespace: `docker exec` into the -// k3d server container runs at the host Docker daemon's authority, which is -// real root and is not subject to the kubelet's user-namespacing. -// -// Best-effort. Waits up to 60s for each PVC to be Bound (local-path uses -// WaitForFirstConsumer, so the host dir doesn't exist until the consuming -// pod is scheduled). On non-k3d backends fixRuntimeVolumeOwnership falls -// back to a plain os.Chown. -// -// If a Hermes pod is currently stuck in Init:CrashLoopBackOff because of the -// pre-fix permissions, deletes it so kubelet re-creates with the corrected -// perms immediately rather than after exponential backoff (up to ~5 min). -// Skips the delete when no pod is stuck so repeated `Sync` calls -// (e.g. `obol model sync` after `obol model prefer`) do not gratuitously -// restart a healthy agent. -func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) { namespace := agentruntime.Namespace(agentruntime.Hermes, id) - kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") - kubectlBin := filepath.Join(cfg.BinDir, "kubectl") - - // Wait only for the PVCs hermesPVCPaths chowns. remote-signer-keystores - // is intentionally NOT in this loop — see the doc comment on - // hermesPVCPaths for why. - for _, pvc := range []string{ - agentruntime.Describe(agentruntime.Hermes).DataPVCName, - } { - waitCmd := exec.Command(kubectlBin, - "wait", "--for=jsonpath={.status.phase}=Bound", - "--timeout=60s", "pvc/"+pvc, "-n", namespace) - waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) - _ = waitCmd.Run() // best-effort; continue even on timeout + if !hermesInitContainerStuck(cfg, namespace) { + return } - for _, p := range hermesPVCPaths(cfg, id) { - fixRuntimeVolumeOwnership(cfg, p, u) - } + hostPath := filepath.Join(cfg.DataDir, namespace, agentruntime.Describe(agentruntime.Hermes).DataPVCName) + fixRuntimeVolumeOwnership(cfg, hostPath, u) - if hermesInitStuck(cfg, namespace) { - deleteCmd := exec.Command(kubectlBin, - "-n", namespace, "delete", "pod", - "-l", "app.kubernetes.io/name=hermes", - "--ignore-not-found", "--wait=false") - deleteCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) - if err := deleteCmd.Run(); err == nil && u != nil { - u.Info("Restarted Hermes pod to apply fresh volume ownership") - } + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + kubectlBin := filepath.Join(cfg.BinDir, "kubectl") + deleteCmd := exec.Command(kubectlBin, + "-n", namespace, "delete", "pod", + "-l", "app.kubernetes.io/name=hermes", + "--ignore-not-found", "--wait=false") + deleteCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + if err := deleteCmd.Run(); err == nil && u != nil { + u.Info("Restarted Hermes pod after best-effort k3d PVC ownership repair") } } -// hermesInitStuck reports whether at least one Hermes pod has an init -// container in CrashLoopBackOff or an Error waiting state — the signature of -// the perm-denied symptom this fix targets. Returns false on any kubectl -// failure so that a transient API hiccup does not trigger spurious restarts. -func hermesInitStuck(cfg *config.Config, namespace string) bool { +func hermesInitContainerStuck(cfg *config.Config, namespace string) bool { kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") kubectlBin := filepath.Join(cfg.BinDir, "kubectl") cmd := exec.Command(kubectlBin, diff --git a/internal/hermes/hermes_test.go b/internal/hermes/hermes_test.go index e18499a0..2fa09d1a 100644 --- a/internal/hermes/hermes_test.go +++ b/internal/hermes/hermes_test.go @@ -142,9 +142,8 @@ func TestGenerateValues_UsesHermesNativeNames(t *testing.T) { "/data/.hermes/obol-skills", "containerPort: 8642", "containerPort: 9119", - "init-hermes-perms", + "fsGroupChangePolicy: OnRootMismatch", "init-hermes-data", - "chown -R 10000:10000 /data", `Hermes binary missing from image: /opt/hermes/.venv/bin/hermes`, `Hermes image is missing required extras: web,messaging,mcp,pty,cli,acp,google`, `import fastapi, uvicorn, telegram, mcp, ptyprocess, simple_term_menu, googleapiclient`, @@ -166,9 +165,11 @@ func TestGenerateValues_UsesHermesNativeNames(t *testing.T) { "git clone", "uv pip install", "/data/.hermes/hermes-agent", + "init-hermes-perms", + "chown -R 10000:10000 /data", } { if strings.Contains(values, banned) { - t.Fatalf("generateValues() should not rebuild Hermes inside the PVC, found %q:\n%s", banned, values) + t.Fatalf("generateValues() contains banned fragment %q:\n%s", banned, values) } } @@ -351,48 +352,3 @@ func mkdirInstance(t *testing.T, cfg *config.Config, id string) { t.Fatalf("create Hermes instance %q: %v", id, err) } } - -// TestHermesPVCPaths pins the host-side directories that -// ensureHermesPVCOwnership chowns. Renaming the Hermes data PVC or -// relocating the namespace prefix in agentruntime without updating this -// list would silently regress the #475 fix on Linux k3d, because the chown -// would land on a non-existent path while the real PVC backing dir kept -// its local-path-provisioner default ownership of 1000:1000. -func TestHermesPVCPaths(t *testing.T) { - cfg := testConfig(t) - const id = "obol-agent" - namespace := agentruntime.Namespace(agentruntime.Hermes, id) - - got := hermesPVCPaths(cfg, id) - want := []string{ - filepath.Join(cfg.DataDir, namespace, "hermes-data"), - } - if !reflect.DeepEqual(got, want) { - t.Fatalf("hermesPVCPaths(%q) = %#v; want %#v", id, got, want) - } -} - -// TestHermesPVCPaths_ExcludesRemoteSignerKeystores pins the negative half of -// the contract: the helper MUST NOT include the remote-signer-keystores PVC -// path. The first revision of this fix included it, which chowned the -// volume to 10000:10000 (Hermes' UID) and broke the remote-signer pod — -// remote-signer runs as runAsUser=65532 with fsGroup=1000, expecting the -// local-path-provisioner default ownership of 1000:1000. The result was -// a remote-signer CrashLoopBackOff with -// -// failed to load keystores: Permission denied (os error 13) -// -// against /data/keystores. This guard makes that regression impossible to -// re-introduce by accident. -func TestHermesPVCPaths_ExcludesRemoteSignerKeystores(t *testing.T) { - cfg := testConfig(t) - const id = "obol-agent" - namespace := agentruntime.Namespace(agentruntime.Hermes, id) - - keystoreVolume := filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores") - for _, p := range hermesPVCPaths(cfg, id) { - if p == keystoreVolume { - t.Fatalf("hermesPVCPaths included %q — the remote-signer pod (runAsUser=65532, fsGroup=1000) crash-loops when this volume is chowned to Hermes' containerUID:containerGID", keystoreVolume) - } - } -} diff --git a/internal/serviceoffercontroller/agent_render.go b/internal/serviceoffercontroller/agent_render.go index 0619ed77..e44874cc 100644 --- a/internal/serviceoffercontroller/agent_render.go +++ b/internal/serviceoffercontroller/agent_render.go @@ -316,9 +316,10 @@ func agentPodSpec(agent *monetizeapi.Agent) map[string]any { "serviceAccountName": hermesServiceName, "automountServiceAccountToken": true, "securityContext": map[string]any{ - "runAsUser": int64(hermesContainerUID), - "runAsGroup": int64(hermesContainerGID), - "fsGroup": int64(hermesContainerGID), + "runAsUser": int64(hermesContainerUID), + "runAsGroup": int64(hermesContainerGID), + "fsGroup": int64(hermesContainerGID), + "fsGroupChangePolicy": "OnRootMismatch", }, "initContainers": []any{ buildAgentProfileInitContainer(), diff --git a/internal/serviceoffercontroller/agent_render_test.go b/internal/serviceoffercontroller/agent_render_test.go index 29735b3d..e04e1e32 100644 --- a/internal/serviceoffercontroller/agent_render_test.go +++ b/internal/serviceoffercontroller/agent_render_test.go @@ -137,6 +137,43 @@ func TestAgentManifests_DeploymentEnvCarriesContext(t *testing.T) { } } +func TestAgentManifests_DeploymentUsesFSGroup(t *testing.T) { + agent := &monetizeapi.Agent{} + agent.Name = "quant" + agent.Namespace = "agent-quant" + agent.Spec = monetizeapi.AgentSpec{Model: "qwen3.5:9b"} + + out, err := agentManifests(agent, "litellm", "api") + if err != nil { + t.Fatalf("agentManifests: %v", err) + } + var dep map[string]any + for _, m := range out { + if m.GetKind() == "Deployment" { + dep = m.UnstructuredContent() + break + } + } + if dep == nil { + t.Fatal("Deployment manifest missing") + } + + podSpec := dep["spec"].(map[string]any)["template"].(map[string]any)["spec"].(map[string]any) + securityContext := podSpec["securityContext"].(map[string]any) + if securityContext["runAsUser"] != int64(hermesContainerUID) { + t.Fatalf("runAsUser = %v, want %d", securityContext["runAsUser"], hermesContainerUID) + } + if securityContext["runAsGroup"] != int64(hermesContainerGID) { + t.Fatalf("runAsGroup = %v, want %d", securityContext["runAsGroup"], hermesContainerGID) + } + if securityContext["fsGroup"] != int64(hermesContainerGID) { + t.Fatalf("fsGroup = %v, want %d", securityContext["fsGroup"], hermesContainerGID) + } + if securityContext["fsGroupChangePolicy"] != "OnRootMismatch" { + t.Fatalf("fsGroupChangePolicy = %v, want OnRootMismatch", securityContext["fsGroupChangePolicy"]) + } +} + func TestAgentManifests_ProfileSeedInitContainer(t *testing.T) { agent := &monetizeapi.Agent{} agent.Name = "quant" diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index a733213b..ff9da256 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -26,6 +26,39 @@ const ( servicesJSONRouteName = "obol-services-json-route" ) +// restrictedPodSecurityContext returns a Pod-level securityContext that +// satisfies the Restricted Pod Security Standard (PSS). PR #521 enforces +// Restricted PSS on the x402 namespace, so the controller-rendered httpd +// workloads (obol-skill-md and agentidentity-*-registration) must ship a +// compliant securityContext or they fail admission and never start. +// +// UID/GID 1000 is the canonical non-root user available in the busybox +// image used by both Deployments. fsGroup keeps the projected ConfigMap +// volumes readable by the httpd process. +func restrictedPodSecurityContext() map[string]any { + return map[string]any{ + "runAsNonRoot": true, + "runAsUser": int64(1000), + "runAsGroup": int64(1000), + "fsGroup": int64(1000), + "seccompProfile": map[string]any{ + "type": "RuntimeDefault", + }, + } +} + +// restrictedContainerSecurityContext returns a container-level +// securityContext compliant with the Restricted PSS profile: privilege +// escalation disabled and all Linux capabilities dropped. +func restrictedContainerSecurityContext() map[string]any { + return map[string]any{ + "allowPrivilegeEscalation": false, + "capabilities": map[string]any{ + "drop": []any{"ALL"}, + }, + } +} + func buildRegistrationRequest(offer *monetizeapi.ServiceOffer, desiredState string) *unstructured.Unstructured { return &unstructured.Unstructured{ Object: map[string]any{ @@ -92,11 +125,13 @@ func buildAgentIdentityRegistrationDeployment(identity *monetizeapi.AgentIdentit }, }, "spec": map[string]any{ + "securityContext": restrictedPodSecurityContext(), "containers": []any{ map[string]any{ - "name": "httpd", - "image": "busybox:1.36", - "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "name": "httpd", + "image": "busybox:1.36", + "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "securityContext": restrictedContainerSecurityContext(), "ports": []any{ map[string]any{"containerPort": int64(8080), "protocol": "TCP"}, }, @@ -259,11 +294,13 @@ func buildSkillCatalogDeployment(contentHash string) *unstructured.Unstructured }, }, "spec": map[string]any{ + "securityContext": restrictedPodSecurityContext(), "containers": []any{ map[string]any{ - "name": "httpd", - "image": "busybox:1.36", - "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "name": "httpd", + "image": "busybox:1.36", + "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "securityContext": restrictedContainerSecurityContext(), "ports": []any{ map[string]any{"containerPort": int64(8080), "protocol": "TCP"}, }, diff --git a/internal/serviceoffercontroller/render_builders_test.go b/internal/serviceoffercontroller/render_builders_test.go index 22efa4b5..573d72af 100644 --- a/internal/serviceoffercontroller/render_builders_test.go +++ b/internal/serviceoffercontroller/render_builders_test.go @@ -5,8 +5,101 @@ import ( "testing" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// assertRestrictedPSS checks that a controller-rendered Deployment satisfies +// the Restricted Pod Security Standard. PR #521 enforces Restricted PSS on +// the x402 namespace, so any httpd workload missing these fields gets +// rejected at admission and never starts (Bug #3 from the 14-PR integration +// test campaign). +func assertRestrictedPSS(t *testing.T, deploymentName string, spec map[string]any) { + t.Helper() + template, _ := spec["template"].(map[string]any) + podSpec, _ := template["spec"].(map[string]any) + + psc, ok := podSpec["securityContext"].(map[string]any) + if !ok { + t.Fatalf("%s: pod spec missing securityContext", deploymentName) + } + if v, _ := psc["runAsNonRoot"].(bool); !v { + t.Errorf("%s: pod securityContext.runAsNonRoot = %v, want true", deploymentName, psc["runAsNonRoot"]) + } + if v, _ := psc["runAsUser"].(int64); v == 0 { + t.Errorf("%s: pod securityContext.runAsUser must be set to a non-zero UID", deploymentName) + } + if v, _ := psc["runAsGroup"].(int64); v == 0 { + t.Errorf("%s: pod securityContext.runAsGroup must be set to a non-zero GID", deploymentName) + } + sp, ok := psc["seccompProfile"].(map[string]any) + if !ok { + t.Errorf("%s: pod securityContext missing seccompProfile", deploymentName) + } else if t2, _ := sp["type"].(string); t2 != "RuntimeDefault" && t2 != "Localhost" { + t.Errorf("%s: pod seccompProfile.type = %q, want RuntimeDefault or Localhost", deploymentName, t2) + } + + containers, _ := podSpec["containers"].([]any) + if len(containers) == 0 { + t.Fatalf("%s: no containers in pod spec", deploymentName) + } + for _, c := range containers { + cm, _ := c.(map[string]any) + name, _ := cm["name"].(string) + csc, ok := cm["securityContext"].(map[string]any) + if !ok { + t.Errorf("%s/%s: container missing securityContext", deploymentName, name) + continue + } + if v, _ := csc["allowPrivilegeEscalation"].(bool); v { + t.Errorf("%s/%s: container allowPrivilegeEscalation = true, want false", deploymentName, name) + } + if _, present := csc["allowPrivilegeEscalation"]; !present { + t.Errorf("%s/%s: container missing allowPrivilegeEscalation (must be false)", deploymentName, name) + } + caps, ok := csc["capabilities"].(map[string]any) + if !ok { + t.Errorf("%s/%s: container securityContext missing capabilities", deploymentName, name) + continue + } + drop, _ := caps["drop"].([]any) + var droppedAll bool + for _, d := range drop { + if s, _ := d.(string); s == "ALL" { + droppedAll = true + } + } + if !droppedAll { + t.Errorf("%s/%s: container capabilities.drop must include \"ALL\", got %v", deploymentName, name, drop) + } + } +} + +// TestBuildSkillCatalogDeployment_RestrictedPSS verifies the skill-md +// httpd Deployment ships a Restricted-PSS-compliant securityContext. +// Regression test for the cross-PR interaction with #521 surfaced by +// the 14-PR integration test (Bug #3). +func TestBuildSkillCatalogDeployment_RestrictedPSS(t *testing.T) { + d := buildSkillCatalogDeployment("hash-x") + spec, _ := d.Object["spec"].(map[string]any) + assertRestrictedPSS(t, skillCatalogConfigMapName, spec) +} + +// TestBuildAgentIdentityRegistrationDeployment_RestrictedPSS verifies the +// agentidentity well-known/agent-registration.json publisher httpd +// Deployment ships a Restricted-PSS-compliant securityContext. +func TestBuildAgentIdentityRegistrationDeployment_RestrictedPSS(t *testing.T) { + identity := &monetizeapi.AgentIdentity{ + ObjectMeta: metav1.ObjectMeta{ + Name: monetizeapi.AgentIdentityDefaultName, + Namespace: "x402", + UID: "test-uid", + }, + } + d := buildAgentIdentityRegistrationDeployment(identity, "hash-y") + spec, _ := d.Object["spec"].(map[string]any) + assertRestrictedPSS(t, agentIdentityRegistrationName(identity), spec) +} + // TestBuildSkillCatalogConfigMap: exposes skill.md + services.json + httpd conf. func TestBuildSkillCatalogConfigMap(t *testing.T) { cm := buildSkillCatalogConfigMap("# Catalog", `[{"name":"a"}]`)