From 06896bba5754f3d35f286118b183d1803ecce594 Mon Sep 17 00:00:00 2001 From: Todd Short Date: Wed, 18 Mar 2026 10:06:14 -0400 Subject: [PATCH] OCPBUGS-78787: fix(operator-controller): clean up orphaned temp dirs in catalog cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit filesystemCache.writeFS creates a temp dir (.{catalog}-{random}) and renames it into place atomically. If the process is interrupted before the rename, the temp dir persists. Each restart adds another, eventually filling the disk. Additionally, writeFS had no defer os.RemoveAll(tmpDir), so any error during WalkMetasReader or the rename step also left the temp dir behind — no process kill required. Two fixes: - Add defer os.RemoveAll(tmpDir) so errors during normal operation clean up. - Add removeOrphanedTempDirs, called at the start of writeFS (under the write mutex), to clean up dirs orphaned by a previous process run. This bounds worst-case accumulation to one orphaned dir per catalog regardless of restart rate. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Todd Short --- .../catalogmetadata/cache/cache.go | 29 +++++++++++++++++++ .../catalogmetadata/cache/cache_test.go | 24 +++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/internal/operator-controller/catalogmetadata/cache/cache.go b/internal/operator-controller/catalogmetadata/cache/cache.go index 8bcfff10fc..37f57d540c 100644 --- a/internal/operator-controller/catalogmetadata/cache/cache.go +++ b/internal/operator-controller/catalogmetadata/cache/cache.go @@ -6,6 +6,7 @@ import ( "io/fs" "os" "path/filepath" + "strings" "sync" "github.com/operator-framework/operator-registry/alpha/declcfg" @@ -75,10 +76,15 @@ func (fsc *filesystemCache) Put(catalogName, resolvedRef string, source io.Reade func (fsc *filesystemCache) writeFS(catalogName string, source io.Reader) (fs.FS, error) { cacheDir := fsc.cacheDir(catalogName) + if err := fsc.removeOrphanedTempDirs(catalogName); err != nil { + return nil, err + } + tmpDir, err := os.MkdirTemp(fsc.cachePath, fmt.Sprintf(".%s-", catalogName)) if err != nil { return nil, fmt.Errorf("error creating temporary directory to unpack catalog metadata: %v", err) } + defer os.RemoveAll(tmpDir) if err := declcfg.WalkMetasReader(source, func(meta *declcfg.Meta, err error) error { if err != nil { @@ -164,3 +170,26 @@ func (fsc *filesystemCache) Remove(catalogName string) error { func (fsc *filesystemCache) cacheDir(catalogName string) string { return filepath.Join(fsc.cachePath, catalogName) } + +// removeOrphanedTempDirs removes temporary staging directories left behind by a +// previous writeFS call for the given catalog that was interrupted before the +// rename (e.g. pod eviction or crash). Temp dirs use the prefix ".{catalogName}-" +// as created by os.MkdirTemp. This method must be called while the write lock is held. +func (fsc *filesystemCache) removeOrphanedTempDirs(catalogName string) error { + entries, err := os.ReadDir(fsc.cachePath) + if os.IsNotExist(err) { + return nil + } + if err != nil { + return fmt.Errorf("error reading cache directory: %w", err) + } + prefix := fmt.Sprintf(".%s-", catalogName) + for _, entry := range entries { + if strings.HasPrefix(entry.Name(), prefix) { + if err := os.RemoveAll(filepath.Join(fsc.cachePath, entry.Name())); err != nil { + return fmt.Errorf("error removing orphaned temp directory %q: %w", entry.Name(), err) + } + } + } + return nil +} diff --git a/internal/operator-controller/catalogmetadata/cache/cache_test.go b/internal/operator-controller/catalogmetadata/cache/cache_test.go index ccc796082f..7f1a5713b3 100644 --- a/internal/operator-controller/catalogmetadata/cache/cache_test.go +++ b/internal/operator-controller/catalogmetadata/cache/cache_test.go @@ -169,6 +169,30 @@ func TestFilesystemCacheRemove(t *testing.T) { assert.NoDirExists(t, catalogCachePath) } +func TestFilesystemCachePutCleansOrphanedTempDirs(t *testing.T) { + const catalogName = "test-catalog" + cacheDir := t.TempDir() + c := cache.NewFilesystemCache(cacheDir) + + // Simulate temp dirs left behind by a previous interrupted Put for this catalog. + orphan1 := filepath.Join(cacheDir, ".test-catalog-1234567890") + orphan2 := filepath.Join(cacheDir, ".test-catalog-9876543210") + require.NoError(t, os.MkdirAll(orphan1, 0700)) + require.NoError(t, os.MkdirAll(orphan2, 0700)) + + // A temp dir for a different catalog should NOT be removed. + otherOrphan := filepath.Join(cacheDir, ".other-catalog-1111111111") + require.NoError(t, os.MkdirAll(otherOrphan, 0700)) + + _, err := c.Put(catalogName, "fake/catalog@sha256:fakesha", defaultContent(), nil) + require.NoError(t, err) + + assert.NoDirExists(t, orphan1, "orphaned temp dir for catalog should have been removed") + assert.NoDirExists(t, orphan2, "orphaned temp dir for catalog should have been removed") + assert.DirExists(t, otherOrphan, "temp dir for a different catalog should not be removed") + assert.DirExists(t, filepath.Join(cacheDir, catalogName), "real cache dir should exist") +} + func equalFilesystems(expected, actual fs.FS) error { normalizeJSON := func(data []byte) []byte { var v interface{}