From 5e323cc53995cb3f960fff496cf78024ea85c78c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 19 May 2026 18:14:09 +0900 Subject: [PATCH 01/14] backup: Redis zset encoder (Phase 0a) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decodes !zs|meta|/!zs|mem|/!zs|scr|/!zs|meta|d| snapshot records into the per-zset `zsets/.json` shape defined by the Phase 0 design (docs/design/2026_04_29_proposed_snapshot_logical_decoder.md line 334-335). Mirrors the hash/list/set encoder shape: - !zs|meta| → 8-byte BE declared member count - !zs|mem| → member m with 8-byte IEEE 754 score - !zs|scr| → secondary score index; silently discarded (!zs|mem| is source of truth at backup time) - !zs|meta|d| → delta records; silently skipped (set/hash policy) Output is sorted by raw member-name bytes so `diff -r` of two dumps with the same logical contents but mutated scores stays line-stable. Scores serialize as JSON numbers for finite values and the ZADD-conventional `"+inf"`/`"-inf"` strings for non-finite ones (json.Marshal rejects ±Inf so the score field uses json.RawMessage). NaN scores fail closed at HandleZSetMember: Redis's ZADD rejects NaN at the wire level, so a NaN in storage indicates corruption that we refuse to silently propagate into the dump. TTL routing: !redis|ttl| for a registered zset key folds into the zset JSON's expire_at_ms (matching the set/list/hash inlining), so a restorer replays ZADD + EXPIRE in one shot without chasing a separate sidecar. Self-review: 1. Data loss — !zs|mem| is the source of truth; !zs|scr| and the delta family are intentional silent skips with caller-audit notes. 2. Concurrency — RedisDB is sequential per scope (matches the existing per-DB encoder contract); no shared state across DBs. 3. Performance — per-zset state buffered in a map; flushed at Finalize. Bounded by maxWideColumnItems on the live side. Sort is O(n log n) on member-name bytes; identical cost shape to the hash/set encoders that shipped in #725/#758. 4. Consistency — JSON field-order determinism preserved (struct tags, not map). Inf score uses string form via json.RawMessage so the same `score` key accepts both shapes. 5. Coverage — 17 table-driven tests: - round-trip basic / empty zset / TTL inlining - binary member via base64 envelope - delta-key skip (both HandleZSetMeta entry + parseZSetMetaKey guard) - HandleZSetMetaDelta explicit entry point - HandleZSetScore silent discard - malformed meta length / overflow / MaxInt64 boundary - malformed member-value length / NaN rejection - ±Inf string-form serialization - members-without-meta still emits file - duplicate-members latest-wins (ZADD semantics) Caller audit for semantics-changing edit (new `case redisKindZSet:` in HandleTTL, redis_string.go:310): purely additive. The new branch fires only when zsetState() has previously registered the key; no existing handler maps to redisKindZSet, so no prior call site changes behavior. Verified via `grep -n 'redisKindZSet' internal/backup/`. --- internal/backup/redis_hash.go | 6 +- internal/backup/redis_string.go | 22 +- internal/backup/redis_zset.go | 329 ++++++++++++++++++ internal/backup/redis_zset_test.go | 527 +++++++++++++++++++++++++++++ 4 files changed, 881 insertions(+), 3 deletions(-) create mode 100644 internal/backup/redis_zset.go create mode 100644 internal/backup/redis_zset_test.go diff --git a/internal/backup/redis_hash.go b/internal/backup/redis_hash.go index e314cfbe..2e5f1e5b 100644 --- a/internal/backup/redis_hash.go +++ b/internal/backup/redis_hash.go @@ -237,7 +237,11 @@ type hashFieldRecord struct { Value json.RawMessage `json:"value"` } -func marshalHashJSON(st *redisHashState) ([]byte, error) { +// nolint comment lives at the function head: dupl pairs this with +// marshalZSetJSON, which carries the rationale (parallel design-spec +// wrappers that can't collapse into a shared helper without breaking +// JSON field-order determinism). See redis_zset.go:marshalZSetJSON. +func marshalHashJSON(st *redisHashState) ([]byte, error) { //nolint:dupl // see comment above + redis_zset.go // Sort by raw byte order for deterministic output across runs. names := make([]string, 0, len(st.fields)) for name := range st.fields { diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index 688225d1..a2d0ecb5 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -85,6 +85,7 @@ const ( redisKindHash redisKindList redisKindSet + redisKindZSet ) // RedisDB encodes one logical Redis database (`redis/db_/`). All @@ -185,6 +186,14 @@ type RedisDB struct { // Finalize into sets/.json with members sorted by raw byte // order for deterministic dump output. sets map[string]*redisSetState + + // zsets buffers per-userKey sorted-set state. Score lives in the + // !zs|mem| value (8-byte IEEE 754 big-endian); member name is the + // trailing key bytes (binary-safe). Flushed at Finalize into + // zsets/.json sorted by member-name bytes (not by score) so + // `diff -r` between dumps stays line-stable across score-only + // mutations. + zsets map[string]*redisZSetState } // NewRedisDB constructs a RedisDB rooted at /redis/db_/. @@ -204,6 +213,7 @@ func NewRedisDB(outRoot string, dbIndex int) *RedisDB { hashes: make(map[string]*redisHashState), lists: make(map[string]*redisListState), sets: make(map[string]*redisSetState), + zsets: make(map[string]*redisZSetState), } } @@ -297,9 +307,16 @@ func (r *RedisDB) HandleTTL(userKey, value []byte) error { st.expireAtMs = expireAtMs st.hasTTL = true return nil + case redisKindZSet: + // Same per-record TTL inlining: ZADD + EXPIRE replay in + // one shot from the per-zset JSON, no separate sidecar. + st := r.zsetState(userKey) + st.expireAtMs = expireAtMs + st.hasTTL = true + return nil case redisKindUnknown: // Track orphan TTL counts only — keys are unused before the - // remaining wide-column encoders (set/zset/stream) land, and + // remaining wide-column encoder (stream) lands, and // buffering them allocates proportional to user-key size // (up to 1 MiB per key) for no benefit. Codex P2 round 6. r.orphanTTLCount++ @@ -318,6 +335,7 @@ func (r *RedisDB) Finalize() error { r.flushHashes, r.flushLists, r.flushSets, + r.flushZSets, func() error { return closeJSONL(r.stringsTTL) }, func() error { return closeJSONL(r.hllTTL) }, r.closeKeymap, @@ -329,7 +347,7 @@ func (r *RedisDB) Finalize() error { if r.warn != nil && r.orphanTTLCount > 0 { r.warn("redis_orphan_ttl", "count", r.orphanTTLCount, - "hint", "remaining wide-column encoders (zset/stream) have not landed yet") + "hint", "remaining wide-column encoder (stream) has not landed yet") } return firstErr } diff --git a/internal/backup/redis_zset.go b/internal/backup/redis_zset.go new file mode 100644 index 00000000..0f4e20a4 --- /dev/null +++ b/internal/backup/redis_zset.go @@ -0,0 +1,329 @@ +package backup + +import ( + "bytes" + "encoding/binary" + "encoding/json" + "math" + "path/filepath" + "sort" + + cockroachdberr "github.com/cockroachdb/errors" +) + +// Redis zset encoder. Translates raw !zs|... snapshot records into the +// per-zset `zsets/.json` shape defined by Phase 0 +// (docs/design/2026_04_29_proposed_snapshot_logical_decoder.md). +// +// Wire format mirrors store/zset_helpers.go: +// - !zs|meta| → 8-byte BE Len +// - !zs|mem| → 8-byte IEEE 754 score +// - !zs|scr| → empty (score index) +// - !zs|meta|d|<...> → 8-byte LenDelta +// +// Routing rules: +// - `!zs|mem|` is the source of truth: it carries both the member name +// (in the trailing key bytes) and its IEEE 754 score (in the value). +// - `!zs|scr|` is a secondary index used at scan time on the live side; +// for backup purposes it is redundant and is silently skipped, the +// same way `!st|scr|` (no such prefix exists) and `!hs|meta|d|` +// deltas are skipped by the hash/set encoders. +// - `!zs|meta|d|` deltas are silently skipped; `!zs|mem|` already +// reflects the post-delta state at backup time. +const ( + RedisZSetMetaPrefix = "!zs|meta|" + RedisZSetMemberPrefix = "!zs|mem|" + RedisZSetScorePrefix = "!zs|scr|" + RedisZSetMetaDeltaPrefix = "!zs|meta|d|" + + // redisZSetScoreSize is the size of the IEEE 754 big-endian score + // stored in !zs|mem| values. Same constant as zsetMetaSizeBytes in + // store/zset_helpers.go; duplicated here to keep the backup + // package free of internal/storage imports. + redisZSetScoreSize = 8 +) + +// ErrRedisInvalidZSetMeta is returned when an !zs|meta| value is not +// the expected 8-byte big-endian member count. +var ErrRedisInvalidZSetMeta = cockroachdberr.New("backup: invalid !zs|meta| value") + +// ErrRedisInvalidZSetMember is returned when an !zs|mem| value is not +// the expected 8-byte IEEE 754 score, or contains a NaN score (Redis's +// ZADD command rejects NaN, so a NaN at backup time indicates store +// corruption and a silent fall-through would re-corrupt the restored +// cluster). +var ErrRedisInvalidZSetMember = cockroachdberr.New("backup: invalid !zs|mem| value") + +// ErrRedisInvalidZSetKey is returned when an !zs| key cannot be parsed +// for its userKeyLen+userKey (or member) segments. +var ErrRedisInvalidZSetKey = cockroachdberr.New("backup: malformed !zs| key") + +// redisZSetState buffers one userKey's zset during a snapshot scan. +// Members are stored as a map keyed by their byte string so duplicate +// HandleZSetMember calls collapse to the latest-wins score (matching +// Redis's ZADD semantics: re-adding a member overwrites the prior +// score). +type redisZSetState struct { + metaSeen bool + declaredLen int64 + members map[string]float64 + expireAtMs uint64 + hasTTL bool +} + +// HandleZSetMeta processes one !zs|meta| record. The +// value is the 8-byte BE member count. We park the declared length so +// flushZSets can warn on a mismatch with the observed member count +// and register the user key so a later !redis|ttl| record +// routes back to this zset state. +// +// !zs|meta|d|... delta keys share the !zs|meta| string prefix, so a +// snapshot dispatcher that routes by "starts with RedisZSetMetaPrefix" +// lands delta records here too. We mirror the hash/set encoder policy +// (PRs #725, #758) and silently skip the delta family because +// !zs|mem| records are the source of truth for restored zset +// contents. +func (r *RedisDB) HandleZSetMeta(key, value []byte) error { + if bytes.HasPrefix(key, []byte(RedisZSetMetaDeltaPrefix)) { + return nil + } + userKey, ok := parseZSetMetaKey(key) + if !ok { + return cockroachdberr.Wrapf(ErrRedisInvalidZSetKey, "meta key: %q", key) + } + if len(value) != redisUint64Bytes { + return cockroachdberr.Wrapf(ErrRedisInvalidZSetMeta, + "length %d != %d", len(value), redisUint64Bytes) + } + // Bounds-check the uint64 declared count before narrowing to + // int64; without this a corrupted store with the high bit set + // would wrap to a negative declaredLen and fire spurious + // redis_zset_length_mismatch warnings on every flush. Mirrors + // the hash/list/set encoders' symmetric guard. + rawLen := binary.BigEndian.Uint64(value) + if rawLen > math.MaxInt64 { + return cockroachdberr.Wrapf(ErrRedisInvalidZSetMeta, + "declared len %d overflows int64", rawLen) + } + st := r.zsetState(userKey) + st.declaredLen = int64(rawLen) //nolint:gosec // bounds-checked above + st.metaSeen = true + return nil +} + +// HandleZSetMember processes one !zs|mem| +// record. The value is the 8-byte IEEE 754 big-endian score. The +// member bytes live in the key (binary-safe). +// +// NaN scores are rejected. Redis's ZADD command itself rejects NaN +// at the wire level, so a NaN at backup time indicates either a +// corrupted store or a write path that bypassed ZADD validation — +// either way, emitting it as `score: null` or letting json.Marshal +// fail mid-flush would corrupt the dump silently. Fail closed here +// so the problem surfaces at the source record. +func (r *RedisDB) HandleZSetMember(key, value []byte) error { + userKey, member, ok := parseZSetMemberKey(key) + if !ok { + return cockroachdberr.Wrapf(ErrRedisInvalidZSetKey, "member key: %q", key) + } + if len(value) != redisZSetScoreSize { + return cockroachdberr.Wrapf(ErrRedisInvalidZSetMember, + "length %d != %d", len(value), redisZSetScoreSize) + } + score := math.Float64frombits(binary.BigEndian.Uint64(value)) + if math.IsNaN(score) { + return cockroachdberr.Wrapf(ErrRedisInvalidZSetMember, + "NaN score for member of user key %q (length %d)", userKey, len(userKey)) + } + st := r.zsetState(userKey) + st.members[string(member)] = score + return nil +} + +// HandleZSetScore accepts and discards one !zs|scr|... record. The +// score index is a live-side secondary index for ZRANGEBYSCORE; the +// authoritative member→score mapping lives in !zs|mem|. Snapshot +// dispatchers should still route this prefix here (rather than to +// the orphan-record warning sink) so a future audit that greps for +// "every !zs| prefix has a handler" finds one. +func (r *RedisDB) HandleZSetScore(_, _ []byte) error { return nil } + +// HandleZSetMetaDelta accepts and discards one !zs|meta|d|... record. +// See HandleZSetMeta's docstring for the rationale; !zs|mem| is the +// source of truth at backup time. +func (r *RedisDB) HandleZSetMetaDelta(_, _ []byte) error { return nil } + +// zsetState lazily creates per-key state. Mirrors the hash/list/set +// kindByKey-registration pattern so HandleZSetMeta, HandleZSetMember, +// and the HandleTTL back-edge all agree on the kind. +func (r *RedisDB) zsetState(userKey []byte) *redisZSetState { + uk := string(userKey) + if st, ok := r.zsets[uk]; ok { + return st + } + st := &redisZSetState{members: make(map[string]float64)} + r.zsets[uk] = st + r.kindByKey[uk] = redisKindZSet + return st +} + +// parseZSetMetaKey strips !zs|meta| and the 4-byte BE userKeyLen +// prefix. Returns (userKey, true) on success. Delta keys +// (!zs|meta|d|...) share the meta string prefix and would otherwise +// be parsed as base-meta with a garbage userKeyLen — refuse them at +// the boundary so a misrouted delta surfaces a parse error rather +// than silent state corruption. Mirrors parseHashMetaKey/parseSetMetaKey. +func parseZSetMetaKey(key []byte) ([]byte, bool) { + if bytes.HasPrefix(key, []byte(RedisZSetMetaDeltaPrefix)) { + return nil, false + } + rest := bytes.TrimPrefix(key, []byte(RedisZSetMetaPrefix)) + if len(rest) == len(key) { + return nil, false + } + return parseUserKeyLenPrefix(rest) +} + +// parseZSetMemberKey strips !zs|mem| and the 4-byte BE userKeyLen +// prefix, then returns (userKey, member, true). The member bytes +// are everything after the userKey segment — binary-safe per +// Redis's ZADD contract. +func parseZSetMemberKey(key []byte) ([]byte, []byte, bool) { + rest := bytes.TrimPrefix(key, []byte(RedisZSetMemberPrefix)) + if len(rest) == len(key) { + return nil, nil, false + } + userKey, ok := parseUserKeyLenPrefix(rest) + if !ok { + return nil, nil, false + } + member := rest[wideColumnUserKeyLenSize+len(userKey):] + return userKey, member, true +} + +// flushZSets writes one JSON file per accumulated zset to +// zsets/.json. Empty zsets (Len=0, no members) still emit a +// file when meta was seen, mirroring the hash/list/set encoders: +// their existence is observable to clients (TYPE returns "zset", +// ZCARD returns 0). Mismatched declared-vs-observed length surfaces +// an `redis_zset_length_mismatch` warning. +func (r *RedisDB) flushZSets() error { + return flushWideColumnDir(r, r.zsets, "zsets", func(dir, uk string, st *redisZSetState) error { + if r.warn != nil && st.metaSeen && int64(len(st.members)) != st.declaredLen { + r.warn("redis_zset_length_mismatch", + "user_key_len", len(uk), + "declared_len", st.declaredLen, + "observed_members", len(st.members), + "hint", "meta record's Len does not match the count of !zs|mem| keys for this user key") + } + return r.writeZSetJSON(dir, []byte(uk), st) + }) +} + +func (r *RedisDB) writeZSetJSON(dir string, userKey []byte, st *redisZSetState) error { + encoded := EncodeSegment(userKey) + if err := r.recordIfFallback(encoded, userKey); err != nil { + return err + } + path := filepath.Join(dir, encoded+".json") + body, err := marshalZSetJSON(st) + if err != nil { + return err + } + if err := writeFileAtomic(path, body); err != nil { + return cockroachdberr.WithStack(err) + } + return nil +} + +// zsetMemberRecord is the dump-format projection of one zset entry. +// `member` is binary-safe (via marshalRedisBinaryValue) and `score` +// is a JSON RawMessage so finite scores serialize as JSON numbers +// (matching the design example `"score": 100`) while ±Inf, which +// json.Marshal rejects, serialize as the conventional ZADD strings +// `"+inf"` / `"-inf"`. Phase 0b's reverse encoder reads both shapes. +type zsetMemberRecord struct { + Member json.RawMessage `json:"member"` + Score json.RawMessage `json:"score"` +} + +// marshalRedisZSetScore renders one IEEE 754 score as the dump's +// score field. ±Inf is emitted as a JSON string ("+inf"/"-inf") +// because json.Marshal returns an error for non-finite floats; this +// matches the conventional ZADD score syntax (Redis accepts "+inf", +// "-inf", "+Inf", "-Inf", and bare "inf"). NaN is unreachable here +// — HandleZSetMember rejects NaN scores at intake — so we do not +// emit a NaN representation. +func marshalRedisZSetScore(score float64) (json.RawMessage, error) { + if math.IsInf(score, 1) { + return json.RawMessage(`"+inf"`), nil + } + if math.IsInf(score, -1) { + return json.RawMessage(`"-inf"`), nil + } + out, err := json.Marshal(score) + if err != nil { + return nil, cockroachdberr.WithStack(err) + } + return out, nil +} + +// marshalZSetJSON renders one zset state as the design's +// `{format_version, members, expire_at_ms}` JSON shape. Members are +// emitted as an array (not a JSON object) sorted by raw byte order +// of the member name so identical snapshots produce identical dump +// output across runs — same rationale as the hash/set encoders' +// arrays (binary-safe member names that would collide under JSON +// object keying when percent-encoded). Each member name goes through +// marshalRedisBinaryValue so non-UTF-8 bytes round-trip via the +// `{"base64":"..."}` envelope. +// +// Member sort key is the member-name bytes only; if a Phase 0b +// consumer relies on score-sorted output, it should re-sort on +// read. We pick name order here so a `diff -r` of two dumps with +// the same logical contents but mutated scores still produces a +// stable line-by-line diff. +// +// The wrapper shape (format_version + + expire_at_ms) is +// intentionally parallel to marshalHashJSON's because the design +// specifies parallel wrappers per Redis type. Collapsing them into a +// shared helper would require either a map[string]any (kills JSON +// field-order determinism) or a generic struct (can't carry distinct +// JSON tags per type), so we keep the structural parallel explicit. +// Reviewers comparing the two functions should diff +// (hashFieldRecord, "fields") against (zsetMemberRecord, "members") — +// the only intentional divergence. +func marshalZSetJSON(st *redisZSetState) ([]byte, error) { //nolint:dupl // see comment above + names := make([]string, 0, len(st.members)) + for m := range st.members { + names = append(names, m) + } + sort.Strings(names) + out := make([]zsetMemberRecord, 0, len(names)) + for _, name := range names { + nameJSON, err := marshalRedisBinaryValue([]byte(name)) + if err != nil { + return nil, err + } + scoreJSON, err := marshalRedisZSetScore(st.members[name]) + if err != nil { + return nil, err + } + out = append(out, zsetMemberRecord{Member: nameJSON, Score: scoreJSON}) + } + type record struct { + FormatVersion uint32 `json:"format_version"` + Members []zsetMemberRecord `json:"members"` + ExpireAtMs *uint64 `json:"expire_at_ms"` + } + rec := record{FormatVersion: 1, Members: out} + if st.hasTTL { + ms := st.expireAtMs + rec.ExpireAtMs = &ms + } + body, err := json.MarshalIndent(rec, "", " ") + if err != nil { + return nil, cockroachdberr.WithStack(err) + } + return body, nil +} diff --git a/internal/backup/redis_zset_test.go b/internal/backup/redis_zset_test.go new file mode 100644 index 00000000..59e00845 --- /dev/null +++ b/internal/backup/redis_zset_test.go @@ -0,0 +1,527 @@ +package backup + +import ( + "encoding/binary" + "encoding/json" + "math" + "os" + "path/filepath" + "testing" + + "github.com/cockroachdb/errors" +) + +// encodeZSetMetaValue builds the 8-byte BE member-count value used by +// the live store/zset_helpers.go (mirror of store.MarshalZSetMeta). +func encodeZSetMetaValue(memberCount int64) []byte { + v := make([]byte, 8) + binary.BigEndian.PutUint64(v, uint64(memberCount)) //nolint:gosec + return v +} + +// encodeZSetScoreValue builds the 8-byte IEEE 754 BE score value used by +// the live store/zset_helpers.go (mirror of store.MarshalZSetScore). +func encodeZSetScoreValue(score float64) []byte { + v := make([]byte, 8) + binary.BigEndian.PutUint64(v, math.Float64bits(score)) + return v +} + +// zsetMetaKey is the test-side mirror of store.ZSetMetaKey: +// !zs|meta|. +func zsetMetaKey(userKey string) []byte { + out := []byte(RedisZSetMetaPrefix) + var l [4]byte + binary.BigEndian.PutUint32(l[:], uint32(len(userKey))) //nolint:gosec + out = append(out, l[:]...) + return append(out, userKey...) +} + +// zsetMemberKey mirrors store.ZSetMemberKey: +// !zs|mem|. Member is binary-safe. +func zsetMemberKey(userKey string, member []byte) []byte { + out := []byte(RedisZSetMemberPrefix) + var l [4]byte + binary.BigEndian.PutUint32(l[:], uint32(len(userKey))) //nolint:gosec + out = append(out, l[:]...) + out = append(out, userKey...) + return append(out, member...) +} + +// zsetScoreKey mirrors store.ZSetScoreKey: +// !zs|scr|. The encoder +// must silently discard this prefix; we only build it here to feed +// HandleZSetScore in a test that pins the "discard" contract. +func zsetScoreKey(userKey string, sortableScore [8]byte, member []byte) []byte { + out := []byte(RedisZSetScorePrefix) + var l [4]byte + binary.BigEndian.PutUint32(l[:], uint32(len(userKey))) //nolint:gosec + out = append(out, l[:]...) + out = append(out, userKey...) + out = append(out, sortableScore[:]...) + return append(out, member...) +} + +// zsetMetaDeltaKey mirrors store.ZSetMetaDeltaKey: +// !zs|meta|d|. +func zsetMetaDeltaKey(userKey string, commitTS uint64, seqInTxn uint32) []byte { + out := []byte(RedisZSetMetaDeltaPrefix) + var l [4]byte + binary.BigEndian.PutUint32(l[:], uint32(len(userKey))) //nolint:gosec + out = append(out, l[:]...) + out = append(out, userKey...) + var ts [8]byte + binary.BigEndian.PutUint64(ts[:], commitTS) + out = append(out, ts[:]...) + var seq [4]byte + binary.BigEndian.PutUint32(seq[:], seqInTxn) + return append(out, seq[:]...) +} + +func readZSetJSON(t *testing.T, path string) map[string]any { + t.Helper() + b, err := os.ReadFile(path) //nolint:gosec // test path + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + var m map[string]any + if err := json.Unmarshal(b, &m); err != nil { + t.Fatalf("unmarshal: %v", err) + } + return m +} + +func zsetMembersArray(t *testing.T, m map[string]any) []any { + t.Helper() + v, ok := m["members"] + if !ok { + t.Fatalf("members missing in %+v", m) + } + raw, ok := v.([]any) + if !ok { + t.Fatalf("members = %T(%v), want []any", v, v) + } + return raw +} + +func zsetFloat(t *testing.T, m map[string]any, key string) float64 { + t.Helper() + v, ok := m[key] + if !ok { + t.Fatalf("field %q missing in %+v", key, m) + } + f, ok := v.(float64) + if !ok { + t.Fatalf("field %q = %T(%v), want float64", key, v, v) + } + return f +} + +// zsetMemberEntry is the test-side projection of one decoded +// {"member":..., "score":...} record. +type zsetMemberEntry struct { + member string + scoreNum float64 + scoreStr string // populated only when score is the string form ("+inf"/"-inf") + scoreKind string // "number" or "string" +} + +// assertZSetEntryScore checks that one decoded record matches the +// expected entry. Extracted from assertZSetMembersEqual to keep both +// callers below the cyclop complexity ceiling and so the +// type-assertion guards live in one place. +func assertZSetEntryScore(t *testing.T, idx int, rec map[string]any, w zsetMemberEntry) { + t.Helper() + switch w.scoreKind { + case "number": + score, ok := rec["score"].(float64) + if !ok { + t.Fatalf("members[%d].score = %T(%v), want number", idx, rec["score"], rec["score"]) + } + if score != w.scoreNum { + t.Fatalf("members[%d].score = %v, want %v", idx, score, w.scoreNum) + } + case "string": + if rec["score"] != w.scoreStr { + t.Fatalf("members[%d].score = %v(%T), want %q (Inf string form)", idx, rec["score"], rec["score"], w.scoreStr) + } + default: + t.Fatalf("test bug: unknown scoreKind %q", w.scoreKind) + } +} + +// assertZSetMembersEqual checks that the JSON `members` array matches +// the given expected entries. +func assertZSetMembersEqual(t *testing.T, members []any, want []zsetMemberEntry) { + t.Helper() + if len(members) != len(want) { + t.Fatalf("len(members) = %d, want %d (got %v)", len(members), len(want), members) + } + for i, w := range want { + rec, ok := members[i].(map[string]any) + if !ok { + t.Fatalf("members[%d] = %T(%v), want object", i, members[i], members[i]) + } + if rec["member"] != w.member { + t.Fatalf("members[%d].member = %v, want %v", i, rec["member"], w.member) + } + assertZSetEntryScore(t, i, rec, w) + } +} + +// TestRedisDB_ZSetRoundTripBasic confirms a multi-member zset +// round-trips through the encoder, that members are sorted by +// raw byte order (NOT score order, see marshalZSetJSON's docstring), +// and that score values land on the JSON record as plain numbers. +func TestRedisDB_ZSetRoundTripBasic(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + if err := db.HandleZSetMeta(zsetMetaKey("leaderboard"), encodeZSetMetaValue(3)); err != nil { + t.Fatalf("HandleZSetMeta: %v", err) + } + // Submit out of byte order and out of score order to exercise the sort. + for _, e := range []struct { + member string + score float64 + }{ + {"charlie", 30}, + {"alice", 100}, + {"bob", 50}, + } { + if err := db.HandleZSetMember(zsetMemberKey("leaderboard", []byte(e.member)), encodeZSetScoreValue(e.score)); err != nil { + t.Fatalf("HandleZSetMember(%s): %v", e.member, err) + } + } + if err := db.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "leaderboard.json")) + if zsetFloat(t, got, "format_version") != 1 { + t.Fatalf("format_version = %v", got["format_version"]) + } + if got["expire_at_ms"] != nil { + t.Fatalf("expire_at_ms must be nil without TTL, got %v", got["expire_at_ms"]) + } + // Sorted by member bytes: alice < bob < charlie. + assertZSetMembersEqual(t, zsetMembersArray(t, got), []zsetMemberEntry{ + {member: "alice", scoreNum: 100, scoreKind: "number"}, + {member: "bob", scoreNum: 50, scoreKind: "number"}, + {member: "charlie", scoreNum: 30, scoreKind: "number"}, + }) +} + +// TestRedisDB_ZSetEmptyZSetStillEmitsFile mirrors the hash/list/set +// emit-empty rule: ZCARD==0 is observable to clients (TYPE returns +// "zset"), so the dump must preserve existence. +func TestRedisDB_ZSetEmptyZSetStillEmitsFile(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + if err := db.HandleZSetMeta(zsetMetaKey("empty"), encodeZSetMetaValue(0)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "empty.json")) + if members := zsetMembersArray(t, got); len(members) != 0 { + t.Fatalf("empty zset should emit empty members array, got %v", members) + } +} + +// TestRedisDB_ZSetTTLInlinedFromScanIndex pins that !redis|ttl| +// records for a zset user key fold into the zset's JSON expire_at_ms, +// not a separate sidecar (the strings/HLL pattern). +func TestRedisDB_ZSetTTLInlinedFromScanIndex(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + if err := db.HandleZSetMeta(zsetMetaKey("k"), encodeZSetMetaValue(1)); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMember(zsetMemberKey("k", []byte("m")), encodeZSetScoreValue(1.5)); err != nil { + t.Fatal(err) + } + if err := db.HandleTTL([]byte("k"), encodeTTLValue(fixedExpireMs)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) + if zsetFloat(t, got, "expire_at_ms") != float64(fixedExpireMs) { + t.Fatalf("expire_at_ms = %v want %d", got["expire_at_ms"], fixedExpireMs) + } + if _, err := os.Stat(filepath.Join(root, "redis", "db_0", "zsets_ttl.jsonl")); !os.IsNotExist(err) { + t.Fatalf("unexpected zset TTL sidecar: stat err=%v", err) + } +} + +// TestRedisDB_ZSetLengthMismatchWarns pins the warn-on-mismatch +// contract — same shape as the hash/list/set encoders. +func TestRedisDB_ZSetLengthMismatchWarns(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + var events []string + db.WithWarnSink(func(event string, _ ...any) { events = append(events, event) }) + if err := db.HandleZSetMeta(zsetMetaKey("z"), encodeZSetMetaValue(5)); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMember(zsetMemberKey("z", []byte("only")), encodeZSetScoreValue(1)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + want := "redis_zset_length_mismatch" + found := false + for _, e := range events { + if e == want { + found = true + break + } + } + if !found { + t.Fatalf("expected %q warning, got %v", want, events) + } +} + +// TestRedisDB_ZSetBinaryMemberUsesBase64Envelope confirms non-UTF-8 +// member bytes round-trip via the typed `{"base64":"..."}` envelope. +func TestRedisDB_ZSetBinaryMemberUsesBase64Envelope(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + if err := db.HandleZSetMeta(zsetMetaKey("blob"), encodeZSetMetaValue(1)); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMember(zsetMemberKey("blob", []byte{0x80, 0xff, 0x01}), encodeZSetScoreValue(42)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "blob.json")) + members := zsetMembersArray(t, got) + if len(members) != 1 { + t.Fatalf("len(members) = %d, want 1", len(members)) + } + rec, ok := members[0].(map[string]any) + if !ok { + t.Fatalf("members[0] = %T(%v), want object", members[0], members[0]) + } + envelope, ok := rec["member"].(map[string]any) + if !ok { + t.Fatalf("expected base64 envelope on member, got %T(%v)", rec["member"], rec["member"]) + } + if envelope["base64"] == "" { + t.Fatalf("base64 envelope missing payload: %v", envelope) + } +} + +// TestRedisDB_ZSetHandleMetaSkipsDeltaKey pins that the !zs|meta|d|... +// family is silently skipped by HandleZSetMeta. Mirrors the +// hash/list/set delta-key guards. +func TestRedisDB_ZSetHandleMetaSkipsDeltaKey(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + deltaValue := make([]byte, 8) + if err := db.HandleZSetMeta(zsetMetaDeltaKey("k", 7, 0), deltaValue); err != nil { + t.Fatalf("delta key must be silently skipped, got %v", err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(filepath.Join(db.outRoot, "redis", "db_0", "zsets")); !os.IsNotExist(err) { + t.Fatalf("delta-only run should not create zsets/, stat err=%v", err) + } +} + +// TestRedisDB_ZSetHandleZSetMetaDelta exercises the explicit +// delta-handler entry point (the dispatcher routes !zs|meta|d| keys +// here directly when it can distinguish them from base meta). +func TestRedisDB_ZSetHandleZSetMetaDelta(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + if err := db.HandleZSetMetaDelta(zsetMetaDeltaKey("k", 1, 0), make([]byte, 8)); err != nil { + t.Fatalf("HandleZSetMetaDelta must accept any value, got %v", err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } +} + +// TestRedisDB_ZSetHandleZSetScoreSilentlyDiscards pins that the +// !zs|scr| score-index prefix is dropped at intake. !zs|mem| carries +// the authoritative member→score mapping. +func TestRedisDB_ZSetHandleZSetScoreSilentlyDiscards(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + var sortable [8]byte + binary.BigEndian.PutUint64(sortable[:], 0xdeadbeef) + if err := db.HandleZSetScore(zsetScoreKey("k", sortable, []byte("m")), nil); err != nil { + t.Fatalf("HandleZSetScore must not return errors, got %v", err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + // No state was registered — no directory, no warning. + if _, err := os.Stat(filepath.Join(db.outRoot, "redis", "db_0", "zsets")); !os.IsNotExist(err) { + t.Fatalf("score-only run must not create zsets/, stat err=%v", err) + } +} + +// TestRedisDB_ZSetRejectsMalformedMetaValueLength pins that an +// !zs|meta| value of the wrong length surfaces as an error. +func TestRedisDB_ZSetRejectsMalformedMetaValueLength(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + err := db.HandleZSetMeta(zsetMetaKey("k"), []byte{0x00}) + if !errors.Is(err, ErrRedisInvalidZSetMeta) { + t.Fatalf("err=%v want ErrRedisInvalidZSetMeta", err) + } +} + +// TestRedisDB_ZSetRejectsOverflowingMetaValue pins the high-bit +// overflow guard — same shape as hash + list + set encoders. +func TestRedisDB_ZSetRejectsOverflowingMetaValue(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + overflow := make([]byte, 8) + binary.BigEndian.PutUint64(overflow, 1<<63) + err := db.HandleZSetMeta(zsetMetaKey("k"), overflow) + if !errors.Is(err, ErrRedisInvalidZSetMeta) { + t.Fatalf("err=%v want ErrRedisInvalidZSetMeta", err) + } +} + +// TestRedisDB_ZSetRejectsMalformedMemberValueLength pins that an +// !zs|mem| value of the wrong length surfaces as an error. The score +// MUST be exactly 8 bytes — anything else points at corruption or a +// wire-format change we want to detect at intake. +func TestRedisDB_ZSetRejectsMalformedMemberValueLength(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + err := db.HandleZSetMember(zsetMemberKey("k", []byte("m")), []byte{0x00, 0x01, 0x02}) + if !errors.Is(err, ErrRedisInvalidZSetMember) { + t.Fatalf("err=%v want ErrRedisInvalidZSetMember", err) + } +} + +// TestRedisDB_ZSetRejectsNaNScore pins that NaN scores fail closed. +// Redis's ZADD command itself rejects NaN at the wire level +// (Lua_redis_pcall → ZADD with NaN returns NOTNUM), so a NaN at +// backup time indicates corruption or a bypass; silently dumping +// `score: null` would re-corrupt the restored cluster. +func TestRedisDB_ZSetRejectsNaNScore(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + nan := make([]byte, 8) + binary.BigEndian.PutUint64(nan, math.Float64bits(math.NaN())) + err := db.HandleZSetMember(zsetMemberKey("k", []byte("m")), nan) + if !errors.Is(err, ErrRedisInvalidZSetMember) { + t.Fatalf("err=%v want ErrRedisInvalidZSetMember", err) + } +} + +// TestRedisDB_ZSetInfinityScoresUseStringForm pins that ±Inf +// scores serialize as the ZADD-conventional strings `"+inf"`/`"-inf"` +// because json.Marshal returns an error for non-finite floats. The +// per-record `score` field is a json.RawMessage so a finite score +// emits as a number AND an Inf score emits as a string, with the +// same key name. +func TestRedisDB_ZSetInfinityScoresUseStringForm(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + if err := db.HandleZSetMeta(zsetMetaKey("inf"), encodeZSetMetaValue(2)); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMember(zsetMemberKey("inf", []byte("top")), encodeZSetScoreValue(math.Inf(1))); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMember(zsetMemberKey("inf", []byte("bottom")), encodeZSetScoreValue(math.Inf(-1))); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "inf.json")) + // Sorted by member name: bottom < top. + assertZSetMembersEqual(t, zsetMembersArray(t, got), []zsetMemberEntry{ + {member: "bottom", scoreStr: "-inf", scoreKind: "string"}, + {member: "top", scoreStr: "+inf", scoreKind: "string"}, + }) +} + +// TestRedisDB_ZSetParseMetaKeyRejectsDelta is the parser-level guard +// companion to the dispatcher skip — pins that a future refactor +// bypassing HandleZSetMeta's prefix check still surfaces a parse +// failure rather than silent state corruption. +func TestRedisDB_ZSetParseMetaKeyRejectsDelta(t *testing.T) { + t.Parallel() + if _, ok := parseZSetMetaKey(zsetMetaDeltaKey("k", 1, 0)); ok { + t.Fatalf("parseZSetMetaKey must reject delta-prefixed keys") + } +} + +// TestRedisDB_ZSetMembersWithoutMetaStillEmitsFile pins the +// members-without-meta contract: members may arrive before (or +// without) meta, and the encoder must still emit the JSON without +// firing the length-mismatch warning. Mirrors the set encoder's +// rule (PR #758). +func TestRedisDB_ZSetMembersWithoutMetaStillEmitsFile(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + var events []string + db.WithWarnSink(func(event string, _ ...any) { events = append(events, event) }) + if err := db.HandleZSetMember(zsetMemberKey("z", []byte("a")), encodeZSetScoreValue(1)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "z.json")) + members := zsetMembersArray(t, got) + if len(members) != 1 { + t.Fatalf("len(members) = %d, want 1", len(members)) + } + for _, e := range events { + if e == "redis_zset_length_mismatch" { + t.Fatalf("members-without-meta must not fire length-mismatch warning, got events %v", events) + } + } +} + +// TestRedisDB_ZSetDuplicateMembersUseLatestScore pins ZADD's +// latest-wins semantics: re-adding a member with a different score +// overwrites the prior score. (Snapshot iterators don't typically +// emit duplicates, but a recovery replay across overlapping windows +// could.) +func TestRedisDB_ZSetDuplicateMembersUseLatestScore(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + if err := db.HandleZSetMember(zsetMemberKey("z", []byte("m")), encodeZSetScoreValue(1)); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMember(zsetMemberKey("z", []byte("m")), encodeZSetScoreValue(2)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "z.json")) + // Latest-wins: the second HandleZSetMember overwrites score 1 with score 2. + assertZSetMembersEqual(t, zsetMembersArray(t, got), []zsetMemberEntry{ + {member: "m", scoreNum: 2, scoreKind: "number"}, + }) +} + +// TestRedisDB_ZSetMaxInt64DeclaredLen pins the math.MaxInt64 +// boundary — declaredLen=math.MaxInt64 must be accepted, only > that +// rejected. +func TestRedisDB_ZSetMaxInt64DeclaredLen(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + boundary := make([]byte, 8) + binary.BigEndian.PutUint64(boundary, math.MaxInt64) // exactly the int64 max — must NOT reject + if err := db.HandleZSetMeta(zsetMetaKey("k"), boundary); err != nil { + t.Fatalf("math.MaxInt64 boundary must be accepted, got %v", err) + } +} From 63e54d91a77f3948ed43c9bff3cb6f5a7dd60231 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 19 May 2026 18:40:53 +0900 Subject: [PATCH 02/14] =?UTF-8?q?backup(zset):=20PR790=20r1=20codex=20P1?= =?UTF-8?q?=20=E2=80=94=20buffer=20TTLs=20that=20arrive=20before=20typed?= =?UTF-8?q?=20records?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pebble snapshots are emitted in encoded-key order (store/snapshot_pebble.go::iter.First()+Next()), and `!redis|ttl|` lex-sorts BEFORE every wide-column prefix where the type letter is >= 's' (`!st|` set, `!stream|`, `!zs|` zset — because `r` < `s`/`z`). The original HandleTTL routed any unknown-kind expiry straight into orphanTTLCount, so for sets, streams, and zsets the TTL was DROPPED before zsetState/setState/streamState could claim it. Restored sets/zsets/streams with TTL became permanent. Codex P1 finding on PR #790. The same bug exists in the already- merged set encoder (PR #758); this commit fixes both retroactively. Stream encoder (PR #791) inherits the fix once rebased. Fix: HandleTTL parks unknown-kind expiries in a new pendingTTL map. Each wide-column state-init that may face the bad ordering (setState, zsetState — and streamState once PR #791 lands) drains the entry on first user-key registration via claimPendingTTL(). Finalize counts whatever remains in pendingTTL as truly-unmatched orphans (a TTL whose user key never appeared in any typed record — indicates store corruption or an unknown type prefix). Semantic-change caller audit (per /loop standing instruction): - HandleTTL's redisKindUnknown branch: previously incremented orphanTTLCount immediately; now buffers and lets Finalize count. All callers: only the per-record dispatcher in cmd/elastickv-snapshot-decode (not yet built — Phase 0a follow- up). No external caller mutates orphanTTLCount today. - TestRedisDB_OrphanTTLCountedNotBuffered: updated to assert intake-time orphanTTLCount==0 + pendingTTL grows, then post- Finalize orphanTTLCount==n. - New caller claimPendingTTL: called only by zsetState and setState in this PR. hashState/listState don't call it because their type prefixes (`!hs|`/`!lst|`) lex-sort BEFORE `!redis|ttl|` so the typed record arrives first; pendingTTL is always empty for them. Verified via `grep -n 'claimPendingTTL' internal/backup/`. New tests: - TestRedisDB_ZSetTTLArrivesBeforeRows — pins the fix for zsets. - TestRedisDB_SetTTLArrivesBeforeRows — retroactive coverage for PR #758's set encoder. - TestRedisDB_OrphanTTLCountsTrulyUnmatchedKeys — pins the new Finalize-time orphan semantics. Self-review: 1. Data loss — the original code DROPPED real TTL'd sets/zsets/streams on every backup. This fix recovers them. No new data-loss surface introduced. 2. Concurrency — pendingTTL is added to RedisDB which is already sequential-per-scope; no new locking required. 3. Performance — pendingTTL holds (string-userKey, uint64-expireAt) pairs. The string allocation cost matches kindByKey's, which we already pay for every typed record. The original P2 round 6 concern (don't buffer arbitrarily-large value payloads) is preserved: we still only buffer 8-byte expiry, not value bytes. 4. Consistency — drain happens at FIRST state registration so a later HandleTTL re-arrival (which would route through the redisKindSet/ZSet case in HandleTTL) is a no-op (st.expireAtMs gets overwritten with the same value). 5. Coverage — 3 new tests + 1 updated test. All 64 redis tests pass. --- internal/backup/redis_set.go | 12 +++ internal/backup/redis_string.go | 107 +++++++++++++++++++++++---- internal/backup/redis_string_test.go | 31 ++++++-- internal/backup/redis_zset.go | 11 +++ internal/backup/redis_zset_test.go | 83 +++++++++++++++++++++ 5 files changed, 224 insertions(+), 20 deletions(-) diff --git a/internal/backup/redis_set.go b/internal/backup/redis_set.go index 7cd4a440..392ae8ce 100644 --- a/internal/backup/redis_set.go +++ b/internal/backup/redis_set.go @@ -114,12 +114,24 @@ func (r *RedisDB) HandleSetMetaDelta(_, _ []byte) error { return nil } // setState lazily creates per-key state. Mirrors the hash/list // kindByKey-registration pattern so HandleSetMeta, HandleSetMember, // and the HandleTTL back-edge all agree on the kind. +// +// On first registration we drain any pendingTTL for the user key. +// `!redis|ttl|` lex-sorts BEFORE `!st|...` (because `r` < `s`), +// so in real snapshot order the TTL arrives FIRST; HandleTTL parks +// it in pendingTTL, and this function inlines it into the set's +// `expire_at_ms`. Without this drain step, every TTL'd set would +// restore as permanent — a latent bug in PR #758 surfaced by codex +// on PR #790. Phase 0a tests added in the same PR pin the ordering. func (r *RedisDB) setState(userKey []byte) *redisSetState { uk := string(userKey) if st, ok := r.sets[uk]; ok { return st } st := &redisSetState{members: make(map[string]struct{})} + if expireAtMs, ok := r.claimPendingTTL(userKey); ok { + st.expireAtMs = expireAtMs + st.hasTTL = true + } r.sets[uk] = st r.kindByKey[uk] = redisKindSet return st diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index a2d0ecb5..76040613 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -194,6 +194,26 @@ type RedisDB struct { // `diff -r` between dumps stays line-stable across score-only // mutations. zsets map[string]*redisZSetState + + // pendingTTL buffers expiries whose user-key prefix sorts AFTER + // `!redis|ttl|` in the snapshot's lex-ordered stream. Pebble + // snapshots emit records in encoded-key order + // (`store/snapshot_pebble.go::iter.First()/Next()`), and + // `!redis|ttl|` lex-sorts before all `!st|`/`!stream|`/`!zs|` + // prefixes (`r` < `s`/`s`/`z`). Without buffering, HandleTTL + // would see kindByKey == redisKindUnknown and count the TTL + // as an orphan, dropping it before zsetState / setState / + // streamState had a chance to claim the user key — TTL'd + // sorted sets, sets, and streams would silently restore as + // permanent. + // + // Lifecycle: HandleTTL files the expiry here when kind is + // still unknown. Each wide-column state-init function + // (setState / zsetState / streamState etc.) drains the entry + // when it first registers the user key. Finalize fires the + // orphan-TTL warning for whatever remains (those keys never + // appeared as a typed record — likely a corrupted store). + pendingTTL map[string]uint64 } // NewRedisDB constructs a RedisDB rooted at /redis/db_/. @@ -214,6 +234,7 @@ func NewRedisDB(outRoot string, dbIndex int) *RedisDB { lists: make(map[string]*redisListState), sets: make(map[string]*redisSetState), zsets: make(map[string]*redisZSetState), + pendingTTL: make(map[string]uint64), } } @@ -256,15 +277,32 @@ func (r *RedisDB) HandleHLL(userKey, value []byte) error { return r.writeBlob("hll", userKey, value) } -// HandleTTL processes one !redis|ttl| record. Routing depends on -// what HandleString/HandleHLL recorded for the same userKey: +// HandleTTL processes one !redis|ttl| record. Routing +// depends on what the encoder has previously recorded for the user +// key. There are two ordering regimes the snapshot stream presents: +// +// 1. Prefix sorts BEFORE !redis|ttl| in encoded-key order +// (!hs|, !lst|, !redis|str|, !redis|hll|). The typed record +// arrives FIRST, kindByKey is already set when HandleTTL fires, +// and we route directly to the per-type sidecar / inline field. +// 2. Prefix sorts AFTER !redis|ttl| (!st|, !stream|, !zs|, because +// `r` < `s`/`s`/`z`). The TTL arrives FIRST and kindByKey is +// still redisKindUnknown. We park the expiry in pendingTTL and +// let each wide-column state-init function (setState / +// zsetState / streamState) drain it when the user key finally +// surfaces as a typed record. Codex P1 finding on PR #790. // -// - redisKindHLL -> hll_ttl.jsonl -// - redisKindString -> strings_ttl.jsonl (legacy strings, whose TTL -// lives in !redis|ttl| rather than the inline magic-prefix header) -// - redisKindUnknown -> counted in orphanTTLCount; reported via the -// warn sink on Finalize because Phase 0a's wide-column encoders -// have not landed yet. +// Routing: +// +// - redisKindHLL -> hll_ttl.jsonl (case 1) +// - redisKindString -> strings_ttl.jsonl (case 1; legacy strings +// whose TTL lives in !redis|ttl| rather than the inline header) +// - redisKindHash/List/Set/ZSet/Stream -> inlined into the +// per-key JSON (case 1 for hash/list, case 2 for set/zset/stream +// where the state-init already drained from pendingTTL before +// HandleTTL would even be called the second time) +// - redisKindUnknown -> bufferPendingTTL. Finalize counts truly +// unmatched entries (key never registered as a typed record). func (r *RedisDB) HandleTTL(userKey, value []byte) error { expireAtMs, err := decodeRedisTTLValue(value) if err != nil { @@ -315,16 +353,48 @@ func (r *RedisDB) HandleTTL(userKey, value []byte) error { st.hasTTL = true return nil case redisKindUnknown: - // Track orphan TTL counts only — keys are unused before the - // remaining wide-column encoder (stream) lands, and - // buffering them allocates proportional to user-key size - // (up to 1 MiB per key) for no benefit. Codex P2 round 6. - r.orphanTTLCount++ + // Park the expiry until a wide-column Handle*Meta / + // Handle*Member / Handle*Entry registers the user key. + // Without this buffering, sorted-set / set / stream TTLs + // would be counted as orphans and dropped because their + // type prefixes lex-sort AFTER `!redis|ttl|` in the + // snapshot's encoded-key order. Codex P1 (PR #790). + // + // We store userKey as a string copy (`string([]byte)` + // allocates) rather than the alias slice — the snapshot + // reader reuses key buffers across iterations, so a slice + // alias would race with the next record. + r.pendingTTL[string(userKey)] = expireAtMs return nil } return nil } +// claimPendingTTL drains any buffered TTL for userKey into the +// caller-provided state. Called by the wide-column state-init +// functions (setState / zsetState / streamState) when they first +// register a user key, so the parked expiry inlines into the same +// per-key JSON the rest of the record assembles. +// +// Returns (expireAtMs, true) when a buffered TTL existed. The +// caller should set state.expireAtMs / state.hasTTL on the +// returned value. The pending entry is removed so Finalize's +// orphan-count loop only sees truly-unmatched TTLs. +// +// Safe to call from hashState/listState too even though those +// types' typed records sort before `!redis|ttl|`; pendingTTL will +// always be empty for them. Keeping the call site uniform keeps +// the state-init contract simple. +func (r *RedisDB) claimPendingTTL(userKey []byte) (uint64, bool) { + uk := string(userKey) + expireAtMs, ok := r.pendingTTL[uk] + if !ok { + return 0, false + } + delete(r.pendingTTL, uk) + return expireAtMs, true +} + // Finalize flushes all open sidecar writers and emits warnings for any // pending TTL records whose user key was never claimed by the wide-column // encoders. Call exactly once after every snapshot record has been @@ -344,10 +414,19 @@ func (r *RedisDB) Finalize() error { firstErr = err } } + // At this point all type-prefixed records have been processed + // and every wide-column state-init drained its claimPendingTTL. + // Whatever remains in pendingTTL is truly unmatched — the + // user key never appeared as a typed record. Likely causes: + // store corruption, a snapshot mid-write where the typed + // record was dropped, or a `!redis|ttl|` entry written for a + // key whose type prefix we don't recognise (a future Redis + // type added on the live side without a backup-encoder update). + r.orphanTTLCount += len(r.pendingTTL) if r.warn != nil && r.orphanTTLCount > 0 { r.warn("redis_orphan_ttl", "count", r.orphanTTLCount, - "hint", "remaining wide-column encoder (stream) has not landed yet") + "hint", "TTL records whose user key never appeared in a typed record — possible store corruption or an unknown type prefix") } return firstErr } diff --git a/internal/backup/redis_string_test.go b/internal/backup/redis_string_test.go index 152a55d4..5343d3b5 100644 --- a/internal/backup/redis_string_test.go +++ b/internal/backup/redis_string_test.go @@ -579,11 +579,21 @@ func TestRedisDB_NoKeymapWhenAllReversible(t *testing.T) { func TestRedisDB_OrphanTTLCountedNotBuffered(t *testing.T) { t.Parallel() - // Codex P2 round 6: orphan TTL records (those with no prior - // HandleString/HandleHLL claim) must be counted only — the - // per-key payload would allocate proportional to user-key size - // and is unused before the wide-column encoders land. Drive a - // sample of orphan records and assert the count, not a buffer. + // Codex P1 (PR #790): orphan TTL records are now BUFFERED in + // pendingTTL during intake — the wide-column state-init + // functions need to drain them when a typed record finally + // registers a user key. The buffer holds (string-userKey, + // uint64-expireAt) pairs; the per-record allocation cost is the + // same as kindByKey's, which we already pay for every typed + // record. The original Codex P2 round 6 concern (don't buffer + // arbitrarily-large payload bytes) is preserved — we still + // don't keep the value bytes. + // + // At Finalize, entries still in pendingTTL are counted as truly + // unmatched orphans. This test now asserts: + // - During intake: orphanTTLCount stays at 0, pendingTTL grows. + // - After Finalize: orphanTTLCount == n (no typed record ever + // drained the entries). db, _ := newRedisDB(t) const n = 10_000 for i := 0; i < n; i++ { @@ -593,8 +603,17 @@ func TestRedisDB_OrphanTTLCountedNotBuffered(t *testing.T) { t.Fatalf("HandleTTL[%d]: %v", i, err) } } + if db.orphanTTLCount != 0 { + t.Fatalf("orphanTTLCount = %d at intake, want 0 (buffered)", db.orphanTTLCount) + } + if len(db.pendingTTL) != n { + t.Fatalf("pendingTTL len = %d, want %d", len(db.pendingTTL), n) + } + if err := db.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } if db.orphanTTLCount != n { - t.Fatalf("orphanTTLCount = %d, want %d", db.orphanTTLCount, n) + t.Fatalf("orphanTTLCount = %d after Finalize, want %d", db.orphanTTLCount, n) } } diff --git a/internal/backup/redis_zset.go b/internal/backup/redis_zset.go index 0f4e20a4..8350fdbc 100644 --- a/internal/backup/redis_zset.go +++ b/internal/backup/redis_zset.go @@ -156,12 +156,23 @@ func (r *RedisDB) HandleZSetMetaDelta(_, _ []byte) error { return nil } // zsetState lazily creates per-key state. Mirrors the hash/list/set // kindByKey-registration pattern so HandleZSetMeta, HandleZSetMember, // and the HandleTTL back-edge all agree on the kind. +// +// On first registration we drain any pendingTTL for the user key. +// `!redis|ttl|` lex-sorts BEFORE `!zs|...` (because `r` < `z`), +// so in real snapshot order the TTL arrives FIRST; HandleTTL parks +// it in pendingTTL, and this function inlines it into the zset's +// `expire_at_ms`. Without this drain step, every TTL'd zset would +// restore as permanent. Codex P1 finding on PR #790. func (r *RedisDB) zsetState(userKey []byte) *redisZSetState { uk := string(userKey) if st, ok := r.zsets[uk]; ok { return st } st := &redisZSetState{members: make(map[string]float64)} + if expireAtMs, ok := r.claimPendingTTL(userKey); ok { + st.expireAtMs = expireAtMs + st.hasTTL = true + } r.zsets[uk] = st r.kindByKey[uk] = redisKindZSet return st diff --git a/internal/backup/redis_zset_test.go b/internal/backup/redis_zset_test.go index 59e00845..ecfa7563 100644 --- a/internal/backup/redis_zset_test.go +++ b/internal/backup/redis_zset_test.go @@ -513,6 +513,89 @@ func TestRedisDB_ZSetDuplicateMembersUseLatestScore(t *testing.T) { }) } +// TestRedisDB_ZSetTTLArrivesBeforeRows pins the codex P1 fix: +// `!redis|ttl|` lex-sorts BEFORE `!zs|...` because `r` < `z`, +// so in a real Pebble snapshot's encoded-key order the TTL record +// arrives FIRST. The encoder MUST buffer the expiry in pendingTTL +// and drain it when the zset first registers via zsetState, +// inlining the value into the zset's JSON expire_at_ms. Without +// this drain step every TTL'd zset would restore as permanent — +// the P1 finding on PR #790 (chatgpt-codex-connector). +func TestRedisDB_ZSetTTLArrivesBeforeRows(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + // Snapshot order: TTL first, then meta + member. + if err := db.HandleTTL([]byte("k"), encodeTTLValue(fixedExpireMs)); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMeta(zsetMetaKey("k"), encodeZSetMetaValue(1)); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMember(zsetMemberKey("k", []byte("m")), encodeZSetScoreValue(1.5)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) + if zsetFloat(t, got, "expire_at_ms") != float64(fixedExpireMs) { + t.Fatalf("expire_at_ms = %v want %d — pendingTTL drain failed", got["expire_at_ms"], fixedExpireMs) + } +} + +// TestRedisDB_SetTTLArrivesBeforeRows pins the same ordering fix +// for sets (`!redis|ttl|` lex-sorts before `!st|...` because +// `r` < `s`). Retroactive coverage for PR #758, which shipped the +// set encoder before the pendingTTL infrastructure existed. +func TestRedisDB_SetTTLArrivesBeforeRows(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + // Snapshot order: TTL first, then meta + member. + if err := db.HandleTTL([]byte("k"), encodeTTLValue(fixedExpireMs)); err != nil { + t.Fatal(err) + } + if err := db.HandleSetMeta(setMetaKey("k"), encodeSetMetaValue(1)); err != nil { + t.Fatal(err) + } + if err := db.HandleSetMember(setMemberKey("k", []byte("m")), nil); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readSetJSON(t, filepath.Join(root, "redis", "db_0", "sets", "k.json")) + if setFloat(t, got, "expire_at_ms") != float64(fixedExpireMs) { + t.Fatalf("expire_at_ms = %v want %d — pendingTTL drain failed", got["expire_at_ms"], fixedExpireMs) + } +} + +// TestRedisDB_OrphanTTLCountsTrulyUnmatchedKeys pins the post-fix +// orphan semantics: a TTL for a key that NEVER appears in any +// typed record (store corruption or unknown type prefix) is +// counted at Finalize, not silently dropped on intake. +func TestRedisDB_OrphanTTLCountsTrulyUnmatchedKeys(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + var events []string + db.WithWarnSink(func(event string, _ ...any) { events = append(events, event) }) + if err := db.HandleTTL([]byte("orphan"), encodeTTLValue(fixedExpireMs)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + found := false + for _, e := range events { + if e == "redis_orphan_ttl" { + found = true + break + } + } + if !found { + t.Fatalf("expected redis_orphan_ttl warning, got %v", events) + } +} + // TestRedisDB_ZSetMaxInt64DeclaredLen pins the math.MaxInt64 // boundary — declaredLen=math.MaxInt64 must be accepted, only > that // rejected. From 5446190f782deeffca7e33784b88efb3d01b0e14 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 19 May 2026 18:52:27 +0900 Subject: [PATCH 03/14] =?UTF-8?q?backup(zset):=20PR790=20r2=20codex=20P1?= =?UTF-8?q?=20=E2=80=94=20handle=20legacy=20!redis|zset|=20single-blob=20l?= =?UTF-8?q?ayout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex flagged that the wide-column zset encoder skips the legacy consolidated single-key blob layout the live store still writes. A zset stored only as `!redis|zset|` (with the magic- prefixed pb.RedisZSetValue body) is silently dropped from backup output and any inline TTL becomes an orphan — user-visible sorted-set data loss. Live-side references (adapter, not changed by this commit): - adapter/redis_compat_types.go:82 — redisZSetPrefix - adapter/redis_compat_commands.go:3495-3508 — writes the blob for non-empty persisted zset updates - adapter/redis_compat_helpers.go:610-631 — reads it as the fallback when no wide-column members exist Fix: new public RedisDB.HandleZSetLegacyBlob method that decodes the magic-prefixed pb.RedisZSetValue and registers the same per- member state HandleZSetMember would. The wide-column merge case (mid-migration snapshot containing BOTH layouts for the same user key) works because `!redis|zset|` sorts BEFORE `!zs|...` so the blob arrives first and wide-column rows then update / add members via the latest-wins map. Inline TTL: `!redis|zset|` sorts BEFORE `!redis|ttl|`, so HandleTTL after this handler sees redisKindZSet already and folds via the case-redisKindZSet branch. No pendingTTL detour needed for this ordering. Fail-closed contract (matches existing wide-column path): - Missing magic prefix → ErrRedisInvalidZSetLegacyBlob - Unmarshal error → ErrRedisInvalidZSetLegacyBlob - NaN score → ErrRedisInvalidZSetLegacyBlob (Redis ZADD rejects NaN at wire level) Caller audit (per /loop standing instruction): new public method HandleZSetLegacyBlob has no external callers. Verified via 'grep -rn HandleZSetLegacyBlob --include=*.go' — all matches inside the test file in this PR. The cmd/elastickv-snapshot-decode dispatcher (Phase 0a follow-up, not yet built) will route the `!redis|zset|` prefix to this handler. Parallel bug class: the SAME issue exists for `!redis|hash|`, `!redis|set|`, and `!redis|stream|` legacy blob prefixes. Those encoders shipped in earlier PRs (#725, #758, #791). Each needs its own legacy-blob handler in a follow-up PR; this commit fixes only the zset case codex flagged on PR #790. New tests: - TestRedisDB_ZSetLegacyBlobRoundTrip — basic round-trip - TestRedisDB_ZSetLegacyBlobThenWideColumnMerges — mid-migration - TestRedisDB_ZSetLegacyBlobWithInlineTTL — TTL ordering - TestRedisDB_ZSetLegacyBlobRejectsMissingMagic — fail-closed - TestRedisDB_ZSetLegacyBlobRejectsNaNScore — fail-closed - TestRedisDB_ZSetLegacyBlobRejectsMalformedKey — fail-closed Self-review: 1. Data loss — exact opposite: this commit RECOVERS zsets that were silently dropped. New fail-closed guards prevent silently importing a corrupt blob. 2. Concurrency — no new shared state; per-DB sequential as before. 3. Performance — one protobuf Unmarshal per legacy zset key (same as the live read path). Member map shares the same latest-wins behavior as wide-column intake. 4. Consistency — merge order (blob first, wide-column second) is determined by snapshot lex order; tested explicitly. 5. Coverage — 6 new tests. All 84 redis tests pass. --- internal/backup/redis_zset.go | 118 ++++++++++++++++++++ internal/backup/redis_zset_test.go | 167 +++++++++++++++++++++++++++++ 2 files changed, 285 insertions(+) diff --git a/internal/backup/redis_zset.go b/internal/backup/redis_zset.go index 8350fdbc..8b39e526 100644 --- a/internal/backup/redis_zset.go +++ b/internal/backup/redis_zset.go @@ -8,7 +8,9 @@ import ( "path/filepath" "sort" + pb "github.com/bootjp/elastickv/proto" cockroachdberr "github.com/cockroachdb/errors" + gproto "google.golang.org/protobuf/proto" ) // Redis zset encoder. Translates raw !zs|... snapshot records into the @@ -36,17 +38,48 @@ const ( RedisZSetScorePrefix = "!zs|scr|" RedisZSetMetaDeltaPrefix = "!zs|meta|d|" + // RedisZSetLegacyBlobPrefix is the consolidated single-key + // layout the live store still writes for non-empty persisted + // zsets (`adapter/redis_compat_types.go:82` redisZSetPrefix, + // produced by `adapter/redis_compat_commands.go:3495-3508` and + // read by `adapter/redis_compat_helpers.go:610-631` as the + // fallback when no wide-column members exist). A backup that + // skipped this prefix would silently drop legacy-only zsets; + // HandleZSetLegacyBlob decodes the blob and registers the same + // per-member state HandleZSetMeta + HandleZSetMember would. + // Codex P1 finding on PR #790 (round 2). + RedisZSetLegacyBlobPrefix = "!redis|zset|" + // redisZSetScoreSize is the size of the IEEE 754 big-endian score // stored in !zs|mem| values. Same constant as zsetMetaSizeBytes in // store/zset_helpers.go; duplicated here to keep the backup // package free of internal/storage imports. redisZSetScoreSize = 8 + + // redisZSetLegacyProtoPrefixLen is the on-disk magic prefix size + // for `!redis|zset|` values + // (`adapter/redis_storage_codec.go:15` storedRedisZSetProtoPrefix). + redisZSetLegacyProtoPrefixLen = 4 ) +// redisZSetLegacyProtoPrefix mirrors +// adapter/redis_storage_codec.go:15 storedRedisZSetProtoPrefix. A +// rename on the live side without an accompanying backup update +// surfaces as ErrRedisInvalidZSetLegacyBlob on decode of any real +// cluster dump. +var redisZSetLegacyProtoPrefix = []byte{0x00, 'R', 'Z', 0x01} + // ErrRedisInvalidZSetMeta is returned when an !zs|meta| value is not // the expected 8-byte big-endian member count. var ErrRedisInvalidZSetMeta = cockroachdberr.New("backup: invalid !zs|meta| value") +// ErrRedisInvalidZSetLegacyBlob is returned when a `!redis|zset|` +// value's magic prefix is missing, its protobuf body fails to +// unmarshal, or its decoded scores include NaN. Fail-closed for +// the same reason as ErrRedisInvalidZSetMember: silently accepting +// a corrupt blob would lose the entire zset's contents at restore. +var ErrRedisInvalidZSetLegacyBlob = cockroachdberr.New("backup: invalid !redis|zset| value") + // ErrRedisInvalidZSetMember is returned when an !zs|mem| value is not // the expected 8-byte IEEE 754 score, or contains a NaN score (Redis's // ZADD command rejects NaN, so a NaN at backup time indicates store @@ -153,6 +186,91 @@ func (r *RedisDB) HandleZSetScore(_, _ []byte) error { return nil } // source of truth at backup time. func (r *RedisDB) HandleZSetMetaDelta(_, _ []byte) error { return nil } +// HandleZSetLegacyBlob processes one `!redis|zset|` record. +// This is the consolidated single-key layout the live store still +// writes for non-empty persisted zsets (see RedisZSetLegacyBlobPrefix +// docstring). The encoded value is a magic-prefixed +// `pb.RedisZSetValue` carrying every (member, score) pair. +// +// Decoded entries land in the same per-key state HandleZSetMember +// would have produced, so the per-zset JSON output is identical +// regardless of which layout the live store used. NaN scores fail +// closed at intake, matching HandleZSetMember's contract. +// +// The legacy prefix `!redis|zset|` lex-sorts BEFORE `!zs|...` and +// BEFORE `!redis|ttl|`, so when a zset is stored in both formats +// (mid-migration), this handler creates the state first and the +// later wide-column records merge into it — duplicate +// HandleZSetMember calls follow the same latest-wins policy. +// +// `!redis|zset|` ALSO sorts BEFORE `!redis|ttl|`, so an inline TTL +// on the same user key will reach HandleTTL after this handler has +// already registered redisKindZSet. The HandleTTL redisKindZSet +// branch then folds the expiry into st.expireAtMs via zsetState +// (which itself drains pendingTTL — a no-op here since the typed +// record came first). +func (r *RedisDB) HandleZSetLegacyBlob(key, value []byte) error { + userKey, ok := parseZSetLegacyBlobKey(key) + if !ok { + return cockroachdberr.Wrapf(ErrRedisInvalidZSetLegacyBlob, "key: %q", key) + } + entries, err := decodeZSetLegacyBlobValue(value) + if err != nil { + return err + } + st := r.zsetState(userKey) + for _, e := range entries { + st.members[e.member] = e.score + } + return nil +} + +// zsetLegacyEntry is the per-(member, score) projection extracted +// from a `!redis|zset|` blob's protobuf body. +type zsetLegacyEntry struct { + member string + score float64 +} + +// parseZSetLegacyBlobKey strips `!redis|zset|` and returns the +// user-key bytes. Unlike the wide-column meta key there is no +// userKeyLen prefix — the live store appends the user key directly +// (`adapter/redis_compat_types.go:177` ZSetKey). +func parseZSetLegacyBlobKey(key []byte) ([]byte, bool) { + rest := bytes.TrimPrefix(key, []byte(RedisZSetLegacyBlobPrefix)) + if len(rest) == len(key) { + return nil, false + } + return rest, true +} + +// decodeZSetLegacyBlobValue strips the magic prefix and unmarshals +// the protobuf body into a slice of (member, score) entries. +// Rejects NaN scores (same fail-closed contract as +// HandleZSetMember). +func decodeZSetLegacyBlobValue(value []byte) ([]zsetLegacyEntry, error) { + if len(value) < redisZSetLegacyProtoPrefixLen || + !bytes.Equal(value[:redisZSetLegacyProtoPrefixLen], redisZSetLegacyProtoPrefix) { + return nil, cockroachdberr.Wrapf(ErrRedisInvalidZSetLegacyBlob, + "missing or corrupt magic prefix (len=%d)", len(value)) + } + msg := &pb.RedisZSetValue{} + if err := gproto.Unmarshal(value[redisZSetLegacyProtoPrefixLen:], msg); err != nil { + return nil, cockroachdberr.Wrapf(ErrRedisInvalidZSetLegacyBlob, + "unmarshal: %v", err) + } + out := make([]zsetLegacyEntry, 0, len(msg.GetEntries())) + for _, e := range msg.GetEntries() { + score := e.GetScore() + if math.IsNaN(score) { + return nil, cockroachdberr.Wrapf(ErrRedisInvalidZSetLegacyBlob, + "NaN score for member %q", e.GetMember()) + } + out = append(out, zsetLegacyEntry{member: e.GetMember(), score: score}) + } + return out, nil +} + // zsetState lazily creates per-key state. Mirrors the hash/list/set // kindByKey-registration pattern so HandleZSetMeta, HandleZSetMember, // and the HandleTTL back-edge all agree on the kind. diff --git a/internal/backup/redis_zset_test.go b/internal/backup/redis_zset_test.go index ecfa7563..dca5a9cd 100644 --- a/internal/backup/redis_zset_test.go +++ b/internal/backup/redis_zset_test.go @@ -8,9 +8,41 @@ import ( "path/filepath" "testing" + pb "github.com/bootjp/elastickv/proto" "github.com/cockroachdb/errors" + gproto "google.golang.org/protobuf/proto" ) +// encodeZSetLegacyBlobValue produces the magic-prefixed protobuf +// wire format the live store writes for `!redis|zset|` +// values (mirror of adapter/redis_storage_codec.go::marshalZSetValue). +func encodeZSetLegacyBlobValue(t *testing.T, entries []zsetLegacyEntry) []byte { + t.Helper() + msg := &pb.RedisZSetValue{} + for _, e := range entries { + msg.Entries = append(msg.Entries, &pb.RedisZSetEntry{ + Member: e.member, + Score: e.score, + }) + } + body, err := gproto.Marshal(msg) + if err != nil { + t.Fatalf("marshal pb.RedisZSetValue: %v", err) + } + out := make([]byte, 0, redisZSetLegacyProtoPrefixLen+len(body)) + out = append(out, redisZSetLegacyProtoPrefix...) + out = append(out, body...) + return out +} + +// zsetLegacyBlobKey is the test-side mirror of +// adapter/redis_compat_types.go:177 ZSetKey: +// `!redis|zset|` (no userKeyLen prefix). +func zsetLegacyBlobKey(userKey string) []byte { + out := []byte(RedisZSetLegacyBlobPrefix) + return append(out, userKey...) +} + // encodeZSetMetaValue builds the 8-byte BE member-count value used by // the live store/zset_helpers.go (mirror of store.MarshalZSetMeta). func encodeZSetMetaValue(memberCount int64) []byte { @@ -608,3 +640,138 @@ func TestRedisDB_ZSetMaxInt64DeclaredLen(t *testing.T) { t.Fatalf("math.MaxInt64 boundary must be accepted, got %v", err) } } + +// TestRedisDB_ZSetLegacyBlobRoundTrip pins the codex P1 fix: a +// zset stored only via the legacy `!redis|zset|` blob +// must surface in the dump with all its members. Without +// HandleZSetLegacyBlob, the encoder would skip the record and +// produce an empty zsets/ output for that key — silent backup +// data loss. +func TestRedisDB_ZSetLegacyBlobRoundTrip(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + value := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{ + {member: "alice", score: 100}, + {member: "bob", score: 50}, + {member: "charlie", score: 30}, + }) + if err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("leaderboard"), value); err != nil { + t.Fatalf("HandleZSetLegacyBlob: %v", err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "leaderboard.json")) + // Members sorted by member-name bytes (matches the wide-column + // encoder's output policy). + assertZSetMembersEqual(t, zsetMembersArray(t, got), []zsetMemberEntry{ + {member: "alice", scoreNum: 100, scoreKind: "number"}, + {member: "bob", scoreNum: 50, scoreKind: "number"}, + {member: "charlie", scoreNum: 30, scoreKind: "number"}, + }) +} + +// TestRedisDB_ZSetLegacyBlobThenWideColumnMerges pins the mid- +// migration shape: when a snapshot carries BOTH the legacy +// `!redis|zset|` blob AND wide-column `!zs|mem|...` rows for +// the same user key (which the live store does during the +// migration window), the encoder must produce a single merged +// zset. `!redis|zset|` lex-sorts before `!zs|...` so the blob +// arrives first; the wide-column rows then update / add members. +func TestRedisDB_ZSetLegacyBlobThenWideColumnMerges(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + legacy := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{ + {member: "alice", score: 1}, + {member: "bob", score: 2}, + }) + if err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), legacy); err != nil { + t.Fatal(err) + } + // Wide-column rows: update alice's score, add charlie. + if err := db.HandleZSetMember(zsetMemberKey("k", []byte("alice")), encodeZSetScoreValue(99)); err != nil { + t.Fatal(err) + } + if err := db.HandleZSetMember(zsetMemberKey("k", []byte("charlie")), encodeZSetScoreValue(3)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) + assertZSetMembersEqual(t, zsetMembersArray(t, got), []zsetMemberEntry{ + {member: "alice", scoreNum: 99, scoreKind: "number"}, // wide-column won + {member: "bob", scoreNum: 2, scoreKind: "number"}, // legacy survived + {member: "charlie", scoreNum: 3, scoreKind: "number"}, + }) +} + +// TestRedisDB_ZSetLegacyBlobWithInlineTTL pins that a TTL'd zset +// stored only via the legacy blob round-trips its expiry. The +// snapshot order is `!redis|zset|` (sorts before `!redis|ttl|`), +// so HandleZSetLegacyBlob runs first and registers redisKindZSet, +// then HandleTTL routes via the redisKindZSet branch (no +// pendingTTL detour needed for this ordering). +func TestRedisDB_ZSetLegacyBlobWithInlineTTL(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + value := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{{member: "m", score: 1}}) + if err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), value); err != nil { + t.Fatal(err) + } + if err := db.HandleTTL([]byte("k"), encodeTTLValue(fixedExpireMs)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) + if zsetFloat(t, got, "expire_at_ms") != float64(fixedExpireMs) { + t.Fatalf("expire_at_ms = %v want %d", got["expire_at_ms"], fixedExpireMs) + } +} + +// TestRedisDB_ZSetLegacyBlobRejectsMissingMagic pins fail-closed +// behaviour: a `!redis|zset|` value without the magic prefix +// fails at intake rather than silently decoding garbage protobuf. +func TestRedisDB_ZSetLegacyBlobRejectsMissingMagic(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + body, err := gproto.Marshal(&pb.RedisZSetValue{}) + if err != nil { + t.Fatalf("marshal: %v", err) + } + err = db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), body) // no magic prefix + if !errors.Is(err, ErrRedisInvalidZSetLegacyBlob) { + t.Fatalf("err=%v want ErrRedisInvalidZSetLegacyBlob", err) + } +} + +// TestRedisDB_ZSetLegacyBlobRejectsNaNScore pins NaN-fail-closed +// parallel to HandleZSetMember's contract. Redis ZADD rejects NaN +// at the wire level, so a NaN in storage indicates corruption. +func TestRedisDB_ZSetLegacyBlobRejectsNaNScore(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + value := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{{member: "m", score: math.NaN()}}) + err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), value) + if !errors.Is(err, ErrRedisInvalidZSetLegacyBlob) { + t.Fatalf("err=%v want ErrRedisInvalidZSetLegacyBlob", err) + } +} + +// TestRedisDB_ZSetLegacyBlobRejectsMalformedKey pins that a +// `!redis|zset|` key with no trailing user-key bytes fails parse. +func TestRedisDB_ZSetLegacyBlobRejectsMalformedKey(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + value := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{{member: "m", score: 1}}) + // Key has the prefix but no trailing user-key bytes — parser must + // still accept it (empty user key is technically valid Redis). + // Use a key that doesn't have the prefix to trigger the parse + // failure. + err := db.HandleZSetLegacyBlob([]byte("not-the-right-prefix|k"), value) + if !errors.Is(err, ErrRedisInvalidZSetLegacyBlob) { + t.Fatalf("err=%v want ErrRedisInvalidZSetLegacyBlob", err) + } +} From ab88f3e0967131716665212aa2bc0dc5e92972b9 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 19 May 2026 19:01:35 +0900 Subject: [PATCH 04/14] =?UTF-8?q?backup(zset):=20PR790=20r3=20codex=20P1?= =?UTF-8?q?=20=E2=80=94=20wide-column=20rows=20supersede=20legacy=20blob?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex flagged that the r2 fix (legacy-blob handler in #790 #2) mis-handles the mid-migration / stale-leftover case: when a snapshot contains BOTH `!redis|zset|` and `!zs|mem|...` for the same user key, my code merged both, surfacing members that are no longer visible to readers. Live read path (adapter/redis_compat_helpers.go:610-620 RedisServer.loadZSetAt): when ANY wide-column row exists, loadZSetMembersAt is used and the legacy blob is IGNORED. A stale `!redis|zset|` blob left over after a partial migration would be invisible to clients but my encoder was surfacing its entries in the dump — silent backup corruption. Fix: add a sawWide flag to redisZSetState that flips on every wide-column observation (HandleZSetMeta + HandleZSetMember). markZSetWide() clears any legacy entries previously deposited by HandleZSetLegacyBlob, and HandleZSetLegacyBlob short-circuits if sawWide is already set. The result matches the live read path's source-of-truth selection: - No wide rows → legacy blob is the source of truth. - Any wide row exists → wide rows are authoritative; legacy entries are ignored (whether they arrived before or after the wide row). Caller audit (per /loop standing instruction): the new markZSetWide method is called only from HandleZSetMeta line 160 and HandleZSetMember line 194. HandleZSetLegacyBlob (line 268) short-circuits on the same flag. No other call sites — verified via 'grep -rn markZSetWide|sawWide internal/backup/'. The semantic change (legacy entries cleared on first wide observation) is contained to redisZSetState; no public API or external caller visible. Test changes: - TestRedisDB_ZSetLegacyBlobThenWideColumnMerges renamed to TestRedisDB_ZSetWideColumnSupersedesLegacyBlob, expectations updated (bob-stale no longer survives). - New TestRedisDB_ZSetMetaAloneEvictsLegacyBlob — meta-only wide-column row evicts legacy blob. - New TestRedisDB_ZSetLegacyBlobAfterWideRowsIsDropped — reverse- order case (wide first, blob later). Self-review: 1. Data loss — opposite: this fix PREVENTS surfacing stale members that the live read path hides. Legacy blob is still applied when no wide rows exist (preserves the original P1 round 2 fix for legacy-only zsets). 2. Concurrency — no new shared state. 3. Performance — sawWide is a single bool per state; markZSetWide short-circuits on subsequent calls so the clear-loop runs at most once per state. 4. Consistency — encoder output now matches `loadZSetAt`'s read- side source-of-truth selection. Tested with both orderings (legacy-first-wide-later, wide-first-legacy-later, meta-alone). 5. Coverage — 1 renamed test + 2 new tests. All 90 redis tests pass. --- internal/backup/redis_zset.go | 55 ++++++++++++++++++ internal/backup/redis_zset_test.go | 90 ++++++++++++++++++++++++++---- 2 files changed, 134 insertions(+), 11 deletions(-) diff --git a/internal/backup/redis_zset.go b/internal/backup/redis_zset.go index 8b39e526..fbdfb278 100644 --- a/internal/backup/redis_zset.go +++ b/internal/backup/redis_zset.go @@ -96,10 +96,22 @@ var ErrRedisInvalidZSetKey = cockroachdberr.New("backup: malformed !zs| key") // HandleZSetMember calls collapse to the latest-wins score (matching // Redis's ZADD semantics: re-adding a member overwrites the prior // score). +// +// sawWide tracks whether ANY wide-column row (`!zs|meta|` or +// `!zs|mem|`) has been observed for the user key. This mirrors the +// live read path's source-of-truth selection in +// `adapter/redis_compat_helpers.go:610-620` +// (`RedisServer.loadZSetAt`): when wide-column rows exist they are +// authoritative, and any `!redis|zset|` legacy blob is ignored +// (treated as a stale post-migration leftover). Without this flag, +// the encoder would merge stale legacy members on top of the +// wide-column source-of-truth, surfacing deleted or outdated +// entries in the restored zset. Codex P1 round 3 (PR #790). type redisZSetState struct { metaSeen bool declaredLen int64 members map[string]float64 + sawWide bool expireAtMs uint64 hasTTL bool } @@ -141,6 +153,11 @@ func (r *RedisDB) HandleZSetMeta(key, value []byte) error { st := r.zsetState(userKey) st.declaredLen = int64(rawLen) //nolint:gosec // bounds-checked above st.metaSeen = true + // Wide-column meta means any legacy blob already merged into + // this state is stale; drop it. The live read path + // (adapter/redis_compat_helpers.go:610-620) makes the same + // choice on read. + r.markZSetWide(st) return nil } @@ -169,10 +186,39 @@ func (r *RedisDB) HandleZSetMember(key, value []byte) error { "NaN score for member of user key %q (length %d)", userKey, len(userKey)) } st := r.zsetState(userKey) + // First wide-column row evicts any provisional legacy-blob + // members on this user key. From here on, the wide-column + // rows are the authoritative source of truth — matches the + // live read path's preference in + // adapter/redis_compat_helpers.go:610-620. + r.markZSetWide(st) st.members[string(member)] = score return nil } +// markZSetWide flips the sawWide flag and, if this is the first +// wide-column observation for the state, clears any provisional +// legacy-blob members. The live read path treats `!redis|zset|` +// as a fallback that is ONLY consulted when no wide-column rows +// exist (`adapter/redis_compat_helpers.go::RedisServer.loadZSetAt` +// line 610-620), so a snapshot containing both layouts for the +// same user key MUST drop the legacy entries to avoid surfacing +// stale post-migration leftovers. Codex P1 round 3 (PR #790). +func (r *RedisDB) markZSetWide(st *redisZSetState) { + if st.sawWide { + return + } + st.sawWide = true + // Clear any legacy-blob members deposited before the first + // wide-column row arrived. Re-using the same map keeps the + // nil-vs-empty contract for empty-but-meta-seen zsets (an + // explicit `make` would change observable Finalize behavior + // for those edge cases). + for k := range st.members { + delete(st.members, k) + } +} + // HandleZSetScore accepts and discards one !zs|scr|... record. The // score index is a live-side secondary index for ZRANGEBYSCORE; the // authoritative member→score mapping lives in !zs|mem|. Snapshot @@ -219,6 +265,15 @@ func (r *RedisDB) HandleZSetLegacyBlob(key, value []byte) error { return err } st := r.zsetState(userKey) + if st.sawWide { + // Wide-column rows already arrived for this user key (rare + // in practice because !redis|zset| sorts before !zs|..., + // but possible with a custom dispatcher ordering or a mid- + // migration replay). The live read path ignores the legacy + // blob in that case, so we do too — applying these entries + // would surface stale post-migration leftovers in the dump. + return nil + } for _, e := range entries { st.members[e.member] = e.score } diff --git a/internal/backup/redis_zset_test.go b/internal/backup/redis_zset_test.go index dca5a9cd..4cd76476 100644 --- a/internal/backup/redis_zset_test.go +++ b/internal/backup/redis_zset_test.go @@ -671,24 +671,35 @@ func TestRedisDB_ZSetLegacyBlobRoundTrip(t *testing.T) { }) } -// TestRedisDB_ZSetLegacyBlobThenWideColumnMerges pins the mid- -// migration shape: when a snapshot carries BOTH the legacy +// TestRedisDB_ZSetWideColumnSupersedesLegacyBlob pins the codex +// P1 round 3 fix: when a snapshot carries BOTH the legacy // `!redis|zset|` blob AND wide-column `!zs|mem|...` rows for -// the same user key (which the live store does during the -// migration window), the encoder must produce a single merged -// zset. `!redis|zset|` lex-sorts before `!zs|...` so the blob -// arrives first; the wide-column rows then update / add members. -func TestRedisDB_ZSetLegacyBlobThenWideColumnMerges(t *testing.T) { +// the same user key (a mid-migration state, or a stale post- +// migration leftover), the encoder MUST drop the legacy entries +// and use only the wide-column source-of-truth. This matches the +// live read path in adapter/redis_compat_helpers.go:610-620 +// (RedisServer.loadZSetAt) which falls back to the legacy blob +// ONLY when no wide-column rows exist. +// +// Without this fix, the dump would surface deleted or outdated +// members from a stale legacy blob — silent backup corruption. +func TestRedisDB_ZSetWideColumnSupersedesLegacyBlob(t *testing.T) { t.Parallel() db, root := newRedisDB(t) + // Snapshot order: legacy blob arrives first (sorts before !zs|). + // "bob-stale" exists ONLY in the legacy blob — a stale + // post-migration leftover that the live store hides via the + // loadZSetAt preference for wide-column rows. legacy := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{ {member: "alice", score: 1}, - {member: "bob", score: 2}, + {member: "bob-stale", score: 2}, }) if err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), legacy); err != nil { t.Fatal(err) } - // Wide-column rows: update alice's score, add charlie. + // Wide-column source-of-truth: alice with a new score, charlie new. + // "bob-stale" is intentionally absent — its presence in the legacy + // blob is the stale-leftover scenario this test guards against. if err := db.HandleZSetMember(zsetMemberKey("k", []byte("alice")), encodeZSetScoreValue(99)); err != nil { t.Fatal(err) } @@ -699,13 +710,70 @@ func TestRedisDB_ZSetLegacyBlobThenWideColumnMerges(t *testing.T) { t.Fatal(err) } got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) + // "bob-stale" MUST NOT appear — wide-column rows supersede the legacy blob. assertZSetMembersEqual(t, zsetMembersArray(t, got), []zsetMemberEntry{ - {member: "alice", scoreNum: 99, scoreKind: "number"}, // wide-column won - {member: "bob", scoreNum: 2, scoreKind: "number"}, // legacy survived + {member: "alice", scoreNum: 99, scoreKind: "number"}, {member: "charlie", scoreNum: 3, scoreKind: "number"}, }) } +// TestRedisDB_ZSetMetaAloneEvictsLegacyBlob pins that a `!zs|meta|` +// record (even without any `!zs|mem|` rows) also triggers the +// legacy-supersedence rule. The live read path's loadZSetAt +// preference is based on whether ANY wide-column key exists for +// the user key — meta-only is rare in practice but the encoder +// stays consistent with the read-side contract. +func TestRedisDB_ZSetMetaAloneEvictsLegacyBlob(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + legacy := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{ + {member: "stale", score: 1}, + }) + if err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), legacy); err != nil { + t.Fatal(err) + } + // Wide-column meta record declares zero members. + if err := db.HandleZSetMeta(zsetMetaKey("k"), encodeZSetMetaValue(0)); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) + if members := zsetMembersArray(t, got); len(members) != 0 { + t.Fatalf("meta-only wide-column row MUST evict legacy blob, got members=%v", members) + } +} + +// TestRedisDB_ZSetLegacyBlobAfterWideRowsIsDropped pins the +// reverse-order edge case: if a snapshot somehow emits a +// `!redis|zset|` AFTER `!zs|mem|` rows (custom dispatcher +// ordering, or a replay), the legacy blob is dropped at intake +// rather than overwriting the established wide-column state. +func TestRedisDB_ZSetLegacyBlobAfterWideRowsIsDropped(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + // Wide-column row first — registers sawWide. + if err := db.HandleZSetMember(zsetMemberKey("k", []byte("alice")), encodeZSetScoreValue(99)); err != nil { + t.Fatal(err) + } + // Legacy blob arrives later — must be ignored. + legacy := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{ + {member: "alice", score: 1}, // would overwrite if not gated + {member: "stale", score: 2}, // would re-add if not gated + }) + if err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), legacy); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) + assertZSetMembersEqual(t, zsetMembersArray(t, got), []zsetMemberEntry{ + {member: "alice", scoreNum: 99, scoreKind: "number"}, // unchanged + }) +} + // TestRedisDB_ZSetLegacyBlobWithInlineTTL pins that a TTL'd zset // stored only via the legacy blob round-trips its expiry. The // snapshot order is `!redis|zset|` (sorts before `!redis|ttl|`), From 174456aed15094b6702cece0e6f24bbb670be12c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 20 May 2026 04:46:43 +0900 Subject: [PATCH 05/14] =?UTF-8?q?backup(zset):=20PR790=20r4=20codex=20P1?= =?UTF-8?q?=20=E2=80=94=20bound=20pendingTTL=20to=20prevent=20OOM?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex flagged the same P1 on PR #791 round 2 — pendingTTL grows unboundedly when no zset handler is present to drain the parked TTLs. The same bug exists on PR #790's branch (the two PRs share the pendingTTL infrastructure; the codex finding's text applies verbatim). Fix: cap pendingTTL at pendingTTLCap entries (default 1 MiB). Once the cap is reached, subsequent unknown-kind TTLs fall back to immediate-orphan counting via the new parkUnknownTTL helper. New API: - RedisDB.WithPendingTTLCap(int) chainable setter. - pendingTTLOverflow counter surfaced in the orphan-TTL warning. Caller audit (per /loop standing instruction): - HandleTTL's redisKindUnknown branch behavior changes: previously ALWAYS buffered, now buffers up to cap then orphan-counts. Other branches unchanged. - Callers of HandleTTL: tests only. The cmd/elastickv-snapshot-decode driver (Phase 0a follow-up) will be the first production caller. - New helper parkUnknownTTL is package-private with one call site (HandleTTL). No prior call sites to audit. - Verified via `grep -rn HandleTTL|pendingTTLCap|WithPendingTTLCap --include=*.go internal/backup/`. New tests: - TestRedisDB_PendingTTLBoundedByCap. - TestRedisDB_WithPendingTTLCapZeroDisablesBuffering. - TestRedisDB_WithPendingTTLCapNegativeCoercedToZero. Self-review: 1. Data loss - cap can mis-classify TTLs in the overflow window that COULD have been drained by a later state-init. Mitigation: default cap (1 MiB) is well above the count of legitimately- buffered wide-column TTL'd keys on real clusters; operators can tune via WithPendingTTLCap. 2. Concurrency - no new shared state. 3. Performance - one extra comparison per unknown-kind TTL. 4. Consistency - mirrors the fix landing on PR #791 r3; when either PR merges first the other can rebase cleanly. 5. Coverage - 3 new tests + the existing TestRedisDB_OrphanTTLCountedNotBuffered (still pins the default-cap buffered + drained-at-Finalize semantics). --- internal/backup/redis_string.go | 89 +++++++++++++++++++++++----- internal/backup/redis_string_test.go | 65 ++++++++++++++++++++ 2 files changed, 140 insertions(+), 14 deletions(-) diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index 76040613..cd000e65 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -214,8 +214,32 @@ type RedisDB struct { // orphan-TTL warning for whatever remains (those keys never // appeared as a typed record — likely a corrupted store). pendingTTL map[string]uint64 + + // pendingTTLCap caps pendingTTL's in-memory size. Once the map + // reaches this many entries, subsequent unknown-kind TTLs fall + // back to incrementing orphanTTLCount directly without + // buffering the user-key bytes. Without this cap, an + // adversarial or corrupt snapshot whose `!redis|ttl|` records + // never find a typed-record claimer would grow pendingTTL + // unboundedly and could OOM the decoder. Codex P1 finding + // surfaced on PR #791 round 2; the same fix is applied here on + // PR #790 because the two PRs share the pendingTTL + // infrastructure and the bug is identical. + pendingTTLCap int + + // pendingTTLOverflow counts entries skipped because the + // pendingTTL buffer was at cap. Surfaced in the Finalize + // warning so operators can distinguish "snapshot exceeded the + // buffer cap" from "TTL records remained unmatched". + pendingTTLOverflow int } +// defaultPendingTTLCap caps pendingTTL at 1 MiB entries by default +// (~64 MiB worst-case memory at ~64 B per Go map entry). Override +// via WithPendingTTLCap for hosts that need a different memory / +// coverage trade-off. +const defaultPendingTTLCap = 1 << 20 + // NewRedisDB constructs a RedisDB rooted at /redis/db_/. // dbIndex selects ; today the producer always passes 0, but accepting // the index as a parameter prevents a future multi-db dump from silently @@ -235,7 +259,21 @@ func NewRedisDB(outRoot string, dbIndex int) *RedisDB { sets: make(map[string]*redisSetState), zsets: make(map[string]*redisZSetState), pendingTTL: make(map[string]uint64), + pendingTTLCap: defaultPendingTTLCap, + } +} + +// WithPendingTTLCap overrides the default cap on the pendingTTL +// buffer. A value of 0 disables buffering — every unknown-kind TTL +// becomes an immediate orphan (matches the pre-pendingTTL behavior). +// Negative inputs are coerced to 0. Returns the receiver so it can +// be chained with other With* setters. +func (r *RedisDB) WithPendingTTLCap(capacity int) *RedisDB { + if capacity < 0 { + capacity = 0 } + r.pendingTTLCap = capacity + return r } // WithWarnSink wires a structured-warning sink. The sink is called with @@ -353,23 +391,39 @@ func (r *RedisDB) HandleTTL(userKey, value []byte) error { st.hasTTL = true return nil case redisKindUnknown: - // Park the expiry until a wide-column Handle*Meta / - // Handle*Member / Handle*Entry registers the user key. - // Without this buffering, sorted-set / set / stream TTLs - // would be counted as orphans and dropped because their - // type prefixes lex-sort AFTER `!redis|ttl|` in the - // snapshot's encoded-key order. Codex P1 (PR #790). - // - // We store userKey as a string copy (`string([]byte)` - // allocates) rather than the alias slice — the snapshot - // reader reuses key buffers across iterations, so a slice - // alias would race with the next record. - r.pendingTTL[string(userKey)] = expireAtMs + r.parkUnknownTTL(userKey, expireAtMs) return nil } return nil } +// parkUnknownTTL buffers a redisKindUnknown TTL into pendingTTL, or +// falls back to the immediate orphan-count path when the buffer is +// at cap. Extracted from HandleTTL's switch so the parent stays +// under the cyclop budget. +// +// Rationale (codex P1 on PR #791 round 2; same fix mirrored here): +// Pebble snapshots emit records in encoded-key order, and +// `!redis|ttl|` lex-sorts BEFORE every wide-column prefix where the +// type letter is >= 's' (set, stream, zset). The buffer lets us +// drain the parked expiry when the typed record finally arrives; +// the cap prevents an adversarial snapshot whose TTLs never find a +// claimer from OOM'ing the decoder. Storage: userKey is COPIED +// (`string([]byte)` allocates) because the snapshot reader reuses +// key buffers across iterations — an alias slice would race with +// the next record. +func (r *RedisDB) parkUnknownTTL(userKey []byte, expireAtMs uint64) { + if len(r.pendingTTL) >= r.pendingTTLCap { + // Cap reached: fall back to the immediate-orphan path so + // memory stays bounded. Already-buffered entries can still + // be drained by later state-inits. + r.pendingTTLOverflow++ + r.orphanTTLCount++ + return + } + r.pendingTTL[string(userKey)] = expireAtMs +} + // claimPendingTTL drains any buffered TTL for userKey into the // caller-provided state. Called by the wide-column state-init // functions (setState / zsetState / streamState) when they first @@ -424,9 +478,16 @@ func (r *RedisDB) Finalize() error { // type added on the live side without a backup-encoder update). r.orphanTTLCount += len(r.pendingTTL) if r.warn != nil && r.orphanTTLCount > 0 { - r.warn("redis_orphan_ttl", + fields := []any{ "count", r.orphanTTLCount, - "hint", "TTL records whose user key never appeared in a typed record — possible store corruption or an unknown type prefix") + "hint", "TTL records whose user key never appeared in a typed record — possible store corruption or an unknown type prefix", + } + if r.pendingTTLOverflow > 0 { + fields = append(fields, + "pending_ttl_buffer_overflow", r.pendingTTLOverflow, + "pending_ttl_buffer_cap", r.pendingTTLCap) + } + r.warn("redis_orphan_ttl", fields...) } return firstErr } diff --git a/internal/backup/redis_string_test.go b/internal/backup/redis_string_test.go index 5343d3b5..104a74eb 100644 --- a/internal/backup/redis_string_test.go +++ b/internal/backup/redis_string_test.go @@ -617,6 +617,71 @@ func TestRedisDB_OrphanTTLCountedNotBuffered(t *testing.T) { } } +// TestRedisDB_PendingTTLBoundedByCap pins the codex P1 fix on PR +// #791 round 2 (mirrored here on PR #790 round 4): pendingTTL must +// NOT grow unboundedly. Once the cap is reached, subsequent +// unknown-kind TTLs fall back to immediate-orphan counting without +// buffering the user-key bytes. +func TestRedisDB_PendingTTLBoundedByCap(t *testing.T) { + t.Parallel() + const cap = 8 + db, _ := newRedisDB(t) + db.WithPendingTTLCap(cap) + for i := 0; i < cap*2; i++ { + key := []byte("orphan-" + intToDecimal(i)) + ms := uint64(i) + 1 //nolint:gosec // i bounded above + if err := db.HandleTTL(key, encodeTTLValue(ms)); err != nil { + t.Fatalf("HandleTTL[%d]: %v", i, err) + } + } + if got := len(db.pendingTTL); got != cap { + t.Fatalf("pendingTTL len = %d, want %d (cap)", got, cap) + } + if got := db.orphanTTLCount; got != cap { + t.Fatalf("orphanTTLCount = %d at intake, want %d (overflow path)", got, cap) + } + if got := db.pendingTTLOverflow; got != cap { + t.Fatalf("pendingTTLOverflow = %d, want %d", got, cap) + } + if err := db.Finalize(); err != nil { + t.Fatalf("Finalize: %v", err) + } + if got := db.orphanTTLCount; got != cap*2 { + t.Fatalf("orphanTTLCount = %d after Finalize, want %d", got, cap*2) + } +} + +// TestRedisDB_WithPendingTTLCapZeroDisablesBuffering pins that +// cap==0 reverts to the original immediate-orphan path. +func TestRedisDB_WithPendingTTLCapZeroDisablesBuffering(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + db.WithPendingTTLCap(0) + const n = 5 + for i := 0; i < n; i++ { + if err := db.HandleTTL([]byte("k"+intToDecimal(i)), encodeTTLValue(1)); err != nil { + t.Fatal(err) + } + } + if got := len(db.pendingTTL); got != 0 { + t.Fatalf("pendingTTL len = %d, want 0 (cap disabled)", got) + } + if got := db.orphanTTLCount; got != n { + t.Fatalf("orphanTTLCount = %d, want %d", got, n) + } +} + +// TestRedisDB_WithPendingTTLCapNegativeCoercedToZero pins input +// sanitisation. +func TestRedisDB_WithPendingTTLCapNegativeCoercedToZero(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + db.WithPendingTTLCap(-100) + if db.pendingTTLCap != 0 { + t.Fatalf("pendingTTLCap = %d after WithPendingTTLCap(-100), want 0", db.pendingTTLCap) + } +} + func TestRedisDB_DirsCreatedCachesMkdirAll(t *testing.T) { t.Parallel() // Two HandleString calls in a row should populate dirsCreated From 800cc8c16df7225472811f27645b49f3f57b7851 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 20 May 2026 04:55:31 +0900 Subject: [PATCH 06/14] =?UTF-8?q?backup(zset):=20PR790=20r5=20codex=20P1?= =?UTF-8?q?=20=E2=80=94=20fail=20closed=20on=20pendingTTL=20overflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 on round 4: the r4 fix capped pendingTTL but silently counted overflow entries as orphans. That's a data-loss path on legitimate large datasets — in real Pebble snapshot order (`!redis|ttl|` < `!st|`/`!stream|`/`!zs|`), an overflowed TTL likely belongs to a wide-column key arriving later in the scan, and dropping it permanently loses `expire_at_ms`. Restored data ends up non-expiring without the operator noticing — only a warning gets logged. Fix: replace the silent-orphan fallback with fail-closed semantics. Three modes determined by pendingTTLCap: - cap > 0 and buffer NOT full: buffer the entry as before. - cap == 0: counter-only mode (operator explicit opt-out; orphan-count without buffering). - cap > 0 and buffer FULL: fail closed with new sentinel ErrPendingTTLBufferFull. Operator must raise the cap via WithPendingTTLCap or investigate the snapshot for corruption. Caller audit (per /loop standing instruction): - HandleTTL's redisKindUnknown branch now CAN return non-nil error in a new condition. Existing callers (all in test files) already check the error. Verified via `grep -rn '\.HandleTTL(' --include=*.go`: every call site has `if err := db.HandleTTL(...); err != nil { ... }`. No silent swallow. - The cmd/elastickv-snapshot-decode driver (Phase 0a follow-up) will see the new error class and can either bail or call WithPendingTTLCap before retrying. - Tests TestRedisDB_PendingTTLFailsClosedAtCap (renamed from TestRedisDB_PendingTTLBoundedByCap) and TestRedisDB_WithPendingTTLCapZeroOpts (renamed from TestRedisDB_WithPendingTTLCapZeroDisablesBuffering) pin the new semantics. The existing TestRedisDB_OrphanTTLCountedNotBuffered drives 10,000 TTLs with default cap=1M, well under the cap so no behavior change. - parkUnknownTTL signature changed from `func(...)` to `func(...) error`. Single caller (HandleTTL) updated. Self-review: 1. Data loss - opposite of the original concern: fail-closed prevents silent TTL drops. Operators who explicitly accept the lossy path opt in via WithPendingTTLCap(0). 2. Concurrency - no new shared state; orphanTTLCount unchanged for the cap=0 counter mode. 3. Performance - one comparison per unknown-kind TTL. Failed- closed path is a clean Go error return; no allocation amplification. 4. Consistency - matches the "fail closed" pattern called out in CLAUDE.md (replication/HLC/snapshot reads). Error type makes the failure mode legible to the caller. 5. Coverage - 2 renamed tests pin the new semantics. The error path is exercised at the cap+1 record; the counter-only mode stays lossy by explicit opt-in. --- internal/backup/redis_string.go | 75 ++++++++++++++++++++-------- internal/backup/redis_string_test.go | 50 +++++++++++-------- 2 files changed, 83 insertions(+), 42 deletions(-) diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index cd000e65..7ad2d39c 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -72,6 +72,24 @@ var ErrRedisInvalidStringValue = cockroachdberr.New("backup: invalid !redis|str| // expected 8-byte big-endian uint64 millisecond expiry. var ErrRedisInvalidTTLValue = cockroachdberr.New("backup: invalid !redis|ttl| value") +// ErrPendingTTLBufferFull is returned by HandleTTL when an +// unknown-kind TTL arrives but the pendingTTL buffer is already at +// pendingTTLCap. The encoder fails closed here rather than silently +// counting the TTL as an orphan because in real Pebble snapshot +// order (`!redis|ttl|` lex-sorts before `!st|`/`!stream|`/`!zs|`), +// the dropped entry would likely belong to a valid wide-column key +// that arrives later — losing its expire_at_ms would produce a +// restored database with non-expiring data that the source +// snapshot's clients expected to expire. Codex P1 on PR #790 +// round 5. +// +// Recovery: raise WithPendingTTLCap above the snapshot's count of +// unmatched-at-intake TTLs, or set the cap to 0 to explicitly opt +// into the lossy counter-only mode (callers that prefer not to +// see this error must accept that orphan-counted entries will be +// dropped without inlining into wide-column state). +var ErrPendingTTLBufferFull = cockroachdberr.New("backup: pendingTTL buffer at cap; raise WithPendingTTLCap or accept orphan-counter mode via WithPendingTTLCap(0)") + // redisKeyKind tracks which Redis-type prefix introduced a user key, so that // when a later !redis|ttl| record arrives we know whether to write its // expiry into strings_ttl.jsonl, hll_ttl.jsonl, or buffer it for a wide- @@ -391,37 +409,52 @@ func (r *RedisDB) HandleTTL(userKey, value []byte) error { st.hasTTL = true return nil case redisKindUnknown: - r.parkUnknownTTL(userKey, expireAtMs) - return nil + return r.parkUnknownTTL(userKey, expireAtMs) } return nil } // parkUnknownTTL buffers a redisKindUnknown TTL into pendingTTL, or -// falls back to the immediate orphan-count path when the buffer is -// at cap. Extracted from HandleTTL's switch so the parent stays -// under the cyclop budget. +// fails closed when the buffer is at cap. Extracted from HandleTTL's +// switch so the parent stays under the cyclop budget. +// +// Three modes determined by pendingTTLCap: +// +// - cap > 0 and buffer NOT full: store the (userKey, expireAtMs) +// pair so a later wide-column state-init can drain it. +// - cap == 0: counter-only mode. The TTL becomes an immediate +// orphan. Operator-explicit opt-out for callers that prefer +// constant-space orphan counting over the buffered drain path. +// - cap > 0 and buffer FULL: fail closed with +// ErrPendingTTLBufferFull. Silently counting the entry as an +// orphan would permanently lose `expire_at_ms` for the wide- +// column key that arrives later — restored data becomes +// non-expiring without the operator noticing. Codex P1 on PR +// #790 round 5. // -// Rationale (codex P1 on PR #791 round 2; same fix mirrored here): -// Pebble snapshots emit records in encoded-key order, and -// `!redis|ttl|` lex-sorts BEFORE every wide-column prefix where the -// type letter is >= 's' (set, stream, zset). The buffer lets us -// drain the parked expiry when the typed record finally arrives; -// the cap prevents an adversarial snapshot whose TTLs never find a -// claimer from OOM'ing the decoder. Storage: userKey is COPIED -// (`string([]byte)` allocates) because the snapshot reader reuses -// key buffers across iterations — an alias slice would race with -// the next record. -func (r *RedisDB) parkUnknownTTL(userKey []byte, expireAtMs uint64) { +// Storage: userKey is COPIED (`string([]byte)` allocates) because +// the snapshot reader reuses key buffers across iterations — an +// alias slice would race with the next record. +func (r *RedisDB) parkUnknownTTL(userKey []byte, expireAtMs uint64) error { + if r.pendingTTLCap == 0 { + // Counter-only mode (operator explicitly disabled the buffer). + r.orphanTTLCount++ + return nil + } if len(r.pendingTTL) >= r.pendingTTLCap { - // Cap reached: fall back to the immediate-orphan path so - // memory stays bounded. Already-buffered entries can still - // be drained by later state-inits. + // Fail closed: refuse to silently drop a TTL that may + // belong to a wide-column key arriving later in the + // snapshot scan. The operator should raise the cap + // (WithPendingTTLCap) or investigate the snapshot for + // corruption. We still increment pendingTTLOverflow so + // the Finalize warning surfaces the count even if the + // caller swallows the error. r.pendingTTLOverflow++ - r.orphanTTLCount++ - return + return cockroachdberr.Wrapf(ErrPendingTTLBufferFull, + "buffer at cap=%d (user_key_len=%d)", r.pendingTTLCap, len(userKey)) } r.pendingTTL[string(userKey)] = expireAtMs + return nil } // claimPendingTTL drains any buffered TTL for userKey into the diff --git a/internal/backup/redis_string_test.go b/internal/backup/redis_string_test.go index 104a74eb..38de429b 100644 --- a/internal/backup/redis_string_test.go +++ b/internal/backup/redis_string_test.go @@ -617,50 +617,58 @@ func TestRedisDB_OrphanTTLCountedNotBuffered(t *testing.T) { } } -// TestRedisDB_PendingTTLBoundedByCap pins the codex P1 fix on PR -// #791 round 2 (mirrored here on PR #790 round 4): pendingTTL must -// NOT grow unboundedly. Once the cap is reached, subsequent -// unknown-kind TTLs fall back to immediate-orphan counting without -// buffering the user-key bytes. -func TestRedisDB_PendingTTLBoundedByCap(t *testing.T) { +// TestRedisDB_PendingTTLFailsClosedAtCap pins the codex P1 fix on +// PR #790 round 5: when pendingTTL reaches cap and another +// unknown-kind TTL arrives, HandleTTL fails closed with +// ErrPendingTTLBufferFull rather than silently counting the entry +// as an orphan. Silent overflow would permanently lose +// expire_at_ms for wide-column keys that arrive later in the scan, +// producing restored data with TTLs the source snapshot expected +// to expire. +func TestRedisDB_PendingTTLFailsClosedAtCap(t *testing.T) { t.Parallel() const cap = 8 db, _ := newRedisDB(t) db.WithPendingTTLCap(cap) - for i := 0; i < cap*2; i++ { + // Fill the buffer to exactly cap — should all succeed. + for i := 0; i < cap; i++ { key := []byte("orphan-" + intToDecimal(i)) ms := uint64(i) + 1 //nolint:gosec // i bounded above if err := db.HandleTTL(key, encodeTTLValue(ms)); err != nil { - t.Fatalf("HandleTTL[%d]: %v", i, err) + t.Fatalf("HandleTTL[%d]: %v (should succeed within cap)", i, err) } } if got := len(db.pendingTTL); got != cap { t.Fatalf("pendingTTL len = %d, want %d (cap)", got, cap) } - if got := db.orphanTTLCount; got != cap { - t.Fatalf("orphanTTLCount = %d at intake, want %d (overflow path)", got, cap) + // One more entry should fail closed. + err := db.HandleTTL([]byte("orphan-overflow"), encodeTTLValue(999)) + if !errors.Is(err, ErrPendingTTLBufferFull) { + t.Fatalf("err = %v, want ErrPendingTTLBufferFull at cap", err) } - if got := db.pendingTTLOverflow; got != cap { - t.Fatalf("pendingTTLOverflow = %d, want %d", got, cap) + if got := db.pendingTTLOverflow; got != 1 { + t.Fatalf("pendingTTLOverflow = %d, want 1", got) } - if err := db.Finalize(); err != nil { - t.Fatalf("Finalize: %v", err) - } - if got := db.orphanTTLCount; got != cap*2 { - t.Fatalf("orphanTTLCount = %d after Finalize, want %d", got, cap*2) + // orphanTTLCount must NOT be incremented at intake — the + // caller has to handle the error explicitly. + if got := db.orphanTTLCount; got != 0 { + t.Fatalf("orphanTTLCount = %d at intake, want 0 (failed closed)", got) } } -// TestRedisDB_WithPendingTTLCapZeroDisablesBuffering pins that -// cap==0 reverts to the original immediate-orphan path. -func TestRedisDB_WithPendingTTLCapZeroDisablesBuffering(t *testing.T) { +// TestRedisDB_WithPendingTTLCapZeroOpts pins the explicit +// counter-only mode: cap==0 disables buffering entirely and every +// unknown-kind TTL becomes an immediate orphan WITHOUT firing +// ErrPendingTTLBufferFull. Operators that prefer constant-space +// orphan-counting over the fail-closed path opt in via cap=0. +func TestRedisDB_WithPendingTTLCapZeroOpts(t *testing.T) { t.Parallel() db, _ := newRedisDB(t) db.WithPendingTTLCap(0) const n = 5 for i := 0; i < n; i++ { if err := db.HandleTTL([]byte("k"+intToDecimal(i)), encodeTTLValue(1)); err != nil { - t.Fatal(err) + t.Fatalf("HandleTTL[%d]: %v (counter-only mode must not fail)", i, err) } } if got := len(db.pendingTTL); got != 0 { From 8eca5cdf88c330f9a44c65705efe49044556329d Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 20 May 2026 05:08:22 +0900 Subject: [PATCH 07/14] =?UTF-8?q?backup(zset):=20PR790=20r6=20codex=20P1?= =?UTF-8?q?=20=E2=80=94=20bound=20pendingTTL=20by=20bytes,=20not=20entries?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P1 on round 5: the entry-count cap introduced in r4 doesn't actually bound memory. Redis user keys can be up to 1 MiB each, so an entry-count cap of 1 MiB entries permits up to 1 TiB of buffered key bytes — defeating the OOM protection on adversarial snapshots with large keys. Fix: replace entry-count cap with a byte budget. Each entry costs `len(userKey) + 8` bytes (the 8 is the uint64 expireAtMs payload; Go-map bucket overhead is intentionally NOT counted so the budget stays deterministic for operators). The budget is decremented when claimPendingTTL drains an entry, so a long-running scan that drains as it goes can reuse the budget. Renames: - WithPendingTTLCap → WithPendingTTLByteCap (the cap is now bytes) - defaultPendingTTLCap → defaultPendingTTLBytesCap (64 MiB) - pendingTTLCap field → pendingTTLBytesCap (struct field) - Added pendingTTLBytes counter (current cumulative cost) - Added pendingTTLEntryOverheadBytes constant (= 8) Caller audit (per /loop standing instruction): - WithPendingTTLCap was introduced in r3/r4 of this PR; no external caller has consumed it yet. Verified via `grep -rn 'WithPendingTTLCap|pendingTTLCap\b' --include=*.go`: zero matches in the full tree (the rename is total). - parkUnknownTTL's signature unchanged (still returns error). The failure condition changes from "entry count >= cap" to "would exceed byte budget"; same fail-closed semantics, just measured in bytes. - claimPendingTTL gains a `r.pendingTTLBytes -= ...` line. Single caller (the wide-column state-init helpers); accounting matches parkUnknownTTL's add side exactly. - Finalize warning surfaces `pending_ttl_buffer_bytes_cap` instead of the old `pending_ttl_buffer_cap`. Field name change in the observable warning; no programmatic consumer exists yet. Tests renamed/added: - TestRedisDB_PendingTTLFailsClosedAtByteCap (renamed from TestRedisDB_PendingTTLFailsClosedAtCap; switched to byte-cap semantics with 8-byte keys × 16-byte cost). - New TestRedisDB_PendingTTLByteCapBoundedByLargeKey pins the exact bug class codex flagged: a single 100-byte key blocks under a 64-byte budget. - New TestRedisDB_PendingTTLByteBudgetReclaimedOnClaim pins that draining via claimPendingTTL frees byte budget. - TestRedisDB_WithPendingTTLByteCapZeroOpts (renamed). - TestRedisDB_WithPendingTTLByteCapNegativeCoercedToZero (renamed). Self-review: 1. Data loss - byte-budget cap is a strictly tighter bound than the previous entry-count cap; no new data-loss surface. fail-closed semantics from r5 unchanged. 2. Concurrency - pendingTTLBytes is per-RedisDB; same single- threaded contract as the rest of the struct. 3. Performance - one int comparison + add per parkUnknownTTL; one int subtract per claimPendingTTL. O(1) overhead. 4. Consistency - byte budget matches the OOM protection reviewer asked for, sized at 64 MiB to comfortably fit any realistic legitimate workload. 5. Coverage - 5 tests total (3 new, 2 renamed). All pass with -race. Existing TestRedisDB_OrphanTTLCountedNotBuffered still passes with the default 64 MiB budget (10K small keys ≈ 180 KB). --- internal/backup/redis_string.go | 169 +++++++++++++++++---------- internal/backup/redis_string_test.go | 127 ++++++++++++++------ 2 files changed, 198 insertions(+), 98 deletions(-) diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index 7ad2d39c..6524b0d8 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -73,22 +73,27 @@ var ErrRedisInvalidStringValue = cockroachdberr.New("backup: invalid !redis|str| var ErrRedisInvalidTTLValue = cockroachdberr.New("backup: invalid !redis|ttl| value") // ErrPendingTTLBufferFull is returned by HandleTTL when an -// unknown-kind TTL arrives but the pendingTTL buffer is already at -// pendingTTLCap. The encoder fails closed here rather than silently -// counting the TTL as an orphan because in real Pebble snapshot -// order (`!redis|ttl|` lex-sorts before `!st|`/`!stream|`/`!zs|`), -// the dropped entry would likely belong to a valid wide-column key -// that arrives later — losing its expire_at_ms would produce a -// restored database with non-expiring data that the source -// snapshot's clients expected to expire. Codex P1 on PR #790 -// round 5. +// unknown-kind TTL arrives but the pendingTTL buffer's BYTE +// budget (pendingTTLBytesCap) is already exhausted. The encoder +// fails closed here rather than silently counting the TTL as an +// orphan because in real Pebble snapshot order (`!redis|ttl|` +// lex-sorts before `!st|`/`!stream|`/`!zs|`), the dropped entry +// would likely belong to a valid wide-column key that arrives +// later — losing its expire_at_ms would produce a restored +// database with non-expiring data that the source snapshot's +// clients expected to expire. // -// Recovery: raise WithPendingTTLCap above the snapshot's count of -// unmatched-at-intake TTLs, or set the cap to 0 to explicitly opt -// into the lossy counter-only mode (callers that prefer not to -// see this error must accept that orphan-counted entries will be -// dropped without inlining into wide-column state). -var ErrPendingTTLBufferFull = cockroachdberr.New("backup: pendingTTL buffer at cap; raise WithPendingTTLCap or accept orphan-counter mode via WithPendingTTLCap(0)") +// The budget is in BYTES (not entries) because Redis user keys can +// be up to 1 MiB each; an entry-count cap of N still permits N +// MiB of accumulated key bytes, which defeats the OOM protection +// on adversarial snapshots with large keys. Codex P1 on PR #790 +// round 5 (entry-count -> byte-budget on round 6). +// +// Recovery: raise WithPendingTTLByteCap above the snapshot's +// expected cumulative byte cost of unmatched-at-intake TTLs, or +// set the byte cap to 0 to explicitly opt into the lossy +// counter-only mode. +var ErrPendingTTLBufferFull = cockroachdberr.New("backup: pendingTTL byte budget exhausted; raise WithPendingTTLByteCap or accept orphan-counter mode via WithPendingTTLByteCap(0)") // redisKeyKind tracks which Redis-type prefix introduced a user key, so that // when a later !redis|ttl| record arrives we know whether to write its @@ -233,30 +238,48 @@ type RedisDB struct { // appeared as a typed record — likely a corrupted store). pendingTTL map[string]uint64 - // pendingTTLCap caps pendingTTL's in-memory size. Once the map - // reaches this many entries, subsequent unknown-kind TTLs fall - // back to incrementing orphanTTLCount directly without - // buffering the user-key bytes. Without this cap, an - // adversarial or corrupt snapshot whose `!redis|ttl|` records - // never find a typed-record claimer would grow pendingTTL - // unboundedly and could OOM the decoder. Codex P1 finding - // surfaced on PR #791 round 2; the same fix is applied here on - // PR #790 because the two PRs share the pendingTTL - // infrastructure and the bug is identical. - pendingTTLCap int - - // pendingTTLOverflow counts entries skipped because the - // pendingTTL buffer was at cap. Surfaced in the Finalize - // warning so operators can distinguish "snapshot exceeded the - // buffer cap" from "TTL records remained unmatched". + // pendingTTLBytes tracks the cumulative byte cost of the + // pendingTTL buffer (each entry costs len(userKey) + 8 bytes, + // where 8 is the uint64 expireAtMs payload — the per-entry + // Go-map overhead is not counted here, since we want the cap + // to be a deterministic byte budget the operator can reason + // about). + pendingTTLBytes int + + // pendingTTLBytesCap caps the byte cost of pendingTTL. Once + // the cumulative cost would exceed this budget, subsequent + // unknown-kind TTLs fail closed with ErrPendingTTLBufferFull. + // We bound by bytes (not by entry count) because Redis user + // keys can be up to 1 MiB each; an entry-count cap of N + // would still permit ~N MiB of accumulated key bytes — + // defeating the OOM protection on adversarial snapshots with + // large keys. Codex P1 finding on PR #790 round 6. + pendingTTLBytesCap int + + // pendingTTLOverflow counts entries that would have entered + // pendingTTL but were rejected because the byte budget was + // exhausted. Surfaced in the Finalize warning so operators + // can distinguish "snapshot exceeded the buffer budget" from + // "TTL records remained unmatched after the entire scan". pendingTTLOverflow int } -// defaultPendingTTLCap caps pendingTTL at 1 MiB entries by default -// (~64 MiB worst-case memory at ~64 B per Go map entry). Override -// via WithPendingTTLCap for hosts that need a different memory / -// coverage trade-off. -const defaultPendingTTLCap = 1 << 20 +// defaultPendingTTLBytesCap caps pendingTTL at 64 MiB cumulative +// key+payload bytes by default. Override via WithPendingTTLByteCap +// for hosts that need a different memory / coverage trade-off. +// 64 MiB covers tens of thousands to millions of typical Redis +// keys (usually << 100 bytes each); the absolute key-size ceiling +// is 1 MiB, so this budget tolerates ~64 maximally-large keys +// without dropping the OOM protection. +const defaultPendingTTLBytesCap = 64 << 20 + +// pendingTTLEntryOverheadBytes is the per-entry payload cost we +// charge against pendingTTLBytesCap on top of the user-key bytes. +// It accounts for the uint64 expireAtMs we store as the map value. +// The Go-map's bucket overhead is NOT included — we want a +// deterministic byte budget the operator can reason about +// directly, not one that drifts with Go's runtime map layout. +const pendingTTLEntryOverheadBytes = 8 // NewRedisDB constructs a RedisDB rooted at /redis/db_/. // dbIndex selects ; today the producer always passes 0, but accepting @@ -267,30 +290,37 @@ func NewRedisDB(outRoot string, dbIndex int) *RedisDB { dbIndex = 0 } return &RedisDB{ - outRoot: outRoot, - dbIndex: dbIndex, - kindByKey: make(map[string]redisKeyKind), - dirsCreated: make(map[string]struct{}), - inlineTTLEmitted: make(map[string]struct{}), - hashes: make(map[string]*redisHashState), - lists: make(map[string]*redisListState), - sets: make(map[string]*redisSetState), - zsets: make(map[string]*redisZSetState), - pendingTTL: make(map[string]uint64), - pendingTTLCap: defaultPendingTTLCap, + outRoot: outRoot, + dbIndex: dbIndex, + kindByKey: make(map[string]redisKeyKind), + dirsCreated: make(map[string]struct{}), + inlineTTLEmitted: make(map[string]struct{}), + hashes: make(map[string]*redisHashState), + lists: make(map[string]*redisListState), + sets: make(map[string]*redisSetState), + zsets: make(map[string]*redisZSetState), + pendingTTL: make(map[string]uint64), + pendingTTLBytesCap: defaultPendingTTLBytesCap, } } -// WithPendingTTLCap overrides the default cap on the pendingTTL -// buffer. A value of 0 disables buffering — every unknown-kind TTL -// becomes an immediate orphan (matches the pre-pendingTTL behavior). -// Negative inputs are coerced to 0. Returns the receiver so it can -// be chained with other With* setters. -func (r *RedisDB) WithPendingTTLCap(capacity int) *RedisDB { +// WithPendingTTLByteCap overrides the default byte budget for the +// pendingTTL buffer. A value of 0 disables buffering — every +// unknown-kind TTL becomes an immediate orphan (matches the +// pre-pendingTTL behavior). Negative inputs are coerced to 0. +// Returns the receiver so it can be chained with other With* +// setters. +// +// The budget is in BYTES (sum of `len(userKey) + +// pendingTTLEntryOverheadBytes` over every buffered entry), NOT +// entry count, because Redis user keys can be up to 1 MiB each +// and an entry-count cap of N would still permit ~N MiB of +// accumulated key bytes. Codex P1 finding on PR #790 round 6. +func (r *RedisDB) WithPendingTTLByteCap(capacity int) *RedisDB { if capacity < 0 { capacity = 0 } - r.pendingTTLCap = capacity + r.pendingTTLBytesCap = capacity return r } @@ -418,7 +448,7 @@ func (r *RedisDB) HandleTTL(userKey, value []byte) error { // fails closed when the buffer is at cap. Extracted from HandleTTL's // switch so the parent stays under the cyclop budget. // -// Three modes determined by pendingTTLCap: +// Three modes determined by pendingTTLBytesCap: // // - cap > 0 and buffer NOT full: store the (userKey, expireAtMs) // pair so a later wide-column state-init can drain it. @@ -436,24 +466,31 @@ func (r *RedisDB) HandleTTL(userKey, value []byte) error { // the snapshot reader reuses key buffers across iterations — an // alias slice would race with the next record. func (r *RedisDB) parkUnknownTTL(userKey []byte, expireAtMs uint64) error { - if r.pendingTTLCap == 0 { + if r.pendingTTLBytesCap == 0 { // Counter-only mode (operator explicitly disabled the buffer). r.orphanTTLCount++ return nil } - if len(r.pendingTTL) >= r.pendingTTLCap { + // Charge the byte cost: user-key bytes + the uint64 expireAtMs + // payload. The Go-map's per-bucket overhead is NOT counted + // because we want the cap to be a deterministic byte budget + // the operator can reason about. See pendingTTLEntryOverheadBytes. + cost := len(userKey) + pendingTTLEntryOverheadBytes + if r.pendingTTLBytes+cost > r.pendingTTLBytesCap { // Fail closed: refuse to silently drop a TTL that may // belong to a wide-column key arriving later in the - // snapshot scan. The operator should raise the cap - // (WithPendingTTLCap) or investigate the snapshot for - // corruption. We still increment pendingTTLOverflow so - // the Finalize warning surfaces the count even if the - // caller swallows the error. + // snapshot scan. The operator should raise the budget + // (WithPendingTTLByteCap) or investigate the snapshot + // for corruption. We still increment pendingTTLOverflow + // so the Finalize warning surfaces the count even if + // the caller swallows the error. r.pendingTTLOverflow++ return cockroachdberr.Wrapf(ErrPendingTTLBufferFull, - "buffer at cap=%d (user_key_len=%d)", r.pendingTTLCap, len(userKey)) + "buffer would exceed byte_cap=%d by %d bytes (user_key_len=%d, in_flight=%d)", + r.pendingTTLBytesCap, r.pendingTTLBytes+cost-r.pendingTTLBytesCap, len(userKey), r.pendingTTLBytes) } r.pendingTTL[string(userKey)] = expireAtMs + r.pendingTTLBytes += cost return nil } @@ -479,6 +516,10 @@ func (r *RedisDB) claimPendingTTL(userKey []byte) (uint64, bool) { return 0, false } delete(r.pendingTTL, uk) + // Free the byte budget so a later snapshot scan that drains + // many entries can buffer further work. Matches the cost + // charged in parkUnknownTTL exactly. + r.pendingTTLBytes -= len(uk) + pendingTTLEntryOverheadBytes return expireAtMs, true } @@ -518,7 +559,7 @@ func (r *RedisDB) Finalize() error { if r.pendingTTLOverflow > 0 { fields = append(fields, "pending_ttl_buffer_overflow", r.pendingTTLOverflow, - "pending_ttl_buffer_cap", r.pendingTTLCap) + "pending_ttl_buffer_bytes_cap", r.pendingTTLBytesCap) } r.warn("redis_orphan_ttl", fields...) } diff --git a/internal/backup/redis_string_test.go b/internal/backup/redis_string_test.go index 38de429b..5a01c247 100644 --- a/internal/backup/redis_string_test.go +++ b/internal/backup/redis_string_test.go @@ -617,54 +617,113 @@ func TestRedisDB_OrphanTTLCountedNotBuffered(t *testing.T) { } } -// TestRedisDB_PendingTTLFailsClosedAtCap pins the codex P1 fix on -// PR #790 round 5: when pendingTTL reaches cap and another -// unknown-kind TTL arrives, HandleTTL fails closed with -// ErrPendingTTLBufferFull rather than silently counting the entry -// as an orphan. Silent overflow would permanently lose -// expire_at_ms for wide-column keys that arrive later in the scan, -// producing restored data with TTLs the source snapshot expected -// to expire. -func TestRedisDB_PendingTTLFailsClosedAtCap(t *testing.T) { +// TestRedisDB_PendingTTLFailsClosedAtByteCap pins the codex P1 fix +// on PR #790 round 6: the byte budget (not entry count) bounds +// pendingTTL memory. When an incoming entry's byte cost would +// exceed pendingTTLBytesCap, HandleTTL fails closed with +// ErrPendingTTLBufferFull rather than silently counting it as an +// orphan. Without this, an adversarial snapshot with a small +// number of 1 MiB unmatched TTL keys could still OOM the decoder +// — defeating the entry-count cap from r4/r5. +func TestRedisDB_PendingTTLFailsClosedAtByteCap(t *testing.T) { t.Parallel() - const cap = 8 + // Each "orphan-N" key (N=0..7) is 8 bytes. With + // pendingTTLEntryOverheadBytes=8, each entry costs 16 bytes. + // 8 entries = 128 bytes; 9th would push to 144. Cap = 128 → + // 8 fit, 9th fails closed. + const byteCap = 128 + const entriesPerByteCap = 8 db, _ := newRedisDB(t) - db.WithPendingTTLCap(cap) - // Fill the buffer to exactly cap — should all succeed. - for i := 0; i < cap; i++ { + db.WithPendingTTLByteCap(byteCap) + for i := 0; i < entriesPerByteCap; i++ { key := []byte("orphan-" + intToDecimal(i)) ms := uint64(i) + 1 //nolint:gosec // i bounded above if err := db.HandleTTL(key, encodeTTLValue(ms)); err != nil { - t.Fatalf("HandleTTL[%d]: %v (should succeed within cap)", i, err) + t.Fatalf("HandleTTL[%d]: %v (should succeed within byte_cap)", i, err) } } - if got := len(db.pendingTTL); got != cap { - t.Fatalf("pendingTTL len = %d, want %d (cap)", got, cap) + if got := len(db.pendingTTL); got != entriesPerByteCap { + t.Fatalf("pendingTTL len = %d, want %d", got, entriesPerByteCap) } - // One more entry should fail closed. - err := db.HandleTTL([]byte("orphan-overflow"), encodeTTLValue(999)) + if got := db.pendingTTLBytes; got != byteCap { + t.Fatalf("pendingTTLBytes = %d, want %d", got, byteCap) + } + // One more entry should fail closed (would push to 144 > 128). + err := db.HandleTTL([]byte("orphan-X"), encodeTTLValue(999)) if !errors.Is(err, ErrPendingTTLBufferFull) { - t.Fatalf("err = %v, want ErrPendingTTLBufferFull at cap", err) + t.Fatalf("err = %v, want ErrPendingTTLBufferFull at byte_cap", err) } if got := db.pendingTTLOverflow; got != 1 { t.Fatalf("pendingTTLOverflow = %d, want 1", got) } - // orphanTTLCount must NOT be incremented at intake — the - // caller has to handle the error explicitly. if got := db.orphanTTLCount; got != 0 { t.Fatalf("orphanTTLCount = %d at intake, want 0 (failed closed)", got) } } -// TestRedisDB_WithPendingTTLCapZeroOpts pins the explicit -// counter-only mode: cap==0 disables buffering entirely and every -// unknown-kind TTL becomes an immediate orphan WITHOUT firing -// ErrPendingTTLBufferFull. Operators that prefer constant-space -// orphan-counting over the fail-closed path opt in via cap=0. -func TestRedisDB_WithPendingTTLCapZeroOpts(t *testing.T) { +// TestRedisDB_PendingTTLByteCapBoundedByLargeKey pins the +// large-key defense codex flagged on PR #790 round 6: a single +// 1 MiB key would have fit under any reasonable entry-count cap +// but exhausts a sensible byte budget. We use a synthetic small +// budget (64 bytes) and a 100-byte key to exercise the same logic +// without allocating a real 1 MiB blob. +func TestRedisDB_PendingTTLByteCapBoundedByLargeKey(t *testing.T) { + t.Parallel() + const byteCap = 64 + db, _ := newRedisDB(t) + db.WithPendingTTLByteCap(byteCap) + largeKey := make([]byte, 100) // 100 + 8 = 108 > 64 + for i := range largeKey { + largeKey[i] = byte(i % 256) + } + err := db.HandleTTL(largeKey, encodeTTLValue(1)) + if !errors.Is(err, ErrPendingTTLBufferFull) { + t.Fatalf("err = %v, want ErrPendingTTLBufferFull on oversize key", err) + } + if got := len(db.pendingTTL); got != 0 { + t.Fatalf("pendingTTL must be empty after failed insert, got %d", got) + } +} + +// TestRedisDB_PendingTTLByteBudgetReclaimedOnClaim pins that the +// byte counter is decremented when an entry is drained via +// claimPendingTTL. A later wide-column state-init that drains the +// buffer must free byte budget so subsequent unknown-kind TTLs can +// be buffered again. +func TestRedisDB_PendingTTLByteBudgetReclaimedOnClaim(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + const byteCap = 32 // ("k0"=2 + 8) * 3 = 30 entries fit; 4th would overflow at 40 + db.WithPendingTTLByteCap(byteCap) + for i := 0; i < 3; i++ { + if err := db.HandleTTL([]byte("k"+intToDecimal(i)), encodeTTLValue(1)); err != nil { + t.Fatalf("HandleTTL[%d]: %v", i, err) + } + } + if got := db.pendingTTLBytes; got != 30 { + t.Fatalf("pendingTTLBytes = %d, want 30", got) + } + // Drain k0 via a wide-column state-init. + if err := db.HandleSetMember(setMemberKey("k0", []byte("m")), nil); err != nil { + t.Fatal(err) + } + if got := db.pendingTTLBytes; got != 20 { + t.Fatalf("pendingTTLBytes after drain = %d, want 20 (reclaimed 10 bytes)", got) + } + // New buffer space available — another small entry fits. + if err := db.HandleTTL([]byte("k3"), encodeTTLValue(1)); err != nil { + t.Fatalf("HandleTTL after drain failed: %v", err) + } +} + +// TestRedisDB_WithPendingTTLByteCapZeroOpts pins the explicit +// counter-only mode: byte_cap==0 disables buffering entirely and +// every unknown-kind TTL becomes an immediate orphan WITHOUT +// firing ErrPendingTTLBufferFull. +func TestRedisDB_WithPendingTTLByteCapZeroOpts(t *testing.T) { t.Parallel() db, _ := newRedisDB(t) - db.WithPendingTTLCap(0) + db.WithPendingTTLByteCap(0) const n = 5 for i := 0; i < n; i++ { if err := db.HandleTTL([]byte("k"+intToDecimal(i)), encodeTTLValue(1)); err != nil { @@ -679,14 +738,14 @@ func TestRedisDB_WithPendingTTLCapZeroOpts(t *testing.T) { } } -// TestRedisDB_WithPendingTTLCapNegativeCoercedToZero pins input -// sanitisation. -func TestRedisDB_WithPendingTTLCapNegativeCoercedToZero(t *testing.T) { +// TestRedisDB_WithPendingTTLByteCapNegativeCoercedToZero pins +// input sanitisation. +func TestRedisDB_WithPendingTTLByteCapNegativeCoercedToZero(t *testing.T) { t.Parallel() db, _ := newRedisDB(t) - db.WithPendingTTLCap(-100) - if db.pendingTTLCap != 0 { - t.Fatalf("pendingTTLCap = %d after WithPendingTTLCap(-100), want 0", db.pendingTTLCap) + db.WithPendingTTLByteCap(-100) + if db.pendingTTLBytesCap != 0 { + t.Fatalf("pendingTTLBytesCap = %d after WithPendingTTLByteCap(-100), want 0", db.pendingTTLBytesCap) } } From 7f28222efd90233f26ef5ce2fc321649b905d437 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 20 May 2026 15:17:10 +0900 Subject: [PATCH 08/14] =?UTF-8?q?backup(zset):=20PR790=20r7=20codex=20P2?= =?UTF-8?q?=20=E2=80=94=20short-circuit=20duplicate-key=20parkUnknownTTL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit codex P2 on round 6: when parkUnknownTTL receives the same userKey more than once before the wide-column state-init claims it, the map write overwrites the prior value (latest-wins) but pendingTTLBytes was incremented on every call. That inflated the byte-budget accounting and could trigger spurious ErrPendingTTLBufferFull at byte-budget boundaries on inputs that replay or duplicate `!redis|ttl|` records, even though the buffered map size had not grown. Same finding was flagged by claude r5/r6/r7 as a Low priority carry-over. Fix: hoist the `string(userKey)` conversion once at the top, then short-circuit on an already-buffered key by overwriting the entry and returning nil. The byte counter only advances when a NEW key is added; duplicates are O(1) and free. Caller audit (per /loop standing instruction): grep -rn 'parkUnknownTTL\b' --include='*.go' → exactly one caller: HandleTTL's redisKindUnknown branch (redis_string.go:442). The error semantics for callers are unchanged — the new branch returns nil (success), strictly broadening the success path. No caller observes a regression. Test added: - TestRedisDB_PendingTTLDuplicateKeyDoesNotDoubleCountBytes pins the exact bug class codex flagged: a replay of an in-buffer key must not advance pendingTTLBytes, and a duplicate at exact byte-budget cap must succeed (not fire ErrPendingTTLBufferFull). Extracted pendingTTLState / capturePendingTTLState / assertPendingTTLState helpers to keep the test under the cyclop=10 ceiling. Self-review: 1. Data loss — strictly more permissive: previously-failing duplicate-at-cap now succeeds. Map size invariant (and thus OOM protection) holds because we do not add a new entry on the duplicate path. 2. Concurrency — pendingTTL/pendingTTLBytes are single-threaded per RedisDB; same contract as before. 3. Performance — duplicate path is now O(1) hashmap lookup + write; first-write path is unchanged (one extra hashmap probe, negligible). 4. Consistency — latest-wins overwrite matches the pattern Pebble uses for snapshot iterators when duplicates surface from replay/recovery; consistent with claimPendingTTL's "drain and forget" semantic. 5. Coverage — new test pins both the no-double-count invariant and the at-cap success path. All other pendingTTL tests still pass with -race. --- internal/backup/redis_string.go | 13 +++++- internal/backup/redis_string_test.go | 67 ++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index 6524b0d8..3f8b44c6 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -471,6 +471,17 @@ func (r *RedisDB) parkUnknownTTL(userKey []byte, expireAtMs uint64) error { r.orphanTTLCount++ return nil } + uk := string(userKey) + if _, alreadyBuffered := r.pendingTTL[uk]; alreadyBuffered { + // Latest-wins overwrite: a healthy Pebble snapshot never + // emits duplicate `!redis|ttl|` records for the same user + // key, but a replay or fragile input must not inflate the + // byte counter — the map size has not grown, so charging + // `cost` again would spuriously trigger ErrPendingTTLBufferFull + // at byte-budget boundaries. Codex P2 on PR #790 round 7. + r.pendingTTL[uk] = expireAtMs + return nil + } // Charge the byte cost: user-key bytes + the uint64 expireAtMs // payload. The Go-map's per-bucket overhead is NOT counted // because we want the cap to be a deterministic byte budget @@ -489,7 +500,7 @@ func (r *RedisDB) parkUnknownTTL(userKey []byte, expireAtMs uint64) error { "buffer would exceed byte_cap=%d by %d bytes (user_key_len=%d, in_flight=%d)", r.pendingTTLBytesCap, r.pendingTTLBytes+cost-r.pendingTTLBytesCap, len(userKey), r.pendingTTLBytes) } - r.pendingTTL[string(userKey)] = expireAtMs + r.pendingTTL[uk] = expireAtMs r.pendingTTLBytes += cost return nil } diff --git a/internal/backup/redis_string_test.go b/internal/backup/redis_string_test.go index 5a01c247..9623c9ea 100644 --- a/internal/backup/redis_string_test.go +++ b/internal/backup/redis_string_test.go @@ -716,6 +716,73 @@ func TestRedisDB_PendingTTLByteBudgetReclaimedOnClaim(t *testing.T) { } } +// pendingTTLState snapshots the in-flight buffer counters at one +// observation point. +type pendingTTLState struct { + bytes int + mapLen int + overflow int +} + +func capturePendingTTLState(db *RedisDB) pendingTTLState { + return pendingTTLState{ + bytes: db.pendingTTLBytes, + mapLen: len(db.pendingTTL), + overflow: db.pendingTTLOverflow, + } +} + +func assertPendingTTLState(t *testing.T, label string, got, want pendingTTLState) { + t.Helper() + if got != want { + t.Fatalf("%s: got %+v, want %+v", label, got, want) + } +} + +// TestRedisDB_PendingTTLDuplicateKeyDoesNotDoubleCountBytes pins +// the codex P2 fix (PR #790 round 7): when parkUnknownTTL receives +// the same userKey more than once before the key is claimed, the +// map entry overwrites in place (latest-wins) and the byte counter +// stays unchanged. A duplicate at byte-budget boundary must not +// spuriously fire ErrPendingTTLBufferFull. +func TestRedisDB_PendingTTLDuplicateKeyDoesNotDoubleCountBytes(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + const byteCap = 20 // single key "k0" costs 2+8 = 10 bytes; cap fits exactly 2 entries. + db.WithPendingTTLByteCap(byteCap) + + if err := db.HandleTTL([]byte("k0"), encodeTTLValue(1)); err != nil { + t.Fatalf("first HandleTTL: %v", err) + } + assertPendingTTLState(t, "after first", capturePendingTTLState(db), + pendingTTLState{bytes: 10, mapLen: 1, overflow: 0}) + + // Replay the same userKey with a later expiry — must overwrite, + // not inflate the byte counter. + if err := db.HandleTTL([]byte("k0"), encodeTTLValue(2)); err != nil { + t.Fatalf("duplicate HandleTTL: %v (must succeed; map size unchanged)", err) + } + assertPendingTTLState(t, "after duplicate", capturePendingTTLState(db), + pendingTTLState{bytes: 10, mapLen: 1, overflow: 0}) + if got := db.pendingTTL["k0"]; got != 2 { + t.Fatalf("pendingTTL[k0] = %d, want latest-wins value 2", got) + } + + // Fill the cap with a second key. + if err := db.HandleTTL([]byte("k1"), encodeTTLValue(1)); err != nil { + t.Fatalf("second-key HandleTTL: %v", err) + } + assertPendingTTLState(t, "after fill", capturePendingTTLState(db), + pendingTTLState{bytes: 20, mapLen: 2, overflow: 0}) + + // At exact cap; duplicate of an existing key must still succeed. + if err := db.HandleTTL([]byte("k0"), encodeTTLValue(3)); err != nil { + t.Fatalf("duplicate at-cap HandleTTL: %v (must succeed; was P2 regression)", err) + } + assertPendingTTLState(t, "after duplicate at-cap", capturePendingTTLState(db), + pendingTTLState{bytes: 20, mapLen: 2, overflow: 0}) +} + // TestRedisDB_WithPendingTTLByteCapZeroOpts pins the explicit // counter-only mode: byte_cap==0 disables buffering entirely and // every unknown-kind TTL becomes an immediate orphan WITHOUT From f8ce906ecfd88b829a3a1e56b19078607bede3bf Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 20 May 2026 15:34:30 +0900 Subject: [PATCH 09/14] =?UTF-8?q?backup(zset):=20PR790=20r8=20codex=20P1?= =?UTF-8?q?=20=E2=80=94=20raise=20default=20pendingTTL=20cap=20to=201=20Gi?= =?UTF-8?q?B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit codex P1 on round 7: the 64 MiB defaultPendingTTLBytesCap turns large-but-valid datasets into hard failures. Because !redis|ttl| lex-sorts BEFORE !st|/!stream|/!zs|, ALL expiring wide-column TTLs must be parked in pendingTTL before any wide-column row arrives to drain them — the peak buffer size equals sum((userKeyLen + 8) for all expiring wide-column keys in the database). A real Redis with 20M expiring wide- column keys × 50-byte user keys peaks at ~1.16 GiB. The previous 64 MiB default would hard-fail with ErrPendingTTLBufferFull on healthy snapshots. There is no in-repo caller raising the ceiling (WithPendingTTLByteCap is only exercised in tests), so the default IS the policy in practice — and the cmd/elastickv-snapshot-decode binary (task 23) is still WIP, so operators have no override path yet. Fix: raise default to 1 GiB. Sizing rationale: - 1 GiB tolerates a 20M-expiring-wide-column-key workload at the average 50-byte user-key size. - Still a hard OOM ceiling: adversarial snapshots with TB of 1 MiB keys still fail closed (the byte-budget accounting is unchanged). - Typical dump-host budget headroom is 16+ GiB, so 1 GiB is a small fraction of available memory. Caller audit (per /loop standing instruction): no semantics change on parkUnknownTTL — the constant value changed only. All callers (HandleTTL redisKindUnknown branch, single site) see the same fail-closed-at-cap semantic; only the cap value shifts. Tests that exercise WithPendingTTLByteCap with explicit (smaller) caps still work because they override the default. Test added: - TestRedisDB_DefaultPendingTTLByteCapSurvives1GiBLoad pins the new constant invariant at 1 GiB and verifies NewRedisDB applies it. Self-review: 1. Data loss — strictly safer than r6: previous default refused healthy snapshots; now accepts them while the byte budget protects against OOM. 2. Concurrency — no new shared state; constant change only. 3. Performance — no measurable impact. 4. Consistency — byte-budget accounting and fail-closed semantics unchanged; only the threshold shifts. 5. Coverage — new sentinel test pins the constant; all other pendingTTL tests still pass with -race. --- internal/backup/redis_string.go | 27 +++++++++++++++++++++------ internal/backup/redis_string_test.go | 23 +++++++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index 3f8b44c6..85420052 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -89,6 +89,13 @@ var ErrRedisInvalidTTLValue = cockroachdberr.New("backup: invalid !redis|ttl| va // on adversarial snapshots with large keys. Codex P1 on PR #790 // round 5 (entry-count -> byte-budget on round 6). // +// The default cap is 1 GiB (see defaultPendingTTLBytesCap), sized +// to accommodate typical real-world workloads — a 20M-expiring- +// wide-column-key snapshot with 50-byte keys peaks at ~1.16 GiB +// of buffered pairs before draining. The OOM ceiling is still +// hard (adversarial snapshots with TB of large keys fail closed). +// Codex P1 on PR #790 round 7 / PR #791 round 7. +// // Recovery: raise WithPendingTTLByteCap above the snapshot's // expected cumulative byte cost of unmatched-at-intake TTLs, or // set the byte cap to 0 to explicitly opt into the lossy @@ -264,14 +271,22 @@ type RedisDB struct { pendingTTLOverflow int } -// defaultPendingTTLBytesCap caps pendingTTL at 64 MiB cumulative +// defaultPendingTTLBytesCap caps pendingTTL at 1 GiB cumulative // key+payload bytes by default. Override via WithPendingTTLByteCap // for hosts that need a different memory / coverage trade-off. -// 64 MiB covers tens of thousands to millions of typical Redis -// keys (usually << 100 bytes each); the absolute key-size ceiling -// is 1 MiB, so this budget tolerates ~64 maximally-large keys -// without dropping the OOM protection. -const defaultPendingTTLBytesCap = 64 << 20 +// +// Sizing rationale (codex P1 on PR #791 round 7): the buffer +// reaches peak size = sum of (userKeyLen + 8) for every expiring +// wide-column key in the database, because `!redis|ttl|` lex-sorts +// BEFORE `!st|`/`!stream|`/`!zs|` — all wide-column TTLs are +// parked before any wide-column row drains them via +// claimPendingTTL. A real database with 20M expiring wide-column +// keys × 50-byte user keys peaks at ~1.16 GiB; the old 64 MiB +// default would hard-fail healthy backups (no in-repo caller +// currently raises the ceiling, so the default IS the policy). +// 1 GiB is the OOM ceiling — adversarial snapshots with TB of +// 1 MiB keys still fail closed via ErrPendingTTLBufferFull. +const defaultPendingTTLBytesCap = 1 << 30 // pendingTTLEntryOverheadBytes is the per-entry payload cost we // charge against pendingTTLBytesCap on top of the user-key bytes. diff --git a/internal/backup/redis_string_test.go b/internal/backup/redis_string_test.go index 9623c9ea..b6cabbca 100644 --- a/internal/backup/redis_string_test.go +++ b/internal/backup/redis_string_test.go @@ -816,6 +816,29 @@ func TestRedisDB_WithPendingTTLByteCapNegativeCoercedToZero(t *testing.T) { } } +// TestRedisDB_DefaultPendingTTLByteCapSurvives1GiBLoad pins the +// codex P1 fix (PR #790 round 8): the default cap must comfortably +// accommodate "millions of expiring wide-column keys" without +// hitting ErrPendingTTLBufferFull. Specifically the old 64 MiB +// default would refuse a 10M-expiring-wide-column-key snapshot +// with average 50-byte keys (cost = 10M * 58 = 580 MB); the new +// 1 GiB default tolerates that workload. +func TestRedisDB_DefaultPendingTTLByteCapSurvives1GiBLoad(t *testing.T) { + t.Parallel() + // Sentinel constant invariant: the default must be exactly 1 GiB. + // Below 64 MiB would resurrect the round-6 regression; above + // ~4 GiB risks raising the OOM ceiling beyond typical dump-host + // budget headroom. + const wantDefault = 1 << 30 + if defaultPendingTTLBytesCap != wantDefault { + t.Fatalf("defaultPendingTTLBytesCap = %d, want %d (1 GiB)", defaultPendingTTLBytesCap, wantDefault) + } + db, _ := newRedisDB(t) + if db.pendingTTLBytesCap != wantDefault { + t.Fatalf("NewRedisDB pendingTTLBytesCap = %d, want %d", db.pendingTTLBytesCap, wantDefault) + } +} + func TestRedisDB_DirsCreatedCachesMkdirAll(t *testing.T) { t.Parallel() // Two HandleString calls in a row should populate dirsCreated From 060ce8a988f3a60350919f3d501cd57472cd0011 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 20 May 2026 20:41:43 +0900 Subject: [PATCH 10/14] =?UTF-8?q?backup(zset):=20PR790=20r9=20codex=20P2?= =?UTF-8?q?=20=E2=80=94=20preserve=20legacy=20zset=20members=20on=20meta-o?= =?UTF-8?q?nly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit codex P2 on round 7: data-loss bug. HandleZSetMeta calling markZSetWide(st) dropped any legacy `!redis|zset|` members as soon as a `!zs|meta|` row appeared, even when no `!zs|mem|` rows ever arrived. But the live read path (adapter/redis_compat_helpers.go:610-621) switches to the wide- column layout ONLY when the `!zs|mem|` prefix scan finds at least one row — a snapshot containing only `!zs|meta|` (no member rows) continues to serve the legacy blob to live clients. The backup encoder dropping the legacy members on meta-alone produced empty/stale zsets on restore — actual user-visible data loss. Although codex labelled this P2, the substance is data-loss on restore for a legitimate snapshot shape (meta-without-members can arise transiently during migration, after partial scans, or on shards where members were pruned but meta lingered). Fix: remove `r.markZSetWide(st)` from HandleZSetMeta. Only HandleZSetMember triggers the supersedence rule, matching the live read path's "switch to wide-column only when a member row exists" semantic. Meta-alone is now treated as bookkeeping (declaredLen + metaSeen for length-mismatch warnings); it does NOT invalidate the legacy blob. Caller audit (per /loop standing instruction): grep -rn 'markZSetWide\|HandleZSetMeta\b' --include='*.go' → markZSetWide has exactly one caller now: HandleZSetMember (redis_zset.go:198). The supersedence rule still fires on the first wide-column MEMBER row, matching the live read path. HandleZSetMeta callers in tests are unaffected: those that follow up with HandleZSetMember still observe markZSetWide via the member row; the new test TestRedisDB_ZSetMetaAlonePreservesLegacyBlob pins the meta-only case explicitly. Test changes: - TestRedisDB_ZSetMetaAloneEvictsLegacyBlob (asserted the OLD buggy behavior) → renamed and inverted to TestRedisDB_ZSetMetaAlonePreservesLegacyBlob (asserts the correct behavior). Test now seeds a 2-member legacy blob, emits meta-only with declaredLen=2, finalizes, and asserts both members survive in the output JSON. - Other tests that combine HandleZSetMeta + HandleZSetMember still pass because HandleZSetMember triggers markZSetWide on the first member row. Self-review: 1. Data loss — STRICTLY safer: previously the encoder dropped members that the live read still served; now meta-only preserves the legacy blob. Wide-column member rows still correctly supersede legacy on first observation (TestRedisDB_ZSetWideColumnSupersedesLegacyBlob still passes). 2. Concurrency — no new shared state; behavior change only. 3. Performance — one less map-clear call on meta-only snapshots. 4. Consistency — encoder semantics now MATCH the live read path's loadZSetAt at adapter/redis_compat_helpers.go:610-621. This is the alignment the rest of the legacy/wide-column handling is designed around. 5. Coverage — TestRedisDB_ZSetMetaAlonePreservesLegacyBlob pins the new behavior. TestRedisDB_ZSetWideColumnSupersedesLegacyBlob (member row triggers supersedence) and TestRedisDB_ZSetLegacyBlobAfterWideRowsIsDropped (reverse order) still pass. --- internal/backup/redis_zset.go | 14 ++++++---- internal/backup/redis_zset_test.go | 45 ++++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/internal/backup/redis_zset.go b/internal/backup/redis_zset.go index fbdfb278..348792b0 100644 --- a/internal/backup/redis_zset.go +++ b/internal/backup/redis_zset.go @@ -153,11 +153,15 @@ func (r *RedisDB) HandleZSetMeta(key, value []byte) error { st := r.zsetState(userKey) st.declaredLen = int64(rawLen) //nolint:gosec // bounds-checked above st.metaSeen = true - // Wide-column meta means any legacy blob already merged into - // this state is stale; drop it. The live read path - // (adapter/redis_compat_helpers.go:610-620) makes the same - // choice on read. - r.markZSetWide(st) + // Codex P2 (PR #790 r9): do NOT call markZSetWide here. The + // live read path (adapter/redis_compat_helpers.go:610-621) + // switches to the wide-column layout only when the !zs|mem| + // scan finds at least one row. A snapshot containing a + // !zs|meta| row WITHOUT any !zs|mem| rows continues to serve + // the legacy `!redis|zset|` blob to live clients. If we + // invalidated the legacy members on meta-alone, the backup + // would emit an empty/stale zset and lose user-visible data + // on restore. Only HandleZSetMember triggers markZSetWide. return nil } diff --git a/internal/backup/redis_zset_test.go b/internal/backup/redis_zset_test.go index 4cd76476..c6a7eddc 100644 --- a/internal/backup/redis_zset_test.go +++ b/internal/backup/redis_zset_test.go @@ -717,31 +717,52 @@ func TestRedisDB_ZSetWideColumnSupersedesLegacyBlob(t *testing.T) { }) } -// TestRedisDB_ZSetMetaAloneEvictsLegacyBlob pins that a `!zs|meta|` -// record (even without any `!zs|mem|` rows) also triggers the -// legacy-supersedence rule. The live read path's loadZSetAt -// preference is based on whether ANY wide-column key exists for -// the user key — meta-only is rare in practice but the encoder -// stays consistent with the read-side contract. -func TestRedisDB_ZSetMetaAloneEvictsLegacyBlob(t *testing.T) { +// TestRedisDB_ZSetMetaAlonePreservesLegacyBlob pins the codex P2 +// fix on PR #790 round 9: a `!zs|meta|` record WITHOUT any +// `!zs|mem|` rows must NOT evict the legacy blob. The live read +// path (adapter/redis_compat_helpers.go:610-621) switches to the +// wide-column layout only when the `!zs|mem|` scan finds at +// least one row — a snapshot containing `!zs|meta|` but no +// `!zs|mem|` rows continues to serve the legacy +// `!redis|zset|` blob to live clients. Earlier rounds +// invalidated the legacy members on meta-alone, which would +// have produced an empty zset on restore — actual user-visible +// data loss. +func TestRedisDB_ZSetMetaAlonePreservesLegacyBlob(t *testing.T) { t.Parallel() db, root := newRedisDB(t) legacy := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{ - {member: "stale", score: 1}, + {member: "kept", score: 1}, + {member: "also-kept", score: 2.5}, }) if err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), legacy); err != nil { t.Fatal(err) } - // Wide-column meta record declares zero members. - if err := db.HandleZSetMeta(zsetMetaKey("k"), encodeZSetMetaValue(0)); err != nil { + // Wide-column meta record declares some count but no !zs|mem| + // rows ever arrive — the live read path would still serve the + // legacy blob, so the backup must do the same. + if err := db.HandleZSetMeta(zsetMetaKey("k"), encodeZSetMetaValue(2)); err != nil { t.Fatal(err) } if err := db.Finalize(); err != nil { t.Fatal(err) } got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) - if members := zsetMembersArray(t, got); len(members) != 0 { - t.Fatalf("meta-only wide-column row MUST evict legacy blob, got members=%v", members) + members := zsetMembersArray(t, got) + if len(members) != 2 { + t.Fatalf("meta-only must preserve legacy blob members, got %d (want 2)", len(members)) + } + wantMembers := map[string]float64{"kept": 1, "also-kept": 2.5} + for _, m := range members { + mMap, ok := m.(map[string]any) + if !ok { + t.Fatalf("member entry not object: %T", m) + } + name, _ := mMap["member"].(string) + score, _ := mMap["score"].(float64) + if wantScore, ok := wantMembers[name]; !ok || wantScore != score { + t.Fatalf("unexpected member %q score %v (want from %v)", name, score, wantMembers) + } } } From 6f1f07169dd794cea408201327a1f83fb4505cd3 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 21 May 2026 04:45:13 +0900 Subject: [PATCH 11/14] =?UTF-8?q?backup(zset):=20PR790=20r11=20claude=20r9?= =?UTF-8?q?/r10=20carryover=20=E2=80=94=20docstring=20+=20sizing=20+=20mis?= =?UTF-8?q?sing=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three carryover items from claude reviews r3-r10, none blocking merge by themselves but the user has asked for them to be cleaned up. All three are doc/coverage; no code semantics change. (1) HandleZSetLegacyBlob docstring claimed `!redis|zset|` sorts BEFORE `!redis|ttl|`. Wrong: 'z'=0x7A > 't'=0x74 on the byte after the shared `!redis|` prefix, so `!redis|ttl|` sorts BEFORE `!redis|zset|` in natural Pebble order. The code was always correct (uses claimPendingTTL to drain the pre-arrived TTL); only the doc misled readers. Same wrong claim in TestRedisDB_ZSetLegacyBlobWithInlineTTL's docstring. (2) defaultPendingTTLBytesCap doc + ErrPendingTTLBufferFull doc both said the 1 GiB cap "accommodates" / "tolerates" a 20M- expiring-wide-column-key workload "peaking at ~1.16 GiB". But 1.16 GiB > 1 GiB — the cap fails-closed at ~18.5M keys (1,073,741,824 / 58 ≈ 18,513,652). Operators trusting the guidance would hit ErrPendingTTLBufferFull. Corrected both docstrings to state the actual ~18.5M ceiling and note that larger deployments must raise the cap. (3) TestRedisDB_ZSetLegacyBlobWithInlineTTL exercised only blob-first → TTL-second (the redisKindZSet HandleTTL fast-path), which is the REVERSE of natural Pebble order. Added TestRedisDB_ZSetLegacyBlobTTLArrivesBeforeBlob to cover the production code path: TTL first → parked in pendingTTL as redisKindUnknown → HandleZSetLegacyBlob → zsetState() drains via claimPendingTTL. Pins both the expireAtMs round-trip and the member survival. Caller audit (per /loop standing instruction): documentation change only on (1) and (2); no semantics change. Test addition in (3) is purely additive. No caller-side audit needed. Self-review: 1. Data loss — none. Docs and a new test only. 2. Concurrency — none. 3. Performance — none. 4. Consistency — docs now accurately describe runtime behavior. The sizing claim and scan-order claim are factually correct. 5. Coverage — new test pins the natural Pebble order for legacy blobs with TTL; previously only the reverse order was covered. Existing tests still pass with -race. --- internal/backup/redis_string.go | 29 +++++++++------- internal/backup/redis_zset.go | 30 ++++++++++------- internal/backup/redis_zset_test.go | 53 ++++++++++++++++++++++++++---- 3 files changed, 84 insertions(+), 28 deletions(-) diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index 85420052..d72ee2d6 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -90,11 +90,14 @@ var ErrRedisInvalidTTLValue = cockroachdberr.New("backup: invalid !redis|ttl| va // round 5 (entry-count -> byte-budget on round 6). // // The default cap is 1 GiB (see defaultPendingTTLBytesCap), sized -// to accommodate typical real-world workloads — a 20M-expiring- -// wide-column-key snapshot with 50-byte keys peaks at ~1.16 GiB -// of buffered pairs before draining. The OOM ceiling is still -// hard (adversarial snapshots with TB of large keys fail closed). -// Codex P1 on PR #790 round 7 / PR #791 round 7. +// to accommodate typical real-world workloads — at the 58-byte +// per-entry cost (50-byte user key + 8-byte expireAtMs payload) +// the cap holds up to ~18.5M expiring wide-column keys before +// fail-closed (1,073,741,824 / 58 ≈ 18,513,652). Larger +// deployments must raise the ceiling via WithPendingTTLByteCap. +// The OOM ceiling is still hard (adversarial snapshots with TB +// of large keys fail closed). Codex P1 on PR #790 round 7; +// claude r9/r10 sizing-claim correction in round 11. // // Recovery: raise WithPendingTTLByteCap above the snapshot's // expected cumulative byte cost of unmatched-at-intake TTLs, or @@ -280,12 +283,16 @@ type RedisDB struct { // wide-column key in the database, because `!redis|ttl|` lex-sorts // BEFORE `!st|`/`!stream|`/`!zs|` — all wide-column TTLs are // parked before any wide-column row drains them via -// claimPendingTTL. A real database with 20M expiring wide-column -// keys × 50-byte user keys peaks at ~1.16 GiB; the old 64 MiB -// default would hard-fail healthy backups (no in-repo caller -// currently raises the ceiling, so the default IS the policy). -// 1 GiB is the OOM ceiling — adversarial snapshots with TB of -// 1 MiB keys still fail closed via ErrPendingTTLBufferFull. +// claimPendingTTL. At the 58-byte per-entry cost (50-byte user +// key + 8-byte expireAtMs payload) the 1 GiB cap holds up to +// ~18.5M expiring wide-column keys before fail-closed +// (1,073,741,824 / 58 ≈ 18,513,652). Deployments above that +// threshold must raise the ceiling via WithPendingTTLByteCap; +// the old 64 MiB default hard-failed at ~1.1M which is too +// small for real Redis workloads. 1 GiB is the OOM ceiling — +// adversarial snapshots with TB of 1 MiB keys still fail +// closed via ErrPendingTTLBufferFull. Sizing-claim correction +// in claude r9/r10 follow-up (round 11). const defaultPendingTTLBytesCap = 1 << 30 // pendingTTLEntryOverheadBytes is the per-entry payload cost we diff --git a/internal/backup/redis_zset.go b/internal/backup/redis_zset.go index 348792b0..8a419159 100644 --- a/internal/backup/redis_zset.go +++ b/internal/backup/redis_zset.go @@ -247,18 +247,26 @@ func (r *RedisDB) HandleZSetMetaDelta(_, _ []byte) error { return nil } // regardless of which layout the live store used. NaN scores fail // closed at intake, matching HandleZSetMember's contract. // -// The legacy prefix `!redis|zset|` lex-sorts BEFORE `!zs|...` and -// BEFORE `!redis|ttl|`, so when a zset is stored in both formats -// (mid-migration), this handler creates the state first and the -// later wide-column records merge into it — duplicate -// HandleZSetMember calls follow the same latest-wins policy. +// Scan-order context (per Pebble lexicographic order): // -// `!redis|zset|` ALSO sorts BEFORE `!redis|ttl|`, so an inline TTL -// on the same user key will reach HandleTTL after this handler has -// already registered redisKindZSet. The HandleTTL redisKindZSet -// branch then folds the expiry into st.expireAtMs via zsetState -// (which itself drains pendingTTL — a no-op here since the typed -// record came first). +// - `!redis|zset|` (0x72='r') sorts BEFORE `!zs|...` (0x7A='z'), +// so a mid-migration store with both formats hits this handler +// first; later wide-column records merge into the same state. +// Duplicate HandleZSetMember calls follow the latest-wins policy. +// +// - `!redis|zset|` (0x72='r' < 0x74='t' for the byte after the +// shared `!redis|` prefix would be the wrong direction; the +// actual comparison is byte 'z'=0x7A > 't'=0x74) — so +// `!redis|ttl|` sorts BEFORE `!redis|zset|`. The real +// production order for a TTL'd legacy zset is therefore: +// +// !redis|ttl|k → HandleTTL (redisKindUnknown) → pendingTTL["k"] = ms +// !redis|zset|k → HandleZSetLegacyBlob → zsetState() → claimPendingTTL() +// drains pendingTTL into st.expireAtMs +// +// The HandleTTL redisKindZSet fast-path branch only fires for +// the reverse order (custom dispatcher or replay), where a +// wide-column row registered the key before the TTL arrived. func (r *RedisDB) HandleZSetLegacyBlob(key, value []byte) error { userKey, ok := parseZSetLegacyBlobKey(key) if !ok { diff --git a/internal/backup/redis_zset_test.go b/internal/backup/redis_zset_test.go index c6a7eddc..f977d7b8 100644 --- a/internal/backup/redis_zset_test.go +++ b/internal/backup/redis_zset_test.go @@ -795,12 +795,16 @@ func TestRedisDB_ZSetLegacyBlobAfterWideRowsIsDropped(t *testing.T) { }) } -// TestRedisDB_ZSetLegacyBlobWithInlineTTL pins that a TTL'd zset -// stored only via the legacy blob round-trips its expiry. The -// snapshot order is `!redis|zset|` (sorts before `!redis|ttl|`), -// so HandleZSetLegacyBlob runs first and registers redisKindZSet, -// then HandleTTL routes via the redisKindZSet branch (no -// pendingTTL detour needed for this ordering). +// TestRedisDB_ZSetLegacyBlobWithInlineTTL pins the reverse-order +// path: HandleZSetLegacyBlob runs first (custom dispatcher or +// replay), so HandleTTL routes via the redisKindZSet fast-path +// branch (no pendingTTL detour needed). +// +// NOTE: This is NOT the natural Pebble scan order. In real Pebble +// order, `!redis|ttl|` sorts BEFORE `!redis|zset|` (0x74='t' +// < 0x7A='z' on the byte after the shared `!redis|` prefix). +// TestRedisDB_ZSetLegacyBlobTTLArrivesBeforeBlob covers the +// natural order via the pendingTTL → claimPendingTTL path. func TestRedisDB_ZSetLegacyBlobWithInlineTTL(t *testing.T) { t.Parallel() db, root := newRedisDB(t) @@ -820,6 +824,43 @@ func TestRedisDB_ZSetLegacyBlobWithInlineTTL(t *testing.T) { } } +// TestRedisDB_ZSetLegacyBlobTTLArrivesBeforeBlob pins the +// claude r3-r10 carry-over: in natural Pebble scan order +// `!redis|ttl|` arrives BEFORE `!redis|zset|` because +// `t`=0x74 < `z`=0x7A in the byte after the shared `!redis|` +// prefix. The TTL is parked in pendingTTL as redisKindUnknown; +// HandleZSetLegacyBlob then registers the user key via +// zsetState() which drains the parked TTL via claimPendingTTL +// into st.expireAtMs. This is the production code path — +// previously only the reverse order +// (TestRedisDB_ZSetLegacyBlobWithInlineTTL) was tested. +func TestRedisDB_ZSetLegacyBlobTTLArrivesBeforeBlob(t *testing.T) { + t.Parallel() + db, root := newRedisDB(t) + // Real Pebble order: TTL first. + if err := db.HandleTTL([]byte("k"), encodeTTLValue(fixedExpireMs)); err != nil { + t.Fatal(err) + } + // Then the legacy blob. + value := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{{member: "m", score: 1}}) + if err := db.HandleZSetLegacyBlob(zsetLegacyBlobKey("k"), value); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + got := readZSetJSON(t, filepath.Join(root, "redis", "db_0", "zsets", "k.json")) + if zsetFloat(t, got, "expire_at_ms") != float64(fixedExpireMs) { + t.Fatalf("expire_at_ms = %v want %d (pendingTTL drain path for legacy blob failed)", + got["expire_at_ms"], fixedExpireMs) + } + // Confirm the member survived the drain. + members := zsetMembersArray(t, got) + if len(members) != 1 { + t.Fatalf("members = %d, want 1", len(members)) + } +} + // TestRedisDB_ZSetLegacyBlobRejectsMissingMagic pins fail-closed // behaviour: a `!redis|zset|` value without the magic prefix // fails at intake rather than silently decoding garbage protobuf. From 130391341c5e9b7fa44381733b557a0dbb3cdde8 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 21 May 2026 22:26:47 +0900 Subject: [PATCH 12/14] =?UTF-8?q?backup(zset):=20PR790=20r12=20claude=20r2?= =?UTF-8?q?-r10=20carryover=20=E2=80=94=20round-trip=20base64=20envelope?= =?UTF-8?q?=20assertion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit claude review carryover from r2-r10 (Low priority but a real bug): TestRedisDB_ZSetBinaryMemberUsesBase64Envelope's payload check was non-functional: if envelope["base64"] == "" { t.Fatalf("base64 envelope missing payload: %v", envelope) } envelope["base64"] is typed as `any`. Comparing `any` to the string literal "" with `==` performs interface comparison, which is never equal whenever the value is a non-nil string — including a corrupt payload like "garbage". The test passed for any base64 field that was a string, regardless of its content. Fix: round-trip the payload. Type-assert to string, decode via base64.StdEncoding.DecodeString, and bytes.Equal against the original input bytes [0x80, 0xff, 0x01]. Added "bytes" and "encoding/base64" imports. Extracted assertBase64EnvelopeDecodesTo helper (called via t.Helper) so the test function stays under the cyclop=10 ceiling. Caller audit (per /loop standing instruction): test-only change; no production code touched. No caller-side audit needed. Self-review: 1. Data loss — none. Test fix only. 2. Concurrency — none. 3. Performance — none. 4. Consistency — the test now actually verifies what its name claims: non-UTF-8 member bytes round-trip via the typed {"base64":"..."} envelope. Previously a regression that corrupted the payload would have silently passed. 5. Coverage — the new helper is also reusable for any future base64-envelope assertions on hashes/lists/streams. --- internal/backup/redis_zset_test.go | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/internal/backup/redis_zset_test.go b/internal/backup/redis_zset_test.go index f977d7b8..5ea48ae3 100644 --- a/internal/backup/redis_zset_test.go +++ b/internal/backup/redis_zset_test.go @@ -1,6 +1,8 @@ package backup import ( + "bytes" + "encoding/base64" "encoding/binary" "encoding/json" "math" @@ -343,8 +345,24 @@ func TestRedisDB_ZSetBinaryMemberUsesBase64Envelope(t *testing.T) { if !ok { t.Fatalf("expected base64 envelope on member, got %T(%v)", rec["member"], rec["member"]) } - if envelope["base64"] == "" { - t.Fatalf("base64 envelope missing payload: %v", envelope) + // Claude r2-r10 carryover: previous assertion compared + // interface{} to string with ==, which is never equal — any + // non-nil value (including a corrupt payload) passed. + assertBase64EnvelopeDecodesTo(t, envelope, []byte{0x80, 0xff, 0x01}) +} + +func assertBase64EnvelopeDecodesTo(t *testing.T, envelope map[string]any, want []byte) { + t.Helper() + payload, ok := envelope["base64"].(string) + if !ok || payload == "" { + t.Fatalf("base64 envelope missing payload (type=%T, value=%v)", envelope["base64"], envelope["base64"]) + } + decoded, err := base64.StdEncoding.DecodeString(payload) + if err != nil { + t.Fatalf("invalid base64 payload %q: %v", payload, err) + } + if !bytes.Equal(decoded, want) { + t.Fatalf("decoded member = %x, want %x", decoded, want) } } From ab7ee2ce7aa9878b65fbf70fe8ad9a2acd84cf69 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 21 May 2026 22:29:37 +0900 Subject: [PATCH 13/14] =?UTF-8?q?backup(zset):=20PR790=20r13=20claude=20ni?= =?UTF-8?q?ts=20=E2=80=94=20consolidate=20gosec=20nolints=20+=20rename=20c?= =?UTF-8?q?ontradictory=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two style-nit cleanups per CLAUDE.md "Avoid //nolint — refactor instead" convention; both were carried by claude reviews across rounds 1-10. (1) //nolint:gosec suppressions in test helpers (lines 52, 69, 77, 90, 102 of the prior file). Five separate sites all doing the same uint32(len(userKey)) cast to pack the wideColKeyLen prefix per store/wideColKeyLenSize. Extracted appendUserKeyLenBE32 helper: it panics on the (theoretical) overflow and carries the SOLE nolint annotation in the file. Net: 5 nolints -> 1 nolint, plus an explicit bounds check that documents the invariant for future readers. Also added a non-negative bounds check to encodeZSetMetaValue (memberCount int64 -> uint64) which had a similar nolint annotation. (2) TestRedisDB_ZSetLegacyBlobRejectsMalformedKey had a contradictory docstring: it claimed to test "empty user key is technically valid Redis" then said "Use a key that doesn't have the prefix to trigger the parse failure" — which is the opposite. The test actually exercises the missing-prefix case. Renamed to TestRedisDB_ZSetLegacyBlobRejectsMissingPrefix and rewrote the docstring to accurately describe what it pins. Caller audit (per /loop standing instruction): test-only changes; no production code touched. appendUserKeyLenBE32 has no production callers (only test fixtures use it). Rename of the test function has no callers (test functions are entry points). Self-review: 1. Data loss — none. Refactor + rename only. 2. Concurrency — none. 3. Performance — same call overhead (append + putUint32); helper does not add an allocation. 4. Consistency — wire-format layout unchanged; helper just wraps the existing cast with bounds documentation. 5. Coverage — all existing tests still pass with -race; golangci-lint clean. --- internal/backup/redis_zset_test.go | 52 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/internal/backup/redis_zset_test.go b/internal/backup/redis_zset_test.go index 5ea48ae3..ed8512a2 100644 --- a/internal/backup/redis_zset_test.go +++ b/internal/backup/redis_zset_test.go @@ -45,11 +45,31 @@ func zsetLegacyBlobKey(userKey string) []byte { return append(out, userKey...) } +// appendUserKeyLenBE32 appends the 4-byte BE-uint32 length of +// userKey to out. Consolidates the five //nolint:gosec sites +// where test fixtures need to pack a userKey length per the +// store/wideColKeyLenSize wire layout. The cast is safe in test +// fixtures because every test userKey is a Go string literal far +// below math.MaxUint32, but the bounds check makes that explicit +// for the linter and any future reader. +func appendUserKeyLenBE32(out []byte, userKey string) []byte { + n := len(userKey) + if n > 0xFFFFFFFF { + panic("test fixture userKey exceeds uint32 length") + } + var l [4]byte + binary.BigEndian.PutUint32(l[:], uint32(n)) //nolint:gosec // n bounds-checked above + return append(out, l[:]...) +} + // encodeZSetMetaValue builds the 8-byte BE member-count value used by // the live store/zset_helpers.go (mirror of store.MarshalZSetMeta). func encodeZSetMetaValue(memberCount int64) []byte { + if memberCount < 0 { + panic("test fixture memberCount must be non-negative") + } v := make([]byte, 8) - binary.BigEndian.PutUint64(v, uint64(memberCount)) //nolint:gosec + binary.BigEndian.PutUint64(v, uint64(memberCount)) //nolint:gosec // memberCount bounds-checked above return v } @@ -65,9 +85,7 @@ func encodeZSetScoreValue(score float64) []byte { // !zs|meta|. func zsetMetaKey(userKey string) []byte { out := []byte(RedisZSetMetaPrefix) - var l [4]byte - binary.BigEndian.PutUint32(l[:], uint32(len(userKey))) //nolint:gosec - out = append(out, l[:]...) + out = appendUserKeyLenBE32(out, userKey) return append(out, userKey...) } @@ -75,9 +93,7 @@ func zsetMetaKey(userKey string) []byte { // !zs|mem|. Member is binary-safe. func zsetMemberKey(userKey string, member []byte) []byte { out := []byte(RedisZSetMemberPrefix) - var l [4]byte - binary.BigEndian.PutUint32(l[:], uint32(len(userKey))) //nolint:gosec - out = append(out, l[:]...) + out = appendUserKeyLenBE32(out, userKey) out = append(out, userKey...) return append(out, member...) } @@ -88,9 +104,7 @@ func zsetMemberKey(userKey string, member []byte) []byte { // HandleZSetScore in a test that pins the "discard" contract. func zsetScoreKey(userKey string, sortableScore [8]byte, member []byte) []byte { out := []byte(RedisZSetScorePrefix) - var l [4]byte - binary.BigEndian.PutUint32(l[:], uint32(len(userKey))) //nolint:gosec - out = append(out, l[:]...) + out = appendUserKeyLenBE32(out, userKey) out = append(out, userKey...) out = append(out, sortableScore[:]...) return append(out, member...) @@ -100,9 +114,7 @@ func zsetScoreKey(userKey string, sortableScore [8]byte, member []byte) []byte { // !zs|meta|d|. func zsetMetaDeltaKey(userKey string, commitTS uint64, seqInTxn uint32) []byte { out := []byte(RedisZSetMetaDeltaPrefix) - var l [4]byte - binary.BigEndian.PutUint32(l[:], uint32(len(userKey))) //nolint:gosec - out = append(out, l[:]...) + out = appendUserKeyLenBE32(out, userKey) out = append(out, userKey...) var ts [8]byte binary.BigEndian.PutUint64(ts[:], commitTS) @@ -908,16 +920,16 @@ func TestRedisDB_ZSetLegacyBlobRejectsNaNScore(t *testing.T) { } } -// TestRedisDB_ZSetLegacyBlobRejectsMalformedKey pins that a -// `!redis|zset|` key with no trailing user-key bytes fails parse. -func TestRedisDB_ZSetLegacyBlobRejectsMalformedKey(t *testing.T) { +// TestRedisDB_ZSetLegacyBlobRejectsMissingPrefix pins that a key +// without the `!redis|zset|` prefix fails parse via +// ErrRedisInvalidZSetLegacyBlob. Empty user key with the correct +// prefix IS valid Redis (technically reachable via a corrupted +// store) and is covered by parseZSetLegacyBlobKey's own unit +// path; only the missing-prefix case routes through this handler. +func TestRedisDB_ZSetLegacyBlobRejectsMissingPrefix(t *testing.T) { t.Parallel() db, _ := newRedisDB(t) value := encodeZSetLegacyBlobValue(t, []zsetLegacyEntry{{member: "m", score: 1}}) - // Key has the prefix but no trailing user-key bytes — parser must - // still accept it (empty user key is technically valid Redis). - // Use a key that doesn't have the prefix to trigger the parse - // failure. err := db.HandleZSetLegacyBlob([]byte("not-the-right-prefix|k"), value) if !errors.Is(err, ErrRedisInvalidZSetLegacyBlob) { t.Fatalf("err=%v want ErrRedisInvalidZSetLegacyBlob", err) From c9d49e281bc035b7bef118644dec8d83693fcce0 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 21 May 2026 23:19:45 +0900 Subject: [PATCH 14/14] =?UTF-8?q?backup(zset):=20PR790=20r14=20codex=20P2?= =?UTF-8?q?=20=E2=80=94=20Finalize=20warns=20on=20overflow-only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit codex P2 on round 13: Finalize's warn sink fired only when orphanTTLCount > 0. But entries rejected in parkUnknownTTL are counted in pendingTTLOverflow and never contribute to orphanTTLCount (the overflow path returns ErrPendingTTLBufferFull before pendingTTL is touched). In runs where overflow happens but all other TTLs drain cleanly (orphanTTLCount == 0), no warning was emitted despite dropped expirations. A caller that swallows the HandleTTL error and continues would lose the visibility signal entirely. Fix: also emit warn when pendingTTLOverflow > 0. The condition becomes `orphanTTLCount > 0 || pendingTTLOverflow > 0`. The warning's existing field set already carries pending_ttl_buffer_overflow and pending_ttl_buffer_bytes_cap when overflow is non-zero — operators get the same diagnostics in the overflow-only path as in the orphan-plus-overflow path. Caller audit (per /loop standing instruction): grep -rn 'orphanTTLCount\b|pendingTTLOverflow\b' --include='*.go' - orphanTTLCount writers: parkUnknownTTL (cap==0 mode), Finalize (adds len(pendingTTL)) - pendingTTLOverflow writers: parkUnknownTTL (overflow branch) - Readers: Finalize (warn emit), tests The change broadens the warn-trigger predicate to include pendingTTLOverflow; no caller depends on the absence of the warning. The warn fields are unchanged for the orphan-only path (still fields existing tests assert). Test added: - TestRedisDB_FinalizeWarnsOnOverflowOnlyEvenWhenOrphanCountZero pins the new behavior: with byte_cap=16, key "k0" (cost=10) fits but key "k1" (would push to 20) fails closed. After draining k0 via HandleSetMember, orphanTTLCount stays at 0 while pendingTTLOverflow stays at 1. Asserts that the redis_orphan_ttl warning still fires AND carries pending_ttl_buffer_overflow in its fields. - Extracted warnFieldsContain helper to keep the test under cyclop=10. Self-review: 1. Data loss visibility — STRICTLY safer: operators now see the overflow signal even when orphanTTLCount==0. Previous behavior silently swallowed dropped expirations if the caller's HandleTTL error handling let them through. 2. Concurrency — none. Same single-threaded Finalize contract. 3. Performance — one extra integer comparison in Finalize (O(1)); negligible. 4. Consistency — predicate now matches the field set that was already added on overflow. The warning's reported "count" field remains orphanTTLCount; pending_ttl_buffer_overflow is a separate field already in the schema. No JSON schema change. 5. Coverage — new test pins both the warn emission and the field carrying. TestRedisDB_HandleTTL_OrphanWarnsOnFinalize still passes (orphan-only path unchanged). All other pendingTTL tests pass with -race. --- internal/backup/redis_string.go | 10 ++++- internal/backup/redis_string_test.go | 60 ++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/internal/backup/redis_string.go b/internal/backup/redis_string.go index d72ee2d6..eeb66b16 100644 --- a/internal/backup/redis_string.go +++ b/internal/backup/redis_string.go @@ -584,7 +584,15 @@ func (r *RedisDB) Finalize() error { // key whose type prefix we don't recognise (a future Redis // type added on the live side without a backup-encoder update). r.orphanTTLCount += len(r.pendingTTL) - if r.warn != nil && r.orphanTTLCount > 0 { + // Codex P2 (PR #790 r13): also warn on pendingTTLOverflow even + // when orphanTTLCount is zero. Overflow entries fail-closed at + // parkUnknownTTL with ErrPendingTTLBufferFull and never reach + // pendingTTL, so they don't contribute to orphanTTLCount. If + // the caller swallows the HandleTTL error and continues, the + // previous condition (`orphanTTLCount > 0` only) would suppress + // the visibility signal for dropped expirations. Treat overflow + // as an independent reason to warn. + if r.warn != nil && (r.orphanTTLCount > 0 || r.pendingTTLOverflow > 0) { fields := []any{ "count", r.orphanTTLCount, "hint", "TTL records whose user key never appeared in a typed record — possible store corruption or an unknown type prefix", diff --git a/internal/backup/redis_string_test.go b/internal/backup/redis_string_test.go index b6cabbca..eb4564c5 100644 --- a/internal/backup/redis_string_test.go +++ b/internal/backup/redis_string_test.go @@ -415,6 +415,66 @@ func TestRedisDB_HandleTTL_OrphanWarnsOnFinalize(t *testing.T) { } } +// warnFieldsContain returns true when fields includes the given +// even-indexed key. Helper for warn-sink assertions; avoids +// repeated `for i:=0; i<...; i+=2` in tests. +func warnFieldsContain(fields []any, key string) bool { + for i := 0; i+1 < len(fields); i += 2 { + if k, ok := fields[i].(string); ok && k == key { + return true + } + } + return false +} + +// TestRedisDB_FinalizeWarnsOnOverflowOnlyEvenWhenOrphanCountZero +// pins the codex P2 fix (PR #790 round 13): Finalize must emit the +// orphan-ttl warning when pendingTTLOverflow > 0, independently of +// orphanTTLCount. Previously the warn sink only fired when +// orphanTTLCount > 0, so a caller that swallowed +// ErrPendingTTLBufferFull would lose the visibility signal for +// dropped expirations. +func TestRedisDB_FinalizeWarnsOnOverflowOnlyEvenWhenOrphanCountZero(t *testing.T) { + t.Parallel() + db, _ := newRedisDB(t) + var events []string + var lastFields []any + db.WithWarnSink(func(event string, fields ...any) { + events = append(events, event) + lastFields = fields + }) + // Small byte-cap to force overflow on the second key. + const byteCap = 16 // single key "k0" costs 2+8=10; second key won't fit at 10+10=20>16. + db.WithPendingTTLByteCap(byteCap) + if err := db.HandleTTL([]byte("k0"), encodeTTLValue(1)); err != nil { + t.Fatalf("first HandleTTL: %v", err) + } + // Second key fails closed — overflow++, orphan stays at 0 + // (entry never enters pendingTTL). + if err := db.HandleTTL([]byte("k1"), encodeTTLValue(2)); !errors.Is(err, ErrPendingTTLBufferFull) { + t.Fatalf("expected ErrPendingTTLBufferFull, got %v", err) + } + // Drain k0 via a wide-column state-init so orphanTTLCount ends + // at 0; only pendingTTLOverflow remains > 0. + if err := db.HandleSetMember(setMemberKey("k0", []byte("m")), nil); err != nil { + t.Fatal(err) + } + if err := db.Finalize(); err != nil { + t.Fatal(err) + } + assertPendingTTLState(t, "post-finalize", capturePendingTTLState(db), + pendingTTLState{bytes: 0, mapLen: 0, overflow: 1}) + if db.orphanTTLCount != 0 { + t.Fatalf("orphanTTLCount = %d, want 0 (drained)", db.orphanTTLCount) + } + if len(events) != 1 || events[0] != "redis_orphan_ttl" { + t.Fatalf("events = %v want [redis_orphan_ttl] (overflow alone must trigger warn)", events) + } + if !warnFieldsContain(lastFields, "pending_ttl_buffer_overflow") { + t.Fatalf("warn fields missing pending_ttl_buffer_overflow: %v", lastFields) + } +} + func TestRedisDB_RejectsTruncatedNewFormat(t *testing.T) { t.Parallel() cases := [][]byte{