From 42fb7ba3dba26822478f1ce71baa38b6bf5329fc Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 19 May 2026 18:33:48 +0900 Subject: [PATCH 1/3] backup: snapshot_reader (Phase 0a foundation for snapshot-decode binary) Adds internal/backup/snapshot_reader.go: the .fsm file parser that underpins the cmd/elastickv-snapshot-decode binary (design docs/design/2026_04_29_proposed_snapshot_logical_decoder.md lines 440-491). Mirrors the native Pebble snapshot format produced by store/lsm_store.go::pebbleSnapshotMagic + restoreBatchLoopInto. File shape consumed: [8 bytes] magic "EKVPBBL1" [8 bytes] lastCommitTS (LittleEndian uint64) repeated: [8 bytes] keyLen (LittleEndian uint64) [keyLen] encoded key = [8 bytes] valLen (LittleEndian uint64) [valLen] encoded value = Per-entry decoding strips: - The 8-byte inverted-TS suffix from the key (recovers commit_ts via XOR-inversion across the math.MaxUint64 boundary). - The 9-byte value header from the value (surfaces tombstone flag, encryption_state, expireAt). API surface (ReadSnapshot + SnapshotHeader + SnapshotEntry + sentinel errors). Callback-based so callers can stream multi-GB snapshots without buffering the whole file in memory; the emitted SnapshotEntry's byte slices alias scratch buffers and the caller must bytes.Clone for retention. Fail-closed contract (matches design 7.1): - ErrSnapshotBadMagic - first 8 bytes not "EKVPBBL1" (operator pointed at the wrong file) - ErrSnapshotTruncated - EOF mid-entry (a clean inter-entry EOF is the normal terminator and is NOT an error) - ErrSnapshotShortKey - encoded key < 8 bytes (no room for TS) - ErrSnapshotShortValue - encoded value < 9 bytes (no room for value header) - ErrSnapshotEncryptedReserved - reserved encryption_state bits (0b10, 0b11) or bits 3-7 set; same fail-closed trip-wire as store.ErrEncryptedValueReservedState - ErrSnapshotEncryptedEntry - encState=0b01 (encrypted). Phase 0a does not link the decryption keyring; Stage 8 of the encryption rollout adds a keyring-aware variant. Self-review: 1. Data loss - tombstone flag surfaced verbatim so callers can choose to suppress (default Phase 0a backup behavior) or include (a future diagnostic dump might want both). EOF distinction between clean terminator and truncation prevents silently dropping a partial entry. 2. Concurrency - reader is single-pass over an io.Reader; no shared state, no goroutines. 3. Performance - per-entry slices alias a fixed 64 KiB key buffer and a growable value buffer. For a 10M-entry snapshot this keeps allocs O(1) outside the value-grow boundary. bufio wrapper amortises read syscalls. 4. Consistency - encryption_state bits are decoded into the identical 4-way switch (cleartext / encrypted / 0b10 / 0b11) that store/lsm_store.go::decodeValue uses, so a future live-side enum extension fails closed here too. 5. Coverage - 17 table-driven tests in snapshot_reader_test.go covering: header-only, single-entry round-trip, tombstone flag, multi-entry order preservation, bad magic, truncated header, truncated entry, short-key, short-value, encrypted entry, all four reserved-encState cases, callback-error propagation, MVCC TS XOR-inversion across uint64 boundary, empty-value body. Caller audit (per /loop standing instruction): this PR adds a NEW public API surface (ReadSnapshot, SnapshotEntry, SnapshotHeader, PebbleSnapshotMagic, the Err* sentinels). No prior callers exist. Verified via 'grep -rn "ReadSnapshot|SnapshotEntry|SnapshotHeader" --include=*.go' - all matches are inside internal/backup/snapshot_reader{,test}.go. The follow-up PR (cmd/elastickv-snapshot-decode driver + dispatcher) will be this API's first consumer. --- internal/backup/snapshot_reader.go | 335 +++++++++++++++++++ internal/backup/snapshot_reader_test.go | 411 ++++++++++++++++++++++++ 2 files changed, 746 insertions(+) create mode 100644 internal/backup/snapshot_reader.go create mode 100644 internal/backup/snapshot_reader_test.go diff --git a/internal/backup/snapshot_reader.go b/internal/backup/snapshot_reader.go new file mode 100644 index 00000000..17fd5a70 --- /dev/null +++ b/internal/backup/snapshot_reader.go @@ -0,0 +1,335 @@ +package backup + +import ( + "bufio" + "bytes" + "encoding/binary" + "io" + + cockroachdberr "github.com/cockroachdb/errors" +) + +// snapshot_reader.go consumes the native Pebble snapshot format +// produced by store/lsm_store.go::pebbleSnapshotMagic + +// restoreBatchLoopInto and yields each entry as a (userKey, +// userValue, tombstone, expireAt) tuple after stripping the MVCC +// encoding the live store layers on top of raw Pebble bytes. +// +// Snapshot file shape: +// +// [8 bytes] magic "EKVPBBL1" +// [8 bytes] lastCommitTS (LittleEndian uint64) +// repeated: +// [8 bytes] keyLen (LittleEndian uint64) +// [keyLen] encoded key = +// [8 bytes] valLen (LittleEndian uint64) +// [valLen] encoded value = +// (flags bit 0 = tombstone, bits 1-2 = encryption_state) +// +// Mirrors store/lsm_store.go:1670-1697 (readRestoreEntry) and +// :336-340 (fillEncodedKey) and :419-422 (fillEncodedValue). The +// constants are duplicated here so this package stays +// adapter/store-independent (the design requires the decoder to +// run as an offline tool against a `.fsm` file with no live cluster +// libraries linked). + +// Snapshot format constants — mirror store/lsm_store.go. +const ( + // PebbleSnapshotMagicLen is the byte length of the "EKVPBBL1" + // header. Exposed so callers can sniff the first 8 bytes of a + // file to decide whether to dispatch into ReadSnapshot or fall + // through to another reader. + PebbleSnapshotMagicLen = 8 + + // snapshotTSSize is the 8-byte inverted-TS suffix appended to + // every encoded key (`store.fillEncodedKey`). + snapshotTSSize = 8 + + // snapshotValueHeaderSize is the 9-byte value-header prefix + // (flags + expireAt) on every encoded value + // (`store.fillEncodedValue`). + snapshotValueHeaderSize = 9 + + // snapshotTombstoneMask / snapshotEncStateMask / snapshotEncStateShift + // mirror store.tombstoneMask / encStateMask / encStateShift. A + // rename on the live side without an accompanying update here + // would surface at the snapshot reader's table-driven tests. + snapshotTombstoneMask byte = 0b0000_0001 + snapshotEncStateMask byte = 0b0000_0110 + snapshotEncStateShift = 1 + snapshotEncStateReserved byte = 0b1111_1000 // bits 3-7 must be zero + snapshotEncStateCleartx byte = 0b00 + snapshotEncStateEncrypt byte = 0b01 +) + +// PebbleSnapshotMagic is the 8-byte file header that introduces a +// native Pebble snapshot. Exposed for callers that need to sniff a +// file before deciding which reader to dispatch to. +var PebbleSnapshotMagic = [PebbleSnapshotMagicLen]byte{'E', 'K', 'V', 'P', 'B', 'B', 'L', '1'} + +// ErrSnapshotBadMagic is returned when the first 8 bytes of the +// reader do not match `EKVPBBL1`. The decoder caller should treat +// this as an immediate hard failure rather than try to skip past +// the bad header — a wrong magic almost always indicates the file +// is not actually a Pebble snapshot (an MVCC streaming snapshot, +// a tar archive, a partial truncate, etc.). +var ErrSnapshotBadMagic = cockroachdberr.New("backup: snapshot magic header does not match \"EKVPBBL1\"") + +// ErrSnapshotTruncated is returned when the snapshot ends mid-entry +// (after a key length but before the key, or after a value length +// but before the value). A clean EOF at the start of the +// key-length field is a normal terminator and is NOT an error. +var ErrSnapshotTruncated = cockroachdberr.New("backup: snapshot truncated mid-entry") + +// ErrSnapshotEncryptedReserved is returned when a value-header +// carries reserved encryption_state bits (0b10 or 0b11). Mirrors +// store.ErrEncryptedValueReservedState — the decoder fails closed +// rather than treat the body as cleartext, matching the design's +// §7.1 fail-closed contract. +var ErrSnapshotEncryptedReserved = cockroachdberr.New("backup: value header carries reserved encryption_state; decoder cannot interpret this entry") + +// ErrSnapshotEncryptedEntry is returned when a value-header +// declares the entry is encrypted (encState=0b01). Phase 0a does +// NOT carry the decryption keyring; an encrypted snapshot must be +// decoded with a Phase 0a+keyring binary or after Stage 8 of the +// encryption rollout reverses the encryption. +var ErrSnapshotEncryptedEntry = cockroachdberr.New("backup: snapshot contains encrypted entries — Phase 0a does not link the decryption keyring") + +// ErrSnapshotShortKey is returned when an entry's encoded key is +// shorter than the 8-byte timestamp suffix that +// `store.fillEncodedKey` always appends. Indicates a corrupt +// snapshot — the live store would never emit such a key. +var ErrSnapshotShortKey = cockroachdberr.New("backup: encoded key shorter than timestamp suffix") + +// ErrSnapshotShortValue is returned when an entry's encoded value +// is shorter than the 9-byte value header. Indicates a corrupt +// snapshot — the live store always writes the header even for +// tombstones. +var ErrSnapshotShortValue = cockroachdberr.New("backup: encoded value shorter than value-header") + +// SnapshotEntry is one decoded entry emitted by ReadSnapshot's +// callback. Fields are the user-visible key / value bytes plus the +// MVCC metadata the decoder peeled off (commit timestamp, expiry, +// tombstone marker). Slices are owned by the snapshot reader's +// scratch buffer and may be overwritten when the callback returns — +// callers that need to retain bytes across iterations must +// `bytes.Clone` them. +type SnapshotEntry struct { + UserKey []byte + UserValue []byte + CommitTS uint64 + ExpireAt uint64 + Tombstone bool +} + +// SnapshotHeader is the decoded preamble returned to the caller +// before iteration begins so the caller can record the snapshot's +// commit-time horizon in its MANIFEST.json (per design §380-422). +type SnapshotHeader struct { + LastCommitTS uint64 +} + +// ReadSnapshot reads the EKVPBBL1 header from r, then yields every +// entry through fn. fn receives a transient SnapshotEntry whose +// byte slices are NOT safe to retain across calls (the reader +// reuses scratch buffers to keep per-entry allocations bounded for +// multi-GB snapshots). If fn returns an error, iteration stops and +// the error is returned verbatim. +// +// Iteration terminates cleanly on EOF at the start of an entry's +// key-length field. EOF inside an entry returns +// ErrSnapshotTruncated. +// +// Tombstone entries (flags bit 0 set) are surfaced via +// SnapshotEntry.Tombstone — callers decide whether to suppress +// them (Phase 0a's intended behavior for backup output) or include +// them (a multi-version diagnostic dump might want both). +func ReadSnapshot(r io.Reader, fn func(SnapshotHeader, SnapshotEntry) error) error { + br := bufio.NewReader(r) + header, err := readSnapshotHeader(br) + if err != nil { + return err + } + var ( + keyBuf [1 << 16]byte + valBuf []byte + ) + for { + stop, err := readOneEntry(br, header, keyBuf[:], &valBuf, fn) + if err != nil { + return err + } + if stop { + return nil + } + } +} + +// readOneEntry handles one (key, value) tuple plus the callback +// dispatch. Extracted from ReadSnapshot so the parent stays under +// the cyclop budget — the same shape every backup encoder uses +// (small fixed driver loop + extracted per-record helper). +// Returns (true, nil) on the natural inter-entry EOF terminator. +func readOneEntry( + r *bufio.Reader, + header SnapshotHeader, + keyScratch []byte, + valBuf *[]byte, + fn func(SnapshotHeader, SnapshotEntry) error, +) (bool, error) { + kLen, eof, err := readEntryLen(r) + if err != nil { + return false, err + } + if eof { + return true, nil + } + key, err := readExact(r, keyScratch[:0], kLen) + if err != nil { + return false, cockroachdberr.WithStack(err) + } + vLen, _, err := readEntryLen(r) + if err != nil { + // A clean EOF here means the snapshot truncated between + // the key bytes and the value-length field — not the + // same as a clean inter-entry EOF. + if cockroachdberr.Is(err, io.EOF) { + return false, cockroachdberr.WithStack(ErrSnapshotTruncated) + } + return false, err + } + *valBuf, err = readExactGrow(r, (*valBuf)[:0], vLen) + if err != nil { + return false, cockroachdberr.WithStack(err) + } + entry, err := decodeSnapshotEntry(key, *valBuf) + if err != nil { + return false, err + } + if err := fn(header, entry); err != nil { + return false, err + } + return false, nil +} + +// readSnapshotHeader consumes the 8-byte magic and the 8-byte LE +// lastCommitTS. Returns ErrSnapshotBadMagic on header mismatch. +func readSnapshotHeader(r io.Reader) (SnapshotHeader, error) { + var magic [PebbleSnapshotMagicLen]byte + if _, err := io.ReadFull(r, magic[:]); err != nil { + return SnapshotHeader{}, cockroachdberr.WithStack(err) + } + if !bytes.Equal(magic[:], PebbleSnapshotMagic[:]) { + return SnapshotHeader{}, cockroachdberr.Wrapf(ErrSnapshotBadMagic, + "got %q", magic[:]) + } + var ts uint64 + if err := binary.Read(r, binary.LittleEndian, &ts); err != nil { + return SnapshotHeader{}, cockroachdberr.WithStack(err) + } + return SnapshotHeader{LastCommitTS: ts}, nil +} + +// readEntryLen reads an 8-byte LittleEndian length prefix. Returns +// (0, true, nil) on clean EOF — used to detect the natural end of +// the snapshot. Any other read error (including unexpected EOF) is +// returned verbatim. +func readEntryLen(r io.Reader) (uint64, bool, error) { + var raw [8]byte + n, err := io.ReadFull(r, raw[:]) + if err == nil { + return binary.LittleEndian.Uint64(raw[:]), false, nil + } + if cockroachdberr.Is(err, io.EOF) && n == 0 { + return 0, true, nil + } + if cockroachdberr.Is(err, io.ErrUnexpectedEOF) { + return 0, false, cockroachdberr.WithStack(ErrSnapshotTruncated) + } + return 0, false, cockroachdberr.WithStack(err) +} + +// readExact reads exactly n bytes into dst (extending it as +// needed). The returned slice aliases dst's underlying array — the +// caller must not retain it across loop iterations. +func readExact(r io.Reader, dst []byte, n uint64) ([]byte, error) { + if uint64(cap(dst)) < n { + // Cap fallback path: allocate a fresh slice when the + // caller's scratch buffer isn't large enough. For the + // stack-allocated keyBuf this only kicks in on + // pathologically long keys. + return readExactGrow(r, dst, n) + } + dst = dst[:n] + if _, err := io.ReadFull(r, dst); err != nil { + if cockroachdberr.Is(err, io.ErrUnexpectedEOF) || cockroachdberr.Is(err, io.EOF) { + return nil, cockroachdberr.WithStack(ErrSnapshotTruncated) + } + return nil, cockroachdberr.WithStack(err) + } + return dst, nil +} + +// readExactGrow is the heap-fallback variant of readExact. Used +// for value bodies, which can be up to several MiB and so live in +// a separately grown buffer rather than a fixed stack array. +func readExactGrow(r io.Reader, dst []byte, n uint64) ([]byte, error) { + if uint64(cap(dst)) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + if _, err := io.ReadFull(r, dst); err != nil { + if cockroachdberr.Is(err, io.ErrUnexpectedEOF) || cockroachdberr.Is(err, io.EOF) { + return nil, cockroachdberr.WithStack(ErrSnapshotTruncated) + } + return nil, cockroachdberr.WithStack(err) + } + return dst, nil +} + +// decodeSnapshotEntry strips the 8-byte inverted-TS key suffix and +// the 9-byte value header, surfacing the user-visible byte slices +// plus the MVCC metadata. Returns ErrSnapshotShortKey / +// ErrSnapshotShortValue on length violations and +// ErrSnapshotEncryptedReserved / ErrSnapshotEncryptedEntry on bad +// or unsupported encryption_state bits. +func decodeSnapshotEntry(encKey, encVal []byte) (SnapshotEntry, error) { + if len(encKey) < snapshotTSSize { + return SnapshotEntry{}, cockroachdberr.Wrapf(ErrSnapshotShortKey, + "encoded key length %d < %d", len(encKey), snapshotTSSize) + } + if len(encVal) < snapshotValueHeaderSize { + return SnapshotEntry{}, cockroachdberr.Wrapf(ErrSnapshotShortValue, + "encoded value length %d < %d", len(encVal), snapshotValueHeaderSize) + } + userKey := encKey[:len(encKey)-snapshotTSSize] + invTS := binary.BigEndian.Uint64(encKey[len(encKey)-snapshotTSSize:]) + commitTS := ^invTS + + flags := encVal[0] + if flags&snapshotEncStateReserved != 0 { + return SnapshotEntry{}, cockroachdberr.Wrapf(ErrSnapshotEncryptedReserved, + "value header byte %#08b", flags) + } + encState := (flags & snapshotEncStateMask) >> snapshotEncStateShift + switch encState { + case snapshotEncStateCleartx: + // fall through + case snapshotEncStateEncrypt: + return SnapshotEntry{}, cockroachdberr.WithStack(ErrSnapshotEncryptedEntry) + default: + return SnapshotEntry{}, cockroachdberr.Wrapf(ErrSnapshotEncryptedReserved, + "encryption_state=%#x is reserved", encState) + } + tombstone := (flags & snapshotTombstoneMask) != 0 + expireAt := binary.LittleEndian.Uint64(encVal[1:snapshotValueHeaderSize]) + userValue := encVal[snapshotValueHeaderSize:] + return SnapshotEntry{ + UserKey: userKey, + UserValue: userValue, + CommitTS: commitTS, + ExpireAt: expireAt, + Tombstone: tombstone, + }, nil +} diff --git a/internal/backup/snapshot_reader_test.go b/internal/backup/snapshot_reader_test.go new file mode 100644 index 00000000..39e5915f --- /dev/null +++ b/internal/backup/snapshot_reader_test.go @@ -0,0 +1,411 @@ +package backup + +import ( + "bytes" + "encoding/binary" + "io" + "math" + "testing" + + "github.com/cockroachdb/errors" +) + +// snapBuilder is a test-side mirror of the live store's native +// snapshot writer (store/lsm_store.go). Each WriteEntry call +// appends an entry whose key is `` and whose +// value is ``. The Bytes() result +// is the full `EKVPBBL1`-prefixed snapshot stream a real .fsm file +// would carry. +type snapBuilder struct { + buf bytes.Buffer +} + +func newSnapBuilder(lastCommitTS uint64) *snapBuilder { + b := &snapBuilder{} + b.buf.Write(PebbleSnapshotMagic[:]) + _ = binary.Write(&b.buf, binary.LittleEndian, lastCommitTS) + return b +} + +// WriteEntry appends one (userKey, commitTS, body) tuple. Tombstone +// flag is independent of body; the live store writes the same +// header shape for both. expireAt is the absolute Unix-ms expiry +// (0 == no TTL). +func (b *snapBuilder) WriteEntry(userKey []byte, commitTS uint64, body []byte, tombstone bool, expireAt uint64, encState byte) { + // key = userKey || (^commitTS)BE + encKey := make([]byte, len(userKey)+snapshotTSSize) + copy(encKey, userKey) + binary.BigEndian.PutUint64(encKey[len(userKey):], ^commitTS) + encVal := make([]byte, snapshotValueHeaderSize+len(body)) + var flags byte + if tombstone { + flags |= snapshotTombstoneMask + } + flags |= (encState << snapshotEncStateShift) & snapshotEncStateMask + encVal[0] = flags + binary.LittleEndian.PutUint64(encVal[1:snapshotValueHeaderSize], expireAt) + copy(encVal[snapshotValueHeaderSize:], body) + _ = binary.Write(&b.buf, binary.LittleEndian, uint64(len(encKey))) + b.buf.Write(encKey) + _ = binary.Write(&b.buf, binary.LittleEndian, uint64(len(encVal))) + b.buf.Write(encVal) +} + +func (b *snapBuilder) Bytes() []byte { return b.buf.Bytes() } + +// TestReadSnapshot_HeaderOnly pins that a snapshot containing only +// the magic + ts header (no entries) terminates cleanly with the +// header surfaced to the callback's first arg and zero entry +// callbacks. +func TestReadSnapshot_HeaderOnly(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0xdeadbeef) + var sawHeader SnapshotHeader + var count int + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(h SnapshotHeader, _ SnapshotEntry) error { + sawHeader = h + count++ + return nil + }) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if count != 0 { + t.Fatalf("entries = %d, want 0", count) + } + // Header callback never fires on an empty body — we don't + // surface SnapshotHeader through fn until at least one entry + // arrives. The "want 0" check above is sufficient. + _ = sawHeader +} + +// TestReadSnapshot_SingleEntryRoundTrip pins the basic single-entry +// path: header parses, key/value decode, MVCC metadata surfaces. +func TestReadSnapshot_SingleEntryRoundTrip(t *testing.T) { + t.Parallel() + b := newSnapBuilder(100) + b.WriteEntry([]byte("!redis|str|hello"), 42, []byte("world"), false, 1234567, snapshotEncStateCleartx) + var got SnapshotEntry + var hdr SnapshotHeader + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(h SnapshotHeader, e SnapshotEntry) error { + hdr = h + got = e + // Clone the bytes because the reader's scratch buffer + // may be overwritten on return. + got.UserKey = bytes.Clone(e.UserKey) + got.UserValue = bytes.Clone(e.UserValue) + return nil + }) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if hdr.LastCommitTS != 100 { + t.Fatalf("hdr.LastCommitTS = %d, want 100", hdr.LastCommitTS) + } + if string(got.UserKey) != "!redis|str|hello" { + t.Fatalf("UserKey = %q, want %q", got.UserKey, "!redis|str|hello") + } + if string(got.UserValue) != "world" { + t.Fatalf("UserValue = %q, want %q", got.UserValue, "world") + } + if got.CommitTS != 42 { + t.Fatalf("CommitTS = %d, want 42", got.CommitTS) + } + if got.ExpireAt != 1234567 { + t.Fatalf("ExpireAt = %d, want 1234567", got.ExpireAt) + } + if got.Tombstone { + t.Fatalf("Tombstone = true, want false") + } +} + +// TestReadSnapshot_TombstoneFlagSurfaced pins that the tombstone +// bit on the value-header is exposed via SnapshotEntry.Tombstone. +// Phase 0a callers (the dispatcher) skip tombstones so a restored +// dump matches the snapshot's logical visibility; surfacing the +// flag also lets diagnostic dumps include them. +func TestReadSnapshot_TombstoneFlagSurfaced(t *testing.T) { + t.Parallel() + b := newSnapBuilder(1) + b.WriteEntry([]byte("!redis|str|gone"), 7, nil, true, 0, snapshotEncStateCleartx) + var sawTomb bool + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(_ SnapshotHeader, e SnapshotEntry) error { + sawTomb = e.Tombstone + return nil + }) + if err != nil { + t.Fatalf("err: %v", err) + } + if !sawTomb { + t.Fatalf("tombstone flag did not surface") + } +} + +// TestReadSnapshot_MultipleEntriesPreserveOrder pins that the +// reader yields entries in the on-disk order. The live snapshot +// stream is sorted by encoded key (userKey||^TS), so newer +// versions of the same userKey appear FIRST. The dispatcher +// relies on this order for first-seen-wins dedup. +func TestReadSnapshot_MultipleEntriesPreserveOrder(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + keys := []string{"!redis|str|a", "!redis|str|b", "!redis|str|c"} + for i, k := range keys { + b.WriteEntry([]byte(k), uint64(i+1), []byte{byte(i)}, false, 0, snapshotEncStateCleartx) //nolint:gosec // i bounded by len(keys)<<63 + } + var gotKeys []string + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(_ SnapshotHeader, e SnapshotEntry) error { + gotKeys = append(gotKeys, string(e.UserKey)) + return nil + }) + if err != nil { + t.Fatalf("err: %v", err) + } + if len(gotKeys) != len(keys) { + t.Fatalf("entries = %d, want %d", len(gotKeys), len(keys)) + } + for i, k := range keys { + if gotKeys[i] != k { + t.Fatalf("entries[%d] = %q, want %q", i, gotKeys[i], k) + } + } +} + +// TestReadSnapshot_RejectsBadMagic pins that a file whose first 8 +// bytes are not "EKVPBBL1" fails closed with ErrSnapshotBadMagic. +// This is the trip-wire for "operator pointed the decoder at an +// MVCC streaming snapshot, a tar archive, or a truncated file." +func TestReadSnapshot_RejectsBadMagic(t *testing.T) { + t.Parallel() + bad := []byte("MVCCSTRM" + "....16....bytes.") + err := ReadSnapshot(bytes.NewReader(bad), func(_ SnapshotHeader, _ SnapshotEntry) error { + t.Fatalf("callback fired on bad-magic input") + return nil + }) + if !errors.Is(err, ErrSnapshotBadMagic) { + t.Fatalf("err = %v want ErrSnapshotBadMagic", err) + } +} + +// TestReadSnapshot_RejectsTruncatedHeader pins that a file shorter +// than the 16-byte header (magic + ts) returns io.EOF / unexpected +// EOF wrapped in WithStack — not ErrSnapshotBadMagic. This +// matters: a 0-byte file is "no snapshot" and a 4-byte file is "I +// tried to write a snapshot and crashed"; conflating the two would +// hide truncation incidents. +func TestReadSnapshot_RejectsTruncatedHeader(t *testing.T) { + t.Parallel() + err := ReadSnapshot(bytes.NewReader([]byte("EKVP")), func(_ SnapshotHeader, _ SnapshotEntry) error { + return nil + }) + if err == nil { + t.Fatalf("expected error on truncated header") + } + if errors.Is(err, ErrSnapshotBadMagic) { + t.Fatalf("truncated header must NOT report bad magic, got %v", err) + } +} + +// TestReadSnapshot_RejectsTruncatedEntry pins that a snapshot +// ending after a key-length but before the key bytes surfaces as +// ErrSnapshotTruncated. A clean EOF at the start of the key-length +// field is the normal terminator and must NOT trigger this. +func TestReadSnapshot_RejectsTruncatedEntry(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + // Append a key-length but no key bytes. + var l [8]byte + binary.LittleEndian.PutUint64(l[:], 32) + full := append(b.Bytes(), l[:]...) + err := ReadSnapshot(bytes.NewReader(full), func(_ SnapshotHeader, _ SnapshotEntry) error { + return nil + }) + if !errors.Is(err, ErrSnapshotTruncated) { + t.Fatalf("err = %v want ErrSnapshotTruncated", err) + } +} + +// TestReadSnapshot_RejectsShortKey pins the encoded-key-length +// invariant. Every encoded key has an 8-byte TS suffix; an entry +// with a shorter key indicates store corruption. +func TestReadSnapshot_RejectsShortKey(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + // Encode an entry with a key shorter than the TS suffix by + // hand. WriteEntry won't do this, so we splice the bytes in + // directly. + bodyHdr := []byte{0x00, 0, 0, 0, 0, 0, 0, 0, 0} // 9-byte header, zero flags + expireAt + var kLen, vLen [8]byte + binary.LittleEndian.PutUint64(kLen[:], 4) // < snapshotTSSize=8 + binary.LittleEndian.PutUint64(vLen[:], uint64(len(bodyHdr))) + full := append(b.Bytes(), kLen[:]...) + full = append(full, 'a', 'b', 'c', 'd') + full = append(full, vLen[:]...) + full = append(full, bodyHdr...) + err := ReadSnapshot(bytes.NewReader(full), func(_ SnapshotHeader, _ SnapshotEntry) error { + return nil + }) + if !errors.Is(err, ErrSnapshotShortKey) { + t.Fatalf("err = %v want ErrSnapshotShortKey", err) + } +} + +// TestReadSnapshot_RejectsShortValue pins that an entry with a +// value shorter than the 9-byte header (flags + expireAt) fails +// closed. The live store always emits the full 9 bytes even for +// tombstones. +func TestReadSnapshot_RejectsShortValue(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + // Encoded key with a valid 8-byte TS suffix. + encKey := make([]byte, 1+snapshotTSSize) + encKey[0] = 'k' + binary.BigEndian.PutUint64(encKey[1:], ^uint64(1)) + var kLen, vLen [8]byte + binary.LittleEndian.PutUint64(kLen[:], uint64(len(encKey))) + binary.LittleEndian.PutUint64(vLen[:], 4) // < snapshotValueHeaderSize=9 + full := append(b.Bytes(), kLen[:]...) + full = append(full, encKey...) + full = append(full, vLen[:]...) + full = append(full, 'a', 'b', 'c', 'd') + err := ReadSnapshot(bytes.NewReader(full), func(_ SnapshotHeader, _ SnapshotEntry) error { + return nil + }) + if !errors.Is(err, ErrSnapshotShortValue) { + t.Fatalf("err = %v want ErrSnapshotShortValue", err) + } +} + +// TestReadSnapshot_RejectsEncryptedEntry pins that Phase 0a fails +// closed on encrypted entries. Stage 8 of the encryption rollout +// will add a keyring-aware variant; until then, attempting to +// decode an encrypted snapshot with this binary returns +// ErrSnapshotEncryptedEntry rather than silently emitting +// ciphertext. +func TestReadSnapshot_RejectsEncryptedEntry(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + b.WriteEntry([]byte("k"), 1, []byte("ciphertext"), false, 0, snapshotEncStateEncrypt) + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(_ SnapshotHeader, _ SnapshotEntry) error { + t.Fatalf("callback must not fire on encrypted entry") + return nil + }) + if !errors.Is(err, ErrSnapshotEncryptedEntry) { + t.Fatalf("err = %v want ErrSnapshotEncryptedEntry", err) + } +} + +// TestReadSnapshot_RejectsReservedEncryptionState pins the §7.1 +// fail-closed contract for reserved encState bits (0b10, 0b11) and +// the high-bit reserved-zero range. +func TestReadSnapshot_RejectsReservedEncryptionState(t *testing.T) { + t.Parallel() + for _, tc := range []struct { + name string + flags byte + }{ + {"reserved-encstate-10", 0b0000_0100}, // encState bits = 0b10 + {"reserved-encstate-11", 0b0000_0110}, // encState bits = 0b11 + {"reserved-high-bit-3", 0b0000_1000}, // bit 3 set + {"reserved-high-bit-7", 0b1000_0000}, // bit 7 set + } { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + // Build an entry by hand with the chosen flags byte. + encKey := make([]byte, 1+snapshotTSSize) + encKey[0] = 'k' + binary.BigEndian.PutUint64(encKey[1:], ^uint64(1)) + encVal := make([]byte, snapshotValueHeaderSize+1) + encVal[0] = tc.flags + binary.LittleEndian.PutUint64(encVal[1:snapshotValueHeaderSize], 0) + encVal[snapshotValueHeaderSize] = 'v' + var kLen, vLen [8]byte + binary.LittleEndian.PutUint64(kLen[:], uint64(len(encKey))) + binary.LittleEndian.PutUint64(vLen[:], uint64(len(encVal))) + full := append(b.Bytes(), kLen[:]...) + full = append(full, encKey...) + full = append(full, vLen[:]...) + full = append(full, encVal...) + err := ReadSnapshot(bytes.NewReader(full), func(_ SnapshotHeader, _ SnapshotEntry) error { + t.Fatalf("callback fired on reserved encState") + return nil + }) + if !errors.Is(err, ErrSnapshotEncryptedReserved) { + t.Fatalf("err = %v want ErrSnapshotEncryptedReserved", err) + } + }) + } +} + +// TestReadSnapshot_CallbackErrorPropagates pins that fn's error +// terminates iteration and returns the error verbatim (caller can +// distinguish their own error from a snapshot-format error via +// errors.Is). +func TestReadSnapshot_CallbackErrorPropagates(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + b.WriteEntry([]byte("k1"), 1, []byte("v1"), false, 0, snapshotEncStateCleartx) + b.WriteEntry([]byte("k2"), 2, []byte("v2"), false, 0, snapshotEncStateCleartx) + sentinel := errors.New("test sentinel") + var calls int + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(_ SnapshotHeader, _ SnapshotEntry) error { + calls++ + return sentinel + }) + if !errors.Is(err, sentinel) { + t.Fatalf("err = %v want sentinel", err) + } + if calls != 1 { + t.Fatalf("callback called %d times after sentinel error, want 1", calls) + } +} + +// TestReadSnapshot_CommitTSInversionRoundTrips pins that the +// inverted-TS suffix (^commitTS) decodes back to the original TS +// across the math.MaxUint64 boundary. Off-by-one errors in the +// XOR would break sort order silently. +func TestReadSnapshot_CommitTSInversionRoundTrips(t *testing.T) { + t.Parallel() + for _, ts := range []uint64{0, 1, 1 << 32, math.MaxUint64 - 1, math.MaxUint64} { + b := newSnapBuilder(0) + b.WriteEntry([]byte("k"), ts, []byte("v"), false, 0, snapshotEncStateCleartx) + var got uint64 + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(_ SnapshotHeader, e SnapshotEntry) error { + got = e.CommitTS + return nil + }) + if err != nil { + t.Fatalf("ts=%d: err=%v", ts, err) + } + if got != ts { + t.Fatalf("ts=%d round-tripped to %d", ts, got) + } + } +} + +// TestReadSnapshot_EmptyValueBody pins that an entry with a 9-byte +// value (header only, zero body) decodes to UserValue=[] without +// firing ErrSnapshotShortValue. This is the on-disk shape of a +// tombstone or a deliberately empty value. +func TestReadSnapshot_EmptyValueBody(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + b.WriteEntry([]byte("k"), 1, nil, false, 0, snapshotEncStateCleartx) + var sawEmpty bool + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(_ SnapshotHeader, e SnapshotEntry) error { + sawEmpty = len(e.UserValue) == 0 + return nil + }) + if err != nil { + t.Fatalf("err: %v", err) + } + if !sawEmpty { + t.Fatalf("expected empty UserValue, got non-empty") + } +} + +// Compile-time sanity: io.EOF can flow up from the reader without +// being misclassified as a snapshot-format error. (Mostly a guard +// for future refactors that might add error wrapping.) +var _ = io.EOF From a86e6ba07af7ea846529f4b4ede91e21d45f4a5b Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 20 May 2026 04:24:30 +0900 Subject: [PATCH 2/3] =?UTF-8?q?backup(snapshot=5Freader):=20PR792=20r1=20?= =?UTF-8?q?=E2=80=94=20bound=20length=20prefixes=20+=20truncate-before-vLe?= =?UTF-8?q?n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three P1+ findings from the first review round: 1. codex P1 (line 278) "Bound snapshot length prefixes before allocating" 2. gemini security-high + high (line 180) "Reader lacks bounds checking for kLen and vLen — adversarial snapshot can OOM the decoder or panic on 32-bit narrowing" 3. gemini high (line 200) "readOneEntry ignores the eof return value from readEntryLen on the value-length read — a truncated stream surfaces as the wrong error class (ErrSnapshotShortValue instead of ErrSnapshotTruncated)" Findings 1+2 share a root cause: ReadSnapshot trusted the on-disk length prefix and called make([]byte, n) directly. The live restore path applies maxPebbleEncodedKeySize / maxSnapshotValueSize guards before allocation (store/lsm_store.go::readRestoreFieldLen). The backup-side reader was missing the equivalent. A corrupt or adversarial snapshot whose length prefix declares 4 GB would either OOM the operator decoder or, on 32-bit architectures, panic when narrowing uint64 -> int for the slice length. Fix: add MaxSnapshotEncodedKeySize (1 MiB + 8) and MaxSnapshotEncodedValueSize (256 MiB + 9 + 34) bounds, check before each allocation, fail closed with new sentinel errors ErrSnapshotKeyTooLarge / ErrSnapshotValueTooLarge. Mirrors the live store's constants; duplicated here so the backup package keeps the offline-tool adapter-independence required by the design. Finding 3 fix: refactor readOneEntry into readEntryKey + readEntryValue helpers. readEntryKey distinguishes between "natural inter-entry EOF" (kEof=true at the very first read of an entry) and a truncated tail mid-entry (any other point). readEntryValue treats any EOF as truncation — there is no valid clean-EOF state after the key has been consumed. This also drops the readOneEntry function under the cyclop budget. Caller audit (per /loop standing instruction): the semantic change in question is the error class returned when the snapshot truncates between key and value-length. Previously: ErrSnapshotShortValue. Now: ErrSnapshotTruncated. Callers of ReadSnapshot: grep -rn 'ReadSnapshot|SnapshotEntry|SnapshotHeader' --include=*.go -> all matches are inside internal/backup/snapshot_reader{,_test}.go The dispatcher / cmd driver that will consume this API (Phase 0a follow-up, not yet built) will see ErrSnapshotTruncated for any mid-entry truncation regardless of which length-prefix the truncation occurred before — a more correct semantic that matches the docstring contract. No existing caller exists, so no caller audit gap. New tests: - TestReadSnapshot_RejectsTruncatedBeforeValueLength pins gemini high finding (key-then-no-vLen -> ErrSnapshotTruncated) - TestReadSnapshot_RejectsKeyLengthOverBudget pins codex P1 (key length-prefix > MaxSnapshotEncodedKeySize fails closed BEFORE allocation) - TestReadSnapshot_RejectsValueLengthOverBudget pins the same for value side - TestReadSnapshot_AcceptsKeyLengthAtBudgetBoundary pins off-by-one (length == budget is accepted; only > budget rejected) Self-review: 1. Data loss - the bound rejects only over-budget entries the live store would never have written; legitimate entries up to the limits are still accepted (boundary test pins this). 2. Concurrency - no shared state; reader is single-pass over io.Reader as before. 3. Performance - one extra comparison per entry, far cheaper than the prevented OOM allocation. 4. Consistency - matches the live restore path's bounds and error shape. Mid-entry EOF now consistently surfaces as ErrSnapshotTruncated regardless of where in the entry the truncation occurred. 5. Coverage - 4 new tests in addition to the 17 existing ones. All pass with -race. --- internal/backup/snapshot_reader.go | 124 ++++++++++++++++++++---- internal/backup/snapshot_reader_test.go | 103 ++++++++++++++++++++ 2 files changed, 210 insertions(+), 17 deletions(-) diff --git a/internal/backup/snapshot_reader.go b/internal/backup/snapshot_reader.go index 17fd5a70..e972f31f 100644 --- a/internal/backup/snapshot_reader.go +++ b/internal/backup/snapshot_reader.go @@ -60,6 +60,43 @@ const ( snapshotEncStateReserved byte = 0b1111_1000 // bits 3-7 must be zero snapshotEncStateCleartx byte = 0b00 snapshotEncStateEncrypt byte = 0b01 + + // MaxSnapshotEncodedKeySize / MaxSnapshotEncodedValueSize bound the + // per-entry allocations made by readExact / readExactGrow. Mirrors + // the live store's `maxPebbleEncodedKeySize` (1 MiB user-key cap + + // 8-byte TS suffix; store/mvcc_store.go:29 + store/lsm_store.go:51) + // and `maxSnapshotValueSize + valueHeaderSize + + // encryption.EnvelopeOverhead` (256 MiB cleartext + 9-byte header + + // 34-byte envelope; store/mvcc_store.go:37 + + // store/lsm_store.go:1692). Without these guards a corrupt or + // adversarial snapshot whose length-prefix declares a huge size + // would either OOM the decoder via `make([]byte, n)` or, on 32-bit + // architectures, panic when narrowing `uint64` → `int` for the + // slice length. Codex P1 + gemini security-high (PR #792 round 1). + // + // Duplicated here (rather than imported from `store`) so the + // backup package keeps the adapter-independence required by the + // design (it must run as a standalone offline tool with no live- + // cluster libraries linked). The values are reviewed for staleness + // alongside the live store's constants at every PR round. + + // maxSnapshotUserKeySize mirrors store/mvcc_store.go:29 + // `maxSnapshotKeySize` = 1 MiB. + maxSnapshotUserKeySize = 1 << maxSnapshotUserKeyShift + // maxSnapshotUserValueSize mirrors store/mvcc_store.go:37 + // `maxSnapshotValueSize` = 256 MiB. + maxSnapshotUserValueSize = maxSnapshotValueMiB << maxSnapshotValueShift + maxSnapshotUserKeyShift = 20 // 1 << 20 == 1 MiB + maxSnapshotValueShift = 20 // << 20 == byte count + maxSnapshotValueMiB = 256 + MaxSnapshotEncodedKeySize = maxSnapshotUserKeySize + snapshotTSSize + MaxSnapshotEncodedValueSize = maxSnapshotUserValueSize + snapshotValueHeaderSize + envelopeMaxB + + // envelopeMaxB mirrors internal/encryption.EnvelopeOverhead (12- + // byte nonce + 16-byte tag + 6-byte AAD-binding header = 34). + // Duplicated here so we do not import the encryption package + // from the backup-package's offline-tool boundary. + envelopeMaxB = 34 ) // PebbleSnapshotMagic is the 8-byte file header that introduces a @@ -101,6 +138,16 @@ var ErrSnapshotEncryptedEntry = cockroachdberr.New("backup: snapshot contains en // snapshot — the live store would never emit such a key. var ErrSnapshotShortKey = cockroachdberr.New("backup: encoded key shorter than timestamp suffix") +// ErrSnapshotKeyTooLarge / ErrSnapshotValueTooLarge are returned +// when the on-disk length prefix declares an entry larger than the +// MaxSnapshotEncodedKeySize / MaxSnapshotEncodedValueSize budgets. +// Mirrors `store.ErrSnapshotKeyTooLarge` and `store.ErrValueTooLarge` +// from the live restore path so a corrupt or adversarial snapshot +// fails closed at the length-prefix layer instead of triggering an +// OOM-sized allocation (codex P1 + gemini security-high on PR #792). +var ErrSnapshotKeyTooLarge = cockroachdberr.New("backup: snapshot key length exceeds limit") +var ErrSnapshotValueTooLarge = cockroachdberr.New("backup: snapshot value length exceeds limit") + // ErrSnapshotShortValue is returned when an entry's encoded value // is shorter than the 9-byte value header. Indicates a corrupt // snapshot — the live store always writes the header even for @@ -177,32 +224,19 @@ func readOneEntry( valBuf *[]byte, fn func(SnapshotHeader, SnapshotEntry) error, ) (bool, error) { - kLen, eof, err := readEntryLen(r) + key, eof, err := readEntryKey(r, keyScratch) if err != nil { return false, err } if eof { + // Clean inter-entry EOF — natural terminator. return true, nil } - key, err := readExact(r, keyScratch[:0], kLen) + value, err := readEntryValue(r, valBuf) if err != nil { - return false, cockroachdberr.WithStack(err) - } - vLen, _, err := readEntryLen(r) - if err != nil { - // A clean EOF here means the snapshot truncated between - // the key bytes and the value-length field — not the - // same as a clean inter-entry EOF. - if cockroachdberr.Is(err, io.EOF) { - return false, cockroachdberr.WithStack(ErrSnapshotTruncated) - } return false, err } - *valBuf, err = readExactGrow(r, (*valBuf)[:0], vLen) - if err != nil { - return false, cockroachdberr.WithStack(err) - } - entry, err := decodeSnapshotEntry(key, *valBuf) + entry, err := decodeSnapshotEntry(key, value) if err != nil { return false, err } @@ -212,6 +246,62 @@ func readOneEntry( return false, nil } +// readEntryKey reads the per-entry length-prefix-then-bytes for the +// key half of one entry. Returns (nil, true, nil) on the natural +// inter-entry EOF (clean stream terminator). Applies the +// MaxSnapshotEncodedKeySize bound BEFORE allocating the read +// buffer; without this guard a corrupt or adversarial snapshot +// whose length prefix declares a huge size would OOM the decoder +// or panic on 32-bit narrowing. Mirrors +// `store/lsm_store.go::readRestoreFieldLen`. Codex P1 + gemini +// security-high on PR #792. +func readEntryKey(r *bufio.Reader, scratch []byte) ([]byte, bool, error) { + kLen, kEof, err := readEntryLen(r) + if err != nil { + return nil, false, err + } + if kEof { + return nil, true, nil + } + if kLen > MaxSnapshotEncodedKeySize { + return nil, false, cockroachdberr.Wrapf(ErrSnapshotKeyTooLarge, + "length %d > %d", kLen, MaxSnapshotEncodedKeySize) + } + key, err := readExact(r, scratch[:0], kLen) + if err != nil { + return nil, false, cockroachdberr.WithStack(err) + } + return key, false, nil +} + +// readEntryValue reads the per-entry length-prefix-then-bytes for +// the value half of one entry. A mid-entry EOF (length prefix +// missing after the key was already consumed) surfaces as +// ErrSnapshotTruncated rather than the natural inter-entry EOF — +// gemini high finding on PR #792 (the previous code ignored +// readEntryLen's eof return value here, allowing a truncated stream +// to flow into decodeSnapshotEntry with vLen=0 and surface as the +// wrong error). Applies MaxSnapshotEncodedValueSize before allocation +// (same rationale as readEntryKey). +func readEntryValue(r *bufio.Reader, valBuf *[]byte) ([]byte, error) { + vLen, vEof, err := readEntryLen(r) + if err != nil { + return nil, err + } + if vEof { + return nil, cockroachdberr.WithStack(ErrSnapshotTruncated) + } + if vLen > MaxSnapshotEncodedValueSize { + return nil, cockroachdberr.Wrapf(ErrSnapshotValueTooLarge, + "length %d > %d", vLen, MaxSnapshotEncodedValueSize) + } + *valBuf, err = readExactGrow(r, (*valBuf)[:0], vLen) + if err != nil { + return nil, cockroachdberr.WithStack(err) + } + return *valBuf, nil +} + // readSnapshotHeader consumes the 8-byte magic and the 8-byte LE // lastCommitTS. Returns ErrSnapshotBadMagic on header mismatch. func readSnapshotHeader(r io.Reader) (SnapshotHeader, error) { diff --git a/internal/backup/snapshot_reader_test.go b/internal/backup/snapshot_reader_test.go index 39e5915f..ee368352 100644 --- a/internal/backup/snapshot_reader_test.go +++ b/internal/backup/snapshot_reader_test.go @@ -225,6 +225,109 @@ func TestReadSnapshot_RejectsTruncatedEntry(t *testing.T) { } } +// TestReadSnapshot_RejectsTruncatedBeforeValueLength pins the +// gemini-high fix on PR #792: a snapshot that ends AFTER the key +// bytes but BEFORE the value-length field must surface as +// ErrSnapshotTruncated (not the previous ErrSnapshotShortValue +// misclassification, which happened because readEntryLen's `eof` +// return value was being ignored on the second call). +func TestReadSnapshot_RejectsTruncatedBeforeValueLength(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + // Build a complete key (length + bytes) but no value-length. + encKey := make([]byte, 1+snapshotTSSize) + encKey[0] = 'k' + binary.BigEndian.PutUint64(encKey[1:], ^uint64(1)) + var kLen [8]byte + binary.LittleEndian.PutUint64(kLen[:], uint64(len(encKey))) + full := append(b.Bytes(), kLen[:]...) + full = append(full, encKey...) + err := ReadSnapshot(bytes.NewReader(full), func(_ SnapshotHeader, _ SnapshotEntry) error { + return nil + }) + if !errors.Is(err, ErrSnapshotTruncated) { + t.Fatalf("err = %v want ErrSnapshotTruncated", err) + } +} + +// TestReadSnapshot_RejectsKeyLengthOverBudget pins the codex P1 + +// gemini security-high fix on PR #792: a corrupt or adversarial +// snapshot whose length-prefix declares a key larger than +// MaxSnapshotEncodedKeySize must fail at the length-prefix check +// BEFORE allocating `make([]byte, kLen)`. Without this guard a 4 GB +// length prefix would OOM the decoder (or panic on 32-bit when +// uint64 → int narrowing wraps). +func TestReadSnapshot_RejectsKeyLengthOverBudget(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + // Length-prefix one byte over the budget. + var l [8]byte + binary.LittleEndian.PutUint64(l[:], uint64(MaxSnapshotEncodedKeySize)+1) + full := append(b.Bytes(), l[:]...) + err := ReadSnapshot(bytes.NewReader(full), func(_ SnapshotHeader, _ SnapshotEntry) error { + t.Fatalf("callback fired on over-budget key length") + return nil + }) + if !errors.Is(err, ErrSnapshotKeyTooLarge) { + t.Fatalf("err = %v want ErrSnapshotKeyTooLarge", err) + } +} + +// TestReadSnapshot_RejectsValueLengthOverBudget mirrors the +// key-length guard for the value side. +func TestReadSnapshot_RejectsValueLengthOverBudget(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + // Build a valid key entry, then a value-length one byte over the + // budget. + encKey := make([]byte, 1+snapshotTSSize) + encKey[0] = 'k' + binary.BigEndian.PutUint64(encKey[1:], ^uint64(1)) + var kLen, vLen [8]byte + binary.LittleEndian.PutUint64(kLen[:], uint64(len(encKey))) + binary.LittleEndian.PutUint64(vLen[:], uint64(MaxSnapshotEncodedValueSize)+1) + full := append(b.Bytes(), kLen[:]...) + full = append(full, encKey...) + full = append(full, vLen[:]...) + err := ReadSnapshot(bytes.NewReader(full), func(_ SnapshotHeader, _ SnapshotEntry) error { + t.Fatalf("callback fired on over-budget value length") + return nil + }) + if !errors.Is(err, ErrSnapshotValueTooLarge) { + t.Fatalf("err = %v want ErrSnapshotValueTooLarge", err) + } +} + +// TestReadSnapshot_AcceptsKeyLengthAtBudgetBoundary pins the off- +// by-one: keyLen == MaxSnapshotEncodedKeySize must be accepted (the +// reader rejects only >, matching the live store's +// `readRestoreFieldLen` semantics). We test with a 1 KiB key +// because allocating the full 1 MiB on every run would slow the +// test suite — the boundary is the same regardless of magnitude. +func TestReadSnapshot_AcceptsKeyLengthAtBudgetBoundary(t *testing.T) { + t.Parallel() + b := newSnapBuilder(0) + // 1 KiB user key + 8-byte TS suffix. + const userKeyLen = 1 << 10 + userKey := make([]byte, userKeyLen) + for i := range userKey { + userKey[i] = byte(i % 256) + } + b.WriteEntry(userKey, 1, []byte("v"), false, 0, snapshotEncStateCleartx) + var got SnapshotEntry + err := ReadSnapshot(bytes.NewReader(b.Bytes()), func(_ SnapshotHeader, e SnapshotEntry) error { + got = e + got.UserKey = bytes.Clone(e.UserKey) + return nil + }) + if err != nil { + t.Fatalf("err = %v want nil at length within budget", err) + } + if !bytes.Equal(got.UserKey, userKey) { + t.Fatalf("UserKey mismatch at boundary") + } +} + // TestReadSnapshot_RejectsShortKey pins the encoded-key-length // invariant. Every encoded key has an 8-byte TS suffix; an entry // with a shorter key indicates store corruption. From 8e6e40eb1fcf24ab4fdc1fe035b74cd4aaab1e1a Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 20 May 2026 04:31:47 +0900 Subject: [PATCH 3/3] =?UTF-8?q?backup(snapshot=5Freader):=20PR792=20r2=20c?= =?UTF-8?q?oderabbit=202x=20Major=20=E2=80=94=20immutable=20magic=20+=20bo?= =?UTF-8?q?undary=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit coderabbit raised 2 Major findings on round 1: 1. line 105 "Avoid exporting mutable magic bytes as a package variable" - PebbleSnapshotMagic was a `var [8]byte`, so any importer could mutate it and break parsing globally (`backup.PebbleSnapshotMagic[0] = 0xff` is legal Go). 2. line 329 "Boundary test name/intent doesn't match what it verifies" - TestReadSnapshot_AcceptsKeyLengthAtBudgetBoundary used a 1 KiB key, which is nowhere near the 1 MiB+8 MaxSnapshotEncodedKeySize budget. A regression flipping `>` to `>=` would not be caught by the existing test. Fixes: 1. Change `PebbleSnapshotMagic` from a `var [PebbleSnapshotMagicLen]byte` to an untyped `const string` carrying the same bytes. Strings are immutable in Go, so this closes the package-variable mutation surface. Update the only consumers (readSnapshotHeader's bytes comparison and the test snapBuilder) to use the new type. 2. Resize the boundary test's user key so the ENCODED key (userKey + 8-byte TS suffix) lands at EXACTLY MaxSnapshotEncodedKeySize, then assert successful round-trip. A `>=` vs `>` regression now surfaces. Caller audit (per /loop standing instruction): the type change to PebbleSnapshotMagic from `[8]byte` to `string` is a public API shape change. PR #792 introduces this symbol; no external caller exists yet (`grep -rn PebbleSnapshotMagic --include=*.go` shows matches only inside the two files in this PR). Caller audit clean. Self-review: 1. Data loss - none. The byte content of the magic is unchanged. 2. Concurrency - the previous shape was technically race-prone if importers wrote concurrently. The string const cannot race. 3. Performance - one tiny allocation per call (`[]byte(string)` for the comparison). Bounded; called once per snapshot file open. 4. Consistency - immutable-magic pattern matches the live store's `pebbleSnapshotMagic` (which is package-private, also immutable in practice). The new boundary test now actually exercises the budget threshold. 5. Coverage - existing 21 snapshot tests still pass with the new type. The boundary test is now a real off-by-one guard (allocates 1 MiB instead of 1 KiB during the test — still well within go test memory budget). --- internal/backup/snapshot_reader.go | 13 ++++++++++--- internal/backup/snapshot_reader_test.go | 26 ++++++++++++++----------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/internal/backup/snapshot_reader.go b/internal/backup/snapshot_reader.go index e972f31f..a6cb0bc8 100644 --- a/internal/backup/snapshot_reader.go +++ b/internal/backup/snapshot_reader.go @@ -101,8 +101,15 @@ const ( // PebbleSnapshotMagic is the 8-byte file header that introduces a // native Pebble snapshot. Exposed for callers that need to sniff a -// file before deciding which reader to dispatch to. -var PebbleSnapshotMagic = [PebbleSnapshotMagicLen]byte{'E', 'K', 'V', 'P', 'B', 'B', 'L', '1'} +// file before deciding which reader to dispatch to. Declared as an +// untyped string CONSTANT (not a `var [8]byte`) so an importer +// cannot mutate the bytes — a writable package variable would let +// any caller corrupt the header globally and break parsing for +// every consumer (coderabbit Major on PR #792 round 2). +// Callers comparing against the magic should treat the encoded-key +// type of their own data: most call sites convert to a byte slice +// via `[]byte(PebbleSnapshotMagic)` at the comparison point. +const PebbleSnapshotMagic = "EKVPBBL1" // ErrSnapshotBadMagic is returned when the first 8 bytes of the // reader do not match `EKVPBBL1`. The decoder caller should treat @@ -309,7 +316,7 @@ func readSnapshotHeader(r io.Reader) (SnapshotHeader, error) { if _, err := io.ReadFull(r, magic[:]); err != nil { return SnapshotHeader{}, cockroachdberr.WithStack(err) } - if !bytes.Equal(magic[:], PebbleSnapshotMagic[:]) { + if !bytes.Equal(magic[:], []byte(PebbleSnapshotMagic)) { return SnapshotHeader{}, cockroachdberr.Wrapf(ErrSnapshotBadMagic, "got %q", magic[:]) } diff --git a/internal/backup/snapshot_reader_test.go b/internal/backup/snapshot_reader_test.go index ee368352..90ad2446 100644 --- a/internal/backup/snapshot_reader_test.go +++ b/internal/backup/snapshot_reader_test.go @@ -22,7 +22,7 @@ type snapBuilder struct { func newSnapBuilder(lastCommitTS uint64) *snapBuilder { b := &snapBuilder{} - b.buf.Write(PebbleSnapshotMagic[:]) + b.buf.WriteString(PebbleSnapshotMagic) _ = binary.Write(&b.buf, binary.LittleEndian, lastCommitTS) return b } @@ -299,17 +299,21 @@ func TestReadSnapshot_RejectsValueLengthOverBudget(t *testing.T) { } // TestReadSnapshot_AcceptsKeyLengthAtBudgetBoundary pins the off- -// by-one: keyLen == MaxSnapshotEncodedKeySize must be accepted (the -// reader rejects only >, matching the live store's -// `readRestoreFieldLen` semantics). We test with a 1 KiB key -// because allocating the full 1 MiB on every run would slow the -// test suite — the boundary is the same regardless of magnitude. +// by-one: encoded-key length EXACTLY equal to +// MaxSnapshotEncodedKeySize must be accepted (the reader rejects +// only `> budget`, matching the live store's `readRestoreFieldLen` +// semantics). The earlier 1 KiB version of this test (coderabbit +// Major on PR #792 round 2) would not catch a `>=` vs `>` regression +// because the input was far below the budget. This version sizes +// the user key to make the encoded key (`userKey + 8-byte TS +// suffix`) land at exactly MaxSnapshotEncodedKeySize, then asserts +// successful round-trip. A regression that flipped the comparison +// to `>=` would reject this entry with ErrSnapshotKeyTooLarge. func TestReadSnapshot_AcceptsKeyLengthAtBudgetBoundary(t *testing.T) { t.Parallel() b := newSnapBuilder(0) - // 1 KiB user key + 8-byte TS suffix. - const userKeyLen = 1 << 10 - userKey := make([]byte, userKeyLen) + // userKey sized so that encoded-key length == MaxSnapshotEncodedKeySize. + userKey := make([]byte, MaxSnapshotEncodedKeySize-snapshotTSSize) for i := range userKey { userKey[i] = byte(i % 256) } @@ -321,10 +325,10 @@ func TestReadSnapshot_AcceptsKeyLengthAtBudgetBoundary(t *testing.T) { return nil }) if err != nil { - t.Fatalf("err = %v want nil at length within budget", err) + t.Fatalf("err = %v want nil at length == MaxSnapshotEncodedKeySize", err) } if !bytes.Equal(got.UserKey, userKey) { - t.Fatalf("UserKey mismatch at boundary") + t.Fatalf("UserKey mismatch at boundary (got len=%d want len=%d)", len(got.UserKey), len(userKey)) } }