diff --git a/docs/design/scaling-findings.md b/docs/design/scaling-findings.md index 263cd414f..595c88d3a 100644 --- a/docs/design/scaling-findings.md +++ b/docs/design/scaling-findings.md @@ -141,3 +141,61 @@ LD_LIBRARY_PATH=build_unix/.libs ./scale_bench rrand 200000 3 1 2 4 8 12 16 24 # snap per-thread MVCC snapshot txn (isolates lock-manager cost) ``` + +## Prototype 1: cache-line isolation of the write-hot BH fields (#7) + +The buffer header (`struct __bh`) packs the write-hot fields — the pin +reference count (`ref`, atomic RMW on every `__memp_fget`/`__memp_fput`) and the +LRU `priority` (rewritten on every `__memp_fput`) — into the **same cache line** +as the read-mostly identity/traversal fields (`pgno`, `mf_offset`, `flags`, +`hq`) that every concurrent hash-chain walk of a hot buffer reads +(`mp_fget.c`: `if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)`). +The hypothesis: each pin invalidates the line all readers need just to traverse +and match the page, so isolating the write-hot fields onto their own cache line +should cut coherence traffic. + +Implemented behind `MPOOL_HOTFIELDS_ISOLATED` (`src/dbinc/mp.h`) so it is a +one-line A/B. Built both layouts, ran a **controlled interleaved A/B** (packed +vs isolated `libdb`, same bench binary, 7 samples per point, medians) on the +12-core box: + +| workload | threads | packed (median) | isolated (median) | delta | +|----------|--------:|----------------:|------------------:|------:| +| rrand | 8 | 486,745 | 489,564 | +0.6% | +| rrand | 12 | 390,927 | 390,416 | -0.1% | +| snap | 8 | 518,422 | 514,260 | -0.8% | +| snap | 12 | 408,213 | 409,415 | +0.3% | + +**Result: no effect (±0.6%, within noise).** Field false-sharing is *not* the +cap. The read-path cost is **true sharing**: every reader performs an atomic +read-modify-write on the *same* shared words — `bhp->ref` and the shared-latch +share-counts (`mtx_hash`, `mtx_buf`) — for the hot (root/internal) page. +Relocating those words to private cache lines cannot help, because the +contention is on the words themselves, not on neighbours that happen to share +their line. + +The prototype is therefore left **off by default** (it only adds per-buffer +memory). Kept guarded so it can be re-A/B'd on the 24-core Linux box, where the +ceiling was originally characterized as futex-dominated and the cache hierarchy +differs from the laptop. + +### Refined direction for #2/#7 + +To lift the ceiling the per-read **shared-counter RMW must be removed**, not +relocated: + +1. **Optimistic / versioned buffer access** for resident, clean pages: read + under a version (seqlock) check instead of incrementing a shared latch + share-count, validating afterward and retrying on the latched path if a + writer raced. Requires safe memory reclamation (epoch/RCU) for the buffer + headers, which BDB does not have today — a buffer freed back to the region + during a lock-free chain walk can be reused under a reader. **Adding deferred + reclamation is the prerequisite work.** +2. **Scalable pin count**: replace the single `bhp->ref` atomic with a + sharded/per-core counter (pin bumps a private shard; eviction sums shards), + removing the ref cache-line ping-pong. Costs memory per buffer and + complicates the `ref == 0` eviction check. + +Both are larger than a layout tweak; this prototype's value is the measurement +that **rules out the cheap fix** and justifies the reclamation/scalable-counter +work. diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index e00055feb..766ce90e7 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -536,11 +536,31 @@ struct __mpoolfile { /* SHARED */ * BH -- * Buffer header. */ +/* + * Multi-core read-scaling prototype (#2/#7): isolate the write-hot buffer + * header fields (the pin reference count and the LRU priority, both written on + * every __memp_fget/__memp_fput) onto their own cache line, away from the + * read-mostly identity/traversal fields (pgno/mf_offset/flags/hq) that every + * concurrent hash-chain walk of a hot (e.g. btree root) buffer reads. Without + * this, each pin's atomic_inc(&ref) / LRU bump invalidates the line all readers + * need just to traverse and match the page, serializing readers through cache + * coherence. Comment out to A/B against the original packed layout. + * + * MEASURED (12-core Apple Silicon, controlled A/B, medians): no effect + * (+/-0.6%). The read-path cost is TRUE sharing of the atomic counters + * (bhp->ref and the shared-latch share-counts), not false sharing, so padding + * does not help. Left OFF by default; flip the #define to re-A/B on the + * 24-core Linux box where the futex-dominated ceiling was characterized. + */ +/* #define MPOOL_HOTFIELDS_ISOLATED 1 */ +#ifdef MPOOL_HOTFIELDS_ISOLATED +#define MPOOL_CACHELINE 64 +#endif + struct __bh { /* SHARED */ db_mutex_t mtx_buf; /* Shared/Exclusive mutex */ - db_atomic_t ref; /* Reference count. */ -#define BH_REFCOUNT(bhp) atomic_read(&(bhp)->ref) +#define BH_REFCOUNT(bhp) atomic_read(&(bhp)->ref) #define BH_CALLPGIN 0x001 /* Convert the page before use. */ #define BH_DIRTY 0x002 /* Page is modified. */ #define BH_DIRTY_CREATE 0x004 /* Page is modified. */ @@ -550,6 +570,28 @@ struct __bh { /* SHARED */ #define BH_FROZEN 0x040 /* Frozen buffer: allocate & re-read. */ #define BH_TRASH 0x080 /* Page is garbage. */ #define BH_THAWED 0x100 /* Page was thawed. */ + +#ifdef MPOOL_HOTFIELDS_ISOLATED + /* Read-mostly identity / traversal fields (read by every chain walk). */ + u_int16_t flags; + SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ + db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ + roff_t mf_offset; /* Associated MPOOLFILE offset. */ + u_int32_t bucket; /* Hash bucket containing header. */ + int region; /* Region containing header. */ + roff_t td_off; /* MVCC: creating TXN_DETAIL offset. */ + SH_CHAIN_ENTRY vc; /* MVCC: version chain. */ +#ifdef DIAG_MVCC + u_int16_t align_off; /* Alignment offset for diagnostics.*/ +#endif + + /* Write-hot fields, isolated on their own cache line. */ + u_int8_t __hot_pad1[MPOOL_CACHELINE]; + db_atomic_t ref; /* Reference count. */ + u_int32_t priority; /* Priority. */ + u_int8_t __hot_pad2[MPOOL_CACHELINE]; +#else + db_atomic_t ref; /* Reference count. */ u_int16_t flags; u_int32_t priority; /* Priority. */ @@ -564,6 +606,7 @@ struct __bh { /* SHARED */ SH_CHAIN_ENTRY vc; /* MVCC: version chain. */ #ifdef DIAG_MVCC u_int16_t align_off; /* Alignment offset for diagnostics.*/ +#endif #endif /*