From 332468a797c319392081f6efa9702f8d42c69ee5 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Wed, 17 Jun 2026 16:09:38 -0400 Subject: [PATCH 01/12] perf(mpool): Stage 1 step 1 - BH_WIRED (pin internal pages resident) Foundation for the optimistic descent: B-tree internal/root pages are wired so the frame is never reclaimed, letting a later lock-free descent read them without a use-after-free hazard (BDB has no epoch reclamation). - struct __bh gains a dedicated 'wired' byte (not a flags bit: it is set with a plain monotonic store while the caller holds only a shared buffer latch, so it must not share the non-atomic RMW of the flags word that __memp_pgwrite uses to clear BH_DIRTY). Reset to 0 at every buffer-header (re)init site. - __memp_alloc skips wired buffers when choosing a victim. - __memp_wire() sets it; guarded against memory-mapped pages (whose page pointer is not a buffer frame -- caught a SIGBUS in test001 with mmap'd files). - __bam_search wires P_IBTREE/P_IRECNO pages on descent (bounded: internal levels only, never leaves). No measurable perf change yet (internals were already hot-resident); this only guarantees residency for step 2 (LSN-validated optimistic descent). Validated: NOSYNC forced-eviction integrity (50k/1MB); TCL test001 btree+hash, test003. --- src/btree/bt_search.c | 7 +++++++ src/dbinc/mp.h | 11 +++++++++++ src/dbinc_auto/mp_ext.h | 1 + src/mp/mp_alloc.c | 3 ++- src/mp/mp_fget.c | 3 +++ src/mp/mp_fput.c | 35 +++++++++++++++++++++++++++++++++++ 6 files changed, 59 insertions(+), 1 deletion(-) diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c index ca9997c12..06e9ed4bd 100644 --- a/src/btree/bt_search.c +++ b/src/btree/bt_search.c @@ -348,6 +348,13 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) if (TYPE(h) == P_LBTREE) adjust = P_INDX; else { + /* + * Stage 1: wire internal index pages so they stay + * resident for the optimistic descent. Bounded: only + * the few internal levels, never the (numerous) leaves. + */ + if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) + (void)__memp_wire(mpf, h); /* * It is possible to catch an internal page as a change * is being backed out. Its leaf pages will be locked diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index e00055feb..7ab98b92a 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -552,6 +552,17 @@ struct __bh { /* SHARED */ #define BH_THAWED 0x100 /* Page was thawed. */ u_int16_t flags; + /* + * Stage 1: "wired" buffers are exempt from eviction (set for B-tree + * internal/root pages so the optimistic descent can read them without + * risk of the frame being reclaimed under it). A dedicated byte, not a + * flags bit: it is set with a plain monotonic store while the caller + * holds only a shared buffer latch, so it must not share the non-atomic + * RMW of the flags word (which __memp_pgwrite clears BH_DIRTY in under a + * shared latch). Reset to 0 wherever a buffer header is (re)initialized. + */ + u_int8_t wired; + u_int32_t priority; /* Priority. */ SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ diff --git a/src/dbinc_auto/mp_ext.h b/src/dbinc_auto/mp_ext.h index d142b5846..d16c4ecb7 100644 --- a/src/dbinc_auto/mp_ext.h +++ b/src/dbinc_auto/mp_ext.h @@ -42,6 +42,7 @@ int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *, int)); int __memp_inmemlist __P((ENV *, char ***, int *)); int __memp_fput_pp __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t)); int __memp_fput __P((DB_MPOOLFILE *, DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY)); +int __memp_wire __P((DB_MPOOLFILE *, void *)); int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *)); int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t)); int __memp_shared __P((DB_MPOOLFILE *, void *)); diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c index f7b937afa..45818fdaf 100644 --- a/src/mp/mp_alloc.c +++ b/src/mp/mp_alloc.c @@ -318,7 +318,8 @@ retry_search: bhp = NULL; #endif if (SH_CHAIN_SINGLETON(current_bhp, vc)) { - if (BH_REFCOUNT(current_bhp) != 0) + if (BH_REFCOUNT(current_bhp) != 0 || + current_bhp->wired) continue; buffers++; if (bucket_priority > current_bhp->priority) { diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c index 16de69515..190b1f989 100644 --- a/src/mp/mp_fget.c +++ b/src/mp/mp_fget.c @@ -649,6 +649,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && /* Initialize enough so we can call __memp_bhfree. */ alloc_bhp->flags = 0; + alloc_bhp->wired = 0; atomic_init(&alloc_bhp->ref, 1); #ifdef DIAGNOSTIC if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { @@ -797,6 +798,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && bhp->td_off = INVALID_ROFF; SH_CHAIN_INIT(bhp, vc); bhp->flags = 0; + bhp->wired = 0; /* * Reference the buffer and lock exclusive. We either @@ -1001,6 +1003,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && alloc_bhp->flags = BH_EXCLUSIVE | ((flags == DB_MPOOL_FREE) ? BH_FREED : F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE)); + alloc_bhp->wired = 0; DB_ASSERT(env, flags != DB_MPOOL_FREE || !F_ISSET(bhp, BH_DIRTY)); F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c index 8cb4e837b..b74de0859 100644 --- a/src/mp/mp_fput.c +++ b/src/mp/mp_fput.c @@ -372,3 +372,38 @@ __memp_unpin_buffers(env, ip) } return (0); } + +/* + * __memp_wire -- + * Mark a resident buffer as non-evictable ("wired"). Used for B-tree + * internal/root pages so the Stage 1 optimistic descent can read them + * without the frame being reclaimed under it. The set is a plain + * monotonic store to a dedicated byte (not the flags word), safe to do + * while the caller holds only a shared buffer latch; the byte is reset to + * 0 wherever a buffer header is (re)initialized. + * + * PUBLIC: int __memp_wire __P((DB_MPOOLFILE *, void *)); + */ +int +__memp_wire(dbmfp, pgaddr) + DB_MPOOLFILE *dbmfp; + void *pgaddr; +{ + BH *bhp; + + /* + * A memory-mapped (read-only) file hands back a pointer into the mmap + * region, not a buffer frame, so the BH back-computation below would be + * a wild pointer. Such pages are never in the buffer pool and never + * evicted, so there is nothing to wire. + */ + if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && + (u_int8_t *)pgaddr <= + (u_int8_t *)dbmfp->addr + dbmfp->len) + return (0); + + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + if (bhp->wired == 0) + bhp->wired = 1; + return (0); +} From 00c1785abf896ff6e7b98f90b3576ea5be43d202 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Wed, 17 Jun 2026 16:26:42 -0400 Subject: [PATCH 02/12] perf(mpool): harden BH_WIRED - __memp_unwire, cap, counter, stat - __memp_unwire(): clears the wired mark so a freed frame is evictable again; called from __db_free (the single page-free chokepoint for all access methods) and from __memp_bhfree for the file/env-close discard path. The wired byte gates the counter so it is decremented exactly once. - Per-region wired-page counter (MPOOL.wired_pages, atomic) with a cap of MPOOL_WIRED_MAX_PCT (25%) of the region's buffers: over the cap __memp_wire is a no-op and the descent uses a normal pin, so wiring can never starve the cache. - db_stat -m reports 'Wired buffers (non-evictable)'. - mmap guard on both wire and unwire (page ptr is not a buffer frame). Validated: NOSYNC integrity 50k/1MB; TCL test001 btree+hash, test003, test011 (cursor splits/merges -> page frees exercise __memp_unwire). --- src/db/db_meta.c | 6 +++++ src/dbinc/mp.h | 17 +++++++++++++ src/dbinc_auto/mp_ext.h | 1 + src/mp/mp_bh.c | 10 ++++++++ src/mp/mp_fput.c | 55 ++++++++++++++++++++++++++++++++++++++++- src/mp/mp_region.c | 1 + src/mp/mp_stat.c | 2 ++ 7 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/db/db_meta.c b/src/db/db_meta.c index f89c10665..f72310aca 100644 --- a/src/db/db_meta.c +++ b/src/db/db_meta.c @@ -312,6 +312,12 @@ __db_free(dbc, h, flags) meta = NULL; prev = NULL; LOCK_INIT(metalock); + + /* + * The page is being freed back to the file; if it was wired + * (a B-tree internal page) clear that so the frame can be reused. + */ + (void)__memp_unwire(mpf, h); #ifdef HAVE_FTRUNCATE lp = NULL; nelem = 0; diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index 7ab98b92a..60d4b2907 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -196,6 +196,15 @@ struct __mpool { /* SHARED */ */ u_int32_t pages; /* Number of pages in the cache. */ + /* + * Count of buffers currently wired (non-evictable; B-tree internal + * pages, Stage 1). Atomic so __memp_wire/__memp_unwire need no region + * lock. Capped at MPOOL_WIRED_MAX_PCT of `pages` so wiring can never + * starve the cache; over the cap, wiring is simply skipped (the descent + * falls back to a normal pin). + */ + db_atomic_t wired_pages; + /* * The stat fields are not thread protected, and cannot be trusted. */ @@ -388,6 +397,14 @@ struct __db_mpool_fstat_int { /* SHARED */ #define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */ #define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */ +/* + * Wiring cap (Stage 1): at most this percent of a cache region's buffers may + * be wired (held non-evictable for B-tree internal pages), so wiring can never + * starve the cache. Over the cap, __memp_wire is a no-op and the descent uses + * a normal pin. + */ +#define MPOOL_WIRED_MAX_PCT 25 + /* * MPOOLFILE -- * Shared DB_MPOOLFILE information. diff --git a/src/dbinc_auto/mp_ext.h b/src/dbinc_auto/mp_ext.h index d16c4ecb7..874243151 100644 --- a/src/dbinc_auto/mp_ext.h +++ b/src/dbinc_auto/mp_ext.h @@ -43,6 +43,7 @@ int __memp_inmemlist __P((ENV *, char ***, int *)); int __memp_fput_pp __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t)); int __memp_fput __P((DB_MPOOLFILE *, DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY)); int __memp_wire __P((DB_MPOOLFILE *, void *)); +int __memp_unwire __P((DB_MPOOLFILE *, void *)); int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *)); int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t)); int __memp_shared __P((DB_MPOOLFILE *, void *)); diff --git a/src/mp/mp_bh.c b/src/mp/mp_bh.c index 93746fec4..de057742a 100644 --- a/src/mp/mp_bh.c +++ b/src/mp/mp_bh.c @@ -591,6 +591,16 @@ __memp_bhfree(dbmp, infop, mfp, hp, bhp, flags) pagesize = mfp->pagesize; #endif + /* + * If this buffer was wired and is being freed without going through + * __db_free (e.g. file/env close discard), drop it from the region's + * wired count. The wired byte gates this so it is decremented once. + */ + if (bhp->wired != 0) { + bhp->wired = 0; + (void)atomic_dec(env, &((MPOOL *)infop->primary)->wired_pages); + } + DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) || (hp != NULL && MUTEX_IS_OWNED(env, hp->mtx_hash))); DB_ASSERT(env, BH_REFCOUNT(bhp) == 1 && diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c index b74de0859..a60c57541 100644 --- a/src/mp/mp_fput.c +++ b/src/mp/mp_fput.c @@ -390,6 +390,9 @@ __memp_wire(dbmfp, pgaddr) void *pgaddr; { BH *bhp; + DB_MPOOL *dbmp; + ENV *env; + MPOOL *c_mp; /* * A memory-mapped (read-only) file hands back a pointer into the mmap @@ -402,8 +405,58 @@ __memp_wire(dbmfp, pgaddr) (u_int8_t *)dbmfp->addr + dbmfp->len) return (0); + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + if (bhp->wired != 0) + return (0); + + /* + * Cap wiring at MPOOL_WIRED_MAX_PCT of the region's buffers so wiring + * can never starve the cache. Over the cap this is a no-op and the + * descent uses a normal pin. The count is approximate under races, + * which is fine for a cap. + */ + env = dbmfp->env; + dbmp = env->mp_handle; + c_mp = dbmp->reginfo[bhp->region].primary; + if (atomic_read(&c_mp->wired_pages) >= + c_mp->pages / 100 * MPOOL_WIRED_MAX_PCT) + return (0); + + bhp->wired = 1; + (void)atomic_inc(env, &c_mp->wired_pages); + return (0); +} + +/* + * __memp_unwire -- + * Clear the wired mark on a buffer (e.g. when its page is freed) so the + * frame becomes evictable again. Safe on mmap'd pages (no-op). + * + * PUBLIC: int __memp_unwire __P((DB_MPOOLFILE *, void *)); + */ +int +__memp_unwire(dbmfp, pgaddr) + DB_MPOOLFILE *dbmfp; + void *pgaddr; +{ + BH *bhp; + DB_MPOOL *dbmp; + ENV *env; + MPOOL *c_mp; + + if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && + (u_int8_t *)pgaddr <= + (u_int8_t *)dbmfp->addr + dbmfp->len) + return (0); + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); if (bhp->wired == 0) - bhp->wired = 1; + return (0); + + bhp->wired = 0; + env = dbmfp->env; + dbmp = env->mp_handle; + c_mp = dbmp->reginfo[bhp->region].primary; + (void)atomic_dec(env, &c_mp->wired_pages); return (0); } diff --git a/src/mp/mp_region.c b/src/mp/mp_region.c index 495203054..d5d6efce8 100644 --- a/src/mp/mp_region.c +++ b/src/mp/mp_region.c @@ -312,6 +312,7 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) } mp->htab_buckets = htab_buckets; mp->htab_mutexes = dbenv->mp_mtxcount; + atomic_init(&mp->wired_pages, 0); mp->pagesize = dbenv->mp_pagesize == 0 ? MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize; diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c index 273c5c938..c904734d5 100644 --- a/src/mp/mp_stat.c +++ b/src/mp/mp_stat.c @@ -709,6 +709,8 @@ __memp_print_hash(env, dbmp, reginfo, fmap, flags) c_mp = reginfo->primary; DB_MSGBUF_INIT(&mb); STAT_ULONG("Hash table last-checked", c_mp->last_checked); + STAT_ULONG("Wired buffers (non-evictable)", + atomic_read(&c_mp->wired_pages)); STAT_ULONG("Hash table LRU priority", c_mp->lru_priority); STAT_ULONG("Hash table LRU generation", c_mp->lru_generation); STAT_ULONG("Put counter", c_mp->put_counter); From 680ee88f89393e13d7582f4f09debc9e0aedb94b Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 08:30:51 -0400 Subject: [PATCH 03/12] perf(btree): wire only the one common root, not all internals Per review: internal/subtree-root pages should stay in the normal evictable pool; only the single main tree root (BAM_ROOT_PGNO) -- fetched by every operation -- is wired, so it stays resident without churning eviction and the root snapshot can refresh cheaply. Move the __memp_wire call from the all-internals site in __bam_search to __bam_get_root, gated on h->pgno == BAM_ROOT_PGNO(dbc). Unwiring is already handled on page free (__db_free) and file close (__memp_bhfree). Validated: NOSYNC integrity 50k/1MB; TCL test001 btree+hash, test011. --- src/btree/bt_search.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c index 06e9ed4bd..bcc8260c8 100644 --- a/src/btree/bt_search.c +++ b/src/btree/bt_search.c @@ -128,6 +128,17 @@ retry: if (lock_mode == DB_LOCK_WRITE) DB_ASSERT(dbp->env, TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO || TYPE(h) == P_LBTREE || TYPE(h) == P_LRECNO || TYPE(h) == P_LDUP); + /* + * Wire the one common tree root so it stays resident: it is fetched by + * every operation, so keeping it non-evictable removes the read-in/ + * eviction churn on the hottest page and lets the root snapshot refresh + * cheaply. Only the main tree root (BAM_ROOT_PGNO) is wired -- subtree + * (off-page duplicate) roots and all internal pages stay evictable. + * Unwired when the page is freed (__db_free) or the file closes. + */ + if (h->pgno == BAM_ROOT_PGNO(dbc)) + (void)__memp_wire(mpf, h); + /* * Decide if we need to dirty and/or lock this page. * We must not hold the latch while we get the lock. @@ -348,13 +359,6 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) if (TYPE(h) == P_LBTREE) adjust = P_INDX; else { - /* - * Stage 1: wire internal index pages so they stay - * resident for the optimistic descent. Bounded: only - * the few internal levels, never the (numerous) leaves. - */ - if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) - (void)__memp_wire(mpf, h); /* * It is possible to catch an internal page as a change * is being backed out. Its leaf pages will be locked From 7866c7ef93a12a6b7bafc841ccf34448a6d4230f Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 09:19:33 -0400 Subject: [PATCH 04/12] perf(btree): lock-free root snapshot for read descents (option B) Read lookups of the main tree no longer fetch (pin/latch) the contended live root. Each handle keeps a private immutable copy of the root taken at a known root LSN; a plain read of the live root LSN (via the wired root buffer) confirms the copy is current, the copy yields the descent's first child, and __bam_search starts from that child -- never touching the live root. Correctness: after the child is fetched, the live root LSN is re-checked (seqlock); if it changed (a split added a level, or a merge freed the child) the child is released and the descent restarts from the real root. Gated to plain read finds of a logged, durable, non-multiversion btree (where the page LSN reliably advances on root modification) -- everything else uses the normal descent. Old copies are retired to a free list and released at handle close (root changes are rare; no reader/free race, no epoch reclamation). Child selection reuses __bam_cmp and the exact __bam_search binary-search rule so the chosen child is identical to a normal descent. Validated: NOSYNC integrity 50k/1MB (logged, fast path active, all verified); TCL test001 btree+hash, test003, test011 (dups), test026. The first cut mis-gated non-logged envs (LSN never advances -> stale copy -> wrong results); fixed with the LOGGING_ON + durable gate. Concurrent stress + scaling on meh next. --- src/btree/bt_method.c | 14 ++++ src/btree/bt_search.c | 184 ++++++++++++++++++++++++++++++++++++++++++ src/dbinc/btree.h | 32 ++++++++ 3 files changed, 230 insertions(+) diff --git a/src/btree/bt_method.c b/src/btree/bt_method.c index bac2cc82b..72788650e 100644 --- a/src/btree/bt_method.c +++ b/src/btree/bt_method.c @@ -113,6 +113,20 @@ __bam_db_close(dbp) if (t->re_source != NULL) __os_free(dbp->env, t->re_source); + /* Free root snapshots: the current copy and all retired copies. */ + { + BAM_RSNAP *s, *snext; + + for (s = t->bt_rsnap; s != NULL; s = snext) { + snext = s->next; + __os_free(dbp->env, s); + } + for (s = t->bt_rsnap_free; s != NULL; s = snext) { + snext = s->next; + __os_free(dbp->env, s); + } + } + __os_free(dbp->env, t); dbp->bt_internal = NULL; diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c index bcc8260c8..c2178afd0 100644 --- a/src/btree/bt_search.c +++ b/src/btree/bt_search.c @@ -49,6 +49,137 @@ #include "dbinc/lock.h" #include "dbinc/mp.h" +static int __bam_rsnap_refresh __P((DBC *)); +static int __bam_rsnap_child __P((DBC *, const DBT *, db_pgno_t *, DB_LSN *)); + +/* + * __bam_rsnap_refresh -- + * Refresh this handle's private copy of the B-tree root. Fetches the + * (wired) root once under its shared latch, takes a consistent copy and + * its LSN, and publishes it. The previously-current copy is retired to + * a free list (freed at handle close) so a concurrent reader still + * holding it is never freed underneath -- root changes are rare, so few + * copies accumulate. Serialized by the handle mutex. + */ +static int +__bam_rsnap_refresh(dbc) + DBC *dbc; +{ + BTREE *t; + BAM_RSNAP *snap; + DB *dbp; + DB_MPOOLFILE *mpf; + DB_LSN lsn; + ENV *env; + PAGE *h; + db_pgno_t root_pgno; + u_int32_t psize; + int ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + mpf = dbp->mpf; + t = dbp->bt_internal; + root_pgno = t->bt_root; + if (root_pgno == PGNO_INVALID) + return (0); + + if ((ret = __memp_fget(mpf, &root_pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + return (ret); + /* Wire the root so the cached buffer address stays valid/resident. */ + (void)__memp_wire(mpf, h); + lsn = LSN(h); + psize = dbp->pgsize; + snap = NULL; + if (TYPE(h) == P_IBTREE && psize != 0 && + (ret = __os_malloc(env, sizeof(BAM_RSNAP) + psize, &snap)) == 0) { + snap->next = NULL; + snap->lsn = lsn; + snap->size = psize; + memcpy(BAM_RSNAP_PAGE(snap), h, psize); + } + + MUTEX_LOCK(env, dbp->mutex); + t->bt_rootpage = h; /* cached wired buffer (stays resident) */ + /* Retire the previously-current copy to the free list. */ + if (t->bt_rsnap != NULL) { + ((BAM_RSNAP *)t->bt_rsnap)->next = t->bt_rsnap_free; + t->bt_rsnap_free = t->bt_rsnap; + } + t->bt_rsnap = snap; /* NULL if the root is a leaf */ + t->bt_rsnap_lsn = lsn; + MUTEX_UNLOCK(env, dbp->mutex); + + if ((t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __bam_rsnap_child -- + * If this handle holds a current snapshot of the root (its LSN still + * matches the live root), search the snapshot copy for the child that + * the descent for "key" would take, returning that child page number and + * the snapshot LSN. Returns DB_NOTFOUND if there is no current snapshot + * (the caller falls back to the normal descent and refreshes). + */ +static int +__bam_rsnap_child(dbc, key, childp, snap_lsnp) + DBC *dbc; + const DBT *key; + db_pgno_t *childp; + DB_LSN *snap_lsnp; +{ + BTREE *t; + BAM_RSNAP *snap; + DB *dbp; + DB_LSN live; + PAGE *cp; + db_indx_t base, indx, lim; + int (*func) __P((DB *, const DBT *, const DBT *)); + int cmp, ret; + + dbp = dbc->dbp; + t = dbp->bt_internal; + snap = t->bt_rsnap; + if (snap == NULL || t->bt_rootpage == NULL) + return (DB_NOTFOUND); + + /* Racy read of the live root LSN; a torn read just forces a refresh. */ + live = LSN((PAGE *)t->bt_rootpage); + if (live.file != snap->lsn.file || live.offset != snap->lsn.offset) + return (DB_NOTFOUND); + + cp = BAM_RSNAP_PAGE(snap); + if (TYPE(cp) != P_IBTREE) + return (DB_NOTFOUND); + + /* + * Mirror the internal-page child selection in __bam_search exactly + * (same binary search, same __bam_cmp, same base->index rule), so the + * child chosen is identical to a normal descent. + */ + func = t->bt_compare; + indx = 0; + cmp = 1; + DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(cp), O_INDX) { + DB_BINARY_SEARCH_INCR(indx, base, lim, O_INDX); + if ((ret = __bam_cmp(dbc, key, cp, indx, func, &cmp)) != 0) + return (DB_NOTFOUND); + if (cmp == 0) + break; + if (cmp > 0) + DB_BINARY_SEARCH_SHIFT_BASE(indx, base, lim, O_INDX); + } + if (cmp != 0) + indx = base > 0 ? base - O_INDX : base; + *childp = GET_BINTERNAL(dbp, cp, indx)->pgno; + *snap_lsnp = snap->lsn; + return (0); +} + /* * __bam_get_root -- * Fetch the root of a tree and see if we want to keep @@ -286,6 +417,9 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp) int (*func) __P((DB *, const DBT *, const DBT *)); u_int32_t get_mode, wait; u_int8_t level, saved_level; + int from_snap; + db_pgno_t snap_child; + DB_LSN snap_lsn; if (F_ISSET(dbc, DBC_OPD)) LOCK_CHECK_OFF(dbc->thread_info); @@ -316,6 +450,31 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp) */ start_pgno = saved_pg = root_pgno; + + /* + * Root-snapshot fast path (option B): for a plain read lookup of the + * main tree (not write/stack/parent/next/del/min/max, not OPD, not + * recno/recnum, not multiversion), take the first child from this + * handle's private root copy and begin the descent there, never + * fetching (pinning/latching) the contended live root. The copy's + * validity is confirmed against the live root LSN here, and re-checked + * after the child is fetched (below) to close the window where a + * concurrent root change could make the child stale. + */ + from_snap = 0; + if (root_pgno == PGNO_INVALID && key != NULL && slevel == LEAFLEVEL && + LF_ISSET(SR_READ) && !LF_ISSET(SR_WRITE | SR_PARENT | SR_STACK | + SR_NEXT | SR_DEL | SR_START | SR_BOTH | SR_MIN | SR_MAX | + SR_STK_ONLY) && !F_ISSET(dbc, DBC_OPD) && + dbc->dbtype == DB_BTREE && !F_ISSET(cp, C_RECNUM) && + atomic_read(&mpf->mfp->multiversion) == 0 && + LOGGING_ON(env) && !F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (__bam_rsnap_child(dbc, key, &snap_child, &snap_lsn) == 0) { + start_pgno = snap_child; + from_snap = 1; + } else + (void)__bam_rsnap_refresh(dbc); + } saved_level = MAXBTREELEVEL; retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) goto err; @@ -350,6 +509,31 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) BT_STK_CLR(cp); + /* + * Root-snapshot re-check: we began the descent at a child taken from + * the root copy. If the live root LSN no longer matches the snapshot, + * the root changed (e.g. a split added a level, or a merge freed the + * child) while we were fetching the child, so the child may be stale or + * reused. Release it and restart the descent from the real root. + */ + if (from_snap) { + DB_LSN now; + + now = LSN((PAGE *)t->bt_rootpage); + if (now.file != snap_lsn.file || now.offset != snap_lsn.offset) { + if ((ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + h = NULL; + (void)__LPUT(dbc, lock); + LOCK_INIT(lock); + from_snap = 0; + start_pgno = PGNO_INVALID; + (void)__bam_rsnap_refresh(dbc); + goto retry; + } + } + /* Choose a comparison function. */ func = F_ISSET(dbc, DBC_OPD) ? (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) : diff --git a/src/dbinc/btree.h b/src/dbinc/btree.h index 1f4c5d5a9..7d5132c3d 100644 --- a/src/dbinc/btree.h +++ b/src/dbinc/btree.h @@ -49,6 +49,21 @@ extern "C" { /* Forward structure declarations. */ struct __btree; typedef struct __btree BTREE; + +/* + * BAM_RSNAP -- + * An immutable private copy of the B-tree root page taken at a known + * root LSN, used by the lock-free root-snapshot descent. The copied + * page image follows the header (size bytes). Superseded copies are + * chained via "next" and freed when the handle closes. + */ +typedef struct __bam_rsnap { + struct __bam_rsnap *next; /* Chain of superseded copies. */ + DB_LSN lsn; /* Root LSN this copy was taken at. */ + u_int32_t size; /* Page size (bytes of copy). */ + /* The copied root page image follows immediately. */ +} BAM_RSNAP; +#define BAM_RSNAP_PAGE(s) ((PAGE *)((u_int8_t *)(s) + sizeof(BAM_RSNAP))) struct __cursor; typedef struct __cursor BTREE_CURSOR; struct __epg; typedef struct __epg EPG; @@ -502,6 +517,23 @@ struct __btree { /* Btree access method. */ db_pgno_t bt_lpgno; /* Last insert location. */ DB_LSN bt_llsn; /* Last insert LSN. */ + /* + * Root snapshot (option B): a private, immutable copy of the B-tree + * root used to find the descent's first child without fetching + * (pinning/latching) the contended live root. bt_rootpage caches the + * wired live-root buffer so a reader can read the current root LSN with + * a plain load; bt_rsnap is the current copy (NULL when the root is a + * leaf); bt_rsnap_lsn is the LSN at the last refresh; bt_rsnap_free + * chains superseded copies, freed when the handle closes (root changes + * are rare, so few accumulate -- this avoids a reader/free race without + * epoch reclamation). Process-local; cross-process correctness comes + * from validating against the shared live-root LSN. + */ + void *bt_rootpage; /* Cached wired live-root buffer. */ + void *bt_rsnap; /* Current root copy (BAM_RSNAP *). */ + DB_LSN bt_rsnap_lsn; /* Root LSN at last snapshot refresh. */ + void *bt_rsnap_free; /* Superseded copies, freed at close. */ + /* * !!! * The re_modified field is NOT protected by any mutex, and for this From e3960be7c11c4396eebb7c5ef51446ac79cc20be Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 17:41:40 -0400 Subject: [PATCH 05/12] docs: record measured root-snapshot scaling A/B and conclusion rrand 200k/3s on meh (24t, tmpfs): snapshot beats master at every thread count (+22-29% at 4-8t) but both peak ~8t and negatively scale to 24t. The snapshot raises the read-scaling ceiling without removing it; at 24t the bottleneck has moved to the lock-manager locker region (lockers% 51-67%, lockpart% ~0.1). Real measured win worth landing; multicore scaling past ~8 cores now bounded by the lock manager (ROADMAP #4). --- docs/design/scaling-findings.md | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/docs/design/scaling-findings.md b/docs/design/scaling-findings.md index 263cd414f..42337dcd3 100644 --- a/docs/design/scaling-findings.md +++ b/docs/design/scaling-findings.md @@ -141,3 +141,50 @@ LD_LIBRARY_PATH=build_unix/.libs ./scale_bench rrand 200000 3 1 2 4 8 12 16 24 # snap per-thread MVCC snapshot txn (isolates lock-manager cost) ``` +## Measured result: lock-free root snapshot (Stage 1b) + +A/B of `master` vs the root-snapshot build (`perf/swip-stage1-descent`), +`scale_bench rrand`, 200 000 keys, 3 s, 3-sample medians, on `meh` +(Xeon E5-2697 v2, 12c/24t, single socket). Run on tmpfs (`/dev/shm`): +the read working set is cache-resident, so this measures CPU/lock +scaling, not disk — and it avoids meh's nvme-over-fabrics `/scratch`, +which stalls in `submit_bio_wait` on the 536 MB cache-region write (the +cause of every earlier "stuck" run). + +| threads | master ops/s | snapshot ops/s | Δ | +|--------:|-------------:|---------------:|:------:| +| 1 | 199 176 | 210 942 | +5.9% | +| 4 | 410 883 | 529 169 | +28.8% | +| 8 | 459 698 | 560 587 | +21.9% | +| 12 | 448 220 | 511 045 | +14.0% | +| 16 | 428 569 | 491 702 | +14.7% | +| 24 | 419 952 | 444 471 | +5.8% | + +Correctness: TCL test001 (btree+hash), test003, test011 (dups), test026, +a 50 000-op logged-env integrity check, and a concurrent stress (24 +readers verifying a hot set while 6 writers churned the tree's structure, +0 reader mismatches) all pass. + +### Conclusion — is multicore scalability addressed? + +**Partially. A real, measured win, but not a solution.** + +- The snapshot is faster at *every* thread count, biggest in the + contended midrange (+22–29% at 4–8 threads). This confirms the + per-operation pin/latch of the contended root page was a genuine + bottleneck, and removing it from read descents helps materially. +- **But both** master and snapshot still **peak at ~8 threads and then + negatively scale** to 24 (snapshot 560 k @ 8 → 444 k @ 24). The + snapshot **raises the ceiling (~+22%) without removing it.** +- At 24 threads the contention signal has moved: `lockpart%≈0.1` (random + reads spread across lock partitions fine) but **`lockers%`=51–67%** — + the lock manager's **locker region** is now the dominant serialization + point (ROADMAP #4), not the buffer pool. + +So Stage 1b should land on its merits (correct, measured, monotonic +improvement), but the multicore read ceiling past ~8 cores is now bounded +by the lock-manager locker region, which is the next target. Stage 0/0.5 +(landed) addressed a different axis (scan resistance, ~10.7×). Stage 2 +(AIO) is I/O-throughput infrastructure (prefetch/async writeback) and does +not affect this cache-resident read-scaling curve. + From 342143fa6dc2607c1450b5ec55cec150a717d999 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 18:07:24 -0400 Subject: [PATCH 06/12] docs: profile the >8-core slide; identify cursor-mutex as next bottleneck perf on meh (24t, snapshot): 40.5% of time is futex wait under __db_pthread_mutex_lock, split between __db_cursor_int (cursor alloc) and __dbc_close (cursor free) -- the per-get transient cursor linked/unlinked on the ONE shared DB handle's active-cursor queue (dbp->mutex). Per-thread handles (sepdb) run +49% at 24t and scale near-linearly to 8t, proving it. Next bottleneck underneath: __memp_fget/fput hash-bucket latch + refcount atomics per descent page (root snapshot removed only the root fetch). Benchmark critique: aggregate metric is sound; it induces lock-manager traffic via DB_INIT_LOCK|TXN reads (should also measure READ_UNCOMMITTED); targ_t.ops false-shares (latent, not in profile); meh is 12c/24t so the 12->24 tail is HT + all-core turbo, and peak-at-8 is software contention. Conclusion: the next scaling fix is the per-get cursor-allocation mutex -- NOT Stage 1c (blocked, only partial) nor Stage 2 #5 (orthogonal: rrand is 100% cache hits, zero I/O). --- docs/design/scaling-findings.md | 83 +++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/docs/design/scaling-findings.md b/docs/design/scaling-findings.md index 42337dcd3..5a5774dd9 100644 --- a/docs/design/scaling-findings.md +++ b/docs/design/scaling-findings.md @@ -188,3 +188,86 @@ by the lock-manager locker region, which is the next target. Stage 0/0.5 (AIO) is I/O-throughput infrastructure (prefetch/async writeback) and does not affect this cache-resident read-scaling curve. +## Profiling the >8-core slide (post-snapshot) and a benchmark critique + +`perf` on meh (24t), snapshot lib, `rrand` 200k in tmpfs. The question: +*why* does aggregate throughput peak at ~8 threads and decline, and is the +benchmark itself fit to drive past 8 cores? + +### Where the time goes (t=24, shared handle) + +Flat profile: **40.5%** of all samples are in `lll_lock_wait` (the kernel +futex) under `__db_pthread_mutex_lock`, split almost evenly: + +``` +18.25% __db_cursor_int (cursor allocate) <- __db_get +18.10% __dbc_close (cursor free) <- __db_get +``` + +`DB->get` allocates a transient cursor and frees it **per operation**; +both link/unlink it into the *one shared handle's* active-cursor queue +under `dbp->mutex`. With every thread sharing a single `db` handle, that +one mutex serializes every `get`. This is the dominant bottleneck once +the root snapshot removes the buffer-pool root pin. + +### Proof: per-thread handles (`sepdb`) remove it + +| threads | rrand (shared handle) | sepdb (per-thread handles) | +|--------:|----------------------:|---------------------------:| +| 1 | 223 851 | 249 302 | +| 8 | 511 698 | **749 896** (3.0×) | +| 16 | 471 626 | 666 188 | +| 24 | 454 073 | **675 996 (+49%)** | + +Per-thread handles scale near-linearly to 8 and run **+49% faster at 24 +threads**. So the negative scaling is largely the shared-handle cursor +mutex — a real BDB serialization, but one the benchmark triggers by the +(common) choice of sharing one handle across all threads. + +### The next bottleneck underneath (sepdb, t=24) + +With the cursor mutex gone, the profile is dominated by +`__memp_fget`/`__memp_fput` — the **mpool hash-bucket shared latch +(`__db_pthread_mutex_lock`) + `__atomic_inc/dec` on the page refcount**, +paid on every page of the descent (root via `__bam_get_root`, +internal/leaf via `__bam_search`, unpin via `__dbc_cleanup`). The root +snapshot removed only the *root* fetch (1 of ~3 per descent); the +internal and leaf pins remain. + +### Is the benchmark fit to drive >8 cores? + +- **Metric is sound** (aggregate `total/dur`); the decline is real. +- **It induces lock-manager traffic**: the env is `DB_INIT_LOCK|DB_INIT_TXN` + and every `get(txn=NULL)` takes a page read lock + a transient locker. + A read-only probe should *also* measure `DB_READ_UNCOMMITTED` / a + non-locking config to separate buffer-pool from lock-manager scaling. +- **Latent false-sharing**: `targ_t ta[256]` is 16 B (4 per cache line), + so adjacent threads' `ops++` share a line — but it does **not** appear + in the profile (the BDB mutexes dwarf it), so it is a code smell, not a + measured contributor. Pad it anyway. +- **Hardware**: meh is **12 physical cores / 24 HT**. Aggregate peaking + at 8 (< 12) means software contention bites *before* the core limit; + the 12→24 tail is additionally bounded by hyperthreading and the + all-core turbo drop (3.5 GHz single → ~3.0 GHz all-core). Even a + perfect read path would scale ~linearly only to ~12, then flatten. + +### Bottleneck ranking (measured), and what it means for the roadmap + +1. **Shared-handle cursor alloc/free mutex** (`__db_cursor_int` + + `__dbc_close` → `dbp->mutex`) — dominant (40% @ 24t). **Not blocked**; + fix = a per-handle cursor cache / lock-free active-cursor list / + per-thread cursor reuse. Biggest tractable win (+49% measured). +2. **mpool pin/unpin atomics + hash-bucket latch** (`__memp_fget`/`fput`) + — the true read ceiling once #1 is gone; needs optimistic/seqlock + buffer access **+ epoch reclamation** (the same prerequisite that + blocks Stage 1c). +3. **Lock-manager read locks** (transient locker + lock object per get; + `lockers%`). +4. **Hardware** (12 cores, HT, all-core turbo, memory bandwidth). + +**Implication:** Stage 1c (lock-free *deeper-internal* descent) only +partially addresses #2 and is blocked on reclamation; Stage 2 #5 (async +I/O) is **orthogonal** — `rrand` is 100% cache hits with zero I/O, so it +cannot move this curve. The correct next scaling fix is **#1, the +per-`get` cursor allocation mutex**, not 1c or #5. + From 0ba598ee74a5d13e8080a72dfede9cfb9c9a5086 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 18:20:36 -0400 Subject: [PATCH 07/12] test(bench): add scale_iso -- per-thread-handle isolation-scaling probe Each thread opens its own handle on the SAME bench.db (removing the shared-handle cursor-queue mutex app-side) and reads under a selectable isolation level (none/read-committed/snapshot/uncommitted) to measure how far BDB scales with full transactional isolation -- not requiring uncommitted reads. Per-thread state is cache-line padded. --- lab/bench/scale_iso.c | 207 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 lab/bench/scale_iso.c diff --git a/lab/bench/scale_iso.c b/lab/bench/scale_iso.c new file mode 100644 index 000000000..3edc2ba78 --- /dev/null +++ b/lab/bench/scale_iso.c @@ -0,0 +1,207 @@ +/*- + * libdb isolation-scaling probe. + * + * The original scale_bench shares ONE DB handle across all threads, so every + * DB->get allocates+frees a transient cursor on that handle's active-cursor + * queue under dbp->mutex -- a single mutex that serializes every read and + * caps scaling at ~8 cores (measured). This probe removes that app-level + * bottleneck the supported way -- each thread opens its OWN handle on the + * SAME database file -- and measures how far BDB then scales under several + * isolation levels, WITHOUT resorting to uncommitted reads: + * + * none DB->get(txn=NULL) per-op auto page read lock + * rc per-op txn + DB_READ_COMMITTED + * snap long-lived per-thread DB_TXN_SNAPSHOT (MVCC, no read locks) + * uncom DB_READ_UNCOMMITTED (baseline; isolation sacrificed) + * + * All threads read the same shared dataset (full isolation, not separate + * DBs), so this isolates handle/cursor scaling from the data. + * + * cc -O2 -pthread scale_iso.c -I -L/.libs -ldb-5.3 -o scale_iso + * ./scale_iso [t2 ...] + */ +#include +#include +#include +#include +#include +#include +#include +#include "db.h" + +enum { ISO_NONE = 0, ISO_RC = 1, ISO_SNAP = 2, ISO_UNCOM = 3 }; + +static DB_ENV *env; +static int g_iso; +static uint32_t g_nkeys; +static volatile int stop, go; + +/* Cache-line-padded per-thread state: no false sharing of the op counter, and + * each thread carries its own handle + (for snap) its long-lived read txn. */ +typedef struct { + uint64_t ops; + unsigned seed; + int tid; + DB *db; /* this thread's own handle on the shared file */ + DB_TXN *rtxn; /* long-lived read txn (snap), else NULL */ + char pad[64]; +} targ_t __attribute__((aligned(64))); + +static double +now_sec(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (ts.tv_sec + ts.tv_nsec / 1e9); +} + +static void * +worker(void *a) +{ + targ_t *t = a; + DBT key, data; + uint32_t kb, vbuf[32]; + DB_TXN *txn; + int ret; + + while (!go) { } + while (!stop) { + uint32_t k = (uint32_t)(rand_r(&t->seed) % g_nkeys); + memset(&key, 0, sizeof(key)); key.data = &kb; key.size = sizeof(kb); kb = k; + memset(&data, 0, sizeof(data)); + data.data = vbuf; data.ulen = sizeof(vbuf); data.flags = DB_DBT_USERMEM; + + txn = NULL; + ret = 0; + switch (g_iso) { + case ISO_SNAP: /* long-lived snapshot txn, no read locks */ + ret = t->db->get(t->db, t->rtxn, &key, &data, 0); + break; + case ISO_RC: /* per-op txn, read-committed (locks held to commit) */ + if ((ret = env->txn_begin(env, NULL, &txn, DB_READ_COMMITTED)) != 0) + break; + ret = t->db->get(t->db, txn, &key, &data, 0); + if (ret == DB_BUFFER_SMALL) ret = 0; + if (ret == 0 || ret == DB_NOTFOUND) + (void)txn->commit(txn, DB_TXN_NOSYNC); + else + (void)txn->abort(txn); + break; + case ISO_UNCOM: /* uncommitted: no read locks (isolation sacrificed) */ + ret = t->db->get(t->db, NULL, &key, &data, DB_READ_UNCOMMITTED); + break; + default: /* none: auto per-op page read lock */ + ret = t->db->get(t->db, NULL, &key, &data, 0); + break; + } + if (ret == DB_BUFFER_SMALL) ret = 0; + if (ret != 0 && ret != DB_NOTFOUND) { + env->err(env, ret, "get k=%u", k); + return (NULL); + } + t->ops++; + } + return (NULL); +} + +static int +open_handle(const char *file, uint32_t flags, DB **dbp) +{ + int ret; + if ((ret = db_create(dbp, env, 0)) != 0) return (ret); + return ((*dbp)->open(*dbp, NULL, file, NULL, DB_BTREE, flags, 0)); +} + +static void +run(int nthreads, double secs) +{ + pthread_t th[256]; + static targ_t ta[256]; + double t0, dur; + uint64_t total = 0; + uint32_t rdflag; + int i; + + stop = go = 0; + rdflag = g_iso == ISO_UNCOM ? DB_READ_UNCOMMITTED : + g_iso == ISO_SNAP ? DB_MULTIVERSION : 0; + for (i = 0; i < nthreads; i++) { + ta[i].ops = 0; ta[i].seed = (unsigned)(i * 2654435761u + 1); ta[i].tid = i; + ta[i].rtxn = NULL; + /* Per-thread handle on the SHARED file. */ + if (open_handle("bench.db", DB_AUTO_COMMIT | DB_THREAD | rdflag, &ta[i].db) != 0) { + fprintf(stderr, "open_handle %d failed\n", i); exit(1); + } + if (g_iso == ISO_SNAP && + env->txn_begin(env, NULL, &ta[i].rtxn, DB_TXN_SNAPSHOT) != 0) { + fprintf(stderr, "txn_begin snap %d failed\n", i); exit(1); + } + pthread_create(&th[i], NULL, worker, &ta[i]); + } + go = 1; + t0 = now_sec(); + struct timespec sl = { (time_t)secs, (long)((secs - (long)secs) * 1e9) }; + nanosleep(&sl, NULL); + stop = 1; + for (i = 0; i < nthreads; i++) { + pthread_join(th[i], NULL); + total += ta[i].ops; + if (ta[i].rtxn != NULL) (void)ta[i].rtxn->commit(ta[i].rtxn, 0); + (void)ta[i].db->close(ta[i].db, 0); + } + dur = now_sec() - t0; + printf("%-6s %3d %12.0f ops/sec\n", + g_iso == ISO_SNAP ? "snap" : g_iso == ISO_RC ? "rc" : + g_iso == ISO_UNCOM ? "uncom" : "none", nthreads, total / dur); + fflush(stdout); +} + +int +main(int argc, char **argv) +{ + DB *db; + DBT key, data; + uint32_t kb, i; + char vbuf[100]; + int ret, ai; + + if (argc < 5) { + fprintf(stderr, "usage: %s \n", argv[0]); + return (1); + } + g_iso = strcmp(argv[1], "snap") == 0 ? ISO_SNAP : + strcmp(argv[1], "rc") == 0 ? ISO_RC : + strcmp(argv[1], "uncom") == 0 ? ISO_UNCOM : ISO_NONE; + g_nkeys = (uint32_t)atoi(argv[2]); + double secs = atof(argv[3]); + + system("rm -rf ./ISODB && mkdir ./ISODB"); + if ((ret = db_env_create(&env, 0)) != 0) { fprintf(stderr, "env_create %d\n", ret); return 1; } + env->set_errfile(env, stderr); + env->set_cachesize(env, 0, 512 * 1024 * 1024, 1); + /* MVCC needs to be enabled on the env for snapshot reads. */ + if ((ret = env->open(env, "./ISODB", DB_CREATE | DB_INIT_MPOOL | + DB_INIT_LOCK | DB_INIT_TXN | DB_INIT_LOG | DB_THREAD | DB_MULTIVERSION, 0)) != 0) { + env->err(env, ret, "env open"); return 1; + } + + /* Load the shared dataset once with a plain handle. */ + if ((ret = open_handle("bench.db", DB_CREATE | DB_AUTO_COMMIT | DB_THREAD, &db)) != 0) { + env->err(env, ret, "load open"); return 1; + } + memset(vbuf, 'v', sizeof(vbuf)); + for (i = 0; i < g_nkeys; i++) { + memset(&key, 0, sizeof(key)); key.data = &kb; key.size = sizeof(kb); kb = i; + memset(&data, 0, sizeof(data)); data.data = vbuf; data.size = sizeof(vbuf); + if ((ret = db->put(db, NULL, &key, &data, 0)) != 0) { env->err(env, ret, "load"); return 1; } + } + (void)db->close(db, 0); + printf("# iso=%s nkeys=%u secs=%.1f (per-thread handle on shared bench.db)\n", + argv[1], g_nkeys, secs); + + for (ai = 4; ai < argc; ai++) + run(atoi(argv[ai]), secs); + + env->close(env, 0); + return (0); +} From f6dce0dd01d9da8fd5607724c38a8376adb4e4a8 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 18:21:05 -0400 Subject: [PATCH 08/12] test(bench): fix targ_t cache-line alignment in scale_iso --- lab/bench/scale_iso.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lab/bench/scale_iso.c b/lab/bench/scale_iso.c index 3edc2ba78..4605596c2 100644 --- a/lab/bench/scale_iso.c +++ b/lab/bench/scale_iso.c @@ -44,7 +44,6 @@ typedef struct { int tid; DB *db; /* this thread's own handle on the shared file */ DB_TXN *rtxn; /* long-lived read txn (snap), else NULL */ - char pad[64]; } targ_t __attribute__((aligned(64))); static double From 04d164396ec69a9037c5afe1b145315948114525 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 18:21:32 -0400 Subject: [PATCH 09/12] test(bench): pad targ_t to one cache line in scale_iso --- lab/bench/scale_iso.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lab/bench/scale_iso.c b/lab/bench/scale_iso.c index 4605596c2..5b59fb339 100644 --- a/lab/bench/scale_iso.c +++ b/lab/bench/scale_iso.c @@ -44,6 +44,7 @@ typedef struct { int tid; DB *db; /* this thread's own handle on the shared file */ DB_TXN *rtxn; /* long-lived read txn (snap), else NULL */ + char pad[32]; /* pad to one 64-byte cache line */ } targ_t __attribute__((aligned(64))); static double From 5f942c2eba8c1d82c864c490bd5349e00afc4d72 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 18:25:41 -0400 Subject: [PATCH 10/12] docs: isolation-scaling results + BDB cursor-fix design scale_iso (per-thread handle on shared bench.db): full-isolation reads ('none', per-op page read locks) scale identically to uncommitted (668k vs 656k @ 24t, 3x to 8t) -- isolation is NOT the scaling barrier, the shared handle was. Per-op explicit txns ('rc') collapse past 8t (txn/ locker/log machinery = bottleneck #3); long-lived MVCC ('snap') avoids it. Documents the cursor-allocation fix design (sharded queues recommended, needs full run_std; ~+47% prize vs shared-handle path). --- docs/design/scaling-findings.md | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/docs/design/scaling-findings.md b/docs/design/scaling-findings.md index 5a5774dd9..a553ed1d0 100644 --- a/docs/design/scaling-findings.md +++ b/docs/design/scaling-findings.md @@ -271,3 +271,62 @@ I/O) is **orthogonal** — `rrand` is 100% cache hits with zero I/O, so it cannot move this curve. The correct next scaling fix is **#1, the per-`get` cursor allocation mutex**, not 1c or #5. +## Scaling with transactional isolation (redesigned probe: `scale_iso`) + +`scale_iso` gives each thread its **own handle on the same `bench.db`** +(removing the shared-handle cursor mutex the supported, app-level way) and +reads under a chosen isolation level. meh, 24t, snapshot lib, 200k keys: + +| isolation | 1t | 8t | 16t | 24t | +|--------------------------------------|-----:|------:|------:|------:| +| `none` (per-op auto page read lock) | 239k | 720k | 675k | **668k** | +| `rc` (per-op txn, read-committed) | 129k | 363k | 139k | 119k | +| `snap` (long-lived MVCC txn) | 348k | 460k | 407k | 420k | +| `uncom` (read-uncommitted) | 271k | 722k | 673k | **656k** | + +Findings: + +- **Isolation is not the scaling barrier.** `none` (which *does* take and + release a page read lock per get) scales **identically to `uncom`** + (668k vs 656k at 24t, both ~3x to 8t). You do not need to drop to + uncommitted reads to scale — the shared *handle*, not the read lock, was + the wall. With per-thread handles, full-isolation reads hold throughput + to 24 threads instead of collapsing. +- **Per-operation explicit transactions don't scale**: `rc` + (txn_begin/commit around every get) collapses past 8t (363k->119k). The + per-op transaction machinery — locker allocation, the transaction + region, the commit log record — is a separate serialization point + (bottleneck #3). A *long-lived* per-thread txn (`snap`) avoids it. +- **Snapshot/MVCC** scales but carries version-lookup overhead (peaks + ~460k); useful when read locks must be avoided for isolation reasons. + +So BDB *can* scale reads to all cores under real isolation **if the +application uses a handle per thread**. The remaining job is to make the +*shared-handle* path scale too, in BDB. + +## Designing the BDB-side cursor-allocation fix (bottleneck #1) + +`__db_cursor_int` (alloc) and `__dbc_close` (free) take `dbp->mutex` up to +three times per `get` to move a transient cursor between the handle's +single `free_queue` and `active_queue`. Options: + +1. **Shard the cursor queues + their mutex** (N partitions chosen by + thread, like BDB already shards lock partitions / mpool regions). + Cleanest contention reduction (~Nx), preserves all semantics (cursors + stay tracked for handle-close cleanup). Cost: N extra mutex-region + handles per DB handle (budget concern with many open handles), and the + ~8 queue-walk sites (refresh/close/secondary/stat/partition/join) must + iterate all partitions. Needs full `run_std` before default-on. +2. **Registered per-thread cursor cache** (stash transient cursors on + `DB_THREAD_INFO`): zero shared-mutex traffic on the hot path, but the + cache must be discoverable from `__db_close` to free leftover cursors, + which reintroduces central registration. +3. **Lock-free `free_queue` (Treiber stack)**: removes the free-list + acquisitions but leaves the `active_queue` mutex — only ~1 of 3 locks. + +Recommended: option 1 (sharded queues) as a separate, fully-`run_std`- +qualified PR — it is an invasive cursor-subsystem change and must not be +rushed. The measured prize is the gap between the shared-handle path +(454k @ 24t) and the per-thread-handle path (668k @ 24t): **~+47%**, and +restoring positive scaling across 8-24 threads for the common +shared-handle usage pattern. From 20466e463c687fc6b0f49b54532fab27ac4a903f Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Thu, 18 Jun 2026 18:34:58 -0400 Subject: [PATCH 11/12] docs: remove dev/research scaling notes from docs tree Scaling measurements, profiling, and design exploration are development notes, not user-facing documentation; they do not belong in ./docs (which should track the code). Moved to the agent notes area, which is never committed. (The same file remains on master from an earlier PR and should be removed there in a follow-up.) --- docs/design/scaling-findings.md | 332 -------------------------------- 1 file changed, 332 deletions(-) delete mode 100644 docs/design/scaling-findings.md diff --git a/docs/design/scaling-findings.md b/docs/design/scaling-findings.md deleted file mode 100644 index a553ed1d0..000000000 --- a/docs/design/scaling-findings.md +++ /dev/null @@ -1,332 +0,0 @@ -# Empirical multi-core scaling findings - -Measured on **meh** (Linux 6.12, Xeon E5-2697 v2, 12c/24t, single socket) with -`lab/bench/scale_bench.c` against a stock Autoconf build of `master`. All data -fit in a 512 MB cache (no device I/O on the read paths). - -## Throughput vs threads - -Read-random, in-cache point gets on a 200k-key B-tree: - -| threads | ops/sec | scaling vs 1t | -|--------:|--------:|--------------:| -| 1 | 200,010 | 1.0x | -| 2 | 234,004 | 1.2x | -| 4 | 399,883 | 2.0x | -| 8 | **464,661** | 2.3x (peak) | -| 12 | 448,775 | — | -| 16 | 416,366 | — | -| 24 | 408,518 | **negative** | - -Read throughput **peaks at ~8 threads and then declines** — on a 24-thread box -we get ~2.3x, not ~12-24x. - -## Where the time goes (perf, rrand @ 24t, self-time) - -``` -66.70% [kernel] (futex) <- threads blocked in the kernel -35.43% __db_pthread_mutex_lock -31.58% __db_pthread_mutex_unlock -30.55% __lll_lock_wait / 28% __lll_lock_wake -26.33% __memp_fget 22.55% __memp_fput -21.90% __atomic_inc 20.91% __atomic_dec -48.56% __bam_search (B-tree descent; calls __memp_fget per level) -``` - -BDB's own wait counters (`lockpart%`, `mpoolhash%`, region waits) are **near -zero** — the contention does not show up there because it is the **per-page -buffer mutex** (a pthread mutex → futex) and the **page reference count** atomic, -not a lock-region/partition mutex. - -## Root cause - -Every B-tree search descends through the **root and internal pages**. Each -`__memp_fget` (a) takes that buffer header's mutex and (b) atomically increments -its pin/reference count; `__memp_fput` reverses it. Because the **root page is -fetched by every operation on every thread**, its mutex and refcount cache line -become a single global serialization point: - -- under the pthread-mutex build, that mutex goes to the kernel → the **66.7% - futex** storm (and negative scaling past the point where futex contention - dominates); -- the refcount atomic inc/dec (~43% combined self-time) ping-pongs one cache - line across all cores. - -## Workload contrasts (confirming the cause) - -- **rhot** (all threads read ONE key): adds lock-manager **partition-mutex** - contention — `lockpart%` rises 13% (4t) → 37.5% (24t) — because every read - takes a *page read lock* on the single hot page. Reads don't conflict - (`conflict%=0`) but acquiring the read lock latches the partition. Page-level - read locking is pure overhead for read-mostly workloads. -- **wrand** (random writes, auto-commit): ~**733 ops/sec single-threaded** — - fsync-per-commit bound — and the **lock region mutex** is heavily contended - (`lockreg_w` in the thousands). Writes need group commit. - -## What this means for the ROADMAP (data-driven re-prioritization) - -1. **#2 latch-free / contention-free buffer-header access is the #1 read-scaling - fix.** The dominant cost is the per-page mutex + refcount on hot (root / - internal) pages. Directions: don't take a kernel-bound mutex to pin a - resident page (optimistic/version-validated reads, LeanStore-style), shard or - bias-lock the pin count, and avoid pinning hot internal pages on the read - path. -2. **#7 cache-line / false-sharing** is the close-second cost (the refcount - atomic). Splitting/aligning the pin counter pairs directly with #2. -3. **#3 group-commit WAL** is the clear write-path win (writes are fsync-bound), - plus the lock-region mutex needs attention under writes. -4. **#4 lock manager**: page-level read locks are needless overhead for - read-mostly access (snapshot/SI reads already skip them — see SSI) and are - the hot-key bottleneck. -5. **Lower priority than the ROADMAP assumed on these boxes:** the mpool **hash** - mutex and lock **partitions** are *not* contended here (~0 wait), and both - hosts are single-socket so NUMA placement (part of #1) can't be validated. - The sharded-buffer-pool hash work matters less than the per-page-pin work - until we test on a multi-socket NUMA box. - -**Next target: #2 (+#7).** Prototype a contention-free pin for resident pages -and re-run this sweep to confirm the 8-thread ceiling lifts. - -## Isolation experiments (which shared structure is the cap?) - -Two workloads were added to separate the candidate causes. Numbers below are an -in-cache random-read sweep on a 12-core Apple Silicon laptop (noisier and fewer -cores than `meh`, but the *pattern* matches the 24-core `perf` profile and is -enough to rank the causes). ops/sec: - -| threads | `rrand` (locked, shared db) | `sepdb` (own db/thread) | `snap` (MVCC, no page locks) | -|--------:|----------------------------:|------------------------:|-----------------------------:| -| 1 | ~560k | ~627k | 463k | -| 2 | ~600k | ~693k | 572k | -| 4 | ~480k | ~638k | 631k | -| 8 | ~340k | ~502k | 459k | -| 12 | ~340k | ~417k | 389k | - -- **`sepdb`** gives every thread its own database file, so there is no shared - root/internal page. It is **30–50% faster than `rrand`** at 4–12 threads, - which confirms the **shared hot page is a real bottleneck**. It still declines, - so it is not the *only* one — the threads still share one env (mpool region, - locker table); the `lockers%` signal is the per-operation locker allocation. -- **`snap`** reads in a per-thread `DB_TXN_SNAPSHOT` transaction: MVCC reads take - **no page read locks** and reuse a single locker, removing the entire - lock-manager per-op cost. It scales to **4 threads (1.36×)** where `rrand` is - already flat — but it **plateaus at 8–12 threads at the same level as - `rrand`**. The only per-op work `snap` still does that `rrand` also does is - `__memp_fget`/`__memp_fput` (pin/unpin every page on the root→leaf path). - -**Conclusion / ranking (measured, not assumed):** - -1. **#2 buffer-header page pin is the dominant cap.** Even with *all* locking - removed (`snap`), throughput still ceilings at 8–12 threads, because every - read pins the shared root/internal pages through `__memp_fget` (per-page - mutex + atomic refcount). This is the change that can lift the ceiling. -2. **#4 lock manager is a secondary cost** in the 2–8 thread range: page read - locks + per-op locker allocation (`lockers%` 25–62%). Snapshot/lock-free - reads relieve it but do not remove the #2 ceiling. -3. **#3 write path** remains fsync-bound and independent (group commit). - -This is why the prototype order is **#2 first** (contention-free pin for -resident pages), then #4 (cache/reuse lockers, or default read-mostly access to -the lock-free path), then #3 (group commit). - -## Reproduce - -```sh -# on a build host: -cc -O2 -pthread lab/bench/scale_bench.c -Ibuild_unix -Lbuild_unix/.libs -ldb-5.3 -o scale_bench -LD_LIBRARY_PATH=build_unix/.libs ./scale_bench rrand 200000 3 1 2 4 8 12 16 24 -# workloads: rrand | rhot | wrand | sepdb | snap -# rrand shared db, locked reads (baseline) -# sepdb one db file per thread (isolates shared-page contention) -# snap per-thread MVCC snapshot txn (isolates lock-manager cost) -``` - -## Measured result: lock-free root snapshot (Stage 1b) - -A/B of `master` vs the root-snapshot build (`perf/swip-stage1-descent`), -`scale_bench rrand`, 200 000 keys, 3 s, 3-sample medians, on `meh` -(Xeon E5-2697 v2, 12c/24t, single socket). Run on tmpfs (`/dev/shm`): -the read working set is cache-resident, so this measures CPU/lock -scaling, not disk — and it avoids meh's nvme-over-fabrics `/scratch`, -which stalls in `submit_bio_wait` on the 536 MB cache-region write (the -cause of every earlier "stuck" run). - -| threads | master ops/s | snapshot ops/s | Δ | -|--------:|-------------:|---------------:|:------:| -| 1 | 199 176 | 210 942 | +5.9% | -| 4 | 410 883 | 529 169 | +28.8% | -| 8 | 459 698 | 560 587 | +21.9% | -| 12 | 448 220 | 511 045 | +14.0% | -| 16 | 428 569 | 491 702 | +14.7% | -| 24 | 419 952 | 444 471 | +5.8% | - -Correctness: TCL test001 (btree+hash), test003, test011 (dups), test026, -a 50 000-op logged-env integrity check, and a concurrent stress (24 -readers verifying a hot set while 6 writers churned the tree's structure, -0 reader mismatches) all pass. - -### Conclusion — is multicore scalability addressed? - -**Partially. A real, measured win, but not a solution.** - -- The snapshot is faster at *every* thread count, biggest in the - contended midrange (+22–29% at 4–8 threads). This confirms the - per-operation pin/latch of the contended root page was a genuine - bottleneck, and removing it from read descents helps materially. -- **But both** master and snapshot still **peak at ~8 threads and then - negatively scale** to 24 (snapshot 560 k @ 8 → 444 k @ 24). The - snapshot **raises the ceiling (~+22%) without removing it.** -- At 24 threads the contention signal has moved: `lockpart%≈0.1` (random - reads spread across lock partitions fine) but **`lockers%`=51–67%** — - the lock manager's **locker region** is now the dominant serialization - point (ROADMAP #4), not the buffer pool. - -So Stage 1b should land on its merits (correct, measured, monotonic -improvement), but the multicore read ceiling past ~8 cores is now bounded -by the lock-manager locker region, which is the next target. Stage 0/0.5 -(landed) addressed a different axis (scan resistance, ~10.7×). Stage 2 -(AIO) is I/O-throughput infrastructure (prefetch/async writeback) and does -not affect this cache-resident read-scaling curve. - -## Profiling the >8-core slide (post-snapshot) and a benchmark critique - -`perf` on meh (24t), snapshot lib, `rrand` 200k in tmpfs. The question: -*why* does aggregate throughput peak at ~8 threads and decline, and is the -benchmark itself fit to drive past 8 cores? - -### Where the time goes (t=24, shared handle) - -Flat profile: **40.5%** of all samples are in `lll_lock_wait` (the kernel -futex) under `__db_pthread_mutex_lock`, split almost evenly: - -``` -18.25% __db_cursor_int (cursor allocate) <- __db_get -18.10% __dbc_close (cursor free) <- __db_get -``` - -`DB->get` allocates a transient cursor and frees it **per operation**; -both link/unlink it into the *one shared handle's* active-cursor queue -under `dbp->mutex`. With every thread sharing a single `db` handle, that -one mutex serializes every `get`. This is the dominant bottleneck once -the root snapshot removes the buffer-pool root pin. - -### Proof: per-thread handles (`sepdb`) remove it - -| threads | rrand (shared handle) | sepdb (per-thread handles) | -|--------:|----------------------:|---------------------------:| -| 1 | 223 851 | 249 302 | -| 8 | 511 698 | **749 896** (3.0×) | -| 16 | 471 626 | 666 188 | -| 24 | 454 073 | **675 996 (+49%)** | - -Per-thread handles scale near-linearly to 8 and run **+49% faster at 24 -threads**. So the negative scaling is largely the shared-handle cursor -mutex — a real BDB serialization, but one the benchmark triggers by the -(common) choice of sharing one handle across all threads. - -### The next bottleneck underneath (sepdb, t=24) - -With the cursor mutex gone, the profile is dominated by -`__memp_fget`/`__memp_fput` — the **mpool hash-bucket shared latch -(`__db_pthread_mutex_lock`) + `__atomic_inc/dec` on the page refcount**, -paid on every page of the descent (root via `__bam_get_root`, -internal/leaf via `__bam_search`, unpin via `__dbc_cleanup`). The root -snapshot removed only the *root* fetch (1 of ~3 per descent); the -internal and leaf pins remain. - -### Is the benchmark fit to drive >8 cores? - -- **Metric is sound** (aggregate `total/dur`); the decline is real. -- **It induces lock-manager traffic**: the env is `DB_INIT_LOCK|DB_INIT_TXN` - and every `get(txn=NULL)` takes a page read lock + a transient locker. - A read-only probe should *also* measure `DB_READ_UNCOMMITTED` / a - non-locking config to separate buffer-pool from lock-manager scaling. -- **Latent false-sharing**: `targ_t ta[256]` is 16 B (4 per cache line), - so adjacent threads' `ops++` share a line — but it does **not** appear - in the profile (the BDB mutexes dwarf it), so it is a code smell, not a - measured contributor. Pad it anyway. -- **Hardware**: meh is **12 physical cores / 24 HT**. Aggregate peaking - at 8 (< 12) means software contention bites *before* the core limit; - the 12→24 tail is additionally bounded by hyperthreading and the - all-core turbo drop (3.5 GHz single → ~3.0 GHz all-core). Even a - perfect read path would scale ~linearly only to ~12, then flatten. - -### Bottleneck ranking (measured), and what it means for the roadmap - -1. **Shared-handle cursor alloc/free mutex** (`__db_cursor_int` + - `__dbc_close` → `dbp->mutex`) — dominant (40% @ 24t). **Not blocked**; - fix = a per-handle cursor cache / lock-free active-cursor list / - per-thread cursor reuse. Biggest tractable win (+49% measured). -2. **mpool pin/unpin atomics + hash-bucket latch** (`__memp_fget`/`fput`) - — the true read ceiling once #1 is gone; needs optimistic/seqlock - buffer access **+ epoch reclamation** (the same prerequisite that - blocks Stage 1c). -3. **Lock-manager read locks** (transient locker + lock object per get; - `lockers%`). -4. **Hardware** (12 cores, HT, all-core turbo, memory bandwidth). - -**Implication:** Stage 1c (lock-free *deeper-internal* descent) only -partially addresses #2 and is blocked on reclamation; Stage 2 #5 (async -I/O) is **orthogonal** — `rrand` is 100% cache hits with zero I/O, so it -cannot move this curve. The correct next scaling fix is **#1, the -per-`get` cursor allocation mutex**, not 1c or #5. - -## Scaling with transactional isolation (redesigned probe: `scale_iso`) - -`scale_iso` gives each thread its **own handle on the same `bench.db`** -(removing the shared-handle cursor mutex the supported, app-level way) and -reads under a chosen isolation level. meh, 24t, snapshot lib, 200k keys: - -| isolation | 1t | 8t | 16t | 24t | -|--------------------------------------|-----:|------:|------:|------:| -| `none` (per-op auto page read lock) | 239k | 720k | 675k | **668k** | -| `rc` (per-op txn, read-committed) | 129k | 363k | 139k | 119k | -| `snap` (long-lived MVCC txn) | 348k | 460k | 407k | 420k | -| `uncom` (read-uncommitted) | 271k | 722k | 673k | **656k** | - -Findings: - -- **Isolation is not the scaling barrier.** `none` (which *does* take and - release a page read lock per get) scales **identically to `uncom`** - (668k vs 656k at 24t, both ~3x to 8t). You do not need to drop to - uncommitted reads to scale — the shared *handle*, not the read lock, was - the wall. With per-thread handles, full-isolation reads hold throughput - to 24 threads instead of collapsing. -- **Per-operation explicit transactions don't scale**: `rc` - (txn_begin/commit around every get) collapses past 8t (363k->119k). The - per-op transaction machinery — locker allocation, the transaction - region, the commit log record — is a separate serialization point - (bottleneck #3). A *long-lived* per-thread txn (`snap`) avoids it. -- **Snapshot/MVCC** scales but carries version-lookup overhead (peaks - ~460k); useful when read locks must be avoided for isolation reasons. - -So BDB *can* scale reads to all cores under real isolation **if the -application uses a handle per thread**. The remaining job is to make the -*shared-handle* path scale too, in BDB. - -## Designing the BDB-side cursor-allocation fix (bottleneck #1) - -`__db_cursor_int` (alloc) and `__dbc_close` (free) take `dbp->mutex` up to -three times per `get` to move a transient cursor between the handle's -single `free_queue` and `active_queue`. Options: - -1. **Shard the cursor queues + their mutex** (N partitions chosen by - thread, like BDB already shards lock partitions / mpool regions). - Cleanest contention reduction (~Nx), preserves all semantics (cursors - stay tracked for handle-close cleanup). Cost: N extra mutex-region - handles per DB handle (budget concern with many open handles), and the - ~8 queue-walk sites (refresh/close/secondary/stat/partition/join) must - iterate all partitions. Needs full `run_std` before default-on. -2. **Registered per-thread cursor cache** (stash transient cursors on - `DB_THREAD_INFO`): zero shared-mutex traffic on the hot path, but the - cache must be discoverable from `__db_close` to free leftover cursors, - which reintroduces central registration. -3. **Lock-free `free_queue` (Treiber stack)**: removes the free-list - acquisitions but leaves the `active_queue` mutex — only ~1 of 3 locks. - -Recommended: option 1 (sharded queues) as a separate, fully-`run_std`- -qualified PR — it is an invasive cursor-subsystem change and must not be -rushed. The measured prize is the gap between the shared-handle path -(454k @ 24t) and the per-thread-handle path (668k @ 24t): **~+47%**, and -restoring positive scaling across 8-24 threads for the common -shared-handle usage pattern. From bb2e78eb094e7b46b2482e8cbade78c4c65e7ff4 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Fri, 19 Jun 2026 12:03:59 -0400 Subject: [PATCH 12/12] perf(btree): fix root-snapshot use-after-free when wiring is skipped The Stage 1b root-snapshot fast path cached the live-root buffer address (bt_rootpage) and read its LSN lock-free during descent. __memp_wire, however, silently no-ops when the per-region wired cap is reached (or on mmap'd pages) yet returned 0 in every case, so the caller could not tell whether the frame was actually wired. __bam_rsnap_refresh cached the frame unconditionally; an un-wired frame is evictable and its address can dangle, so __bam_rsnap_child could read LSN() from a freed/reused buffer. - __memp_wire: add a wiredp out-param reporting whether the frame is wired on return (newly wired or already wired); document that callers caching the address for lock-free reads must check it. - __bam_rsnap_refresh: only build the snapshot and cache bt_rootpage when wiring took; otherwise leave them NULL so the descent falls back to the normal pinned path. - bt_compact: when compaction moves the root to a new page, disarm the cached snapshot (bt_rootpage = NULL) under the handle mutex so a stale frame pointer is never followed. - Fix the cap arithmetic (pages * PCT / 100) so it does not truncate to zero for caches smaller than 100 buffers. Validated: clean build (debug + release); TCL lock001, txn001, test001, ssi001, ssi002, test003, test011, test026, test111 (compaction, incl. -revsplitoff) all pass. --- src/btree/bt_compact.c | 11 +++++++++++ src/btree/bt_search.c | 23 ++++++++++++++++------- src/dbinc_auto/mp_ext.h | 2 +- src/mp/mp_fput.c | 34 ++++++++++++++++++++++++++-------- 4 files changed, 54 insertions(+), 16 deletions(-) diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index ac91e6d0c..1b01d032a 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -2635,6 +2635,17 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) && LSN_NOT_LOGGED(LSN(meta)); bt->bt_root = meta->root = PGNO(root); bt->revision = dbp->mpf->mfp->revision; + /* + * The tree root moved to a new page. Any cached + * root-snapshot for this handle now points at the old + * (soon-to-be-freed) root frame, so disarm it under the + * handle mutex; the next read descent rebuilds it. + * Clearing bt_rootpage is sufficient: __bam_rsnap_child + * bails when it is NULL, before touching bt_rsnap. + */ + MUTEX_LOCK(dbp->env, dbp->mutex); + bt->bt_rootpage = NULL; + MUTEX_UNLOCK(dbp->env, dbp->mutex); if ((ret = __memp_fput(dbp->mpf, ip, root, dbp->priority)) != 0) goto err; diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c index c2178afd0..499d5bf2d 100644 --- a/src/btree/bt_search.c +++ b/src/btree/bt_search.c @@ -74,7 +74,7 @@ __bam_rsnap_refresh(dbc) PAGE *h; db_pgno_t root_pgno; u_int32_t psize; - int ret, t_ret; + int ret, t_ret, wired; dbp = dbc->dbp; env = dbp->env; @@ -87,12 +87,20 @@ __bam_rsnap_refresh(dbc) if ((ret = __memp_fget(mpf, &root_pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0) return (ret); - /* Wire the root so the cached buffer address stays valid/resident. */ - (void)__memp_wire(mpf, h); + /* + * Wire the root so the cached buffer address stays resident: only then + * may we keep a pointer to the frame for later lock-free LSN reads. If + * wiring did not take (mmap'd page, or the per-region wired cap was + * reached), we must not cache the frame -- it is evictable and the + * pointer could dangle -- so we disarm the fast path for this handle + * (bt_rootpage/bt_rsnap NULL) and fall back to the normal descent. + */ + wired = 0; + (void)__memp_wire(mpf, h, &wired); lsn = LSN(h); psize = dbp->pgsize; snap = NULL; - if (TYPE(h) == P_IBTREE && psize != 0 && + if (wired && TYPE(h) == P_IBTREE && psize != 0 && (ret = __os_malloc(env, sizeof(BAM_RSNAP) + psize, &snap)) == 0) { snap->next = NULL; snap->lsn = lsn; @@ -101,13 +109,14 @@ __bam_rsnap_refresh(dbc) } MUTEX_LOCK(env, dbp->mutex); - t->bt_rootpage = h; /* cached wired buffer (stays resident) */ + /* Cache the wired live-root buffer; NULL if it could not be wired. */ + t->bt_rootpage = wired ? h : NULL; /* Retire the previously-current copy to the free list. */ if (t->bt_rsnap != NULL) { ((BAM_RSNAP *)t->bt_rsnap)->next = t->bt_rsnap_free; t->bt_rsnap_free = t->bt_rsnap; } - t->bt_rsnap = snap; /* NULL if the root is a leaf */ + t->bt_rsnap = snap; /* NULL if not wired or the root is a leaf */ t->bt_rsnap_lsn = lsn; MUTEX_UNLOCK(env, dbp->mutex); @@ -268,7 +277,7 @@ retry: if (lock_mode == DB_LOCK_WRITE) * Unwired when the page is freed (__db_free) or the file closes. */ if (h->pgno == BAM_ROOT_PGNO(dbc)) - (void)__memp_wire(mpf, h); + (void)__memp_wire(mpf, h, NULL); /* * Decide if we need to dirty and/or lock this page. diff --git a/src/dbinc_auto/mp_ext.h b/src/dbinc_auto/mp_ext.h index 874243151..8df869c35 100644 --- a/src/dbinc_auto/mp_ext.h +++ b/src/dbinc_auto/mp_ext.h @@ -42,7 +42,7 @@ int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *, int)); int __memp_inmemlist __P((ENV *, char ***, int *)); int __memp_fput_pp __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t)); int __memp_fput __P((DB_MPOOLFILE *, DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY)); -int __memp_wire __P((DB_MPOOLFILE *, void *)); +int __memp_wire __P((DB_MPOOLFILE *, void *, int *)); int __memp_unwire __P((DB_MPOOLFILE *, void *)); int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *)); int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t)); diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c index 92dbfd17f..966ec7494 100644 --- a/src/mp/mp_fput.c +++ b/src/mp/mp_fput.c @@ -295,23 +295,34 @@ __memp_unpin_buffers(env, ip) * while the caller holds only a shared buffer latch; the byte is reset to * 0 wherever a buffer header is (re)initialized. * - * PUBLIC: int __memp_wire __P((DB_MPOOLFILE *, void *)); + * If wiredp is non-NULL it is set to 1 iff the frame is wired (and so + * guaranteed resident) on return -- either newly wired here or already + * wired -- and 0 otherwise (an mmap'd page, or the per-region cap was + * reached). Callers that cache the frame address for later lock-free + * reads MUST only do so when *wiredp is 1; a non-wired frame is evictable + * and its address may dangle. + * + * PUBLIC: int __memp_wire __P((DB_MPOOLFILE *, void *, int *)); */ int -__memp_wire(dbmfp, pgaddr) +__memp_wire(dbmfp, pgaddr, wiredp) DB_MPOOLFILE *dbmfp; void *pgaddr; + int *wiredp; { BH *bhp; DB_MPOOL *dbmp; ENV *env; MPOOL *c_mp; + if (wiredp != NULL) + *wiredp = 0; + /* * A memory-mapped (read-only) file hands back a pointer into the mmap * region, not a buffer frame, so the BH back-computation below would be * a wild pointer. Such pages are never in the buffer pool and never - * evicted, so there is nothing to wire. + * evicted, so there is nothing to wire (and nothing that can dangle). */ if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && (u_int8_t *)pgaddr <= @@ -319,24 +330,31 @@ __memp_wire(dbmfp, pgaddr) return (0); bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); - if (bhp->wired != 0) + if (bhp->wired != 0) { + if (wiredp != NULL) + *wiredp = 1; return (0); + } /* * Cap wiring at MPOOL_WIRED_MAX_PCT of the region's buffers so wiring - * can never starve the cache. Over the cap this is a no-op and the - * descent uses a normal pin. The count is approximate under races, - * which is fine for a cap. + * can never starve the cache. Over the cap this is a no-op (the frame + * stays evictable and the descent uses a normal pin). The count is + * approximate under races, which is fine for a cap. Compute the limit + * as (pages * PCT) / 100 so it does not round down to zero for caches + * smaller than 100 buffers. */ env = dbmfp->env; dbmp = env->mp_handle; c_mp = dbmp->reginfo[bhp->region].primary; if (atomic_read(&c_mp->wired_pages) >= - c_mp->pages / 100 * MPOOL_WIRED_MAX_PCT) + c_mp->pages * MPOOL_WIRED_MAX_PCT / 100) return (0); bhp->wired = 1; (void)atomic_inc(env, &c_mp->wired_pages); + if (wiredp != NULL) + *wiredp = 1; return (0); }