From 38e1f22450f122d04bebb00b79dc34d0af8d1400 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Wed, 17 Jun 2026 10:11:52 -0400 Subject: [PATCH 1/4] perf(mpool): Stage 0 - CLOCK/second-chance replacement (write-free hot reads) Replace the timestamp LRU with a CLOCK / second-chance policy so the read path no longer writes shared memory on hot-page access. Before: every __memp_fput wrote bhp->priority = c_mp->lru_priority and advanced the shared c_mp->lru_priority counter; on wraparound __memp_reset_lru swept the whole cache. Two shared writes per put + an O(cache) sweep. After: bhp->priority is a small saturating warmth counter [0, MPOOL_CLOCK_MAX]. A put refills warmth read-first (store only when colder than the target chosen from the access priority hint), so an already-warm buffer's put performs no store. The __memp_alloc hand decrements warmth as it sweeps and frees a buffer at warmth 0 (second chance); when aggressive it frees the coldest it can find. This is also scan-resistant: bulk-scanned pages refill to low warmth and age out before the hot working set. Deleted the global lru_priority increment and __memp_reset_lru; mapped the two MPOOL_LRU_REDZONE deprioritize stores and the revive/thaw priority sets onto the warmth scale. Eviction safety handshake (refcount==0 + exclusive mtx_buf, dirty write, MVCC freeze) is unchanged; only victim recency selection changed. Validated: clean build; forced-eviction integrity test (50k records through a 1MB cache -> thousands of CLOCK evictions, all read back byte-correct, + 200k random gets, no wedge/ENOMEM). GATED before merge on the full TCL regression and on meh measurement (eviction quality, scan resistance, no read regression). --- src/dbinc/mp.h | 23 ++++++++ src/mp/mp_alloc.c | 38 ++++++++++---- src/mp/mp_fget.c | 4 +- src/mp/mp_fput.c | 131 +++++++++------------------------------------- src/mp/mp_mvcc.c | 2 +- 5 files changed, 78 insertions(+), 120 deletions(-) diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index e00055feb..717427810 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -388,6 +388,29 @@ struct __db_mpool_fstat_int { /* SHARED */ #define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */ #define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */ +/* + * CLOCK / second-chance buffer "warmth" (scalability Stage 0). + * + * bhp->priority is reused as a small saturating warmth counter in the range + * [0, MPOOL_CLOCK_MAX]. A buffer access refills the warmth toward a level + * chosen from the access priority hint -- done read-first, so an already-warm + * (hot) buffer is not written at all, keeping the read path free of shared + * stores. The eviction hand (__memp_alloc) decrements warmth as it sweeps and + * evicts a buffer once its warmth reaches 0 (second chance). + * + * This replaces the previous timestamp LRU, which wrote bhp->priority and + * advanced a shared c_mp->lru_priority counter on every __memp_fput, and swept + * the whole cache in __memp_reset_lru on wraparound. It is also scan-resistant: + * bulk-scanned pages refill to a low warmth and age out before the hot set. + */ +#define MPOOL_CLOCK_MAX 4 /* Sticky-hot ceiling. */ +#define MPOOL_CLOCK_VERY_LOW 0 +#define MPOOL_CLOCK_LOW 1 +#define MPOOL_CLOCK_DEFAULT 2 +#define MPOOL_CLOCK_HIGH 3 +#define MPOOL_CLOCK_VERY_HIGH MPOOL_CLOCK_MAX +#define MPOOL_CLOCK_DIRTY_BOOST 1 /* Dirty pages get +1 warmth. */ + /* * MPOOLFILE -- * Shared DB_MPOOLFILE information. diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c index f7b937afa..0743badce 100644 --- a/src/mp/mp_alloc.c +++ b/src/mp/mp_alloc.c @@ -45,7 +45,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) MPOOL *c_mp; MPOOLFILE *bh_mfp; size_t freed_space; - u_int32_t buckets, bucket_priority, buffers, cache_reduction; + u_int32_t buckets, bucket_priority, buffers; u_int32_t dirty_eviction, high_priority, priority, versions; u_int32_t priority_saved, put_counter, lru_generation, total_buckets; int aggressive, alloc_freeze, b_lock, giveup; @@ -149,12 +149,12 @@ found: if (offsetp != NULL) search: /* - * Anything newer than 1/10th of the buffer pool is ignored during the - * first MPOOL_SEARCH_ALLOC_LIMIT buckets worth of allocation. + * CLOCK replacement: consider buffers of any warmth. The hash-bucket + * scan below ages (decrements) each unreferenced singleton it passes and + * frees one whose warmth has reached 0 (second chance); when aggressive, + * it frees the coldest buffer it can find regardless of warmth. */ - cache_reduction = c_mp->pages / 10; - high_priority = aggressive ? MPOOL_LRU_MAX : - c_mp->lru_priority - cache_reduction; + high_priority = MPOOL_CLOCK_MAX + 1; lru_generation = c_mp->lru_generation; ret = 0; @@ -228,7 +228,7 @@ found: if (offsetp != NULL) * this to MPOOL_LRU_MAX, we'll still select a victim * even if all buffers have the highest normal priority. */ - high_priority = MPOOL_LRU_MAX; + high_priority = MPOOL_CLOCK_MAX + 1; PERFMON4(env, mpool, alloc_wrap, len, infop->id, aggressive, c_mp->put_counter); switch (aggressive) { @@ -270,7 +270,7 @@ found: if (offsetp != NULL) if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) { aggressive = 1; /* Once aggressive, we consider all buffers. */ - high_priority = MPOOL_LRU_MAX; + high_priority = MPOOL_CLOCK_MAX + 1; } /* Unlock the region and lock the hash bucket. */ @@ -318,11 +318,27 @@ retry_search: bhp = NULL; #endif if (SH_CHAIN_SINGLETON(current_bhp, vc)) { + u_int32_t warmth; + if (BH_REFCOUNT(current_bhp) != 0) continue; buffers++; - if (bucket_priority > current_bhp->priority) { - bucket_priority = current_bhp->priority; + warmth = current_bhp->priority; + /* + * Second chance: a warm buffer is aged (warmth + * decremented) and skipped; only a buffer whose + * warmth has reached 0 is a victim -- unless we + * are aggressive, in which case the coldest + * buffer we can find will do. The warmth store + * races benignly with concurrent puts/scans, the + * same tolerance the old LRU priority had. + */ + if (warmth != 0 && !aggressive) { + current_bhp->priority = warmth - 1; + continue; + } + if (bucket_priority > warmth) { + bucket_priority = warmth; if (bhp != NULL) atomic_dec(env, &bhp->ref); bhp = current_bhp; @@ -525,7 +541,7 @@ retry_search: bhp = NULL; __memp_fns(dbmp, bh_mfp), bhp->pgno, ret); } - bhp->priority = MPOOL_LRU_REDZONE; + bhp->priority = MPOOL_CLOCK_MAX; goto next_hb; } diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c index 16de69515..3045f2fb7 100644 --- a/src/mp/mp_fget.c +++ b/src/mp/mp_fget.c @@ -789,7 +789,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && * * Append the buffer to the tail of the bucket list. */ - bhp->priority = MPOOL_LRU_REDZONE; + bhp->priority = MPOOL_CLOCK_MAX; bhp->pgno = *pgnoaddr; bhp->mf_offset = mf_offset; bhp->bucket = bucket; @@ -1013,7 +1013,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && h_locked = 0; DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) > 0); if (atomic_dec(env, &bhp->ref) == 0) { - bhp->priority = c_mp->lru_priority; + bhp->priority = MPOOL_CLOCK_DEFAULT; MVCC_MPROTECT(bhp->buf, mfp->pagesize, 0); } F_CLR(bhp, BH_EXCLUSIVE); diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c index 8cb4e837b..3f0129b1a 100644 --- a/src/mp/mp_fput.c +++ b/src/mp/mp_fput.c @@ -12,7 +12,6 @@ #include "dbinc/log.h" #include "dbinc/mp.h" -static int __memp_reset_lru __P((ENV *, REGINFO *)); /* * __memp_fput_pp -- @@ -75,7 +74,8 @@ __memp_fput(dbmfp, ip, pgaddr, priority) REGINFO *infop, *reginfo; roff_t b_ref; int region; - int adjust, pfactor, ret, t_ret; + int ret; + u_int32_t warmth; char buf[DB_THREADID_STRLEN]; env = dbmfp->env; @@ -202,55 +202,43 @@ __memp_fput(dbmfp, ip, pgaddr, priority) MUTEX_UNLOCK(env, hp->mtx_hash); #endif - /* Update priority values. */ + /* + * CLOCK warmth refill. Choose a target warmth from the access priority + * hint and refill read-first: only store when the buffer is colder than + * the target, so a hot (already-warm) buffer's put performs no shared + * store. The eviction hand ages warmth and frees at 0. We don't lock + * the warmth; a benign race only mis-warms a buffer briefly. + */ if (priority == DB_PRIORITY_VERY_LOW || mfp->priority == MPOOL_PRI_VERY_LOW) - bhp->priority = 0; + warmth = MPOOL_CLOCK_VERY_LOW; else { - /* - * We don't lock the LRU priority or the pages field, if - * we get garbage (which won't happen on a 32-bit machine), it - * only means a buffer has the wrong priority. - */ - bhp->priority = c_mp->lru_priority; - switch (priority) { - default: - case DB_PRIORITY_UNCHANGED: - pfactor = mfp->priority; - break; case DB_PRIORITY_VERY_LOW: - pfactor = MPOOL_PRI_VERY_LOW; + warmth = MPOOL_CLOCK_VERY_LOW; break; case DB_PRIORITY_LOW: - pfactor = MPOOL_PRI_LOW; - break; - case DB_PRIORITY_DEFAULT: - pfactor = MPOOL_PRI_DEFAULT; + warmth = MPOOL_CLOCK_LOW; break; case DB_PRIORITY_HIGH: - pfactor = MPOOL_PRI_HIGH; + warmth = MPOOL_CLOCK_HIGH; break; case DB_PRIORITY_VERY_HIGH: - pfactor = MPOOL_PRI_VERY_HIGH; + warmth = MPOOL_CLOCK_VERY_HIGH; + break; + default: + case DB_PRIORITY_UNCHANGED: + case DB_PRIORITY_DEFAULT: + warmth = (mfp->priority == MPOOL_PRI_LOW) ? + MPOOL_CLOCK_LOW : MPOOL_CLOCK_DEFAULT; break; } - - adjust = 0; - if (pfactor != 0) - adjust = (int)c_mp->pages / pfactor; - - if (F_ISSET(bhp, BH_DIRTY)) - adjust += (int)c_mp->pages / MPOOL_PRI_DIRTY; - - if (adjust > 0) { - if (MPOOL_LRU_REDZONE - bhp->priority >= - (u_int32_t)adjust) - bhp->priority += adjust; - } else if (adjust < 0) - if (bhp->priority > (u_int32_t)-adjust) - bhp->priority += adjust; + if (F_ISSET(bhp, BH_DIRTY) && + warmth <= MPOOL_CLOCK_MAX - MPOOL_CLOCK_DIRTY_BOOST) + warmth += MPOOL_CLOCK_DIRTY_BOOST; } + if (bhp->priority < warmth) + bhp->priority = warmth; /* * __memp_pgwrite only has a shared lock while it clears the @@ -261,78 +249,9 @@ __memp_fput(dbmfp, ip, pgaddr, priority) F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); - /* - * On every buffer put we update the cache lru priority and check - * for wraparound. The increment doesn't need to be atomic: occasional - * lost increments are okay; __memp_reset_lru handles race conditions. - */ - if (++c_mp->lru_priority >= MPOOL_LRU_REDZONE && - (t_ret = __memp_reset_lru(env, infop)) != 0 && ret == 0) - ret = t_ret; - return (ret); } -/* - * __memp_reset_lru -- - * Reset the cache LRU priority when it reaches the upper limit. - */ -static int -__memp_reset_lru(env, infop) - ENV *env; - REGINFO *infop; -{ - BH *bhp, *tbhp; - DB_MPOOL_HASH *hp; - MPOOL *c_mp; - u_int32_t bucket; - int reset; - - /* - * Update the priority so all future allocations will start at the - * bottom. Lock this cache region to ensure that exactly one thread - * will reset this cache's buffers. - */ - c_mp = infop->primary; - MPOOL_REGION_LOCK(env, infop); - reset = c_mp->lru_priority >= MPOOL_LRU_DECREMENT; - if (reset) { - c_mp->lru_priority -= MPOOL_LRU_DECREMENT; - c_mp->lru_generation++; - } - MPOOL_REGION_UNLOCK(env, infop); - - if (!reset) - return (0); - - /* Reduce the priority of every buffer in this cache region. */ - for (hp = R_ADDR(infop, c_mp->htab), - bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { - /* - * Skip empty buckets. - * - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. - */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) - continue; - - MUTEX_LOCK(env, hp->mtx_hash); - SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { - for (tbhp = bhp; tbhp != NULL; - tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) { - if (tbhp->priority > MPOOL_LRU_DECREMENT) - tbhp->priority -= MPOOL_LRU_DECREMENT; - else - tbhp->priority = 0; - } - } - MUTEX_UNLOCK(env, hp->mtx_hash); - } - - COMPQUIET(env, NULL); - return (0); -} /* * __memp_unpin_buffers -- diff --git a/src/mp/mp_mvcc.c b/src/mp/mp_mvcc.c index 770bad813..d578ccb8b 100644 --- a/src/mp/mp_mvcc.c +++ b/src/mp/mp_mvcc.c @@ -565,7 +565,7 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) */ MUTEX_REQUIRED(env, hp->mtx_hash); if (alloc_bhp != NULL) { - alloc_bhp->priority = c_mp->lru_priority; + alloc_bhp->priority = MPOOL_CLOCK_DEFAULT; SH_CHAIN_INSERT_AFTER(frozen_bhp, alloc_bhp, vc, __bh); if (!SH_CHAIN_HASNEXT(alloc_bhp, vc)) { From 86729a506baf76a7bd54e43aadfb210f3b757683 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Wed, 17 Jun 2026 10:49:01 -0400 Subject: [PATCH 2/4] perf(mpool): remove dead timestamp-LRU machinery; fix db_stat output Follow-up cleanup so the code matches the CLOCK replacement: - Remove the now-unused MPOOL fields lru_priority/lru_generation, the MPOOL_LRU_MAX/REDZONE/BASE/DECREMENT macros, the dead MPOOL_ALLOC_SEARCH_DYN blocks, and the obsolete lru_generation victim-retry (reset_lru is gone). - db_stat -m: drop the 'Hash table LRU priority/generation' dump lines (those reported timestamp-LRU scalars that have no CLOCK equivalent -- there is no global counter or reset sweep by design). The CLOCK hand is still reported as 'Hash table last-checked', per-buffer warmth is still dumped per-BH (column relabeled priority->warmth), and eviction effectiveness remains in the unchanged aggregate stats (cache hit/miss, clean/dirty evictions). - Update stale comments referencing LRU priority / MPOOL_LRU_MAX. Rebuilt clean; forced-eviction integrity test + TCL test001 green. --- src/dbinc/mp.h | 17 ++--------------- src/mp/mp_alloc.c | 36 +++++++----------------------------- src/mp/mp_stat.c | 4 +--- 3 files changed, 10 insertions(+), 47 deletions(-) diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index 717427810..99d75a9b3 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -179,14 +179,12 @@ struct __mpool { /* SHARED */ * The htab and htab_buckets fields are not thread protected as they * are initialized during mpool creation, and not modified again. * - * The last_checked, lru_priority, and lru_generation fields are thread - * protected by the region lock. + * The last_checked field (the CLOCK eviction hand) is thread protected + * by the region lock. */ roff_t htab; /* Hash table offset. */ u_int32_t htab_buckets; /* Number of hash table entries. */ u_int32_t last_checked; /* Last bucket checked for free. */ - u_int32_t lru_priority; /* Priority counter for buffer LRU. */ - u_int32_t lru_generation; /* Allocation race condition detector. */ u_int32_t htab_mutexes; /* Number of hash mutexes per region. */ /* @@ -366,17 +364,6 @@ struct __db_mpool_fstat_int { /* SHARED */ #endif }; -/* - * The base mpool priority is 1/4th of the name space, or just under 2^30. When - * the LRU priority counter is about to wrap (within a 128-entry 'red zone' - * area) we adjust everybody down so that no one is larger than the new LRU - * priority. - */ -#define MPOOL_LRU_MAX UINT32_MAX -#define MPOOL_LRU_REDZONE (MPOOL_LRU_MAX - 128) -#define MPOOL_LRU_BASE (MPOOL_LRU_MAX / 4) -#define MPOOL_LRU_DECREMENT (MPOOL_LRU_MAX - MPOOL_LRU_BASE) - /* * Mpool priorities from low to high. Defined in terms of fractions of the * buffers in the pool. diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c index 0743badce..61c97d6b3 100644 --- a/src/mp/mp_alloc.c +++ b/src/mp/mp_alloc.c @@ -47,7 +47,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) size_t freed_space; u_int32_t buckets, bucket_priority, buffers; u_int32_t dirty_eviction, high_priority, priority, versions; - u_int32_t priority_saved, put_counter, lru_generation, total_buckets; + u_int32_t priority_saved, put_counter, total_buckets; int aggressive, alloc_freeze, b_lock, giveup; int h_locked, need_free, obsolete, ret, write_error; u_int8_t *endp; @@ -155,7 +155,6 @@ found: if (offsetp != NULL) * it frees the coldest buffer it can find regardless of warmth. */ high_priority = MPOOL_CLOCK_MAX + 1; - lru_generation = c_mp->lru_generation; ret = 0; MAX_LSN(oldest_reader); @@ -224,9 +223,9 @@ found: if (offsetp != NULL) aggressive++; /* - * Once aggressive, we consider all buffers. By setting - * this to MPOOL_LRU_MAX, we'll still select a victim - * even if all buffers have the highest normal priority. + * Once aggressive, we consider all buffers. Setting the + * ceiling above MPOOL_CLOCK_MAX lets us still select a + * victim even if every buffer is at maximum warmth. */ high_priority = MPOOL_CLOCK_MAX + 1; PERFMON4(env, mpool, alloc_wrap, @@ -291,7 +290,7 @@ found: if (offsetp != NULL) * don't want to free a buffer out of the middle of an MVCC * chain, since that requires I/O. So, walk the buffers, * looking for an obsolete buffer at the end of an MVCC chain. - * Once a buffer becomes obsolete, its LRU priority is + * Once a buffer becomes obsolete, its warmth is * irrelevant because that version can never be accessed again. * * If we don't find any obsolete MVCC buffers, we will get @@ -311,12 +310,6 @@ retry_search: bhp = NULL; * aggressive), and is better than the best candidate * we have found so far in this bucket. */ -#ifdef MPOOL_ALLOC_SEARCH_DYN - if (aggressive == 0 && - ++high_priority >= c_mp->lru_priority) - aggressive = 1; -#endif - if (SH_CHAIN_SINGLETON(current_bhp, vc)) { u_int32_t warmth; @@ -356,11 +349,6 @@ retry_search: bhp = NULL; mvcc_bhp != NULL; oldest_bhp = mvcc_bhp, mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) { -#ifdef MPOOL_ALLOC_SEARCH_DYN - if (aggressive == 0 && - ++high_priority >= c_mp->lru_priority) - aggressive = 1; -#endif DB_ASSERT(env, mvcc_bhp != SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); if ((aggressive < 2 && @@ -474,19 +462,9 @@ retry_search: bhp = NULL; } /* - * If another thread has called __memp_reset_lru() while we were - * looking for this buffer, it is possible that we've picked a - * poor choice for a victim. If so toss it and start over. + * Discard any previously remembered hash bucket, we've got + * a winner. */ - if (lru_generation != c_mp->lru_generation) { - DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); - atomic_dec(env, &bhp->ref); - MUTEX_UNLOCK(env, hp->mtx_hash); - MPOOL_REGION_LOCK(env, infop); - hp_saved = NULL; - goto search; - } - this_buffer: /* * Discard any previously remembered hash bucket, we've got * a winner. diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c index 273c5c938..a019fa891 100644 --- a/src/mp/mp_stat.c +++ b/src/mp/mp_stat.c @@ -709,15 +709,13 @@ __memp_print_hash(env, dbmp, reginfo, fmap, flags) c_mp = reginfo->primary; DB_MSGBUF_INIT(&mb); STAT_ULONG("Hash table last-checked", c_mp->last_checked); - STAT_ULONG("Hash table LRU priority", c_mp->lru_priority); - STAT_ULONG("Hash table LRU generation", c_mp->lru_generation); STAT_ULONG("Put counter", c_mp->put_counter); /* Display the hash table list of BH's. */ __db_msg(env, "BH hash table (%lu hash slots)", (u_long)c_mp->htab_buckets); __db_msg(env, "bucket #: priority, I/O wait, [mutex]"); - __db_msg(env, "\tpageno, file, ref, LSN, address, priority, flags"); + __db_msg(env, "\tpageno, file, ref, LSN, address, warmth, flags"); for (hp = R_ADDR(reginfo, c_mp->htab), bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { From 59e7a04575101dc228eebd02ed55a26926b15dab Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Wed, 17 Jun 2026 11:45:38 -0400 Subject: [PATCH 3/4] perf(mpool): make CLOCK warmth frequency-based for scan resistance The fixed-target refill warmed hot and scan-touched buffers to the same level, so a large scan evicted the hot set as readily as plain LRU (measured: equal hot-set page-ins after a scan). Make warmth climb one step per access toward a hint-derived cap (default cap = MPOOL_CLOCK_MAX), so a frequently-accessed buffer reaches the ceiling while a scan-once buffer only reaches warmth 1 and ages out first. The climb is read-first, so a saturated hot buffer's put still performs no store (write-free hot reads preserved). HIGH/VERY_HIGH hints pin at the ceiling; VERY_LOW evicts ASAP. --- src/mp/mp_fput.c | 56 +++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c index 3f0129b1a..e75152d67 100644 --- a/src/mp/mp_fput.c +++ b/src/mp/mp_fput.c @@ -203,42 +203,36 @@ __memp_fput(dbmfp, ip, pgaddr, priority) #endif /* - * CLOCK warmth refill. Choose a target warmth from the access priority - * hint and refill read-first: only store when the buffer is colder than - * the target, so a hot (already-warm) buffer's put performs no shared - * store. The eviction hand ages warmth and frees at 0. We don't lock - * the warmth; a benign race only mis-warms a buffer briefly. + * CLOCK warmth update. Warmth encodes access frequency: each access + * climbs the buffer one step toward a cap derived from the access + * priority hint, and the eviction hand decrements warmth as it sweeps. + * A frequently-accessed (hot) buffer climbs to its cap and then stays + * there with no further store (the climb is read-first), so the hot + * read path is free of shared writes; a buffer touched once by a scan + * only reaches warmth 1 and is aged out well before the hot set. We + * don't lock the warmth; a benign race only mis-warms a buffer briefly. */ if (priority == DB_PRIORITY_VERY_LOW || mfp->priority == MPOOL_PRI_VERY_LOW) - warmth = MPOOL_CLOCK_VERY_LOW; - else { - switch (priority) { - case DB_PRIORITY_VERY_LOW: - warmth = MPOOL_CLOCK_VERY_LOW; - break; - case DB_PRIORITY_LOW: - warmth = MPOOL_CLOCK_LOW; - break; - case DB_PRIORITY_HIGH: - warmth = MPOOL_CLOCK_HIGH; - break; - case DB_PRIORITY_VERY_HIGH: - warmth = MPOOL_CLOCK_VERY_HIGH; - break; - default: - case DB_PRIORITY_UNCHANGED: - case DB_PRIORITY_DEFAULT: - warmth = (mfp->priority == MPOOL_PRI_LOW) ? - MPOOL_CLOCK_LOW : MPOOL_CLOCK_DEFAULT; - break; - } + bhp->priority = MPOOL_CLOCK_VERY_LOW; + else if (priority == DB_PRIORITY_HIGH || + priority == DB_PRIORITY_VERY_HIGH) { + /* Explicit high-priority hint pins the buffer at the ceiling. */ + if (bhp->priority < MPOOL_CLOCK_MAX) + bhp->priority = MPOOL_CLOCK_MAX; + } else { + u_int32_t cap; + + cap = (priority == DB_PRIORITY_LOW || + mfp->priority == MPOOL_PRI_LOW) ? + MPOOL_CLOCK_LOW : MPOOL_CLOCK_MAX; if (F_ISSET(bhp, BH_DIRTY) && - warmth <= MPOOL_CLOCK_MAX - MPOOL_CLOCK_DIRTY_BOOST) - warmth += MPOOL_CLOCK_DIRTY_BOOST; + cap <= MPOOL_CLOCK_MAX - MPOOL_CLOCK_DIRTY_BOOST) + cap += MPOOL_CLOCK_DIRTY_BOOST; + warmth = bhp->priority; + if (warmth < cap) + bhp->priority = warmth + 1; } - if (bhp->priority < warmth) - bhp->priority = warmth; /* * __memp_pgwrite only has a shared lock while it clears the From 1fc97d216266fec9039db7cf1eff3b07be31bebe Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Wed, 17 Jun 2026 14:36:48 -0400 Subject: [PATCH 4/4] perf(mpool): Stage 0.5 - probationary admission + COOL-first eviction Make scan resistance robust (LeanStore/Umbra cooling model, validated against the sibling sqlxtc bufmgr.c and noxu evictor). Warmth is split into a COOL band [0, MPOOL_CLOCK_HOT) and a HOT band. A freshly read/created buffer is admitted COOL (MPOOL_CLOCK_ADMIT=0; the pin, not warmth, protects it in transit), so a page touched once by a scan stays COOL while a re-referenced page climbs into the HOT band. The eviction hand ages and reclaims COOL-band buffers and leaves HOT-band buffers untouched -- so a scan of any length, which keeps supplying COOL victims, never ages the hot working set. HOT buffers are cooled only when a full sweep finds no COOL victim (existing aggressive path). The COOL aging selects in the same pass it reaches 0 (no wasted sweep). Validated: NOSYNC forced-eviction integrity (50k/1MB, all verified, 1s); TCL test001 btree+hash, test003, recd001 (recovery verified); fsync write timing equal to master (no write regression -- both fsync-bound). Single-CLOCK gave only ~19% scan-resistance gain; this targets robust resistance (meh measurement next). --- src/dbinc/mp.h | 17 +++++++++++++++++ src/mp/mp_alloc.c | 31 +++++++++++++++++++++---------- src/mp/mp_fget.c | 2 +- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index 99d75a9b3..bef9e824b 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -398,6 +398,23 @@ struct __db_mpool_fstat_int { /* SHARED */ #define MPOOL_CLOCK_VERY_HIGH MPOOL_CLOCK_MAX #define MPOOL_CLOCK_DIRTY_BOOST 1 /* Dirty pages get +1 warmth. */ +/* + * Scan resistance (probationary admission + COOL-first eviction). + * + * Warmth is split into a COOL band [0, MPOOL_CLOCK_HOT) and a HOT band + * [MPOOL_CLOCK_HOT, MPOOL_CLOCK_MAX]. A freshly read/created buffer is + * admitted COOL (MPOOL_CLOCK_ADMIT); the access that read it climbs it one + * step, so a page touched only once (e.g. by a sequential scan) stays in the + * COOL band, while a re-referenced page crosses into the HOT band. + * + * The eviction hand ages and reclaims COOL-band buffers and leaves HOT-band + * buffers untouched, so a scan of any length -- which keeps supplying COOL + * victims -- never ages the hot working set. HOT-band buffers are only cooled + * when a full sweep finds no COOL victim (the existing "aggressive" path). + */ +#define MPOOL_CLOCK_HOT MPOOL_CLOCK_DEFAULT /* >= this is protected */ +#define MPOOL_CLOCK_ADMIT MPOOL_CLOCK_VERY_LOW /* probationary warmth */ + /* * MPOOLFILE -- * Shared DB_MPOOLFILE information. diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c index 61c97d6b3..4766b2828 100644 --- a/src/mp/mp_alloc.c +++ b/src/mp/mp_alloc.c @@ -318,17 +318,28 @@ retry_search: bhp = NULL; buffers++; warmth = current_bhp->priority; /* - * Second chance: a warm buffer is aged (warmth - * decremented) and skipped; only a buffer whose - * warmth has reached 0 is a victim -- unless we - * are aggressive, in which case the coldest - * buffer we can find will do. The warmth store - * races benignly with concurrent puts/scans, the - * same tolerance the old LRU priority had. + * COOL-first, scan-resistant selection. In the + * normal (non-aggressive) sweep we only touch the + * COOL band: a HOT-band buffer is protected (not + * aged, not selected), a COOL buffer above 0 is + * aged one step (second chance), and a buffer at + * warmth 0 is the victim. A scan keeps supplying + * warmth-0 COOL pages, so the hot working set is + * never aged out from under it. When aggressive + * (a full sweep found no COOL victim) we consider + * and age every buffer, cooling the hot band too. + * The warmth store races benignly with concurrent + * puts, the same tolerance the old LRU had. */ - if (warmth != 0 && !aggressive) { - current_bhp->priority = warmth - 1; - continue; + if (!aggressive) { + if (warmth >= MPOOL_CLOCK_HOT) + continue; + if (warmth != 0) { + warmth--; + current_bhp->priority = warmth; + if (warmth != 0) + continue; + } } if (bucket_priority > warmth) { bucket_priority = warmth; diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c index 3045f2fb7..63a1791c7 100644 --- a/src/mp/mp_fget.c +++ b/src/mp/mp_fget.c @@ -789,7 +789,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && * * Append the buffer to the tail of the bucket list. */ - bhp->priority = MPOOL_CLOCK_MAX; + bhp->priority = MPOOL_CLOCK_ADMIT; bhp->pgno = *pgnoaddr; bhp->mf_offset = mf_offset; bhp->bucket = bucket;