diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index e00055feb..bef9e824b 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -179,14 +179,12 @@ struct __mpool { /* SHARED */ * The htab and htab_buckets fields are not thread protected as they * are initialized during mpool creation, and not modified again. * - * The last_checked, lru_priority, and lru_generation fields are thread - * protected by the region lock. + * The last_checked field (the CLOCK eviction hand) is thread protected + * by the region lock. */ roff_t htab; /* Hash table offset. */ u_int32_t htab_buckets; /* Number of hash table entries. */ u_int32_t last_checked; /* Last bucket checked for free. */ - u_int32_t lru_priority; /* Priority counter for buffer LRU. */ - u_int32_t lru_generation; /* Allocation race condition detector. */ u_int32_t htab_mutexes; /* Number of hash mutexes per region. */ /* @@ -366,17 +364,6 @@ struct __db_mpool_fstat_int { /* SHARED */ #endif }; -/* - * The base mpool priority is 1/4th of the name space, or just under 2^30. When - * the LRU priority counter is about to wrap (within a 128-entry 'red zone' - * area) we adjust everybody down so that no one is larger than the new LRU - * priority. - */ -#define MPOOL_LRU_MAX UINT32_MAX -#define MPOOL_LRU_REDZONE (MPOOL_LRU_MAX - 128) -#define MPOOL_LRU_BASE (MPOOL_LRU_MAX / 4) -#define MPOOL_LRU_DECREMENT (MPOOL_LRU_MAX - MPOOL_LRU_BASE) - /* * Mpool priorities from low to high. Defined in terms of fractions of the * buffers in the pool. @@ -388,6 +375,46 @@ struct __db_mpool_fstat_int { /* SHARED */ #define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */ #define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */ +/* + * CLOCK / second-chance buffer "warmth" (scalability Stage 0). + * + * bhp->priority is reused as a small saturating warmth counter in the range + * [0, MPOOL_CLOCK_MAX]. A buffer access refills the warmth toward a level + * chosen from the access priority hint -- done read-first, so an already-warm + * (hot) buffer is not written at all, keeping the read path free of shared + * stores. The eviction hand (__memp_alloc) decrements warmth as it sweeps and + * evicts a buffer once its warmth reaches 0 (second chance). + * + * This replaces the previous timestamp LRU, which wrote bhp->priority and + * advanced a shared c_mp->lru_priority counter on every __memp_fput, and swept + * the whole cache in __memp_reset_lru on wraparound. It is also scan-resistant: + * bulk-scanned pages refill to a low warmth and age out before the hot set. + */ +#define MPOOL_CLOCK_MAX 4 /* Sticky-hot ceiling. */ +#define MPOOL_CLOCK_VERY_LOW 0 +#define MPOOL_CLOCK_LOW 1 +#define MPOOL_CLOCK_DEFAULT 2 +#define MPOOL_CLOCK_HIGH 3 +#define MPOOL_CLOCK_VERY_HIGH MPOOL_CLOCK_MAX +#define MPOOL_CLOCK_DIRTY_BOOST 1 /* Dirty pages get +1 warmth. */ + +/* + * Scan resistance (probationary admission + COOL-first eviction). + * + * Warmth is split into a COOL band [0, MPOOL_CLOCK_HOT) and a HOT band + * [MPOOL_CLOCK_HOT, MPOOL_CLOCK_MAX]. A freshly read/created buffer is + * admitted COOL (MPOOL_CLOCK_ADMIT); the access that read it climbs it one + * step, so a page touched only once (e.g. by a sequential scan) stays in the + * COOL band, while a re-referenced page crosses into the HOT band. + * + * The eviction hand ages and reclaims COOL-band buffers and leaves HOT-band + * buffers untouched, so a scan of any length -- which keeps supplying COOL + * victims -- never ages the hot working set. HOT-band buffers are only cooled + * when a full sweep finds no COOL victim (the existing "aggressive" path). + */ +#define MPOOL_CLOCK_HOT MPOOL_CLOCK_DEFAULT /* >= this is protected */ +#define MPOOL_CLOCK_ADMIT MPOOL_CLOCK_VERY_LOW /* probationary warmth */ + /* * MPOOLFILE -- * Shared DB_MPOOLFILE information. diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c index f7b937afa..4766b2828 100644 --- a/src/mp/mp_alloc.c +++ b/src/mp/mp_alloc.c @@ -45,9 +45,9 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) MPOOL *c_mp; MPOOLFILE *bh_mfp; size_t freed_space; - u_int32_t buckets, bucket_priority, buffers, cache_reduction; + u_int32_t buckets, bucket_priority, buffers; u_int32_t dirty_eviction, high_priority, priority, versions; - u_int32_t priority_saved, put_counter, lru_generation, total_buckets; + u_int32_t priority_saved, put_counter, total_buckets; int aggressive, alloc_freeze, b_lock, giveup; int h_locked, need_free, obsolete, ret, write_error; u_int8_t *endp; @@ -149,13 +149,12 @@ found: if (offsetp != NULL) search: /* - * Anything newer than 1/10th of the buffer pool is ignored during the - * first MPOOL_SEARCH_ALLOC_LIMIT buckets worth of allocation. + * CLOCK replacement: consider buffers of any warmth. The hash-bucket + * scan below ages (decrements) each unreferenced singleton it passes and + * frees one whose warmth has reached 0 (second chance); when aggressive, + * it frees the coldest buffer it can find regardless of warmth. */ - cache_reduction = c_mp->pages / 10; - high_priority = aggressive ? MPOOL_LRU_MAX : - c_mp->lru_priority - cache_reduction; - lru_generation = c_mp->lru_generation; + high_priority = MPOOL_CLOCK_MAX + 1; ret = 0; MAX_LSN(oldest_reader); @@ -224,11 +223,11 @@ found: if (offsetp != NULL) aggressive++; /* - * Once aggressive, we consider all buffers. By setting - * this to MPOOL_LRU_MAX, we'll still select a victim - * even if all buffers have the highest normal priority. + * Once aggressive, we consider all buffers. Setting the + * ceiling above MPOOL_CLOCK_MAX lets us still select a + * victim even if every buffer is at maximum warmth. */ - high_priority = MPOOL_LRU_MAX; + high_priority = MPOOL_CLOCK_MAX + 1; PERFMON4(env, mpool, alloc_wrap, len, infop->id, aggressive, c_mp->put_counter); switch (aggressive) { @@ -270,7 +269,7 @@ found: if (offsetp != NULL) if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) { aggressive = 1; /* Once aggressive, we consider all buffers. */ - high_priority = MPOOL_LRU_MAX; + high_priority = MPOOL_CLOCK_MAX + 1; } /* Unlock the region and lock the hash bucket. */ @@ -291,7 +290,7 @@ found: if (offsetp != NULL) * don't want to free a buffer out of the middle of an MVCC * chain, since that requires I/O. So, walk the buffers, * looking for an obsolete buffer at the end of an MVCC chain. - * Once a buffer becomes obsolete, its LRU priority is + * Once a buffer becomes obsolete, its warmth is * irrelevant because that version can never be accessed again. * * If we don't find any obsolete MVCC buffers, we will get @@ -311,18 +310,39 @@ retry_search: bhp = NULL; * aggressive), and is better than the best candidate * we have found so far in this bucket. */ -#ifdef MPOOL_ALLOC_SEARCH_DYN - if (aggressive == 0 && - ++high_priority >= c_mp->lru_priority) - aggressive = 1; -#endif - if (SH_CHAIN_SINGLETON(current_bhp, vc)) { + u_int32_t warmth; + if (BH_REFCOUNT(current_bhp) != 0) continue; buffers++; - if (bucket_priority > current_bhp->priority) { - bucket_priority = current_bhp->priority; + warmth = current_bhp->priority; + /* + * COOL-first, scan-resistant selection. In the + * normal (non-aggressive) sweep we only touch the + * COOL band: a HOT-band buffer is protected (not + * aged, not selected), a COOL buffer above 0 is + * aged one step (second chance), and a buffer at + * warmth 0 is the victim. A scan keeps supplying + * warmth-0 COOL pages, so the hot working set is + * never aged out from under it. When aggressive + * (a full sweep found no COOL victim) we consider + * and age every buffer, cooling the hot band too. + * The warmth store races benignly with concurrent + * puts, the same tolerance the old LRU had. + */ + if (!aggressive) { + if (warmth >= MPOOL_CLOCK_HOT) + continue; + if (warmth != 0) { + warmth--; + current_bhp->priority = warmth; + if (warmth != 0) + continue; + } + } + if (bucket_priority > warmth) { + bucket_priority = warmth; if (bhp != NULL) atomic_dec(env, &bhp->ref); bhp = current_bhp; @@ -340,11 +360,6 @@ retry_search: bhp = NULL; mvcc_bhp != NULL; oldest_bhp = mvcc_bhp, mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) { -#ifdef MPOOL_ALLOC_SEARCH_DYN - if (aggressive == 0 && - ++high_priority >= c_mp->lru_priority) - aggressive = 1; -#endif DB_ASSERT(env, mvcc_bhp != SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); if ((aggressive < 2 && @@ -458,19 +473,9 @@ retry_search: bhp = NULL; } /* - * If another thread has called __memp_reset_lru() while we were - * looking for this buffer, it is possible that we've picked a - * poor choice for a victim. If so toss it and start over. + * Discard any previously remembered hash bucket, we've got + * a winner. */ - if (lru_generation != c_mp->lru_generation) { - DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); - atomic_dec(env, &bhp->ref); - MUTEX_UNLOCK(env, hp->mtx_hash); - MPOOL_REGION_LOCK(env, infop); - hp_saved = NULL; - goto search; - } - this_buffer: /* * Discard any previously remembered hash bucket, we've got * a winner. @@ -525,7 +530,7 @@ retry_search: bhp = NULL; __memp_fns(dbmp, bh_mfp), bhp->pgno, ret); } - bhp->priority = MPOOL_LRU_REDZONE; + bhp->priority = MPOOL_CLOCK_MAX; goto next_hb; } diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c index 16de69515..63a1791c7 100644 --- a/src/mp/mp_fget.c +++ b/src/mp/mp_fget.c @@ -789,7 +789,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && * * Append the buffer to the tail of the bucket list. */ - bhp->priority = MPOOL_LRU_REDZONE; + bhp->priority = MPOOL_CLOCK_ADMIT; bhp->pgno = *pgnoaddr; bhp->mf_offset = mf_offset; bhp->bucket = bucket; @@ -1013,7 +1013,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && h_locked = 0; DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) > 0); if (atomic_dec(env, &bhp->ref) == 0) { - bhp->priority = c_mp->lru_priority; + bhp->priority = MPOOL_CLOCK_DEFAULT; MVCC_MPROTECT(bhp->buf, mfp->pagesize, 0); } F_CLR(bhp, BH_EXCLUSIVE); diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c index 8cb4e837b..e75152d67 100644 --- a/src/mp/mp_fput.c +++ b/src/mp/mp_fput.c @@ -12,7 +12,6 @@ #include "dbinc/log.h" #include "dbinc/mp.h" -static int __memp_reset_lru __P((ENV *, REGINFO *)); /* * __memp_fput_pp -- @@ -75,7 +74,8 @@ __memp_fput(dbmfp, ip, pgaddr, priority) REGINFO *infop, *reginfo; roff_t b_ref; int region; - int adjust, pfactor, ret, t_ret; + int ret; + u_int32_t warmth; char buf[DB_THREADID_STRLEN]; env = dbmfp->env; @@ -202,54 +202,36 @@ __memp_fput(dbmfp, ip, pgaddr, priority) MUTEX_UNLOCK(env, hp->mtx_hash); #endif - /* Update priority values. */ + /* + * CLOCK warmth update. Warmth encodes access frequency: each access + * climbs the buffer one step toward a cap derived from the access + * priority hint, and the eviction hand decrements warmth as it sweeps. + * A frequently-accessed (hot) buffer climbs to its cap and then stays + * there with no further store (the climb is read-first), so the hot + * read path is free of shared writes; a buffer touched once by a scan + * only reaches warmth 1 and is aged out well before the hot set. We + * don't lock the warmth; a benign race only mis-warms a buffer briefly. + */ if (priority == DB_PRIORITY_VERY_LOW || mfp->priority == MPOOL_PRI_VERY_LOW) - bhp->priority = 0; - else { - /* - * We don't lock the LRU priority or the pages field, if - * we get garbage (which won't happen on a 32-bit machine), it - * only means a buffer has the wrong priority. - */ - bhp->priority = c_mp->lru_priority; - - switch (priority) { - default: - case DB_PRIORITY_UNCHANGED: - pfactor = mfp->priority; - break; - case DB_PRIORITY_VERY_LOW: - pfactor = MPOOL_PRI_VERY_LOW; - break; - case DB_PRIORITY_LOW: - pfactor = MPOOL_PRI_LOW; - break; - case DB_PRIORITY_DEFAULT: - pfactor = MPOOL_PRI_DEFAULT; - break; - case DB_PRIORITY_HIGH: - pfactor = MPOOL_PRI_HIGH; - break; - case DB_PRIORITY_VERY_HIGH: - pfactor = MPOOL_PRI_VERY_HIGH; - break; - } - - adjust = 0; - if (pfactor != 0) - adjust = (int)c_mp->pages / pfactor; - - if (F_ISSET(bhp, BH_DIRTY)) - adjust += (int)c_mp->pages / MPOOL_PRI_DIRTY; - - if (adjust > 0) { - if (MPOOL_LRU_REDZONE - bhp->priority >= - (u_int32_t)adjust) - bhp->priority += adjust; - } else if (adjust < 0) - if (bhp->priority > (u_int32_t)-adjust) - bhp->priority += adjust; + bhp->priority = MPOOL_CLOCK_VERY_LOW; + else if (priority == DB_PRIORITY_HIGH || + priority == DB_PRIORITY_VERY_HIGH) { + /* Explicit high-priority hint pins the buffer at the ceiling. */ + if (bhp->priority < MPOOL_CLOCK_MAX) + bhp->priority = MPOOL_CLOCK_MAX; + } else { + u_int32_t cap; + + cap = (priority == DB_PRIORITY_LOW || + mfp->priority == MPOOL_PRI_LOW) ? + MPOOL_CLOCK_LOW : MPOOL_CLOCK_MAX; + if (F_ISSET(bhp, BH_DIRTY) && + cap <= MPOOL_CLOCK_MAX - MPOOL_CLOCK_DIRTY_BOOST) + cap += MPOOL_CLOCK_DIRTY_BOOST; + warmth = bhp->priority; + if (warmth < cap) + bhp->priority = warmth + 1; } /* @@ -261,78 +243,9 @@ __memp_fput(dbmfp, ip, pgaddr, priority) F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); - /* - * On every buffer put we update the cache lru priority and check - * for wraparound. The increment doesn't need to be atomic: occasional - * lost increments are okay; __memp_reset_lru handles race conditions. - */ - if (++c_mp->lru_priority >= MPOOL_LRU_REDZONE && - (t_ret = __memp_reset_lru(env, infop)) != 0 && ret == 0) - ret = t_ret; - return (ret); } -/* - * __memp_reset_lru -- - * Reset the cache LRU priority when it reaches the upper limit. - */ -static int -__memp_reset_lru(env, infop) - ENV *env; - REGINFO *infop; -{ - BH *bhp, *tbhp; - DB_MPOOL_HASH *hp; - MPOOL *c_mp; - u_int32_t bucket; - int reset; - - /* - * Update the priority so all future allocations will start at the - * bottom. Lock this cache region to ensure that exactly one thread - * will reset this cache's buffers. - */ - c_mp = infop->primary; - MPOOL_REGION_LOCK(env, infop); - reset = c_mp->lru_priority >= MPOOL_LRU_DECREMENT; - if (reset) { - c_mp->lru_priority -= MPOOL_LRU_DECREMENT; - c_mp->lru_generation++; - } - MPOOL_REGION_UNLOCK(env, infop); - - if (!reset) - return (0); - - /* Reduce the priority of every buffer in this cache region. */ - for (hp = R_ADDR(infop, c_mp->htab), - bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { - /* - * Skip empty buckets. - * - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. - */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) - continue; - - MUTEX_LOCK(env, hp->mtx_hash); - SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { - for (tbhp = bhp; tbhp != NULL; - tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) { - if (tbhp->priority > MPOOL_LRU_DECREMENT) - tbhp->priority -= MPOOL_LRU_DECREMENT; - else - tbhp->priority = 0; - } - } - MUTEX_UNLOCK(env, hp->mtx_hash); - } - - COMPQUIET(env, NULL); - return (0); -} /* * __memp_unpin_buffers -- diff --git a/src/mp/mp_mvcc.c b/src/mp/mp_mvcc.c index 770bad813..d578ccb8b 100644 --- a/src/mp/mp_mvcc.c +++ b/src/mp/mp_mvcc.c @@ -565,7 +565,7 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) */ MUTEX_REQUIRED(env, hp->mtx_hash); if (alloc_bhp != NULL) { - alloc_bhp->priority = c_mp->lru_priority; + alloc_bhp->priority = MPOOL_CLOCK_DEFAULT; SH_CHAIN_INSERT_AFTER(frozen_bhp, alloc_bhp, vc, __bh); if (!SH_CHAIN_HASNEXT(alloc_bhp, vc)) { diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c index 273c5c938..a019fa891 100644 --- a/src/mp/mp_stat.c +++ b/src/mp/mp_stat.c @@ -709,15 +709,13 @@ __memp_print_hash(env, dbmp, reginfo, fmap, flags) c_mp = reginfo->primary; DB_MSGBUF_INIT(&mb); STAT_ULONG("Hash table last-checked", c_mp->last_checked); - STAT_ULONG("Hash table LRU priority", c_mp->lru_priority); - STAT_ULONG("Hash table LRU generation", c_mp->lru_generation); STAT_ULONG("Put counter", c_mp->put_counter); /* Display the hash table list of BH's. */ __db_msg(env, "BH hash table (%lu hash slots)", (u_long)c_mp->htab_buckets); __db_msg(env, "bucket #: priority, I/O wait, [mutex]"); - __db_msg(env, "\tpageno, file, ref, LSN, address, priority, flags"); + __db_msg(env, "\tpageno, file, ref, LSN, address, warmth, flags"); for (hp = R_ADDR(reginfo, c_mp->htab), bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {