Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 42 additions & 15 deletions src/dbinc/mp.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,12 @@ struct __mpool { /* SHARED */
* The htab and htab_buckets fields are not thread protected as they
* are initialized during mpool creation, and not modified again.
*
* The last_checked, lru_priority, and lru_generation fields are thread
* protected by the region lock.
* The last_checked field (the CLOCK eviction hand) is thread protected
* by the region lock.
*/
roff_t htab; /* Hash table offset. */
u_int32_t htab_buckets; /* Number of hash table entries. */
u_int32_t last_checked; /* Last bucket checked for free. */
u_int32_t lru_priority; /* Priority counter for buffer LRU. */
u_int32_t lru_generation; /* Allocation race condition detector. */
u_int32_t htab_mutexes; /* Number of hash mutexes per region. */

/*
Expand Down Expand Up @@ -366,17 +364,6 @@ struct __db_mpool_fstat_int { /* SHARED */
#endif
};

/*
* The base mpool priority is 1/4th of the name space, or just under 2^30. When
* the LRU priority counter is about to wrap (within a 128-entry 'red zone'
* area) we adjust everybody down so that no one is larger than the new LRU
* priority.
*/
#define MPOOL_LRU_MAX UINT32_MAX
#define MPOOL_LRU_REDZONE (MPOOL_LRU_MAX - 128)
#define MPOOL_LRU_BASE (MPOOL_LRU_MAX / 4)
#define MPOOL_LRU_DECREMENT (MPOOL_LRU_MAX - MPOOL_LRU_BASE)

/*
* Mpool priorities from low to high. Defined in terms of fractions of the
* buffers in the pool.
Expand All @@ -388,6 +375,46 @@ struct __db_mpool_fstat_int { /* SHARED */
#define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */
#define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */

/*
* CLOCK / second-chance buffer "warmth" (scalability Stage 0).
*
* bhp->priority is reused as a small saturating warmth counter in the range
* [0, MPOOL_CLOCK_MAX]. A buffer access refills the warmth toward a level
* chosen from the access priority hint -- done read-first, so an already-warm
* (hot) buffer is not written at all, keeping the read path free of shared
* stores. The eviction hand (__memp_alloc) decrements warmth as it sweeps and
* evicts a buffer once its warmth reaches 0 (second chance).
*
* This replaces the previous timestamp LRU, which wrote bhp->priority and
* advanced a shared c_mp->lru_priority counter on every __memp_fput, and swept
* the whole cache in __memp_reset_lru on wraparound. It is also scan-resistant:
* bulk-scanned pages refill to a low warmth and age out before the hot set.
*/
#define MPOOL_CLOCK_MAX 4 /* Sticky-hot ceiling. */
#define MPOOL_CLOCK_VERY_LOW 0
#define MPOOL_CLOCK_LOW 1
#define MPOOL_CLOCK_DEFAULT 2
#define MPOOL_CLOCK_HIGH 3
#define MPOOL_CLOCK_VERY_HIGH MPOOL_CLOCK_MAX
#define MPOOL_CLOCK_DIRTY_BOOST 1 /* Dirty pages get +1 warmth. */

/*
* Scan resistance (probationary admission + COOL-first eviction).
*
* Warmth is split into a COOL band [0, MPOOL_CLOCK_HOT) and a HOT band
* [MPOOL_CLOCK_HOT, MPOOL_CLOCK_MAX]. A freshly read/created buffer is
* admitted COOL (MPOOL_CLOCK_ADMIT); the access that read it climbs it one
* step, so a page touched only once (e.g. by a sequential scan) stays in the
* COOL band, while a re-referenced page crosses into the HOT band.
*
* The eviction hand ages and reclaims COOL-band buffers and leaves HOT-band
* buffers untouched, so a scan of any length -- which keeps supplying COOL
* victims -- never ages the hot working set. HOT-band buffers are only cooled
* when a full sweep finds no COOL victim (the existing "aggressive" path).
*/
#define MPOOL_CLOCK_HOT MPOOL_CLOCK_DEFAULT /* >= this is protected */
#define MPOOL_CLOCK_ADMIT MPOOL_CLOCK_VERY_LOW /* probationary warmth */

/*
* MPOOLFILE --
* Shared DB_MPOOLFILE information.
Expand Down
85 changes: 45 additions & 40 deletions src/mp/mp_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
size_t freed_space;
u_int32_t buckets, bucket_priority, buffers, cache_reduction;
u_int32_t buckets, bucket_priority, buffers;
u_int32_t dirty_eviction, high_priority, priority, versions;
u_int32_t priority_saved, put_counter, lru_generation, total_buckets;
u_int32_t priority_saved, put_counter, total_buckets;
int aggressive, alloc_freeze, b_lock, giveup;
int h_locked, need_free, obsolete, ret, write_error;
u_int8_t *endp;
Expand Down Expand Up @@ -149,13 +149,12 @@ found: if (offsetp != NULL)

search:
/*
* Anything newer than 1/10th of the buffer pool is ignored during the
* first MPOOL_SEARCH_ALLOC_LIMIT buckets worth of allocation.
* CLOCK replacement: consider buffers of any warmth. The hash-bucket
* scan below ages (decrements) each unreferenced singleton it passes and
* frees one whose warmth has reached 0 (second chance); when aggressive,
* it frees the coldest buffer it can find regardless of warmth.
*/
cache_reduction = c_mp->pages / 10;
high_priority = aggressive ? MPOOL_LRU_MAX :
c_mp->lru_priority - cache_reduction;
lru_generation = c_mp->lru_generation;
high_priority = MPOOL_CLOCK_MAX + 1;

ret = 0;
MAX_LSN(oldest_reader);
Expand Down Expand Up @@ -224,11 +223,11 @@ found: if (offsetp != NULL)

aggressive++;
/*
* Once aggressive, we consider all buffers. By setting
* this to MPOOL_LRU_MAX, we'll still select a victim
* even if all buffers have the highest normal priority.
* Once aggressive, we consider all buffers. Setting the
* ceiling above MPOOL_CLOCK_MAX lets us still select a
* victim even if every buffer is at maximum warmth.
*/
high_priority = MPOOL_LRU_MAX;
high_priority = MPOOL_CLOCK_MAX + 1;
PERFMON4(env, mpool, alloc_wrap,
len, infop->id, aggressive, c_mp->put_counter);
switch (aggressive) {
Expand Down Expand Up @@ -270,7 +269,7 @@ found: if (offsetp != NULL)
if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) {
aggressive = 1;
/* Once aggressive, we consider all buffers. */
high_priority = MPOOL_LRU_MAX;
high_priority = MPOOL_CLOCK_MAX + 1;
}

/* Unlock the region and lock the hash bucket. */
Expand All @@ -291,7 +290,7 @@ found: if (offsetp != NULL)
* don't want to free a buffer out of the middle of an MVCC
* chain, since that requires I/O. So, walk the buffers,
* looking for an obsolete buffer at the end of an MVCC chain.
* Once a buffer becomes obsolete, its LRU priority is
* Once a buffer becomes obsolete, its warmth is
* irrelevant because that version can never be accessed again.
*
* If we don't find any obsolete MVCC buffers, we will get
Expand All @@ -311,18 +310,39 @@ retry_search: bhp = NULL;
* aggressive), and is better than the best candidate
* we have found so far in this bucket.
*/
#ifdef MPOOL_ALLOC_SEARCH_DYN
if (aggressive == 0 &&
++high_priority >= c_mp->lru_priority)
aggressive = 1;
#endif

if (SH_CHAIN_SINGLETON(current_bhp, vc)) {
u_int32_t warmth;

if (BH_REFCOUNT(current_bhp) != 0)
continue;
buffers++;
if (bucket_priority > current_bhp->priority) {
bucket_priority = current_bhp->priority;
warmth = current_bhp->priority;
/*
* COOL-first, scan-resistant selection. In the
* normal (non-aggressive) sweep we only touch the
* COOL band: a HOT-band buffer is protected (not
* aged, not selected), a COOL buffer above 0 is
* aged one step (second chance), and a buffer at
* warmth 0 is the victim. A scan keeps supplying
* warmth-0 COOL pages, so the hot working set is
* never aged out from under it. When aggressive
* (a full sweep found no COOL victim) we consider
* and age every buffer, cooling the hot band too.
* The warmth store races benignly with concurrent
* puts, the same tolerance the old LRU had.
*/
if (!aggressive) {
if (warmth >= MPOOL_CLOCK_HOT)
continue;
if (warmth != 0) {
warmth--;
current_bhp->priority = warmth;
if (warmth != 0)
continue;
}
}
if (bucket_priority > warmth) {
bucket_priority = warmth;
if (bhp != NULL)
atomic_dec(env, &bhp->ref);
bhp = current_bhp;
Expand All @@ -340,11 +360,6 @@ retry_search: bhp = NULL;
mvcc_bhp != NULL;
oldest_bhp = mvcc_bhp,
mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) {
#ifdef MPOOL_ALLOC_SEARCH_DYN
if (aggressive == 0 &&
++high_priority >= c_mp->lru_priority)
aggressive = 1;
#endif
DB_ASSERT(env, mvcc_bhp !=
SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
if ((aggressive < 2 &&
Expand Down Expand Up @@ -458,19 +473,9 @@ retry_search: bhp = NULL;
}

/*
* If another thread has called __memp_reset_lru() while we were
* looking for this buffer, it is possible that we've picked a
* poor choice for a victim. If so toss it and start over.
* Discard any previously remembered hash bucket, we've got
* a winner.
*/
if (lru_generation != c_mp->lru_generation) {
DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
atomic_dec(env, &bhp->ref);
MUTEX_UNLOCK(env, hp->mtx_hash);
MPOOL_REGION_LOCK(env, infop);
hp_saved = NULL;
goto search;
}

this_buffer: /*
* Discard any previously remembered hash bucket, we've got
* a winner.
Expand Down Expand Up @@ -525,7 +530,7 @@ retry_search: bhp = NULL;
__memp_fns(dbmp, bh_mfp),
bhp->pgno, ret);
}
bhp->priority = MPOOL_LRU_REDZONE;
bhp->priority = MPOOL_CLOCK_MAX;

goto next_hb;
}
Expand Down
4 changes: 2 additions & 2 deletions src/mp/mp_fget.c
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
*
* Append the buffer to the tail of the bucket list.
*/
bhp->priority = MPOOL_LRU_REDZONE;
bhp->priority = MPOOL_CLOCK_ADMIT;
bhp->pgno = *pgnoaddr;
bhp->mf_offset = mf_offset;
bhp->bucket = bucket;
Expand Down Expand Up @@ -1013,7 +1013,7 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
h_locked = 0;
DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) > 0);
if (atomic_dec(env, &bhp->ref) == 0) {
bhp->priority = c_mp->lru_priority;
bhp->priority = MPOOL_CLOCK_DEFAULT;
MVCC_MPROTECT(bhp->buf, mfp->pagesize, 0);
}
F_CLR(bhp, BH_EXCLUSIVE);
Expand Down
Loading
Loading