From eba2a0e9fc2c6748869b070b203fe44e409ced85 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Fri, 19 Jun 2026 21:09:03 -0400 Subject: [PATCH 1/4] fix(bench): size lock subsystem and log buffer for scale The TPROC drivers opened the environment with default lock-region sizing (~1000 locks/objects/lockers) and a tiny default log buffer. A batched bulk load or a many-thread run exhausts those entries and fails mid-run with ENOMEM (BDB2055 'Lock table is out of available lock entries', BDB1501 'Logging region out of memory'), and an unchecked failure during populate could leave a partially built environment that crashes on reuse. Size the lock subsystem (200k locks/objects/lockers) and the log buffer (16MB) when the corresponding subsystems are enabled. Verified populate + run at scale 5 and 50. --- lab/bench/bdb_bench.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lab/bench/bdb_bench.h b/lab/bench/bdb_bench.h index 55815625d..d59f8c9bb 100644 --- a/lab/bench/bdb_bench.h +++ b/lab/bench/bdb_bench.h @@ -176,6 +176,26 @@ bb_env_open(bb_config *c, DB_ENV **envp) if (c->use_lock || c->use_txn) (void)env->set_lk_detect(env, DB_LOCK_DEFAULT); + /* + * Size the lock subsystem generously. The default region holds only + * ~1000 locks/lockers/objects; a batched bulk load or a many-thread + * run needs far more (each held lock and each active transaction + * consumes entries), and exhausting them returns ENOMEM mid-run. + */ + if (c->use_lock || c->use_txn) { + (void)env->set_lk_max_locks(env, 200000); + (void)env->set_lk_max_objects(env, 200000); + (void)env->set_lk_max_lockers(env, 200000); + } + + /* + * Size the in-memory log buffer so a write-heavy run does not stall + * rolling tiny (default) log segments. Durability is governed + * separately by the -d toggle below. + */ + if (c->use_log || c->use_txn) + (void)env->set_lg_bsize(env, 16 * 1024 * 1024); + if (c->use_txn) { if (c->durability == BB_NOSYNC) (void)env->set_flags(env, DB_TXN_NOSYNC, 1); From 332a1ed3205ec1434b773255bcc01f3dcbc2944e Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Fri, 19 Jun 2026 21:23:02 -0400 Subject: [PATCH 2/4] bench: add -D deadlock-detection mode knob Add a -D N toggle to the shared harness: 0 (default) keeps BDB's detect-on-every-conflict behavior; N>0 disables inline detection and runs a background deadlock detector every N ms instead. Lets a run A/B the cost of synchronous vs periodic deadlock detection. Measurement tooling only; no engine change. (A/B on a 12-core box found the two modes within noise on the contended debit/credit workload.) --- lab/README.md | 1 + lab/bench/bdb_bench.h | 49 +++++++++++++++++++++++++++++++++++++++++-- lab/bench/tproc_b.c | 21 ++++++++++++------- lab/bench/tproc_c.c | 25 ++++++++++++++-------- lab/bench/tproc_h.c | 21 ++++++++++++------- 5 files changed, 92 insertions(+), 25 deletions(-) diff --git a/lab/README.md b/lab/README.md index caf1b8282..14e19e3be 100644 --- a/lab/README.md +++ b/lab/README.md @@ -38,6 +38,7 @@ individual protections removed, to measure their cost: | `-d sync\|wnosync\|nosync` | commit durability (default `nosync`) | | `-m` | MVCC / snapshot isolation (`DB_MULTIVERSION`) | | `-C` | Concurrent Data Store (`DB_INIT_CDB`) instead of full txns | +| `-D N` | deadlock detection: `0` (default) detects on every conflict; `N>0` runs a background detector every `N` ms and leaves the hot path free of detection | | `-c` `-t` `-S` `-s` `-i` | cache bytes, threads, scale, seconds, init | Example: diff --git a/lab/bench/bdb_bench.h b/lab/bench/bdb_bench.h index d59f8c9bb..8eba38c84 100644 --- a/lab/bench/bdb_bench.h +++ b/lab/bench/bdb_bench.h @@ -59,6 +59,9 @@ typedef struct { int use_log; /* DB_INIT_LOG */ int use_mvcc; /* DB_MULTIVERSION + DB_TXN_SNAPSHOT readers */ int use_cdb; /* DB_INIT_CDB (concurrent data store) */ + int dd_periodic; /* deadlock detect: 0=on every conflict (default), + * else run a background detector every N ms and + * set_lk_detect(NONE) on the hot path */ enum bb_durability durability; } bb_config; @@ -95,7 +98,7 @@ bb_getopt(int argc, char **argv, bb_config *c) int ch; extern char *optarg; - while ((ch = getopt(argc, argv, "h:c:t:S:s:imCd:X:R:")) != EOF) + while ((ch = getopt(argc, argv, "h:c:t:S:s:imCd:X:R:D:")) != EOF) switch (ch) { case 'h': c->home = optarg; break; case 'c': c->cachebytes = strtoull(optarg, NULL, 10); break; @@ -106,6 +109,7 @@ bb_getopt(int argc, char **argv, bb_config *c) case 'm': c->use_mvcc = 1; break; case 'C': c->use_cdb = 1; break; case 'R': c->seed = (unsigned)strtoul(optarg, NULL, 10); break; + case 'D': c->dd_periodic = atoi(optarg); break; /* dd interval ms */ case 'd': if (strcmp(optarg, "sync") == 0) c->durability = BB_SYNC; else if (strcmp(optarg, "wnosync") == 0) @@ -173,7 +177,15 @@ bb_env_open(bb_config *c, DB_ENV **envp) * victim and returns DB_LOCK_DEADLOCK) instead of blocking forever -- * the workloads here intentionally contend on shared rows. */ - if (c->use_lock || c->use_txn) + /* + * Deadlock detection. By default BDB can run the detector on every + * lock conflict (set_lk_detect) -- correct, but every blocked acquire + * pays the detector's cost inline. With -D N we instead leave the hot + * path free of detection and run a background detector every N ms + * (started by bb_start_dd after open); a victim is chosen at the next + * sweep rather than immediately. This A/B isolates the detector cost. + */ + if ((c->use_lock || c->use_txn) && c->dd_periodic == 0) (void)env->set_lk_detect(env, DB_LOCK_DEFAULT); /* @@ -307,4 +319,37 @@ bb_print_config(const bb_config *c, const char *name) c->durability == BB_WRITE_NOSYNC ? "wnosync" : "nosync"); } +/* ---- background deadlock detector (for -D N) -------------------- */ +struct bb_dd_arg { DB_ENV *env; int interval_ms; volatile int *stop; }; + +static void * +bb_dd_thread(void *a) +{ + struct bb_dd_arg *arg = a; + int rejected; + + while (!*arg->stop) { + usleep((useconds_t)arg->interval_ms * 1000); + (void)arg->env->lock_detect(arg->env, 0, + DB_LOCK_YOUNGEST, &rejected); + } + return NULL; +} + +/* + * bb_start_dd / bb_stop_dd -- run a periodic deadlock detector when + * c->dd_periodic > 0. No-ops otherwise. *stop must outlive the thread. + */ +static int +bb_start_dd(const bb_config *c, DB_ENV *env, pthread_t *tid, + struct bb_dd_arg *arg, volatile int *stop) +{ + if (c->dd_periodic <= 0) + return 0; + arg->env = env; + arg->interval_ms = c->dd_periodic; + arg->stop = stop; + return pthread_create(tid, NULL, bb_dd_thread, arg); +} + #endif /* BDB_BENCH_H */ diff --git a/lab/bench/tproc_b.c b/lab/bench/tproc_b.c index 845aeff19..e451a2aeb 100644 --- a/lab/bench/tproc_b.c +++ b/lab/bench/tproc_b.c @@ -235,13 +235,20 @@ main(int argc, char **argv) } g_stop = 0; - t0 = bb_now_ms(); - for (t = 0; t < g_cfg.threads; t++) - pthread_create(&tids[t], NULL, worker_main, &workers[t]); - usleep((useconds_t)g_cfg.seconds * 1000000); - g_stop = 1; - for (t = 0; t < g_cfg.threads; t++) - pthread_join(tids[t], NULL); + { + pthread_t ddtid; struct bb_dd_arg ddarg; int dd_on; + dd_on = (bb_start_dd(&g_cfg, g_env, &ddtid, &ddarg, &g_stop) == 0 + && g_cfg.dd_periodic > 0); + t0 = bb_now_ms(); + for (t = 0; t < g_cfg.threads; t++) + pthread_create(&tids[t], NULL, worker_main, &workers[t]); + usleep((useconds_t)g_cfg.seconds * 1000000); + g_stop = 1; + for (t = 0; t < g_cfg.threads; t++) + pthread_join(tids[t], NULL); + if (dd_on) + pthread_join(ddtid, NULL); + } elapsed = (bb_now_ms() - t0) / 1000.0; okall = retryall = 0; diff --git a/lab/bench/tproc_c.c b/lab/bench/tproc_c.c index a25892214..a2f131d5c 100644 --- a/lab/bench/tproc_c.c +++ b/lab/bench/tproc_c.c @@ -534,15 +534,22 @@ main(int argc, char **argv) } g_stop = 0; - t0 = bb_now_ms(); - for (t = 0; t < g_cfg.threads; t++) - pthread_create(&tids[t], NULL, worker_main, &workers[t]); - - /* Run for the requested wall-clock, then signal stop. */ - usleep((useconds_t)g_cfg.seconds * 1000000); - g_stop = 1; - for (t = 0; t < g_cfg.threads; t++) - pthread_join(tids[t], NULL); + { + pthread_t ddtid; struct bb_dd_arg ddarg; int dd_on; + dd_on = (bb_start_dd(&g_cfg, g_env, &ddtid, &ddarg, &g_stop) == 0 + && g_cfg.dd_periodic > 0); + t0 = bb_now_ms(); + for (t = 0; t < g_cfg.threads; t++) + pthread_create(&tids[t], NULL, worker_main, &workers[t]); + + /* Run for the requested wall-clock, then signal stop. */ + usleep((useconds_t)g_cfg.seconds * 1000000); + g_stop = 1; + for (t = 0; t < g_cfg.threads; t++) + pthread_join(tids[t], NULL); + if (dd_on) + pthread_join(ddtid, NULL); + } elapsed = (bb_now_ms() - t0) / 1000.0; memset(total, 0, sizeof(total)); diff --git a/lab/bench/tproc_h.c b/lab/bench/tproc_h.c index bbe576786..2aefb4d27 100644 --- a/lab/bench/tproc_h.c +++ b/lab/bench/tproc_h.c @@ -399,13 +399,20 @@ main(int argc, char **argv) } g_stop = 0; - t0 = bb_now_ms(); - for (t = 0; t < nthreads; t++) - pthread_create(&tids[t], NULL, worker_main, &workers[t]); - usleep((useconds_t)g_cfg.seconds * 1000000); - g_stop = 1; - for (t = 0; t < nthreads; t++) - pthread_join(tids[t], NULL); + { + pthread_t ddtid; struct bb_dd_arg ddarg; int dd_on; + dd_on = (bb_start_dd(&g_cfg, g_env, &ddtid, &ddarg, &g_stop) == 0 + && g_cfg.dd_periodic > 0); + t0 = bb_now_ms(); + for (t = 0; t < nthreads; t++) + pthread_create(&tids[t], NULL, worker_main, &workers[t]); + usleep((useconds_t)g_cfg.seconds * 1000000); + g_stop = 1; + for (t = 0; t < nthreads; t++) + pthread_join(tids[t], NULL); + if (dd_on) + pthread_join(ddtid, NULL); + } elapsed = (bb_now_ms() - t0) / 1000.0; memset(total, 0, sizeof(total)); From ec259f1b9bcf8597bcd2ea3cc974e058e1acab8f Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Fri, 19 Jun 2026 21:41:38 -0400 Subject: [PATCH 3/4] bench: add lock_bench direct lock-manager probe A micro-benchmark that exercises the lock subsystem in isolation: each thread allocates its own locker and loops lock_get/lock_put on either per-thread (distinct, no-conflict) or shared read objects, with no access method or buffer pool in the path. This exposes lock-manager scaling that btree-bound workloads (e.g. scale_bench rrand) hide behind page cache misses. On a 24-thread box it shows the per-op global locker mutex plateauing throughput at ~8 threads. --- lab/README.md | 6 ++ lab/bench/Makefile | 5 +- lab/bench/lock_bench.c | 164 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 lab/bench/lock_bench.c diff --git a/lab/README.md b/lab/README.md index 14e19e3be..1ddf9fe83 100644 --- a/lab/README.md +++ b/lab/README.md @@ -16,6 +16,12 @@ cd ../lab/bench && make BDB=../../build_unix # build the drivers drives a shared environment from N threads and reports ops/sec plus region-contention signals. +- **`lock_bench`** — direct lock-manager probe. Each thread allocates its own + locker and calls `lock_get`/`lock_put` in a tight loop on `distinct` + (per-thread, no-conflict) or `shared` (read-lock the same objects) keys, + bypassing the access methods and buffer pool so the lock subsystem's own + scaling is measured in isolation. + - **`tproc_c` / `tproc_b` / `tproc_h`** — HammerDB-style workloads (independently implemented; **not** the TPC benchmarks and not comparable to TPC results): diff --git a/lab/bench/Makefile b/lab/bench/Makefile index 8bb383db6..1d42fec15 100644 --- a/lab/bench/Makefile +++ b/lab/bench/Makefile @@ -8,13 +8,16 @@ CFLAGS ?= -O2 -pthread INCLUDES = -I$(BDB) LIBS = -L$(BDB)/.libs -ldb-5.3 -BENCHES = scale_bench tproc_c tproc_b tproc_h +BENCHES = scale_bench tproc_c tproc_b tproc_h lock_bench all: $(BENCHES) scale_bench: scale_bench.c $(CC) $(CFLAGS) $(INCLUDES) scale_bench.c $(LIBS) -o $@ +lock_bench: lock_bench.c + $(CC) $(CFLAGS) $(INCLUDES) lock_bench.c $(LIBS) -o $@ + tproc_c: tproc_c.c bdb_bench.h $(CC) $(CFLAGS) $(INCLUDES) tproc_c.c $(LIBS) -o $@ diff --git a/lab/bench/lock_bench.c b/lab/bench/lock_bench.c new file mode 100644 index 000000000..69ab7efc2 --- /dev/null +++ b/lab/bench/lock_bench.c @@ -0,0 +1,164 @@ +/*- + * See the file LICENSE for redistribution information. + * + * lock_bench -- direct lock-manager throughput/scaling probe. + * + * Bypasses the access methods and the buffer pool entirely: each thread + * allocates its own locker id, then in a tight loop calls DB_ENV->lock_get + * followed by DB_ENV->lock_put on a chosen object. This isolates the lock + * manager's own machinery (partition mutexes, object hash, lock/object free + * lists, locker lookup, per-op counters) from B-tree search and page-pin + * cache misses that dominate a real DB->get and mask the lock layer. + * + * ./lock_bench [t2 ...] + * + * mode: + * distinct - each thread locks objects from its own disjoint key range + * (no conflicts; pure throughput / partition scaling) + * shared - all threads lock READ over the SAME small set of nobj objects + * (read locks don't conflict, but they share object hash slots + * + lock-object refcount cache lines -> measures true sharing) + * + * Prints ops/sec per thread count. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static DB_ENV *g_env; +static int g_secs, g_nobj, g_shared, g_maxthreads; +static volatile int g_stop; + +typedef struct { + int tid; + u_int32_t locker; + uint64_t ops; +} worker; + +static double +now_ms(void) +{ + struct timeval tv; + (void)gettimeofday(&tv, NULL); + return (double)tv.tv_sec * 1000.0 + (double)tv.tv_usec / 1000.0; +} + +static void * +worker_main(void *arg) +{ + worker *w = arg; + DBT obj; + DB_LOCK lock; + uint32_t key, base; + uint64_t i = 0; + int ret; + + /* distinct: thread t owns keys [t*nobj, (t+1)*nobj). shared: [0,nobj). */ + base = g_shared ? 0 : (uint32_t)w->tid * (uint32_t)g_nobj; + memset(&obj, 0, sizeof(obj)); + obj.size = sizeof(key); + obj.data = &key; + + while (!g_stop) { + key = base + (uint32_t)(i++ % (uint64_t)g_nobj); + ret = g_env->lock_get(g_env, w->locker, 0, &obj, + g_shared ? DB_LOCK_READ : DB_LOCK_WRITE, &lock); + if (ret != 0) { + fprintf(stderr, "lock_get: %s\n", db_strerror(ret)); + return NULL; + } + if ((ret = g_env->lock_put(g_env, &lock)) != 0) { + fprintf(stderr, "lock_put: %s\n", db_strerror(ret)); + return NULL; + } + w->ops++; + } + return NULL; +} + +int +main(int argc, char **argv) +{ + pthread_t *tids; + worker *workers; + const char *home = "LOCKBENCHDIR"; + double t0, elapsed; + int ai, t, nthreads, ret; + + if (argc < 5) { + fprintf(stderr, + "usage: %s [t2 ...]\n", + argv[0]); + return 1; + } + g_secs = atoi(argv[1]); + g_nobj = atoi(argv[2]); + g_shared = strcmp(argv[3], "shared") == 0; + if (g_nobj < 1) g_nobj = 1; + + for (ai = 4; ai < argc; ai++) + if (atoi(argv[ai]) > g_maxthreads) g_maxthreads = atoi(argv[ai]); + + if ((ret = db_env_create(&g_env, 0)) != 0) { + fprintf(stderr, "env_create: %s\n", db_strerror(ret)); + return 1; + } + g_env->set_errfile(g_env, stderr); + /* Size the lock subsystem for many lockers/objects/locks. */ + (void)g_env->set_lk_max_locks(g_env, 500000); + (void)g_env->set_lk_max_objects(g_env, 500000); + (void)g_env->set_lk_max_lockers(g_env, 500000); + if ((ret = g_env->open(g_env, home, + DB_CREATE | DB_INIT_LOCK | DB_THREAD | DB_PRIVATE, 0)) != 0) { + g_env->err(g_env, ret, "env open (mkdir %s first)", home); + return 1; + } + + printf("# lock_bench mode=%s nobj=%d secs=%d\n", + g_shared ? "shared" : "distinct", g_nobj, g_secs); + printf("# threads ops/sec\n"); + + for (ai = 4; ai < argc; ai++) { + nthreads = atoi(argv[ai]); + tids = calloc((size_t)nthreads, sizeof(*tids)); + workers = calloc((size_t)nthreads, sizeof(*workers)); + for (t = 0; t < nthreads; t++) { + workers[t].tid = t; + if ((ret = g_env->lock_id(g_env, &workers[t].locker)) != 0) { + fprintf(stderr, "lock_id: %s\n", db_strerror(ret)); + return 1; + } + } + g_stop = 0; + t0 = now_ms(); + for (t = 0; t < nthreads; t++) + pthread_create(&tids[t], NULL, worker_main, &workers[t]); + usleep((useconds_t)g_secs * 1000000); + g_stop = 1; + for (t = 0; t < nthreads; t++) + pthread_join(tids[t], NULL); + elapsed = (now_ms() - t0) / 1000.0; + + { + uint64_t total = 0; + for (t = 0; t < nthreads; t++) { + total += workers[t].ops; + (void)g_env->lock_id_free(g_env, workers[t].locker); + } + printf("%-12d %12.0f\n", nthreads, + (double)total / elapsed); + } + free(tids); free(workers); + } + + (void)g_env->close(g_env, 0); + return 0; +} From b3190af6af30f40264eebc1f196aa38eab3372e8 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Fri, 19 Jun 2026 21:50:31 -0400 Subject: [PATCH 4/4] perf(lock): take the locker mutex shared on the lock-get hot path Every DB_ENV->lock_get / lock_put resolves its locker through __lock_getlocker_int under the region-global locker mutex (mtx_lockers). On the lock-get path the lookup is create=0 -- a read-only walk of the locker hash bucket -- yet it was held *exclusive*, serializing every lock acquisition across all cores even when objects are fully partitioned and there is no lock conflict. Make mtx_lockers a DB_MUTEX_SHARED latch and take it in shared mode for the read-only locker lookup on the hot path (__lock_get_api). Locker create, free, the deadlock detector's locker-list walk, failchk, and stat continue to hold it exclusive, so they never run concurrently with a reader. Measured with lab/bench/lock_bench (distinct mode, no lock conflict, on a 24-thread box): master plateaus and then declines past 8 threads (~3.0M ops/s peak, 2.6M at 24t); the shared latch scales to 7.0M at 24t -- 2.1x at 8 threads, 2.7x at 24. It captures roughly half the upper bound of removing the mutex entirely; the remainder is the shared latch's own reference-count cache line, which would require partitioning the locker hash to recover (left for later -- this is the low-risk 80/20). A small single-thread regression (~8%) reflects the shared latch's slightly higher uncontended cost and is dwarfed by the multi-core gain. Verified: TCL lock001/002/003 (incl. multi-process), txn001/002, test001, ssi001/002 pass; concurrent shared read-lock acquisition (lock_bench shared) runs clean. --- src/dbinc/lock.h | 2 ++ src/lock/lock.c | 2 +- src/lock/lock_region.c | 10 +++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/dbinc/lock.h b/src/dbinc/lock.h index d6133e800..1090779ef 100644 --- a/src/dbinc/lock.h +++ b/src/dbinc/lock.h @@ -329,6 +329,8 @@ struct __db_lock { /* SHARED */ MUTEX_UNLOCK(env, (region)->mtx_dd) #define LOCK_LOCKERS(env, region) \ MUTEX_LOCK(env, (region)->mtx_lockers) +#define RDLOCK_LOCKERS(env, region) \ + MUTEX_READLOCK(env, (region)->mtx_lockers) #define UNLOCK_LOCKERS(env, region) \ MUTEX_UNLOCK(env, (region)->mtx_lockers) diff --git a/src/lock/lock.c b/src/lock/lock.c index 73400d92d..7bd16e870 100644 --- a/src/lock/lock.c +++ b/src/lock/lock.c @@ -596,7 +596,7 @@ __lock_get_api(env, locker, flags, obj, lock_mode, lock) region = env->lk_handle->reginfo.primary; - LOCK_LOCKERS(env, region); + RDLOCK_LOCKERS(env, region); ret = __lock_getlocker_int(env->lk_handle, locker, 0, &sh_locker); UNLOCK_LOCKERS(env, region); LOCK_SYSTEM_LOCK(env->lk_handle, region); diff --git a/src/lock/lock_region.c b/src/lock/lock_region.c index bd5a7a170..29695cdfb 100644 --- a/src/lock/lock_region.c +++ b/src/lock/lock_region.c @@ -253,8 +253,16 @@ __lock_region_init(env, lt) env, MTX_LOCK_REGION, 0, ®ion->mtx_dd)) != 0) return (ret); + /* + * The locker mutex is a SHARED latch: the hot lock-get path looks up + * an existing locker (a read-only hash walk) and takes it shared, so + * many cores can resolve their locker concurrently; locker create, + * free, the deadlock detector's locker-list walk, failchk, and stat + * take it exclusive. This removes the per-operation global + * serialization on lock_get/lock_put. + */ if ((ret = __mutex_alloc( - env, MTX_LOCK_REGION, 0, ®ion->mtx_lockers)) != 0) + env, MTX_LOCK_REGION, DB_MUTEX_SHARED, ®ion->mtx_lockers)) != 0) return (ret); /* Allocate room for the locker hash table and initialize it. */