From eba2a0e9fc2c6748869b070b203fe44e409ced85 Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Fri, 19 Jun 2026 21:09:03 -0400
Subject: [PATCH 1/4] fix(bench): size lock subsystem and log buffer for scale

The TPROC drivers opened the environment with default lock-region sizing
(~1000 locks/objects/lockers) and a tiny default log buffer.  A batched
bulk load or a many-thread run exhausts those entries and fails mid-run
with ENOMEM (BDB2055 'Lock table is out of available lock entries',
BDB1501 'Logging region out of memory'), and an unchecked failure during
populate could leave a partially built environment that crashes on reuse.

Size the lock subsystem (200k locks/objects/lockers) and the log buffer
(16MB) when the corresponding subsystems are enabled.  Verified populate +
run at scale 5 and 50.
---
 lab/bench/bdb_bench.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/lab/bench/bdb_bench.h b/lab/bench/bdb_bench.h
index 55815625d..d59f8c9bb 100644
--- a/lab/bench/bdb_bench.h
+++ b/lab/bench/bdb_bench.h
@@ -176,6 +176,26 @@ bb_env_open(bb_config *c, DB_ENV **envp)
 	if (c->use_lock || c->use_txn)
 		(void)env->set_lk_detect(env, DB_LOCK_DEFAULT);
 
+	/*
+	 * Size the lock subsystem generously.  The default region holds only
+	 * ~1000 locks/lockers/objects; a batched bulk load or a many-thread
+	 * run needs far more (each held lock and each active transaction
+	 * consumes entries), and exhausting them returns ENOMEM mid-run.
+	 */
+	if (c->use_lock || c->use_txn) {
+		(void)env->set_lk_max_locks(env, 200000);
+		(void)env->set_lk_max_objects(env, 200000);
+		(void)env->set_lk_max_lockers(env, 200000);
+	}
+
+	/*
+	 * Size the in-memory log buffer so a write-heavy run does not stall
+	 * rolling tiny (default) log segments.  Durability is governed
+	 * separately by the -d toggle below.
+	 */
+	if (c->use_log || c->use_txn)
+		(void)env->set_lg_bsize(env, 16 * 1024 * 1024);
+
 	if (c->use_txn) {
 		if (c->durability == BB_NOSYNC)
 			(void)env->set_flags(env, DB_TXN_NOSYNC, 1);

From 332a1ed3205ec1434b773255bcc01f3dcbc2944e Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Fri, 19 Jun 2026 21:23:02 -0400
Subject: [PATCH 2/4] bench: add -D deadlock-detection mode knob

Add a -D N toggle to the shared harness: 0 (default) keeps BDB's
detect-on-every-conflict behavior; N>0 disables inline detection and runs a
background deadlock detector every N ms instead.  Lets a run A/B the cost of
synchronous vs periodic deadlock detection.

Measurement tooling only; no engine change.  (A/B on a 12-core box found the
two modes within noise on the contended debit/credit workload.)
---
 lab/README.md         |  1 +
 lab/bench/bdb_bench.h | 49 +++++++++++++++++++++++++++++++++++++++++--
 lab/bench/tproc_b.c   | 21 ++++++++++++-------
 lab/bench/tproc_c.c   | 25 ++++++++++++++--------
 lab/bench/tproc_h.c   | 21 ++++++++++++-------
 5 files changed, 92 insertions(+), 25 deletions(-)

diff --git a/lab/README.md b/lab/README.md
index caf1b8282..14e19e3be 100644
--- a/lab/README.md
+++ b/lab/README.md
@@ -38,6 +38,7 @@ individual protections removed, to measure their cost:
 | `-d sync\|wnosync\|nosync` | commit durability (default `nosync`) |
 | `-m` | MVCC / snapshot isolation (`DB_MULTIVERSION`) |
 | `-C` | Concurrent Data Store (`DB_INIT_CDB`) instead of full txns |
+| `-D N` | deadlock detection: `0` (default) detects on every conflict; `N>0` runs a background detector every `N` ms and leaves the hot path free of detection |
 | `-c` `-t` `-S` `-s` `-i` | cache bytes, threads, scale, seconds, init |
 
 Example:
diff --git a/lab/bench/bdb_bench.h b/lab/bench/bdb_bench.h
index d59f8c9bb..8eba38c84 100644
--- a/lab/bench/bdb_bench.h
+++ b/lab/bench/bdb_bench.h
@@ -59,6 +59,9 @@ typedef struct {
 	int use_log;		/* DB_INIT_LOG */
 	int use_mvcc;		/* DB_MULTIVERSION + DB_TXN_SNAPSHOT readers */
 	int use_cdb;		/* DB_INIT_CDB (concurrent data store) */
+	int dd_periodic;	/* deadlock detect: 0=on every conflict (default),
+				 * else run a background detector every N ms and
+				 * set_lk_detect(NONE) on the hot path */
 	enum bb_durability durability;
 } bb_config;
 
@@ -95,7 +98,7 @@ bb_getopt(int argc, char **argv, bb_config *c)
 	int ch;
 	extern char *optarg;
 
-	while ((ch = getopt(argc, argv, "h:c:t:S:s:imCd:X:R:")) != EOF)
+	while ((ch = getopt(argc, argv, "h:c:t:S:s:imCd:X:R:D:")) != EOF)
 		switch (ch) {
 		case 'h': c->home = optarg; break;
 		case 'c': c->cachebytes = strtoull(optarg, NULL, 10); break;
@@ -106,6 +109,7 @@ bb_getopt(int argc, char **argv, bb_config *c)
 		case 'm': c->use_mvcc = 1; break;
 		case 'C': c->use_cdb = 1; break;
 		case 'R': c->seed = (unsigned)strtoul(optarg, NULL, 10); break;
+		case 'D': c->dd_periodic = atoi(optarg); break;	/* dd interval ms */
 		case 'd':
 			if (strcmp(optarg, "sync") == 0) c->durability = BB_SYNC;
 			else if (strcmp(optarg, "wnosync") == 0)
@@ -173,7 +177,15 @@ bb_env_open(bb_config *c, DB_ENV **envp)
 	 * victim and returns DB_LOCK_DEADLOCK) instead of blocking forever --
 	 * the workloads here intentionally contend on shared rows.
 	 */
-	if (c->use_lock || c->use_txn)
+	/*
+	 * Deadlock detection.  By default BDB can run the detector on every
+	 * lock conflict (set_lk_detect) -- correct, but every blocked acquire
+	 * pays the detector's cost inline.  With -D N we instead leave the hot
+	 * path free of detection and run a background detector every N ms
+	 * (started by bb_start_dd after open); a victim is chosen at the next
+	 * sweep rather than immediately.  This A/B isolates the detector cost.
+	 */
+	if ((c->use_lock || c->use_txn) && c->dd_periodic == 0)
 		(void)env->set_lk_detect(env, DB_LOCK_DEFAULT);
 
 	/*
@@ -307,4 +319,37 @@ bb_print_config(const bb_config *c, const char *name)
 	    c->durability == BB_WRITE_NOSYNC ? "wnosync" : "nosync");
 }
 
+/* ---- background deadlock detector (for -D N) -------------------- */
+struct bb_dd_arg { DB_ENV *env; int interval_ms; volatile int *stop; };
+
+static void *
+bb_dd_thread(void *a)
+{
+	struct bb_dd_arg *arg = a;
+	int rejected;
+
+	while (!*arg->stop) {
+		usleep((useconds_t)arg->interval_ms * 1000);
+		(void)arg->env->lock_detect(arg->env, 0,
+		    DB_LOCK_YOUNGEST, &rejected);
+	}
+	return NULL;
+}
+
+/*
+ * bb_start_dd / bb_stop_dd -- run a periodic deadlock detector when
+ * c->dd_periodic > 0.  No-ops otherwise.  *stop must outlive the thread.
+ */
+static int
+bb_start_dd(const bb_config *c, DB_ENV *env, pthread_t *tid,
+    struct bb_dd_arg *arg, volatile int *stop)
+{
+	if (c->dd_periodic <= 0)
+		return 0;
+	arg->env = env;
+	arg->interval_ms = c->dd_periodic;
+	arg->stop = stop;
+	return pthread_create(tid, NULL, bb_dd_thread, arg);
+}
+
 #endif /* BDB_BENCH_H */
diff --git a/lab/bench/tproc_b.c b/lab/bench/tproc_b.c
index 845aeff19..e451a2aeb 100644
--- a/lab/bench/tproc_b.c
+++ b/lab/bench/tproc_b.c
@@ -235,13 +235,20 @@ main(int argc, char **argv)
 	}
 
 	g_stop = 0;
-	t0 = bb_now_ms();
-	for (t = 0; t < g_cfg.threads; t++)
-		pthread_create(&tids[t], NULL, worker_main, &workers[t]);
-	usleep((useconds_t)g_cfg.seconds * 1000000);
-	g_stop = 1;
-	for (t = 0; t < g_cfg.threads; t++)
-		pthread_join(tids[t], NULL);
+	{
+		pthread_t ddtid; struct bb_dd_arg ddarg; int dd_on;
+		dd_on = (bb_start_dd(&g_cfg, g_env, &ddtid, &ddarg, &g_stop) == 0
+		    && g_cfg.dd_periodic > 0);
+		t0 = bb_now_ms();
+		for (t = 0; t < g_cfg.threads; t++)
+			pthread_create(&tids[t], NULL, worker_main, &workers[t]);
+		usleep((useconds_t)g_cfg.seconds * 1000000);
+		g_stop = 1;
+		for (t = 0; t < g_cfg.threads; t++)
+			pthread_join(tids[t], NULL);
+		if (dd_on)
+			pthread_join(ddtid, NULL);
+	}
 	elapsed = (bb_now_ms() - t0) / 1000.0;
 
 	okall = retryall = 0;
diff --git a/lab/bench/tproc_c.c b/lab/bench/tproc_c.c
index a25892214..a2f131d5c 100644
--- a/lab/bench/tproc_c.c
+++ b/lab/bench/tproc_c.c
@@ -534,15 +534,22 @@ main(int argc, char **argv)
 	}
 
 	g_stop = 0;
-	t0 = bb_now_ms();
-	for (t = 0; t < g_cfg.threads; t++)
-		pthread_create(&tids[t], NULL, worker_main, &workers[t]);
-
-	/* Run for the requested wall-clock, then signal stop. */
-	usleep((useconds_t)g_cfg.seconds * 1000000);
-	g_stop = 1;
-	for (t = 0; t < g_cfg.threads; t++)
-		pthread_join(tids[t], NULL);
+	{
+		pthread_t ddtid; struct bb_dd_arg ddarg; int dd_on;
+		dd_on = (bb_start_dd(&g_cfg, g_env, &ddtid, &ddarg, &g_stop) == 0
+		    && g_cfg.dd_periodic > 0);
+		t0 = bb_now_ms();
+		for (t = 0; t < g_cfg.threads; t++)
+			pthread_create(&tids[t], NULL, worker_main, &workers[t]);
+
+		/* Run for the requested wall-clock, then signal stop. */
+		usleep((useconds_t)g_cfg.seconds * 1000000);
+		g_stop = 1;
+		for (t = 0; t < g_cfg.threads; t++)
+			pthread_join(tids[t], NULL);
+		if (dd_on)
+			pthread_join(ddtid, NULL);
+	}
 	elapsed = (bb_now_ms() - t0) / 1000.0;
 
 	memset(total, 0, sizeof(total));
diff --git a/lab/bench/tproc_h.c b/lab/bench/tproc_h.c
index bbe576786..2aefb4d27 100644
--- a/lab/bench/tproc_h.c
+++ b/lab/bench/tproc_h.c
@@ -399,13 +399,20 @@ main(int argc, char **argv)
 	}
 
 	g_stop = 0;
-	t0 = bb_now_ms();
-	for (t = 0; t < nthreads; t++)
-		pthread_create(&tids[t], NULL, worker_main, &workers[t]);
-	usleep((useconds_t)g_cfg.seconds * 1000000);
-	g_stop = 1;
-	for (t = 0; t < nthreads; t++)
-		pthread_join(tids[t], NULL);
+	{
+		pthread_t ddtid; struct bb_dd_arg ddarg; int dd_on;
+		dd_on = (bb_start_dd(&g_cfg, g_env, &ddtid, &ddarg, &g_stop) == 0
+		    && g_cfg.dd_periodic > 0);
+		t0 = bb_now_ms();
+		for (t = 0; t < nthreads; t++)
+			pthread_create(&tids[t], NULL, worker_main, &workers[t]);
+		usleep((useconds_t)g_cfg.seconds * 1000000);
+		g_stop = 1;
+		for (t = 0; t < nthreads; t++)
+			pthread_join(tids[t], NULL);
+		if (dd_on)
+			pthread_join(ddtid, NULL);
+	}
 	elapsed = (bb_now_ms() - t0) / 1000.0;
 
 	memset(total, 0, sizeof(total));

From ec259f1b9bcf8597bcd2ea3cc974e058e1acab8f Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Fri, 19 Jun 2026 21:41:38 -0400
Subject: [PATCH 3/4] bench: add lock_bench direct lock-manager probe

A micro-benchmark that exercises the lock subsystem in isolation: each
thread allocates its own locker and loops lock_get/lock_put on either
per-thread (distinct, no-conflict) or shared read objects, with no access
method or buffer pool in the path.  This exposes lock-manager scaling that
btree-bound workloads (e.g. scale_bench rrand) hide behind page cache
misses.  On a 24-thread box it shows the per-op global locker mutex
plateauing throughput at ~8 threads.
---
 lab/README.md          |   6 ++
 lab/bench/Makefile     |   5 +-
 lab/bench/lock_bench.c | 164 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 lab/bench/lock_bench.c

diff --git a/lab/README.md b/lab/README.md
index 14e19e3be..1ddf9fe83 100644
--- a/lab/README.md
+++ b/lab/README.md
@@ -16,6 +16,12 @@ cd ../lab/bench && make BDB=../../build_unix       # build the drivers
   drives a shared environment from N threads and reports ops/sec plus
   region-contention signals.
 
+- **`lock_bench`** — direct lock-manager probe. Each thread allocates its own
+  locker and calls `lock_get`/`lock_put` in a tight loop on `distinct`
+  (per-thread, no-conflict) or `shared` (read-lock the same objects) keys,
+  bypassing the access methods and buffer pool so the lock subsystem's own
+  scaling is measured in isolation.
+
 - **`tproc_c` / `tproc_b` / `tproc_h`** — HammerDB-style workloads
   (independently implemented; **not** the TPC benchmarks and not comparable to
   TPC results):
diff --git a/lab/bench/Makefile b/lab/bench/Makefile
index 8bb383db6..1d42fec15 100644
--- a/lab/bench/Makefile
+++ b/lab/bench/Makefile
@@ -8,13 +8,16 @@ CFLAGS ?= -O2 -pthread
 INCLUDES = -I$(BDB)
 LIBS = -L$(BDB)/.libs -ldb-5.3
 
-BENCHES = scale_bench tproc_c tproc_b tproc_h
+BENCHES = scale_bench tproc_c tproc_b tproc_h lock_bench
 
 all: $(BENCHES)
 
 scale_bench: scale_bench.c
 	$(CC) $(CFLAGS) $(INCLUDES) scale_bench.c $(LIBS) -o $@
 
+lock_bench: lock_bench.c
+	$(CC) $(CFLAGS) $(INCLUDES) lock_bench.c $(LIBS) -o $@
+
 tproc_c: tproc_c.c bdb_bench.h
 	$(CC) $(CFLAGS) $(INCLUDES) tproc_c.c $(LIBS) -o $@
 
diff --git a/lab/bench/lock_bench.c b/lab/bench/lock_bench.c
new file mode 100644
index 000000000..69ab7efc2
--- /dev/null
+++ b/lab/bench/lock_bench.c
@@ -0,0 +1,164 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * lock_bench -- direct lock-manager throughput/scaling probe.
+ *
+ * Bypasses the access methods and the buffer pool entirely: each thread
+ * allocates its own locker id, then in a tight loop calls DB_ENV->lock_get
+ * followed by DB_ENV->lock_put on a chosen object.  This isolates the lock
+ * manager's own machinery (partition mutexes, object hash, lock/object free
+ * lists, locker lookup, per-op counters) from B-tree search and page-pin
+ * cache misses that dominate a real DB->get and mask the lock layer.
+ *
+ *   ./lock_bench <secs> <nobj> <mode> <t1> [t2 ...]
+ *
+ * mode:
+ *   distinct  - each thread locks objects from its own disjoint key range
+ *               (no conflicts; pure throughput / partition scaling)
+ *   shared    - all threads lock READ over the SAME small set of nobj objects
+ *               (read locks don't conflict, but they share object hash slots
+ *               + lock-object refcount cache lines -> measures true sharing)
+ *
+ * Prints ops/sec per thread count.
+ */
+#include <sys/types.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <db.h>
+
+static DB_ENV *g_env;
+static int g_secs, g_nobj, g_shared, g_maxthreads;
+static volatile int g_stop;
+
+typedef struct {
+	int tid;
+	u_int32_t locker;
+	uint64_t ops;
+} worker;
+
+static double
+now_ms(void)
+{
+	struct timeval tv;
+	(void)gettimeofday(&tv, NULL);
+	return (double)tv.tv_sec * 1000.0 + (double)tv.tv_usec / 1000.0;
+}
+
+static void *
+worker_main(void *arg)
+{
+	worker *w = arg;
+	DBT obj;
+	DB_LOCK lock;
+	uint32_t key, base;
+	uint64_t i = 0;
+	int ret;
+
+	/* distinct: thread t owns keys [t*nobj, (t+1)*nobj). shared: [0,nobj). */
+	base = g_shared ? 0 : (uint32_t)w->tid * (uint32_t)g_nobj;
+	memset(&obj, 0, sizeof(obj));
+	obj.size = sizeof(key);
+	obj.data = &key;
+
+	while (!g_stop) {
+		key = base + (uint32_t)(i++ % (uint64_t)g_nobj);
+		ret = g_env->lock_get(g_env, w->locker, 0, &obj,
+		    g_shared ? DB_LOCK_READ : DB_LOCK_WRITE, &lock);
+		if (ret != 0) {
+			fprintf(stderr, "lock_get: %s\n", db_strerror(ret));
+			return NULL;
+		}
+		if ((ret = g_env->lock_put(g_env, &lock)) != 0) {
+			fprintf(stderr, "lock_put: %s\n", db_strerror(ret));
+			return NULL;
+		}
+		w->ops++;
+	}
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	pthread_t *tids;
+	worker *workers;
+	const char *home = "LOCKBENCHDIR";
+	double t0, elapsed;
+	int ai, t, nthreads, ret;
+
+	if (argc < 5) {
+		fprintf(stderr,
+		    "usage: %s <secs> <nobj> <distinct|shared> <t1> [t2 ...]\n",
+		    argv[0]);
+		return 1;
+	}
+	g_secs = atoi(argv[1]);
+	g_nobj = atoi(argv[2]);
+	g_shared = strcmp(argv[3], "shared") == 0;
+	if (g_nobj < 1) g_nobj = 1;
+
+	for (ai = 4; ai < argc; ai++)
+		if (atoi(argv[ai]) > g_maxthreads) g_maxthreads = atoi(argv[ai]);
+
+	if ((ret = db_env_create(&g_env, 0)) != 0) {
+		fprintf(stderr, "env_create: %s\n", db_strerror(ret));
+		return 1;
+	}
+	g_env->set_errfile(g_env, stderr);
+	/* Size the lock subsystem for many lockers/objects/locks. */
+	(void)g_env->set_lk_max_locks(g_env, 500000);
+	(void)g_env->set_lk_max_objects(g_env, 500000);
+	(void)g_env->set_lk_max_lockers(g_env, 500000);
+	if ((ret = g_env->open(g_env, home,
+	    DB_CREATE | DB_INIT_LOCK | DB_THREAD | DB_PRIVATE, 0)) != 0) {
+		g_env->err(g_env, ret, "env open (mkdir %s first)", home);
+		return 1;
+	}
+
+	printf("# lock_bench mode=%s nobj=%d secs=%d\n",
+	    g_shared ? "shared" : "distinct", g_nobj, g_secs);
+	printf("# threads      ops/sec\n");
+
+	for (ai = 4; ai < argc; ai++) {
+		nthreads = atoi(argv[ai]);
+		tids = calloc((size_t)nthreads, sizeof(*tids));
+		workers = calloc((size_t)nthreads, sizeof(*workers));
+		for (t = 0; t < nthreads; t++) {
+			workers[t].tid = t;
+			if ((ret = g_env->lock_id(g_env, &workers[t].locker)) != 0) {
+				fprintf(stderr, "lock_id: %s\n", db_strerror(ret));
+				return 1;
+			}
+		}
+		g_stop = 0;
+		t0 = now_ms();
+		for (t = 0; t < nthreads; t++)
+			pthread_create(&tids[t], NULL, worker_main, &workers[t]);
+		usleep((useconds_t)g_secs * 1000000);
+		g_stop = 1;
+		for (t = 0; t < nthreads; t++)
+			pthread_join(tids[t], NULL);
+		elapsed = (now_ms() - t0) / 1000.0;
+
+		{
+			uint64_t total = 0;
+			for (t = 0; t < nthreads; t++) {
+				total += workers[t].ops;
+				(void)g_env->lock_id_free(g_env, workers[t].locker);
+			}
+			printf("%-12d %12.0f\n", nthreads,
+			    (double)total / elapsed);
+		}
+		free(tids); free(workers);
+	}
+
+	(void)g_env->close(g_env, 0);
+	return 0;
+}

From b3190af6af30f40264eebc1f196aa38eab3372e8 Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Fri, 19 Jun 2026 21:50:31 -0400
Subject: [PATCH 4/4] perf(lock): take the locker mutex shared on the lock-get
 hot path

Every DB_ENV->lock_get / lock_put resolves its locker through
__lock_getlocker_int under the region-global locker mutex (mtx_lockers).
On the lock-get path the lookup is create=0 -- a read-only walk of the
locker hash bucket -- yet it was held *exclusive*, serializing every lock
acquisition across all cores even when objects are fully partitioned and
there is no lock conflict.

Make mtx_lockers a DB_MUTEX_SHARED latch and take it in shared mode for the
read-only locker lookup on the hot path (__lock_get_api).  Locker create,
free, the deadlock detector's locker-list walk, failchk, and stat continue
to hold it exclusive, so they never run concurrently with a reader.

Measured with lab/bench/lock_bench (distinct mode, no lock conflict, on a
24-thread box): master plateaus and then declines past 8 threads
(~3.0M ops/s peak, 2.6M at 24t); the shared latch scales to 7.0M at 24t --
2.1x at 8 threads, 2.7x at 24.  It captures roughly half the upper bound of
removing the mutex entirely; the remainder is the shared latch's own
reference-count cache line, which would require partitioning the locker
hash to recover (left for later -- this is the low-risk 80/20).  A small
single-thread regression (~8%) reflects the shared latch's slightly higher
uncontended cost and is dwarfed by the multi-core gain.

Verified: TCL lock001/002/003 (incl. multi-process), txn001/002, test001,
ssi001/002 pass; concurrent shared read-lock acquisition (lock_bench shared)
runs clean.
---
 src/dbinc/lock.h       |  2 ++
 src/lock/lock.c        |  2 +-
 src/lock/lock_region.c | 10 +++++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/dbinc/lock.h b/src/dbinc/lock.h
index d6133e800..1090779ef 100644
--- a/src/dbinc/lock.h
+++ b/src/dbinc/lock.h
@@ -329,6 +329,8 @@ struct __db_lock { /* SHARED */
 	MUTEX_UNLOCK(env, (region)->mtx_dd)
 #define	LOCK_LOCKERS(env, region)					\
 	MUTEX_LOCK(env, (region)->mtx_lockers)
+#define	RDLOCK_LOCKERS(env, region)					\
+	MUTEX_READLOCK(env, (region)->mtx_lockers)
 #define	UNLOCK_LOCKERS(env, region)					\
 	MUTEX_UNLOCK(env, (region)->mtx_lockers)
 
diff --git a/src/lock/lock.c b/src/lock/lock.c
index 73400d92d..7bd16e870 100644
--- a/src/lock/lock.c
+++ b/src/lock/lock.c
@@ -596,7 +596,7 @@ __lock_get_api(env, locker, flags, obj, lock_mode, lock)
 
 	region = env->lk_handle->reginfo.primary;
 
-	LOCK_LOCKERS(env, region);
+	RDLOCK_LOCKERS(env, region);
 	ret = __lock_getlocker_int(env->lk_handle, locker, 0, &sh_locker);
 	UNLOCK_LOCKERS(env, region);
 	LOCK_SYSTEM_LOCK(env->lk_handle, region);
diff --git a/src/lock/lock_region.c b/src/lock/lock_region.c
index bd5a7a170..29695cdfb 100644
--- a/src/lock/lock_region.c
+++ b/src/lock/lock_region.c
@@ -253,8 +253,16 @@ __lock_region_init(env, lt)
 	    env, MTX_LOCK_REGION, 0, &region->mtx_dd)) != 0)
 		return (ret);
 
+	/*
+	 * The locker mutex is a SHARED latch: the hot lock-get path looks up
+	 * an existing locker (a read-only hash walk) and takes it shared, so
+	 * many cores can resolve their locker concurrently; locker create,
+	 * free, the deadlock detector's locker-list walk, failchk, and stat
+	 * take it exclusive.  This removes the per-operation global
+	 * serialization on lock_get/lock_put.
+	 */
 	if ((ret = __mutex_alloc(
-	    env, MTX_LOCK_REGION, 0, &region->mtx_lockers)) != 0)
+	    env, MTX_LOCK_REGION, DB_MUTEX_SHARED, &region->mtx_lockers)) != 0)
 		return (ret);
 
 	/* Allocate room for the locker hash table and initialize it. */