kamepoolalloc/allocator.cpp at master · northriv/kamepoolalloc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/***************************************************************************
        Copyright (C) 2002-2026 Kentaro Kitagawa
                           kitag@issp.u-tokyo.ac.jp

        This file is dual-licensed under your choice of EITHER:

          * Apache License, Version 2.0
            (http://www.apache.org/licenses/LICENSE-2.0, or see
            LICENSE-APACHE-2.0 in this directory)

        -- OR --

          * GNU General Public License, version 2 of the License,
            or (at your option) any later version
            (http://www.gnu.org/licenses/old-licenses/gpl-2.0.html,
            or see LICENSE-GPL-2.0 in this directory).

        Pick whichever license suits your project.  Unless required
        by applicable law or agreed to in writing, this file is
        distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
        CONDITIONS OF ANY KIND, either express or implied
***************************************************************************/

//#define GUARDIAN 0xaaaaaaaauLL
//#define FILLING_AFTER_ALLOC 0x55555555uLL
// per-thread floor on `owner_release`.  Stop releasing
// when this thread's DLL has fewer than this many chunks for the given
// (ALIGN, FS) template — avoids release / re-mmap thrashing on bursty
// workloads.
//
// Value tuning history:
//   * 2 — fine for the original `s_tls.my_chunk` + DLL design.
//   * 16 — bumped as a workaround for the bucket34_repro
//     33.5 → 0.24 M/s Linux regression, on the (incorrect) theory
//     that aggressive release / re-mmap was the cause.
//   * REAL fix landed — `s_tls.dll_cursor` / `s_tls.dll_exhausted`
//     was the culprit, not the floor.  Three direct
//     `batch_return_to_bitmap` sites now reset the cursor so the
//     next walk finds the revived chunks.
//   * This commit: 16 → 2.  With an earlier change the floor=16 bloat is
//     unnecessary; bucket34_repro 1t actually IMPROVES at floor=2
//     (15-22 → 27 M/s) because empty chunks release sooner,
//     improving region locality and reducing post-workers RSS.
//     All other workloads parity.
#define LEAVE_VACANT_CHUNKS_PER_THREAD 2

#include "allocator.h"
#include "kame_pool.h"        // C-API stats struct + version macro
#if defined(__linux__)
#  include <dirent.h>          // /sys/devices/system/node walk (§14C)
#  include <sched.h>           // sched_getcpu                  (§14C)
#  include <sys/syscall.h>     // SYS_mbind                     (§14C)
#  include <unistd.h>          // syscall                       (§14C)
#endif

#ifndef USE_STD_ALLOCATOR

#include "atomic_mfence.h"   // readBarrier / writeBarrier / pause4spin
                             // (kamepoolalloc-internal, mirrors kame/atomic.h's
                             //  arch-select chain but drops atomic_shared_ptr et al)

#include <algorithm>
#include <assert.h>
#include <cerrno>
#include <chrono>           // (§28.1) lazy-drain wall clock for LRC_MMAP push
#include <cstdio>
#include <cstdlib>
#include <cstring>          // std::memset / std::memcpy
#include <limits>           // (§30) numeric_limits for the realtime-mode preset
#include <new>              // std::get_new_handler / std::bad_alloc (§18 OOM)
                            // (glibc's `<string.h>` puts them in the
                            //  global namespace only — libc++/Apple
                            //  pull them into `std::` transitively but
                            //  libstdc++ does not.  `<cstring>` is the
                            //  portable C++ way.)
#include <type_traits>
#if defined(__linux__)
    #include <dlfcn.h>           // RTLD_NEXT resolve of libc
                                 // malloc_usable_size (strong-symbol
                                 // co-interpose forwards foreign pointers)
#endif
#if defined(__APPLE__)
    #include <malloc/malloc.h>   // for malloc_zone_from_ptr / malloc_zone_free
#elif defined(_WIN32) || defined(__WIN32__) || defined(WINDOWS)
    #include <malloc.h>          // for _aligned_malloc / _aligned_free
                                 // (over-aligned alloc fallback when
                                 // alignment exceeds the pool's 16-B
                                 // guarantee)
    // VirtualAlloc / VirtualFree / MEM_COMMIT etc. for the radix L2 node
    // allocator (the Windows counterpart to mmap()).  WIN32_LEAN_AND_MEAN
    // keeps the symbol load small; we only need the memory + handle API.
    #ifndef WIN32_LEAN_AND_MEAN
    #  define WIN32_LEAN_AND_MEAN
    #endif
    #ifndef NOMINMAX
    #  define NOMINMAX   // keep windows.h's min/max macros from clobbering
    #                    // std::numeric_limits<>::max() etc. (breaks MSVC)
    #endif
    #include <windows.h>
    #include <intrin.h>          // __readgsqword — PEB walk in the §31
                                 // free-redirect (loader-lock-safe module
                                 // enumeration; see kame_patch_all_modules)
#endif
#if KAME_FAST_TSD
    #include <pthread.h>
#endif

// (§32) Drop-in default: the standalone DYLIB build interposes the FULL libc
// malloc family by default on macOS, so kamepoolalloc is a real
// DYLD_INSERT_LIBRARIES / drop-in allocator (like mimalloc/jemalloc) out of the
// box.  Safe because the `malloc_size` co-interpose (see the macOS FULL block
// near the bottom) returns the true capacity of pool pointers — without it the
// Swift runtime (`__StringStorage`) and ObjC class realization corrupt.  Soak:
// Foundation, libswiftCore (CPython), QtCore, C++ STL, 2000-thread stress — all
// clean.  Opt out with -DKAMEPOOLALLOC_CONSERVATIVE_INTERCEPT (free+realloc only).
// Linux dylibs keep the explicit -DKAMEPOOLALLOC_FULL_INTERCEPT opt-in pending
// their own soak; Windows is default-on too now (soaked — see the §31 block).
// The inline kame.app (MH_EXECUTE) is unaffected — dyld
// honours __interpose only from MH_DYLIB, so its interpose set is inert either way.
#if defined(KAMEPOOLALLOC_DYLIB) && defined(__APPLE__) \
    && !defined(KAMEPOOLALLOC_FULL_INTERCEPT) \
    && !defined(KAMEPOOLALLOC_CONSERVATIVE_INTERCEPT)
#  define KAMEPOOLALLOC_FULL_INTERCEPT 1
#endif

// Windows: full malloc-family interception is the default too — matching
// the macOS dylib drop-in.  NOTE there is no `KAMEPOOLALLOC_DYLIB` gate
// here, unlike macOS: the §31 IAT redirect below patches imports of any
// loaded module from the *inline-compiled* kame.exe, so this covers the
// production executable (on macOS only an MH_DYLIB can interpose, so the
// inline kame.app stays conservative).  Ruby (msvcrt heap) is excluded by
// `kame_is_crt_dll` (ucrtbase / api-ms-win-crt-heap only), so only the
// UCRT-family (kame.exe, Qt, libc++) is pooled; the `_msize` co-redirect
// (the Windows analog of macOS `malloc_size`) keeps size-queries correct.
// Opt out with -DKAMEPOOLALLOC_CONSERVATIVE_INTERCEPT (free-family only).
#if (defined(_WIN32) || defined(__WIN32__) || defined(WINDOWS)) \
    && !defined(KAMEPOOLALLOC_FULL_INTERCEPT) \
    && !defined(KAMEPOOLALLOC_CONSERVATIVE_INTERCEPT)
#  define KAMEPOOLALLOC_FULL_INTERCEPT 1
#endif

#if defined(_WIN32) || defined(__WIN32__) || defined(WINDOWS)
// ===================================================================
// (§31) Windows free-family redirect — genuine-CRT bypass pointers.
//
// On PE/COFF there is no cross-module interposition of the replaceable
// `operator new` / `operator delete` (or `free`) the way ELF strong
// symbols and Mach-O `__DATA,__interpose` provide.  Each prebuilt DLL
// (Qt6*.dll, libc++.dll) binds `free` / `operator delete` to the UCRT
// at *its* link time, so a Qt object that KAME pool-allocated and then
// hands back to Qt is freed by Qt via the UCRT's `free()` — heap
// corruption, because a pool pointer is not a CRT pointer.  (Confirmed
// on-target: the Create-Driver dialog's child widgets, pool-allocated
// in kame.exe, are deleted by QObjectPrivate::deleteChildren() inside
// Qt6Core.dll → libc++ free → int3 in ntdll's heap path.)
//
// We close the gap the way mimalloc-redirect.dll does.  By default we
// patch only the *deallocation* family (`free` / `realloc` / `_msize`),
// matching the production-conservative model: KAME allocs stay in the
// pool (kame.exe `operator new` override), Qt / Ruby / Python allocs
// stay in the CRT, and frees are reconciled wherever they happen via
// `kame_free`'s pool-or-foreign dispatcher.  See
// `kame_pool_win_install_redirect` near the bottom.
//
// FULL interception (`KAMEPOOLALLOC_FULL_INTERCEPT`, default-ON on Windows
// per the §32 block above) extends the IAT patch table to `malloc` and
// `calloc`, so EVERY UCRT-family alloc-family call (kame.exe, Qt, libc++)
// is routed through the pool — a true mimalloc-style drop-in, matching the
// macOS dylib default.  Ruby (msvcrt heap) stays excluded by
// `kame_is_crt_dll`, and `_msize` is co-redirected (the Windows analog of
// macOS `malloc_size`) so size-queries see pool pointers' true capacity.
// Soaked on-target (MinGW64 + lld): kame.exe — Qt + Ruby + Python +
// Create-Driver dialog teardown + load test — clean exit (176 slots / 58
// modules patched); alloc_stress_test 2000-thread / 42 M-op stress PASSES.
// Opt OUT with `-DKAMEPOOLALLOC_CONSERVATIVE_INTERCEPT` (free-family
// reconcile only — the prior 20-yr-stable Qt/Ruby/Python conservative model).
//
// These pointers hold the *genuine* UCRT entry points, resolved once
// from ucrtbase.dll.  The pool's own "forward a foreign pointer to the
// real heap" paths (`libsystem_*_for_pool`) and its region release
// (`free_munmap`) MUST call these — NOT `std::free` / `std::realloc` —
// because once the IAT redirect is installed, kame.exe's own `free`
// import is patched to route back into the pool; a plain `std::free`
// would recurse forever.  Until resolution/install happens these stay
// null and the call sites fall back to `std::*` (still genuine then,
// since nothing is patched yet).
typedef void        (*kame_real_free_fn)(void *);
typedef void       *(*kame_real_realloc_fn)(void *, std::size_t);
typedef void       *(*kame_real_calloc_fn)(std::size_t, std::size_t);
typedef std::size_t (*kame_real_msize_fn)(void *);
typedef void       *(*kame_real_malloc_fn)(std::size_t);
static kame_real_free_fn    g_real_free    = nullptr;
static kame_real_realloc_fn g_real_realloc = nullptr;
static kame_real_calloc_fn  g_real_calloc  = nullptr;
static kame_real_msize_fn   g_real_msize   = nullptr;
static kame_real_malloc_fn  g_real_malloc  = nullptr;

static void kame_resolve_real_crt() noexcept {
    if(g_real_free) return;  // idempotent
    HMODULE h = GetModuleHandleA("ucrtbase.dll");
    if( !h) h = LoadLibraryA("ucrtbase.dll");
    if( !h) h = GetModuleHandleA("msvcrt.dll");
    if( !h) return;
    g_real_free    = reinterpret_cast<kame_real_free_fn>   (GetProcAddress(h, "free"));
    g_real_realloc = reinterpret_cast<kame_real_realloc_fn>(GetProcAddress(h, "realloc"));
    g_real_calloc  = reinterpret_cast<kame_real_calloc_fn> (GetProcAddress(h, "calloc"));
    g_real_msize   = reinterpret_cast<kame_real_msize_fn>  (GetProcAddress(h, "_msize"));
    g_real_malloc  = reinterpret_cast<kame_real_malloc_fn> (GetProcAddress(h, "malloc"));
}
// Installed at pool activation (see `activateAllocator`).  Defined far
// below, after the pool dispatchers (`kame_free` / `kame_realloc`) it
// routes the patched imports to.
extern "C" void kame_pool_win_install_redirect() noexcept;
#endif // _WIN32 redirect bypass pointers

// Per-thread flag: set to true when AllocThreadExitCleanup has run, signalling
// that pool-allocator TLS (s_tls.my_chunk, freelists, pin counts) is no
// longer valid.  Trivially destructible (`ALLOC_TLS` = `__thread`) so it
// survives past all thread_local / pthread_key destructors.  Checked in
// `new_redirected()` to fall back to malloc for any heap operations
// that occur during later TLS cleanup phases (e.g. pthread_key dtors
// like RunnerCounterRegistration).
// (§23) IE-TLS: this flag is on the hot path of `new_redirected_large`
// (every large alloc reads it).  Under default global-dynamic TLS the
// access triggers `__tls_get_addr`, which perf-record measured at ~17 %
// of the 65 KiB tight-loop CPU.  IE-TLS bypasses the GOT round-trip and
// reads via fs:offset directly.  Same single-bool fits the IE budget
// easily; updates happen exactly once per thread (at TLS teardown).
ALLOC_TLS_IE bool s_alloc_tls_off = false;

// Per-thread owner id for the deallocate owner-check fast path.  A
// chunk stamps `m_owner_id = s_tls_owner_id` at allocate_chunk; a
// freeing thread compares its own id to decide owner-side
// (chunk-local freelist push, no atomics) vs cross-thread (batch).
// Assigned once per thread from a global counter on first use; 0 is
// reserved for "unassigned" so a never-allocated thread's frees never
// spuriously match a chunk (chunks always carry a non-zero id).
// (§hot-tls) `g_tls_page` (KameTlsPage: last_region_base + owner_id + m_slots[])
// is defined further down.  We use `kame_page()->owner_id` here.
// See allocator_prv.h for the rationale.
static std::atomic<uint32_t> s_owner_id_next{1};
static inline uint32_t kame_owner_id() noexcept {
    uint32_t id = kame_page()->owner_id;
    if(__builtin_expect(id == 0, 0)) {
        do { id = s_owner_id_next.fetch_add(1, std::memory_order_relaxed); }
        while(id == 0);   // skip the reserved 0 on 32-bit wrap
        kame_page()->owner_id = id;
    }
    return id;
}

// (§S7) The §36 orphan Treiber-stack packing (biased chunk ptr + 18-bit ABA
// tag in s_orphan_head, ORPHAN_PTR_BIAS / ORPHAN_TAG_MASK) is retired with the
// stack itself — the atomic_shared_ptr orphan chain needs no ABA tag (the
// smart-ptr refcount keeps a popped node alive across the CAS).
static_assert(((size_t)ALLOC_MIN_CHUNK_SIZE % ((size_t)1 << 18)) == 0,
              "ALLOC_MIN_CHUNK_SIZE must be a multiple of 2^18 so the unit "
              "boundary (= biased PoolAllocator ptr) has 18 low zero bits");

// (§28.2 / §28.4 / §28.5) Tier-attribution counters for kame_pool_get_stats.
//
// HISTORY:
//   §28.2 single global atomic per counter — 10x MT regression
//         (cache-line bouncing on every alloc/free).
//   §28.4 LRC_STATS_SHARDS=64 cache-line-aligned shards — fixed up to 64T,
//         but 128T re-introduces 2-way coherence collisions on the dedicated
//         tier's hot path (≈ 17 % drop at 64 KiB / 128T on Ohtaka).
//   §28.5 (this commit) DROP the running counters entirely — they were pure
//         telemetry for `kame_pool_get_stats()`, never used by allocator
//         logic.
//
//   * `dedicated_chunk_bytes` is recomputed on demand by walking the region
//     bitmap + back_offset table (already walked for `chunks_live`).  A
//     bit-7 dedicated marker on a base-unit's back_offset selects the
//     chunk; its DEDICATED_SIZE header field gives the size.  Includes
//     cache-parked dedicated chunks too — see header doc.
//   * `large_alloc_count` / `large_alloc_bytes` use 2 plain global atomics.
//     Large allocs (4..32 MiB) are rare (multi-MiB ⇒ ~kHz/thread at most),
//     so a single cache line is fine — no measurable contention.
// Pointer-width counters so i486 (no CMPXCHG8B) doesn't need libatomic.
// Live values are bounded by VA size on 32-bit (≤ 4 GiB), so size_t fits;
// the 32-bit "transiently negative" concern is replaced by unsigned-wrap
// semantics — a fetch_sub that briefly underflows produces ~SIZE_MAX, which
// the "> cap" pre-push gate naturally rejects (push refused), exactly the
// effect the prior signed `int64_t` clamp produced.
static std::atomic<size_t> g_large_alloc_count{0};
static std::atomic<size_t> g_large_alloc_bytes{0};

static inline void stats_inc_large(std::size_t mmap_size) noexcept {
    g_large_alloc_count.fetch_add(1, std::memory_order_relaxed);
    g_large_alloc_bytes.fetch_add(mmap_size, std::memory_order_relaxed);
}
static inline void stats_dec_large(std::size_t mmap_size) noexcept {
    g_large_alloc_count.fetch_sub(1, std::memory_order_relaxed);
    g_large_alloc_bytes.fetch_sub(mmap_size, std::memory_order_relaxed);
}

#if KAME_FAST_TSD
// Fast pthread-TSD bypass of macOS TLV thunk for KameTlsPage.
// See header for the design overview.  This global carries the discovered
// byte offset within the pthread struct (= `kame_thread_pointer()`) where
// our pthread_key's TSD slot lives.  Zero means "not yet initialised";
// the hot accessor (`kame_page()` in the header) falls to `tls_page_ie`
// or calls `kame_page_cold()` in that state.
std::size_t s_kame_page_tsd_offset = 0;

namespace {
pthread_key_t s_kame_page_key;

// Constructor priority 101: runs early but after libc/libpthread
// constructors at priorities <= 100.  If pthread_key_create or the
// sentinel scan fails, the offset stays 0 and the allocator stays on
// the TLV path with no further runtime overhead (degraded mode).
//
// Inter-TU ordering: other TUs' constructor(101)s may run before this
// one and call operator new; they hit the TLV/IE fallback (offset == 0),
// which is safe.  Once we run, subsequent allocations on the main
// thread go through fast TSD.  Other threads plant their own TSD slot
// lazily on their first allocation via `kame_page_cold` below.
__attribute__((constructor(101)))
void kame_tls_init_fast() noexcept {
    char *tp = kame_thread_pointer();
    if( !tp) {
#if defined(KAME_FIXED_TSD_SLOT) && (KAME_FIXED_TSD_SLOT)
        fprintf(stderr, "kamepoolalloc FATAL: KAME_FIXED_TSD_SLOT build but "
            "no thread pointer at init.\n");
        abort();
#else
        return;
#endif
    }

#if defined(KAME_FIXED_TSD_SLOT) && (KAME_FIXED_TSD_SLOT)
    // Fixed-slot build (opt-in; see kame_page()).  The hot path baked
    // KAME_FIXED_TSD_SLOT as the TSD byte offset (no runtime
    // `s_kame_page_tsd_offset` load, no offset guard — mimalloc-parity).
    // Force OUR OWN pthread key to land exactly at that slot: allocate
    // keys until the sentinel scan reports the baked offset.  Held probe
    // keys must NOT be deleted mid-spin — `pthread_key_create` hands out
    // the lowest free slot, so deleting one would let the next create
    // reuse it and never advance; delete them only AFTER the hit, to
    // return them to the PTHREAD_KEYS_MAX budget.  No runtime fallback
    // exists (a graceful fast/slow switch costs the hot path, and a
    // dlopen'd interposer does NOT retroactively rebind malloc — both
    // measured), so on overshoot / key exhaustion, fail loudly.
    {
        const std::size_t WANT = (std::size_t)(KAME_FIXED_TSD_SLOT);
        enum { MAX_SPIN = 480 };               // < PTHREAD_KEYS_MAX (512)
        pthread_key_t held[MAX_SPIN];
        int  nheld = 0;
        bool hit = false;
        for(int i = 0; i < (int)MAX_SPIN; ++i) {
            pthread_key_t k;
            if(pthread_key_create(&k, nullptr) != 0) break;   // key exhaustion
            // Unique sentinel per iteration so the scan can never match a
            // previously-held key's slot.
            const uintptr_t sent =
                (uintptr_t)0xDEAD600D11AA0000ull ^ (uintptr_t)(unsigned)i;
            pthread_setspecific(k, (void *)sent);
            std::size_t off = 0; bool got = false;
            for(std::size_t o = 0; o < 4096; o += 8)
                if(*reinterpret_cast<uintptr_t *>(tp + o) == sent) {
                    off = o; got = true; break;
                }
            if(got && off == WANT) { s_kame_page_key = k; hit = true; break; }
            held[nheld++] = k;                 // hold to advance the allocator
            if(got && off > WANT) break;        // overshot — cannot go back
        }
        for(int i = 0; i < nheld; ++i) pthread_key_delete(held[i]);
        if( !hit) {
            fprintf(stderr,
                "kamepoolalloc FATAL: built with -DKAME_FIXED_TSD_SLOT=%zu, but "
                "could not place a pthread TSD key at that slot (overshoot or "
                "key exhaustion) on this runtime. Rebuild with a reachable "
                "KAME_FIXED_TSD_SLOT (probe s_kame_page_tsd_offset for this "
                "OS), or drop the flag for the robust runtime-offset build.\n",
                WANT);
            abort();
        }
        s_kame_page_tsd_offset = WANT;          // cold-path readers (teardown) use it
        pthread_setspecific(s_kame_page_key, &g_tls_page);
        tls_page_ie = &g_tls_page;
    }
#else
    if(pthread_key_create(&s_kame_page_key, nullptr) != 0) return;

    // Sentinel scan: plant a magic value via the POSIX API, then walk
    // the pthread struct to find which byte offset received it.  POSIX
    // doesn't expose the layout, but the implementation must store the
    // value somewhere reachable from the thread pointer for
    // `pthread_getspecific` to be fast — we rely on it being a fixed
    // offset, true for both Apple's libc and glibc.
    const uintptr_t sent1 = (uintptr_t)0xDEAD600D11AA1234ull;
    pthread_setspecific(s_kame_page_key, (void *)sent1);

    std::size_t off1 = 0;
    // 4 KiB upper bound covers all libc TSD layouts we know about
    // (Apple reserves slots 0..N, then user keys start; offsets are
    // typically < 2 KiB).  Stride 8 — slot is a pointer.
    for(std::size_t off = 0; off < 4096 && !off1; off += 8) {
        uintptr_t v = *reinterpret_cast<uintptr_t *>(tp + off);
        if(v == sent1) off1 = off;
    }

    if(off1) {
        s_kame_page_tsd_offset = off1;
        // Plant THIS thread's (= typically the main thread's) TSD slot
        // now so the next allocation hits the fast path on the first try.
        // Touching the __thread struct triggers TLV lazy init; the
        // resulting address is stable for this thread's life.
        pthread_setspecific(s_kame_page_key, &g_tls_page);
        tls_page_ie = &g_tls_page;
    }
    else {
        // Scan failed — leave offset at 0 (degraded TLV-only mode).
        pthread_setspecific(s_kame_page_key, nullptr);
    }
#endif
}
} // anon namespace

// Cold path for the fast-TSD accessor in the header.  Called when
// either guard branch fails (offset == 0 → pre-init; or TSD slot
// null → first allocation on this thread, plant the pointer).
// `preserve_most` (matching the header decl) tells the caller that
// this call preserves nearly all caller-saved registers.
[[clang::preserve_most]]
__attribute__((cold, noinline))
KameTlsPage *kame_page_cold() noexcept {
    // (dylib TLV-bootstrap leak fix) Park the fast-TSD slot at the teardown
    // sentinel BEFORE the general-dynamic `&g_tls_page` access below.
    //
    // In a DYLIB build, that first thread_local touch makes dyld lazily
    // instantiate this image's per-thread TLV block (all our thread_locals:
    // g_tls_page + tls_cross_dealloc_batch (16 KiB) + the per-ALIGN s_tls +
    // tls_alloc_thread_exit_cleanup, ~32 KiB) via a single `malloc` — which the
    // dylib interposes.  Routed into the pool, that malloc claims a ~32 KiB
    // chunk to hold the process's OWN per-thread TLS; at thread exit dyld frees
    // the block off the pool's per-thread reclaim discipline, so the chunk is
    // never returned -> ~8 units leaked PER THREAD (unbounded across thread
    // churn).  Confirmed by lldb: deallocate -> kame_page() -> kame_page_cold ->
    // _tlv_get_addr -> dyld instantiateVariable -> malloc -> kame_pool_malloc.
    //
    // Parking the slot at g_teardown_page makes that re-entrant malloc observe
    // kame_thread_torn_down()==true in cold_first_access / new_redirected_large,
    // so it falls to libsystem_malloc_for_pool (the real heap) instead of the
    // pool.  The TLV block is then never pooled, and its eventual free passes
    // straight through (radix ABSENT -> libsystem free).  The slot is restored
    // to the real page before returning, so the outer caller is unaffected.
    //
    // Inline/static builds (production kame.app / kame.exe) reach g_tls_page via
    // initial-exec / static TLS — the block is allocated by the kernel at thread
    // creation, NOT via malloc — so no re-entry occurs and this parking is inert.
    char *tp = (s_kame_page_tsd_offset != 0) ? kame_thread_pointer() : nullptr;
    if(tp)
        *reinterpret_cast<KameTlsPage **>(tp + s_kame_page_tsd_offset) = &g_teardown_page;
    KameTlsPage *p = &g_tls_page;   // one GD TLV — cold, paid once per thread
    tls_page_ie = p;
    if(s_kame_page_tsd_offset != 0) {
        pthread_setspecific(s_kame_page_key, p);
        if(tp)
            *reinterpret_cast<KameTlsPage **>(tp + s_kame_page_tsd_offset) = p;
    }
    return p;
}
#endif // KAME_FAST_TSD

// Forward decl for `drain_thread_slot_freelists` — now a retained no-op
// stub (see its definition).  Owner-thread freelists are no longer in a
// global `g_thread_slots[]` array; each chunk's freelist is chunk-local
// (`m_freelist_head[]`) and is drained per-chunk by
// `release_dll_chunks_for_thread` before that chunk's BIT_OWNED clear.
// Kept as a symbol so the `~AllocThreadExitCleanup` call site stays valid.
namespace { void drain_thread_slot_freelists() noexcept; }

// (§22) Unified per-thread large-recycle cache, shared by BOTH large
// tiers so a tight alloc/free loop of 64 KiB–32 MiB reuses warm VA+pages
// instead of paying the per-cycle release every time:
//   - LRC_CHUNK : §15 dedicated multi-unit chunks (64 KiB–4 MiB).  The
//                 claim bits stay SET while cached (so no other thread can
//                 re-claim the units); reuse returns the payload directly
//                 with the chunk_header intact — NO claim_chunk, NO madvise.
//                 True release (on eviction / thread-exit) = the N-bit
//                 bitmap-CAS claim-clear + madvise inside deallocate_chunk.
//   - LRC_MMAP  : §19 single-mmap large allocs (4 MiB–32 MiB).  The VA
//                 stays mapped while cached (radix CLEARED for double-free
//                 routing); reuse re-registers the radix.  Release = munmap.
// The freeing thread wins the kind's single-winner clearing CAS (bitmap
// N-bit for chunk, radix for mmap) and is thus the unique owner, so the
// deferred release is race-free regardless of thread-exit ordering.  The
// `kind` tag selects the release backend on eviction.  Cache + helpers are
// defined far below (after deallocate_chunk / the mmap helpers); these are
// forward decls so the earlier §15 dedicated-chunk paths can reach them.
namespace {
enum { LRC_MMAP = 0, LRC_CHUNK = 1 };
char *large_recycle_pop(std::size_t need, unsigned kind) noexcept;
bool  large_recycle_push(char *base, std::size_t size, unsigned kind) noexcept;
}

// Per-thread cleanup at thread exit.  chunks are no longer
// pinned via atomic counters; this destructor instead walks each
// (ALIGN, FS) template's per-thread DLL (via the registered
// `release_dll_chunks_for_thread` callbacks) and either releases
// empty chunks directly or marks non-empty chunks with
// `BIT_OWNER_EXITED` so cross-thread last-slot-returners can release
// them later.  Capacity covers the count of distinct PoolAllocator
// template instantiations actually in use by this thread.
namespace {
struct AllocThreadExitCleanup {
    static constexpr int MAX = 32;
    // `noexcept` is part of the function-pointer type since C++17 — the
    // dylib + tests + production builds (cmake `-std=gnu++17`, qmake
    // `CONFIG += c++17`) compile at C++17 so this is well-formed and
    // matches the implementation's `noexcept` declaration.
    using ReleaseDllFn = void (*)() noexcept;
    ReleaseDllFn release_fns[MAX] = {};
    int count = 0;
    //! Register a per-template DLL teardown callback.  Called once per
    //! thread per (ALIGN, FS) template from `allocate_chunk_path` on
    //! the first mmap-fresh path entry.  Dedup'd so repeated calls
    //! are O(count) but idempotent.
    void add(ReleaseDllFn fn) noexcept {
        for(int i = 0; i < count; ++i)
            if(release_fns[i] == fn) return;
        if(count < MAX) release_fns[count++] = fn;
    }
    ~AllocThreadExitCleanup() noexcept {
        // `drain_thread_slot_freelists()` is a retained no-op stub now
        // (see its definition); the per-chunk freelist drain has been
        // folded into the per-template DLL walk below
        // (`release_dll_chunks_for_thread`), which drains each chunk's
        // freelist right before clearing its BIT_OWNED.  Call kept for
        // call-site / ABI stability.
        drain_thread_slot_freelists();
        // Clear every per-thread bucket chunk pointer BEFORE the DLL
        // teardown walk.  Otherwise a later TLS destructor that
        // allocates could route through a chunk that's about to be
        // released.  After this loop the slow path's per-bucket
        // freelist-ptr slot reads as cleared, so
        // `new_redirected` falls to `cold_first_access`, which
        // observes `s_alloc_tls_off == true` (set a few lines below)
        // and returns `std::malloc(size)`.
        // (§12.3 / §hot-tls) Clear all per-bucket freelist-ptr slots in
        // the TLS page.  A null entry makes new_redirected take the cold
        // path on next access (which observes s_alloc_tls_off == true
        // and routes to std::malloc).
        // Use tls_page_ie for the drain path (TSD slot may already be
        // cleared at thread-exit; IE fallback is safe here).
        {
#if KAME_FAST_TSD
            KameTlsPage *pg = tls_page_ie ? tls_page_ie : &g_tls_page;
#else
            KameTlsPage *pg = &g_tls_page;
#endif
            for(int b = 0; b < ALLOC_NUM_BUCKETS; ++b)
                pg->m_slots[b].freelist_head = nullptr;
        }
        // Walk each registered template's per-thread DLL.  Each
        // callback wipes its own `s_tls.my_chunk` / `s_tls.dll_head` / `s_tls.dll_tail`
        // first, then iterates with cached-next, setting BIT_OWNER_EXITED
        // on non-empty chunks and releasing empties directly via
        // BIT_RELEASED CAS.  See
        // `PoolAllocator<>::release_dll_chunks_for_thread` for details.
        for(int i = 0; i < count; ++i)
            release_fns[i]();
        // Signal that pool-allocator TLS is dead.  Read by
        // `is_allocator_thread_active()` from later (pthread_key) TLS
        // dtors.  `new_redirected` itself no longer checks this flag —
        // the per-bucket slot rewrite above is its analogue.
        s_alloc_tls_off = true;
        // (§hot-tls teardown sentinel) Point this thread's fast-TSD page
        // slot at the static teardown sentinel.  After this, any later
        // pthread_key dtor (e.g. libc++ ~__thread_struct) that frees a pool
        // pointer reaches `deallocate` → owner-id mismatch (sentinel
        // owner_id == 0) → cold `deallocate_pooled`, which identity-compares
        // `kame_page() == &g_teardown_page` and takes a TLS-free path
        // WITHOUT re-touching `s_tls` / `&s_tls.dll_head` — whose TLV may
        // already be finalized, so a `_tlv_get_addr` re-instantiation would
        // `malloc` mid-teardown and trap.  This write is value-only (a
        // pthread TSD slot store, legal during cleanup) — no TLV deref.
        //
        // macOS-only: the sentinel exists solely to give the fast-TSD
        // `kame_page()` a teardown-safe value to return.  On Linux/Windows
        // `tls_page_ie` does not exist (the page is read directly as IE TLS)
        // and `kame_thread_torn_down()` uses the teardown-safe `s_alloc_tls_off`
        // flag set above — nothing to redirect here.
#if KAME_FAST_TSD
        tls_page_ie = &g_teardown_page;
        if(s_kame_page_tsd_offset != 0) {
            pthread_setspecific(s_kame_page_key, &g_teardown_page);
            char *tp = kame_thread_pointer();
            if(tp)
                *reinterpret_cast<KameTlsPage **>(tp + s_kame_page_tsd_offset)
                    = &g_teardown_page;
        }
#endif
    }
};
// Raw `thread_local` — the kamepoolalloc dylib boundary already
// ensures a single shared instance across all plugin DLLs/dylibs
// that link against us, so the cross-DLL slot-sharing concern that
// motivated `XThreadLocal` upstream is gone.
//
// First-touch re-entry safety: C++ thread_local lazy init on macOS
// uses `tlv_allocate_and_initialize_for_key` (libsystem) for the
// storage, and `__cxa_thread_atexit` registers the dtor via
// libcxxabi's `malloc` — both libsystem-malloc paths.  Neither
// recurses into our pool, so first-touch from `allocate()` is safe.
//
// Destruction order: C++ destroys thread_locals in reverse order of
// construction completion.  `AllocThreadExitCleanup` is touched first
// (via `tls_alloc_thread_exit_cleanup.add(...)` in the allocate() hot path),
// `CrossDeallocBatch` second (via `push(...)` in deallocate); so the
// batch is flushed before AllocThreadExitCleanup tears down chunks — the
// ordering invariant the previous XThreadLocal PerThread LIFO chain
// guaranteed.
thread_local AllocThreadExitCleanup tls_alloc_thread_exit_cleanup;

// Cross-thread dealloc batch — per-thread parallel arrays of slot
// pointers and their owning chunks.  Parallel-array (SoA) layout is
// chosen over the natural AoS (`struct { chunk, slot }`) so that
// after sorting, the per-chunk `slot` subarray is *contiguous in
// memory* — directly passable to `chunk->batch_return_to_bitmap`
// without an intermediate copy.
//
// On flush:
//   1. Insertion-sort the (chunks, slots) pair by (chunk, slot)
//      lexicographically — chunk primary key for grouping, slot
//      pointer secondary key so the per-chunk slot subarray is
//      pointer-sorted (= m_flags-word-index-sorted).  In-place,
//      swap-based, no allocation.  Insertion sort is the right
//      choice at CAP=16: O(n²/2) ≈ 128 compares worst, but it's
//      branch-friendly and cache-warm on the tiny SoA arrays.
//   2. Walk chunk runs, hand each `chunk->batch_return_to_bitmap`
//      the contiguous `&slots[run_start], run_len`.  The chunk's
//      bitmap clear (in `batch_clear_impl`) walks the sorted slots
//      once, merging adjacent same-word slots into one CAS — O(n)
//      total, no temporary allocation, no m_count-proportional
//      bookkeeping.
//
// Why batching beats CAP=1 here despite the earlier ohtaka result:
// the old `batch_clear_impl` paid O(m_count) bookkeeping per call
// regardless of n, so n=1 calls were ~150 cycles of pure overhead
// per slot.  Now the bookkeeping is O(n) (slot-walk + adjacent
// same-word merge), so n>1 wins purely from coalesced CAS reduction
// whenever slots happen to share an m_flags word.
//
// CAP=16 chosen by the earlier sweep (HWM trade-off — see git log).
// Re-tune-able now that the O(n) impl removes the throughput cost
// curve.
struct CrossDeallocBatch {
    // FS=true-only small-slot batch (FS=false bypasses
    // cross-batch entirely in its `deallocate_pooled` — see that
    // function for rationale).  FS=true buckets are ALIGN==SIZE
    // (16..240 B), one bit per slot in m_flags ⇒ up to 64 slots per
    // FUINT word.  Cross-thread frees of small slots are numerous AND
    // their chunks tend to repeat (a few hot per-size-class chunks
    // serve most allocs), so a deep accumulation window catches
    // same-chunk same-word "buddies" arriving over time → at flush,
    // sort + adjacent-merge coalesces them into one CAS per word.
    //
    // CAP=1024 chosen for L1d-resident accumulation:
    //   16 B / entry × 1025 entries = 16.4 KiB.
    // Most modern L1d is 32-64 KiB; the buf fits with room for other
    // working set.  Per-thread; 128 threads × 16 KiB = 2 MiB total —
    // acceptable for the throughput win expected on NUMA.
    //
    // Sort cost (~20000 cycles for 1024 entries) amortised over
    // 1024 pushes ≈ 20 cycles/push — break-even with current CAP=1
    // direct dispatch IF average coalescing factor > 1.08 (saves >
    // 8 % of CAS, which at ~250 cycles per cross-socket CAS = 20
    // cycles/push).  Realistic FS=true workload (STM Payload deep-
    // copies, identical-size objects from a few chunks) should
    // comfortably exceed this.
    static constexpr int CAP = 1024;
    CrossDeallocEntry buf[CAP + 1];   // +1 = sentinel slot
    int               count = 0;

    //! FS=true path: hold and batch.  Caller passes its own `this`
    //! as `c` (the chunk).
    void push(PoolAllocatorBase *c, void *s) noexcept {
        if(count == CAP) flush();
        buf[count++] = {c, s};
    }

    //! Direct/adaptive dispatch path — FS=true only (    //! FS=false bypasses cross-batch entirely in `deallocate_pooled`
    //! and never reaches this template).
    //!
    //! FS=true: adaptive.  Reads the chunk's `m_last_coalesce_x16`
    //! hint (relaxed); routes to hold when ≥ per-ALIGN threshold
    //! (compile-time folded), else direct.  Epsilon-greedy explore
    //! force-holds once per `EXPLORE_PERIOD` to re-measure chunks
    //! whose hint dropped below threshold.
    //!
    //! FS=true thresholds (compile-time tiers):
    //!
    //!   ALIGN ≤  64  → 20  (1.25×)
    //!   ALIGN ≤ 128  → 24  (1.50×)
    //!   ALIGN ≤ 256  → 29  (1.81×)
    //!   ALIGN >  256 → 35  (2.19×)
    //!
    //! Not static — the explore counter lives in the per-thread
    //! batch instance, naturally TLS-local.
    static constexpr int EXPLORE_PERIOD = 128;
    int explore_counter = 0;

    template <unsigned ALIGN>
    void push_direct(PoolAllocatorBase *c, void *s) noexcept {
        constexpr uint8_t threshold_x16 =
            (ALIGN <=  64) ? 20 :
            (ALIGN <= 128) ? 24 :
            (ALIGN <= 256) ? 29 : 35;
        bool hold;
        if(++explore_counter >= EXPLORE_PERIOD) {
            explore_counter = 0;
            hold = true;                                // explore
        }
        else {
            hold = c->m_last_coalesce_x16.load(
                       std::memory_order_relaxed) >= threshold_x16;
        }
        if(hold) {
            push(c, s);
            return;
        }
        CrossDeallocEntry tmp[2] = {{c, s}, {nullptr, nullptr}};
        // (§20) Cache the dll-cursor-reset addresses BEFORE
        // batch_return_to_bitmap.  If this is `c`'s last slot AND
        // BIT_OWNED is clear (owner exited), batch_return releases the
        // chunk: the placement-new destructor runs, and `c` becomes a
        // stale pointer — accessing `c->m_owner_dll_head_addr` /
        // `c->m_owner_dll_force_walk_ptr` afterwards is UB by C++'s
        // object-lifetime rule (UBSAN's vptr check fires under
        // -fsanitize=undefined).  The fields are write-once at chunk
        // construction so the cached values are safe across the call.
        void *cached_dll_head_addr = c->m_owner_dll_head_addr;
        auto *cached_force_walk =
            c->m_owner_dll_force_walk_ptr.load(std::memory_order_acquire);
        c->batch_return_to_bitmap(tmp);
        // (§20) `c` may be destructed past this point — use cached values only.
        if(cached_dll_head_addr ==
           PoolAllocator<ALIGN, true, true>::dll_head_tls_addr())
            PoolAllocator<ALIGN, true, true>::reset_dll_walk_state();
        else if(cached_force_walk)
            // Acquire load (above) synchronises with owner-exit's
            // release-store of nullptr in
            // `release_dll_chunks_for_thread`.  Null after owner exit
            // → skip deref; non-null means owner's TLS storage is
            // still live (owner-exit nullifies BEFORE thread teardown).
            cached_force_walk->store(true, std::memory_order_relaxed);
    }

    void flush() noexcept {
        if(count == 0) return;
        // Sort by (chunk, slot) lex — chunk primary key for grouping,
        // slot pointer secondary key so each chunk run is pointer-
        // ascending (= m_flags-word-ascending).  std::sort introsort,
        // no heap, in-place swap-based.
        std::sort(buf, buf + count,
                  [](const CrossDeallocEntry &a, const CrossDeallocEntry &b) {
                      if(a.chunk != b.chunk) return a.chunk < b.chunk;
                      return a.slot < b.slot;
                  });
        // Plant the sentinel after the live count so the chunk-side
        // walk terminates by `entries[k].chunk == this` failing,
        // without a length check.
        buf[count] = {nullptr, nullptr};
        // Walk chunk runs.  `batch_return_to_bitmap` consumes the run
        // starting at `&buf[i]` (entries[k].chunk == this until
        // sentinel / next chunk), returns the count, caller advances.
        //
        // For each unique chunk the batch returned slots to: signal the
        // OWNER thread's "force walk from head" hint flag.  Without this
        // poke, the owner's `allocate_chunk_path` Phase 2 DLL walk stays
        // gated by its own `dll_exhausted` flag (set after the previous
        // walk found no space) and keeps mmap'ing fresh chunks instead
        // of reusing the slots we just returned.  The single-slot
        // `push_direct` path already does this; the batched flush path
        // skipped it — caught by `bench_xthread_pool -w2 -s64` where the
        // pool inflated +32 regions (1 GiB VA) over a 5-second run.
        //
        // Cache `m_owner_dll_force_walk_ptr` BEFORE
        // `batch_return_to_bitmap`: the call may release the chunk on
        // last-slot return + owner-exit, after which `chunk` is a stale
        // pointer.  The owner's TLS storage that the cached ptr targets
        // lives independently of the chunk; null after owner-exit's
        // release-store, so the post-call deref is safe-or-skipped.
        int i = 0;
        while(i < count) {
            PoolAllocatorBase *chunk = buf[i].chunk;
            std::atomic<bool> *cached_force_walk =
                chunk->m_owner_dll_force_walk_ptr.load(
                    std::memory_order_acquire);
            i += chunk->batch_return_to_bitmap(&buf[i]);
            if(cached_force_walk)
                cached_force_walk->store(true, std::memory_order_relaxed);
        }
        count = 0;
    }
    ~CrossDeallocBatch() noexcept { flush(); }
};
thread_local CrossDeallocBatch tls_cross_dealloc_batch;

// drain_thread_slot_freelists (defined below) is a retained no-op stub.
// Owner-thread freelists are chunk-local and drained per-chunk by
// `release_dll_chunks_for_thread` (per-template DLL walk) before each
// chunk's BIT_OWNED clear — see that function and the stub's own comment.
//
// each touched chunk still has `BIT_OWNER_EXITED == 0`
// at this point (the per-template DLL walk that sets it runs
// AFTER `drain_thread_slot_freelists` in `~AllocThreadExitCleanup`), so
// the cross_release inside batch_return_to_bitmap returns false
// — the owner thread (us) is still alive, no release allowed.
void drain_thread_slot_freelists() noexcept {
    // Single-slot scratch + trailing nullptr sentinel — satisfies
    // `batch_return_to_bitmap`'s `entries[k].chunk == this` walk
    // contract (one matching entry, then the sentinel terminates).
    //
    // freelists hold p_user pointers for BOTH FS=true and
    // FS=false (the "borrow scheme" puts FS=false's user pointer at
    // slot_start, same convention as FS=true).  `batch_return_to_bitmap`
    // and its MaskFn both work on `entries[k].slot == p_user` directly
    // — for FS=false they read the `{bucket, SIZE}` header from
    // `p_user - 8` (chunk_header pad for slot 0, predecessor's
    // reserved tail otherwise).  No per-FS conversion needed.
    // No-op since follow-up "(1)": owner-thread freelists are now
    // chunk-local (PoolAllocatorBase::m_freelist_head[]), not in the
    // global g_thread_slots[] array.  Each chunk's freelist is drained
    // by `release_dll_chunks_for_thread` (per-template DLL walk) right
    // before that chunk's BIT_OWNED clear — see there.  Kept as an
    // empty symbol so the ~AllocThreadExitCleanup call site and any
    // external references stay valid; the compiler elides it.
}

} // anon namespace

// Atomic helpers moved to allocator_prv.h so the header-inlined
// `batch_clear_impl` template member of PoolAllocator can use them.
#if defined __WIN32__ || defined WINDOWS || defined _WIN32
#else
    #include <sys/mman.h>
#endif
#include <sys/types.h>

// `count_bits` and `find_zero_forward` are now in allocator_prv.h
// (header-visible for inline use by FS=false bucket-freelist push).
// Reference: H. S. Warren, Jr., "Beautiful Code", O'Reilly.

//! \return one bit at the first one from the LSB in \a x.
template <typename T>
inline T find_one_forward(T x) {
	return x & ( ~x + 1u);
}

//! Folds "OR" operations. O(log X).
//! Expecting inline expansions of codes.
//! \tparam X number of zeros to be looked for.
template<typename T>
inline T fold_bits(unsigned int X, unsigned int SHIFTS, T x) {
//	printf("%d, %llx\n", SHIFTS, x);
//	if(x == ~(T)0u)
//		return x; //already filled.
	if(X <  2 * SHIFTS)
		return x;
	x = (x >> SHIFTS) | x;
	if(X & SHIFTS)
		x = (x >> SHIFTS) | x;
	return (2 * SHIFTS < sizeof(T) * 8) ?
		fold_bits(X, (2 * SHIFTS < sizeof(T) * 8) ? 2 * SHIFTS : 1, x) : x;
};

//! Bit scan forward, counting zeros in the LSBs.
//! \param x should be 2^n (a single set bit).
//! \sa find_zero_forward(), find_first_oen().
//!
//! Compiles to `bsf`/`tzcnt` on x86 and `rbit;clz` on ARM64 via
//! __builtin_ctzll, so this single implementation covers every arch the
//! pool allocator supports. The former x86 inline-asm form is preserved
//! behind the same guard as a backstop for exotic toolchains.
template <typename T>
inline unsigned int count_zeros_forward(T x) {
#if defined(__GNUC__) || defined(__clang__)
    return __builtin_ctzll(static_cast<unsigned long long>(x));
#elif defined __i386__ || defined __i486__ || defined __i586__ || defined __i686__ || defined __x86_64__
	T ret;
	asm ("bsf %1,%0": "=q" (ret) : "r" (x) :);
	return ret;
#else
	return count_bits(x - 1);
#endif
}

//template <int X, typename T>
//inline T find_training_zeros_tedious(T x) {
//	T ret = ((T)1u << X) - 1u;
//	while(x & ret)
//		ret = ret << 1;
//	ret = find_one_forward(ret);
//	if(ret > (T)1u << (sizeof(T) * 8 - X)) return 0; //checking if T has enough space in MSBs.
//	return ret;
//}

//! Finds training zeros from LSB in \a x using O(log n) algorithm.
//! \arg X number of zeros to be looked for.
//! \return one bit at the LSB of the training zeros if enough zeros are found.
template<typename T>
inline T find_training_zeros (int X, T x) {
//	if( !x) return 1u;
	if(X == sizeof(T) * 8)
		return !x ? 1u : 0u; //a trivial case.
	x = fold_bits(X, 1, x);
	if(x == ~(T)0u)
		return 0; //already filled.
	x = find_zero_forward(x); //picking the first zero from LSB.
	if(x > (T)1u << (sizeof(T) * 8 - X)) return 0; //checking if T has enough space in MSBs.
	return x;
};

inline void *malloc_mmap(size_t size) {
//		fprintf(stderr, "mmap(), %d\n", (int)size);
#if defined __WIN32__ || defined WINDOWS || defined _WIN32
        // Genuine UCRT malloc — NOT the redirected `malloc`.  This IS the
        // pool's region-backing allocator; under KAMEPOOLALLOC_FULL_INTERCEPT
        // a plain `malloc` is IAT-patched to route back into the pool, which
        // would recurse infinitely here (pool → region claim → malloc_mmap →
        // malloc → pool ...).  Mirrors free_munmap's g_real_free.
        // g_real_malloc is resolved before the redirect installs, so the only
        // pre-resolution callers (none on the region path) fall to std::malloc.
        void *p = g_real_malloc ? g_real_malloc(size) : malloc(size);
#else
		void *p = (
			mmap(0, size + ALLOC_ALIGNMENT, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0));
		assert(p != MAP_FAILED);
#endif
		*static_cast<size_t *>(p) = size + ALLOC_ALIGNMENT;
		return static_cast<char *>(p) + ALLOC_ALIGNMENT;
}
inline void free_munmap(void *p) {
		p = static_cast<void *>(static_cast<char *>(p) - ALLOC_ALIGNMENT);
		size_t size = *static_cast<size_t *>(p);
	//	fprintf(stderr, "unmmap(), %d\n", (int)size);
#if defined __WIN32__ || defined WINDOWS || defined _WIN32
        // Genuine UCRT free — NOT the redirected `free`.  This is pool-
        // backing region memory (malloc'd in mmap_new_region); routing it
        // through the pool dispatcher post-redirect would misclassify it.
        // g_real_free is resolved before the redirect is ever installed.
        if(g_real_free) g_real_free(p);
        else free(p);
#else
        int ret = munmap(p, size);
		assert( !ret);
#endif
}

bool g_sys_image_loaded = false;

#if defined(KAMEPOOLALLOC_DYLIB)
// Dylib mode: auto-activate at dylib load.  `__attribute__((constructor))`
// with the priority slot we already use for `kame_tls_init_fast` (101)
// runs after libc/libpthread (which use ≤100) but before any consumer
// image's static-init — so by the time `main()` is reached, every
// `operator new` call is fully pool-routed.  No `activateAllocator()`
// call from user code is necessary; `KamePooledAllocGuard` and the
// per-test `tests/allocator.cpp` activator shim are correspondingly
// elided in dylib builds (see `KAMEPOOLALLOC_DYLIB` branches in
// `allocator.h`, and the dropped `support_SRCS` entry in
// `tests/CMakeLists.txt`).
//
// `[[gnu::used]]` keeps the symbol against `-fdata-sections / -ffunction-
// sections` + GC, and also against lld's static-internal-with-no-explicit-
// reference pruning.  Reported (MinGW64 + lld): the DLL would build
// fine, but `g_sys_image_loaded` stayed `false` at runtime — pool fell
// through to libc malloc and no `Reserve swap space` ever printed.  The
// backup static-init activator below also covers the case where lld
// emits the constructor record but Windows CRT init doesn't pick it up.
[[gnu::used]] __attribute__((constructor(101)))
static void kamepoolalloc_auto_activate() noexcept {
#if defined(_WIN32) || defined(__WIN32__) || defined(WINDOWS)
    // (§31) Hook the free-family across all modules BEFORE flipping the
    // flag, so the very first pool pointer the pool hands out is already
    // safe to free from any module (Qt / libc++ / kamestm).  Safe to call
    // from this constructor even when it runs in DllMain under the loader
    // lock: the module sweep walks the PEB list (no loader re-entry) — see
    // kame_patch_all_modules.  Idempotent + KAME_POOL_WIN_REDIRECT=0.
    kame_pool_win_install_redirect();
#endif
    g_sys_image_loaded = true;
}
// Backup: a file-scope global whose default constructor unconditionally
// flips the flag.  Static-init ordering is unspecified relative to other
// TUs, but `g_sys_image_loaded = true;` has no other-init dependency
// (just a plain bool store).  Static-init runs reliably on every linker
// we care about — including the Windows path where the priority-tagged
// `__attribute__((constructor))` record may be silently dropped.
namespace {
struct KamePoolAutoActivator {
    KamePoolAutoActivator() noexcept {
#if defined(_WIN32) || defined(__WIN32__) || defined(WINDOWS)
        kame_pool_win_install_redirect();  // see kamepoolalloc_auto_activate
#endif