From 63aa7b01c8b27d9c0079129585b48d4a38bc3489 Mon Sep 17 00:00:00 2001 From: Arun KV <65647132+arun-kv@users.noreply.github.com> Date: Wed, 14 May 2025 09:24:36 +0530 Subject: [PATCH 01/13] SSV-24782:Create MSVC-specific wrapper library for reading CR8 register (#101) * Create MSVC-specific wrapper library for reading CR8 register Added a new C file that defines read_cr8_msvc(), which wraps the MSVC intrinsic __readcr8() to read the CR8 register, since Clang and GCC do not provide __readcr8(). This function is now compiled into a separate library spl_cr_wrappers, allowing Clang projects to call read_cr8_msvc() without directly depending on MSVC-specific intrinsics in Clang code. --- module/os/windows/spl/CMakeLists.txt | 36 ++++++++++++++++++++++++- module/os/windows/spl/spl-windows.c | 2 +- module/os/windows/spl/spl_cr_wrappers.c | 21 +++++++++++++++ module/os/windows/zfs/CMakeLists.txt | 2 +- 4 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 module/os/windows/spl/spl_cr_wrappers.c diff --git a/module/os/windows/spl/CMakeLists.txt b/module/os/windows/spl/CMakeLists.txt index b63c54e57921..b1840072271d 100644 --- a/module/os/windows/spl/CMakeLists.txt +++ b/module/os/windows/spl/CMakeLists.txt @@ -40,6 +40,40 @@ wdk_add_library(splkern ${TMH_FILE_LIST} ) +target_include_directories(splkern BEFORE PUBLIC "${CMAKE_SOURCE_DIR}/include/os/windows/" PUBLIC "${CMAKE_SOURCE_DIR}/include/os/windows/spl") # set(CMAKE_TOOLCHAIN_FILE $ENV{CMAKE_TOOLCHAIN_FILE}) -target_include_directories(splkern BEFORE PUBLIC "${CMAKE_SOURCE_DIR}/include/os/windows/" PUBLIC "${CMAKE_SOURCE_DIR}/include/os/windows/spl") +find_program(MSVC_CL_EXECUTABLE cl.exe) +if(NOT MSVC_CL_EXECUTABLE) + message(FATAL_ERROR "MSVC_CL_EXECUTABLE is not set! Make sure to run CMake with a configured compiler.") +else() + message(STATUS "MSVC_CL_EXECUTABLE is set to: ${MSVC_CL_EXECUTABLE}") +endif() + +find_program(MSVC_LIB_EXECUTABLE lib.exe) +if(NOT MSVC_LIB_EXECUTABLE) + message(FATAL_ERROR "MSVC lib.exe not found in PATH. Ensure you're using Visual Studio Command Prompt or MSVC toolchain.") +else() +message(STATUS "MSVC_LIB_EXECUTABLE is set to: ${MSVC_LIB_EXECUTABLE}") +endif() + +set(CR_WRAPPERS_DIR "${CMAKE_BINARY_DIR}/module/os/windows/spl") +file(MAKE_DIRECTORY "${CR_WRAPPERS_DIR}") + +set(CR_WRAPPERS_OBJ "${CR_WRAPPERS_DIR}/spl_cr_wrappers.obj") +set(CR_WRAPPERS_LIB "${CR_WRAPPERS_DIR}/spl_cr_wrappers.lib") + +add_custom_command( + OUTPUT ${CR_WRAPPERS_LIB} + COMMAND ${MSVC_CL_EXECUTABLE} /nologo /c /Fo"${CR_WRAPPERS_OBJ}" ${CMAKE_SOURCE_DIR}/module/os/windows/spl/spl_cr_wrappers.c + COMMAND ${MSVC_LIB_EXECUTABLE} /nologo /OUT:${CR_WRAPPERS_LIB} /MACHINE:x64 ${CR_WRAPPERS_OBJ} + WORKING_DIRECTORY ${CR_WRAPPERS_DIR} + DEPENDS ${CMAKE_SOURCE_DIR}/module/os/windows/spl/spl_cr_wrappers.c + COMMENT "Building ${CR_WRAPPERS_LIB} in ${CR_WRAPPERS_DIR} using MSVC" +) + +add_custom_target(cr_wrappers ALL DEPENDS ${CR_WRAPPERS_LIB}) +add_library(spl_cr_wrappers STATIC IMPORTED GLOBAL) +set_target_properties(spl_cr_wrappers PROPERTIES + IMPORTED_LOCATION ${CR_WRAPPERS_LIB}) + diff --git a/module/os/windows/spl/spl-windows.c b/module/os/windows/spl/spl-windows.c index ae3135208fef..e6b31edac923 100644 --- a/module/os/windows/spl/spl-windows.c +++ b/module/os/windows/spl/spl-windows.c @@ -73,7 +73,7 @@ uint32_t spl_hostid = 0; uint64_t __readcr8(void) { - return (0ULL); + return (read_cr8_msvc()); } unsigned long diff --git a/module/os/windows/spl/spl_cr_wrappers.c b/module/os/windows/spl/spl_cr_wrappers.c new file mode 100644 index 000000000000..529fe45df92c --- /dev/null +++ b/module/os/windows/spl/spl_cr_wrappers.c @@ -0,0 +1,21 @@ + +#ifdef _MSC_VER +#include +#include + +// we can use the MSVC-specific intrinsic __readcr8() to read the value of the CR8 +// register directly. This intrinsic is part of MSVC's built-in functions, +// which allows us to access hardware-level registers without writing assembly code. + +// Clang does not support the __readcr8 intrinsic, as it is specific to MSVC. +// Clang does not have a direct equivalent +// for accessing the CR8 register via a built-in function. Therefore, if we are using +// Clang, we must either use inline assembly or a different method to access the register. +// https://learn.microsoft.com/en-us/cpp/intrinsics/readcr8?view=msvc-170 + +__declspec(dllexport) uint64_t read_cr8_msvc(void) { + return __readcr8(); +} +#else +#error "_MSC_VER not defined" +#endif diff --git a/module/os/windows/zfs/CMakeLists.txt b/module/os/windows/zfs/CMakeLists.txt index 4a778d957f8f..ce832e0bb43c 100644 --- a/module/os/windows/zfs/CMakeLists.txt +++ b/module/os/windows/zfs/CMakeLists.txt @@ -47,4 +47,4 @@ zvol_os.c ${TMH_FILE_LIST} ) -target_link_libraries(zfskern_os PRIVATE splkern icpkern) +target_link_libraries(zfskern_os PRIVATE splkern icpkern spl_cr_wrappers) From b461b51a10098b7784907cf28b5f707790fc09ca Mon Sep 17 00:00:00 2001 From: Arun KV Date: Tue, 6 Jan 2026 14:09:28 +0530 Subject: [PATCH 02/13] Added support to prealloc the abd cache Registry 'zfs_abd_prealloc_percent' can be used to set the percent of prealloc wrt zfs_arc_max --- include/os/windows/zfs/sys/kstat_windows.h | 2 + module/os/windows/spl/spl-kmem.c | 55 ++++++++++++++++++++++ module/os/windows/zfs/arc_os.c | 2 + module/os/windows/zfs/zfs_kstat_windows.c | 7 ++- module/os/windows/zfs/zvol_os.c | 1 + 5 files changed, 66 insertions(+), 1 deletion(-) diff --git a/include/os/windows/zfs/sys/kstat_windows.h b/include/os/windows/zfs/sys/kstat_windows.h index 644b01354177..3d1a510b9d06 100644 --- a/include/os/windows/zfs/sys/kstat_windows.h +++ b/include/os/windows/zfs/sys/kstat_windows.h @@ -150,6 +150,7 @@ typedef struct windows_kstat { kstat_named_t zfs_removal_suspend_progress; kstat_named_t cpu_avx_supported; kstat_named_t zvol_io_threads; + kstat_named_t zfs_abd_prealloc_percent; } windows_kstat_t; @@ -261,6 +262,7 @@ extern int zfs_autoimport_disable; extern int zfs_removal_suspend_progress; extern int cpu_avx_supported; extern int zvol_threads; +extern int zfs_abd_prealloc_percent; int kstat_windows_init(void *); void kstat_windows_fini(void); diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index 3a57b55ea90d..1ebd03d777c9 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -76,6 +76,7 @@ static volatile _Atomic int64_t spl_free = 0; int64_t spl_free_delta_ema; static boolean_t spl_event_thread_exit = FALSE; +static boolean_t spl_abd_prealloc_thread_exit = FALSE; PKEVENT low_mem_event = NULL; static volatile _Atomic int64_t spl_free_manual_pressure = 0; @@ -4914,6 +4915,56 @@ spl_event_thread(void *notused) thread_exit(); } +extern kmem_cache_t *abd_chunk_cache; +extern uint64_t zfs_arc_max; +extern int zfs_abd_prealloc_percent; + +static void +spl_abd_prealloc_thread(void *notused) +{ + NTSTATUS Status; + + typedef struct abd_prealloc_node { + list_node_t node; + } abd_prealloc_node_t; + + abd_prealloc_node_t *node; + list_t abd_prealloc_list; + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc start segkmem_total_mem_allocated: %lld total_memory: %lld\n", + segkmem_total_mem_allocated, total_memory)); + + dprintf("SPL: beginning spl_abd_prealloc_thread() loop\n"); + + list_create(&abd_prealloc_list, sizeof (abd_prealloc_node_t), offsetof(abd_prealloc_node_t, node)); + + while (!spl_abd_prealloc_thread_exit) { + + if (!abd_chunk_cache || !zfs_arc_max) { + delay(hz); + continue; + } + node = (abd_prealloc_node_t *)kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); + list_insert_tail(&abd_prealloc_list, node); + + if (segkmem_total_mem_allocated >= + (zfs_arc_max * zfs_abd_prealloc_percent) / 100) { + break; + } + } + + while ((node = list_remove_head(&abd_prealloc_list)) != NULL) { + kmem_cache_free(abd_chunk_cache, node); + } + + spl_abd_prealloc_thread_exit = FALSE; + dprintf("SPL: %s thread_exit\n", __func__); + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc done segkmem_total_mem_allocated: %lld total_memory: %lld zfs_prealloc_percent: %d%\n", + segkmem_total_mem_allocated, total_memory, zfs_abd_prealloc_percent)); + thread_exit(); +} + static int spl_kstat_update(kstat_t *ksp, int rw) @@ -5344,6 +5395,9 @@ spl_kmem_thread_init(void) spl_event_thread_exit = FALSE; (void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); + + spl_abd_prealloc_thread_exit = FALSE; + (void) thread_create(NULL, 0, spl_abd_prealloc_thread, 0, 0, 0, 0, 92); } void @@ -5351,6 +5405,7 @@ spl_kmem_thread_fini(void) { shutting_down = 1; + spl_abd_prealloc_thread_exit = TRUE; if (low_mem_event != NULL) { dprintf("SPL: stopping spl_event_thread\n"); spl_event_thread_exit = TRUE; diff --git a/module/os/windows/zfs/arc_os.c b/module/os/windows/zfs/arc_os.c index 41eef66413f3..60cca3b399ed 100644 --- a/module/os/windows/zfs/arc_os.c +++ b/module/os/windows/zfs/arc_os.c @@ -696,6 +696,7 @@ arc_kstat_update_windows(kstat_t *ksp, int rw) zfs_arc_average_blocksize = ks->arc_zfs_arc_average_blocksize.value.ui64; zvol_threads = ks->zvol_io_threads.value.ui32; + zfs_abd_prealloc_percent = ks->zfs_abd_prealloc_percent.value.ui32; #ifdef _KERNEL if (ks->zfs_total_memory_limit.value.ui64 > total_memory && @@ -731,6 +732,7 @@ arc_kstat_update_windows(kstat_t *ksp, int rw) ks->arc_zfs_arc_average_blocksize.value.ui64 = zfs_arc_average_blocksize; ks->zvol_io_threads.value.ui32 = zvol_threads; + ks->zfs_abd_prealloc_percent.value.ui32 = zfs_abd_prealloc_percent; #ifdef _KERNEL ks->zfs_total_memory_limit.value.ui64 = total_memory; diff --git a/module/os/windows/zfs/zfs_kstat_windows.c b/module/os/windows/zfs/zfs_kstat_windows.c index 36799e2fdd40..1a11d72524d4 100644 --- a/module/os/windows/zfs/zfs_kstat_windows.c +++ b/module/os/windows/zfs/zfs_kstat_windows.c @@ -173,7 +173,8 @@ windows_kstat_t windows_kstat = { { "zfs_total_memory_limit", KSTAT_DATA_UINT64 }, { "zfs_removal_suspend_progress", KSTAT_DATA_INT32 }, { "cpu_avx_supported", KSTAT_DATA_UINT32 }, - { "zvol_io_threads", KSTAT_DATA_UINT32 } + { "zvol_io_threads", KSTAT_DATA_UINT32 }, + { "zfs_abd_prealloc_percent", KSTAT_DATA_UINT32 }, }; @@ -383,6 +384,8 @@ windows_kstat_update(kstat_t *ksp, int rw) ks->zfs_removal_suspend_progress.value.i32; cpu_avx_supported = ks->cpu_avx_supported.value.ui32; + zfs_abd_prealloc_percent = + ks->zfs_abd_prealloc_percent.value.ui32; } else { /* kstat READ */ @@ -573,6 +576,8 @@ windows_kstat_update(kstat_t *ksp, int rw) cpu_avx_supported; ks->zvol_io_threads.value.ui32 = zvol_threads; + ks->zfs_abd_prealloc_percent.value.ui32 = + zfs_abd_prealloc_percent; } arc_kstat_update_windows(ksp, rw); return (0); diff --git a/module/os/windows/zfs/zvol_os.c b/module/os/windows/zfs/zvol_os.c index 0500d2fdffc7..c52a08f6a814 100644 --- a/module/os/windows/zfs/zvol_os.c +++ b/module/os/windows/zfs/zvol_os.c @@ -46,6 +46,7 @@ unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; int zvol_threads = 0; +int zfs_abd_prealloc_percent = 10; taskq_t *zvol_taskq; From df523e26f0f50d68534ddd02608c1680668d77ee Mon Sep 17 00:00:00 2001 From: Arun KV Date: Tue, 6 Jan 2026 18:53:05 +0530 Subject: [PATCH 03/13] Disabled memory pressure thread --- module/os/windows/spl/spl-kmem.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index 1ebd03d777c9..31126c7ce532 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -4304,7 +4304,10 @@ spl_free_set_pressure(int64_t new_p) // and any spl_free_set_and_wait_pressure() threads cv_broadcast(&spl_free_thread_cv); } - spl_free_last_pressure = zfs_lbolt(); + if (new_p > 0) + spl_free_last_pressure = zfs_lbolt(); + else + spl_free_last_pressure = zfs_lbolt(); } void @@ -4480,6 +4483,7 @@ spl_free_thread() spl_vm_pressure_level != MAGIC_PRESSURE_UNAVAILABLE) { /* there is pressure */ lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pressure_level: %lu\n",spl_vm_pressure_level)); new_spl_free = -(2LL * PAGE_SIZE * spl_vm_pages_wanted); if (spl_vm_pressure_level > 1) { emergency_lowmem = true; @@ -4534,6 +4538,7 @@ spl_free_thread() int64_t old_pressure = spl_free_manual_pressure; new_spl_free -= old_pressure * 2LL; lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_free_manual_pressure: %llu\n",spl_free_manual_pressure)); if (spl_free_fast_pressure) { emergency_lowmem = true; new_spl_free -= old_pressure * 4LL; @@ -4631,6 +4636,7 @@ spl_free_thread() new_spl_free += bminus; lowmem = true; emergency_lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pages_wanted %lu\n", spl_vm_pages_wanted)); // atomic swaps to set these variables used in arc.c int64_t previous_highest_pressure = 0; int64_t new_p = -bminus; @@ -4651,6 +4657,7 @@ spl_free_thread() new_spl_free -= bytes_wanted; if (reserve_low && !early_lots_free) { lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pages_wanted: %lu reserve_low: %lu early_lots_free: %lu\n", spl_vm_pages_wanted, reserve_low, early_lots_free)); if (recent_lowmem == 0) { recent_lowmem = time_now; } @@ -4761,6 +4768,7 @@ spl_free_thread() real_total_memory) > 75) { new_spl_free -= total_mem_used / 32; lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "segkmem_total_mem_allocated: %llu real_total_memory: %llu\n", segkmem_total_mem_allocated, real_total_memory)); } } @@ -5393,8 +5401,8 @@ spl_kmem_thread_init(void) (void) thread_create(NULL, 0, spl_free_thread, 0, 0, 0, 0, 92); spl_free_thread_running = TRUE; - spl_event_thread_exit = FALSE; - (void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); + //spl_event_thread_exit = FALSE; + //(void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); spl_abd_prealloc_thread_exit = FALSE; (void) thread_create(NULL, 0, spl_abd_prealloc_thread, 0, 0, 0, 0, 92); From db57b31ca204e21ca2de8d19bdd5bd207fb34395 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Wed, 7 Jan 2026 11:27:40 +0530 Subject: [PATCH 04/13] Changed abd_cache alloc size --- module/os/windows/zfs/abd_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/windows/zfs/abd_os.c b/module/os/windows/zfs/abd_os.c index 08cefa3c57e1..c0a181d9f395 100644 --- a/module/os/windows/zfs/abd_os.c +++ b/module/os/windows/zfs/abd_os.c @@ -87,7 +87,7 @@ struct { * will cause the machine to panic if you change it and try to access the data * within a scattered ABD. */ -size_t zfs_abd_chunk_size = 4096; +size_t zfs_abd_chunk_size = 65536; kmem_cache_t *abd_chunk_cache; static kstat_t *abd_ksp; From 85312c2f79459224aa8ededd96ef65dc627de48c Mon Sep 17 00:00:00 2001 From: Arun KV Date: Thu, 8 Jan 2026 10:33:24 +0530 Subject: [PATCH 05/13] skip source free when the reserved memory is not crossing arc_max --- module/os/windows/spl/spl-vmem.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c index 2540dca395aa..1bc95dc7b355 100644 --- a/module/os/windows/spl/spl-vmem.c +++ b/module/os/windows/spl/spl-vmem.c @@ -448,6 +448,7 @@ uint64_t spl_frag_walk_cnt = 0; extern void spl_free_set_emergency_pressure(int64_t p); extern uint64_t segkmem_total_mem_allocated; extern uint64_t total_memory; +extern uint64_t zfs_arc_max; /* * Get a vmem_seg_t from the global segfree list. @@ -1732,12 +1733,23 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) vsp = vprev; } + // calling vm_source_free will free the memory to windows, we + // don;t want to do this unless we are near the arc limit + boolean_t skip_sfree = true; + if (segkmem_total_mem_allocated > + (zfs_arc_max * 90) / 100) { + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, + "vmem_xfree: skip source free segkmem_total_mem_allocated: %llu zfs_arc_max: %llu\n", + segkmem_total_mem_allocated, zfs_arc_max)); + skip_sfree = false; + } + /* * If the entire span is free, return it to the source. */ if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL && vsp->vs_aprev->vs_type == VMEM_SPAN && - vsp->vs_anext->vs_type == VMEM_SPAN) { + vsp->vs_anext->vs_type == VMEM_SPAN && !skip_sfree) { vaddr = (void *)vsp->vs_start; size = VS_SIZE(vsp); ASSERT(size == VS_SIZE(vsp->vs_aprev)); From 3a718c01ef4f30299833e537b9baf324f13bbb80 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Thu, 8 Jan 2026 18:36:04 +0530 Subject: [PATCH 06/13] skip source free when the reserved memory is not crossing arc_max --- module/os/windows/spl/spl-vmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c index 1bc95dc7b355..169150550298 100644 --- a/module/os/windows/spl/spl-vmem.c +++ b/module/os/windows/spl/spl-vmem.c @@ -1737,7 +1737,7 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) // don;t want to do this unless we are near the arc limit boolean_t skip_sfree = true; if (segkmem_total_mem_allocated > - (zfs_arc_max * 90) / 100) { + (zfs_arc_max * 100) / 100) { KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "vmem_xfree: skip source free segkmem_total_mem_allocated: %llu zfs_arc_max: %llu\n", segkmem_total_mem_allocated, zfs_arc_max)); From cd0c019e143e5305b4e1f9644d2e457c2ac23557 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Fri, 9 Jan 2026 10:09:42 +0530 Subject: [PATCH 07/13] skip source free when the reserved memory is not crossing arc_max --- module/os/windows/spl/spl-kmem.c | 4 ++-- module/os/windows/spl/spl-vmem.c | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index 31126c7ce532..148b208e2b49 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -4968,8 +4968,8 @@ spl_abd_prealloc_thread(void *notused) spl_abd_prealloc_thread_exit = FALSE; dprintf("SPL: %s thread_exit\n", __func__); - KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc done segkmem_total_mem_allocated: %lld total_memory: %lld zfs_prealloc_percent: %d%\n", - segkmem_total_mem_allocated, total_memory, zfs_abd_prealloc_percent)); + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc done segkmem_total_mem_allocated: %lld total_memory: %lld zfs_arc_max: %llu zfs_prealloc_percent: %d%\n", + segkmem_total_mem_allocated, total_memory, zfs_arc_max, zfs_abd_prealloc_percent)); thread_exit(); } diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c index 169150550298..8bfcc0966193 100644 --- a/module/os/windows/spl/spl-vmem.c +++ b/module/os/windows/spl/spl-vmem.c @@ -1737,10 +1737,7 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) // don;t want to do this unless we are near the arc limit boolean_t skip_sfree = true; if (segkmem_total_mem_allocated > - (zfs_arc_max * 100) / 100) { - KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, - "vmem_xfree: skip source free segkmem_total_mem_allocated: %llu zfs_arc_max: %llu\n", - segkmem_total_mem_allocated, zfs_arc_max)); + (zfs_arc_max * 102) / 100) { skip_sfree = false; } From 04d51c1cfab01ae93e1d44b533747c3c79d26473 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Fri, 9 Jan 2026 19:57:52 +0530 Subject: [PATCH 08/13] Changed the registry name from zfs_abd_prealloc_percent to zfs_prealloc_percent Setting zfs_prealloc_percent=0 will disable the prealloc feature --- include/os/windows/zfs/sys/kstat_windows.h | 4 +- module/os/windows/spl/spl-kmem.c | 34 ++++---- module/os/windows/spl/spl-vmem.c | 18 +++-- module/os/windows/spl/spl-windows.c | 93 ++++++++++++++++++++++ module/os/windows/zfs/arc_os.c | 4 +- module/os/windows/zfs/zfs_kstat_windows.c | 10 +-- module/os/windows/zfs/zvol_os.c | 2 +- 7 files changed, 131 insertions(+), 34 deletions(-) diff --git a/include/os/windows/zfs/sys/kstat_windows.h b/include/os/windows/zfs/sys/kstat_windows.h index 3d1a510b9d06..842cc4e006d7 100644 --- a/include/os/windows/zfs/sys/kstat_windows.h +++ b/include/os/windows/zfs/sys/kstat_windows.h @@ -150,7 +150,7 @@ typedef struct windows_kstat { kstat_named_t zfs_removal_suspend_progress; kstat_named_t cpu_avx_supported; kstat_named_t zvol_io_threads; - kstat_named_t zfs_abd_prealloc_percent; + kstat_named_t zfs_prealloc_percent; } windows_kstat_t; @@ -262,7 +262,7 @@ extern int zfs_autoimport_disable; extern int zfs_removal_suspend_progress; extern int cpu_avx_supported; extern int zvol_threads; -extern int zfs_abd_prealloc_percent; +extern int zfs_prealloc_percent; int kstat_windows_init(void *); void kstat_windows_fini(void); diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index 148b208e2b49..a976337e213f 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -132,6 +132,10 @@ extern uint64_t zfs_active_rwlock; extern uint64_t total_memory; extern uint64_t real_total_memory; +extern kmem_cache_t *abd_chunk_cache; +extern uint64_t zfs_arc_max; +extern int zfs_prealloc_percent; + #define MULT 1 static const char *KMEM_VA_PREFIX = "kmem_va"; @@ -4304,10 +4308,7 @@ spl_free_set_pressure(int64_t new_p) // and any spl_free_set_and_wait_pressure() threads cv_broadcast(&spl_free_thread_cv); } - if (new_p > 0) - spl_free_last_pressure = zfs_lbolt(); - else - spl_free_last_pressure = zfs_lbolt(); + spl_free_last_pressure = zfs_lbolt(); } void @@ -4923,10 +4924,6 @@ spl_event_thread(void *notused) thread_exit(); } -extern kmem_cache_t *abd_chunk_cache; -extern uint64_t zfs_arc_max; -extern int zfs_abd_prealloc_percent; - static void spl_abd_prealloc_thread(void *notused) { @@ -4952,13 +4949,14 @@ spl_abd_prealloc_thread(void *notused) delay(hz); continue; } - node = (abd_prealloc_node_t *)kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); - list_insert_tail(&abd_prealloc_list, node); if (segkmem_total_mem_allocated >= - (zfs_arc_max * zfs_abd_prealloc_percent) / 100) { + (zfs_arc_max * zfs_prealloc_percent) / 100) { break; } + + node = (abd_prealloc_node_t *)kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); + list_insert_tail(&abd_prealloc_list, node); } while ((node = list_remove_head(&abd_prealloc_list)) != NULL) { @@ -4969,7 +4967,7 @@ spl_abd_prealloc_thread(void *notused) dprintf("SPL: %s thread_exit\n", __func__); KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc done segkmem_total_mem_allocated: %lld total_memory: %lld zfs_arc_max: %llu zfs_prealloc_percent: %d%\n", - segkmem_total_mem_allocated, total_memory, zfs_arc_max, zfs_abd_prealloc_percent)); + segkmem_total_mem_allocated, total_memory, zfs_arc_max, zfs_prealloc_percent)); thread_exit(); } @@ -5401,11 +5399,13 @@ spl_kmem_thread_init(void) (void) thread_create(NULL, 0, spl_free_thread, 0, 0, 0, 0, 92); spl_free_thread_running = TRUE; - //spl_event_thread_exit = FALSE; - //(void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); - - spl_abd_prealloc_thread_exit = FALSE; - (void) thread_create(NULL, 0, spl_abd_prealloc_thread, 0, 0, 0, 0, 92); + if (zfs_prealloc_percent) { + spl_abd_prealloc_thread_exit = FALSE; + (void) thread_create(NULL, 0, spl_abd_prealloc_thread, 0, 0, 0, 0, 92); + } else { + spl_event_thread_exit = FALSE; + (void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); + } } void diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c index 8bfcc0966193..4c3d0a645aea 100644 --- a/module/os/windows/spl/spl-vmem.c +++ b/module/os/windows/spl/spl-vmem.c @@ -449,6 +449,7 @@ extern void spl_free_set_emergency_pressure(int64_t p); extern uint64_t segkmem_total_mem_allocated; extern uint64_t total_memory; extern uint64_t zfs_arc_max; +extern int zfs_prealloc_percent; /* * Get a vmem_seg_t from the global segfree list. @@ -1734,19 +1735,22 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) } // calling vm_source_free will free the memory to windows, we - // don;t want to do this unless we are near the arc limit - boolean_t skip_sfree = true; - if (segkmem_total_mem_allocated > - (zfs_arc_max * 102) / 100) { - skip_sfree = false; - } + // don't want to do this unless we are crossing the arc limit when + // zfs_prealloc_percent is enabled. + boolean_t allow_vm_source_free = true; + if (zfs_prealloc_percent) { + if (segkmem_total_mem_allocated < + (zfs_arc_max * 102) / 100) { + allow_vm_source_free = false; + } + } /* * If the entire span is free, return it to the source. */ if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL && vsp->vs_aprev->vs_type == VMEM_SPAN && - vsp->vs_anext->vs_type == VMEM_SPAN && !skip_sfree) { + vsp->vs_anext->vs_type == VMEM_SPAN && allow_vm_source_free) { vaddr = (void *)vsp->vs_start; size = VS_SIZE(vsp); ASSERT(size == VS_SIZE(vsp->vs_aprev)); diff --git a/module/os/windows/spl/spl-windows.c b/module/os/windows/spl/spl-windows.c index e6b31edac923..423a6e02c124 100644 --- a/module/os/windows/spl/spl-windows.c +++ b/module/os/windows/spl/spl-windows.c @@ -53,12 +53,14 @@ volatile unsigned int vm_page_speculative_count = 5500; uint64_t spl_GetPhysMem(void); uint64_t spl_GetZfsTotalMemory(PUNICODE_STRING RegistryPath); +uint64_t spl_getZfsPreallocSize(PUNICODE_STRING RegistryPath); #include #include // Size in bytes of the memory allocated in seg_kmem extern uint64_t segkmem_total_mem_allocated; +extern int zfs_prealloc_percent; #define MAXHOSTNAMELEN 64 extern char hostname[MAXHOSTNAMELEN]; @@ -528,6 +530,9 @@ spl_start(PUNICODE_STRING RegistryPath) spl_mutex_subsystem_init(); spl_kmem_init(total_memory); + // lets get the registry value now, because the zfs loads the registry little later + zfs_prealloc_percent = spl_getZfsPreallocSize(RegistryPath); + spl_vnode_init(); spl_kmem_thread_init(); spl_kmem_mp_init(); @@ -755,3 +760,91 @@ spl_GetZfsTotalMemory(PUNICODE_STRING RegistryPath) ZwClose(h); return (newvalue); } + +uint64_t +spl_getZfsPreallocSize(PUNICODE_STRING RegistryPath) +{ + OBJECT_ATTRIBUTES ObjectAttributes; + HANDLE h; + NTSTATUS status; + uint64_t newvalue = 0; + + InitializeObjectAttributes(&ObjectAttributes, + RegistryPath, + OBJ_KERNEL_HANDLE | OBJ_CASE_INSENSITIVE, + NULL, + NULL); + + status = ZwOpenKey(&h, // KeyHandle + KEY_ALL_ACCESS, // DesiredAccess + &ObjectAttributes); // ObjectAttributes + + if (!NT_SUCCESS(status)) { + dprintf("%s: Unable to open Registry %wZ: 0x%x. " + "Going with defaults.\n", __func__, RegistryPath, status); + return (0); + } + + ULONG index = 0; + ULONG length = 0; + PKEY_VALUE_FULL_INFORMATION regBuffer = NULL; + + for (index = 0; status != STATUS_NO_MORE_ENTRIES; index++) { + // Get the buffer size necessary + status = ZwEnumerateValueKey(h, index, KeyValueFullInformation, + NULL, 0, &length); + + if ((status != STATUS_BUFFER_TOO_SMALL) && + (status != STATUS_BUFFER_OVERFLOW)) + break; // Something is wrong - or we finished + + // Allocate space to hold + regBuffer = (PKEY_VALUE_FULL_INFORMATION)ExAllocatePoolWithTag( + NonPagedPoolNx, length, 'zfsr'); + + if (regBuffer == NULL) + break; + + status = ZwEnumerateValueKey(h, index, KeyValueFullInformation, + regBuffer, length, &length); + if (!NT_SUCCESS(status)) { + break; + } + // Convert name to straight ascii so we compare with kstat + ULONG outlen = 0; + char keyname[KSTAT_STRLEN + 1] = { 0 }; + status = RtlUnicodeToUTF8N(keyname, KSTAT_STRLEN, &outlen, + regBuffer->Name, regBuffer->NameLength); + + // Conversion failed? move along.. + if (status != STATUS_SUCCESS && status + != STATUS_SOME_NOT_MAPPED) + break; + + // Output string is only null terminated if input is, + // so do so now. + keyname[outlen] = 0; + if (strcasecmp("zfs_prealloc_percent", keyname) == 0) { + if (regBuffer->Type != REG_DWORD || + regBuffer->DataLength != sizeof (uint32_t)) { + dprintf("%s: registry '%s' did not match. " + "Type needs to be REG_QWORD. (8 bytes)\n", + __func__, keyname); + } else { + newvalue = *(uint32_t *)((uint8_t *)regBuffer + + regBuffer->DataOffset); + dprintf("%s: zfs_prealloc_percent is set to:" + " %llu\n", __func__, newvalue); + } + break; + } + ExFreePool(regBuffer); + regBuffer = NULL; + } + + if (regBuffer) + ExFreePool(regBuffer); + + ZwClose(h); + return (newvalue); +} diff --git a/module/os/windows/zfs/arc_os.c b/module/os/windows/zfs/arc_os.c index 60cca3b399ed..6a90d7604c70 100644 --- a/module/os/windows/zfs/arc_os.c +++ b/module/os/windows/zfs/arc_os.c @@ -696,7 +696,7 @@ arc_kstat_update_windows(kstat_t *ksp, int rw) zfs_arc_average_blocksize = ks->arc_zfs_arc_average_blocksize.value.ui64; zvol_threads = ks->zvol_io_threads.value.ui32; - zfs_abd_prealloc_percent = ks->zfs_abd_prealloc_percent.value.ui32; + zfs_prealloc_percent = ks->zfs_prealloc_percent.value.ui32; #ifdef _KERNEL if (ks->zfs_total_memory_limit.value.ui64 > total_memory && @@ -732,7 +732,7 @@ arc_kstat_update_windows(kstat_t *ksp, int rw) ks->arc_zfs_arc_average_blocksize.value.ui64 = zfs_arc_average_blocksize; ks->zvol_io_threads.value.ui32 = zvol_threads; - ks->zfs_abd_prealloc_percent.value.ui32 = zfs_abd_prealloc_percent; + ks->zfs_prealloc_percent.value.ui32 = zfs_prealloc_percent; #ifdef _KERNEL ks->zfs_total_memory_limit.value.ui64 = total_memory; diff --git a/module/os/windows/zfs/zfs_kstat_windows.c b/module/os/windows/zfs/zfs_kstat_windows.c index 1a11d72524d4..8b0c4d9823a3 100644 --- a/module/os/windows/zfs/zfs_kstat_windows.c +++ b/module/os/windows/zfs/zfs_kstat_windows.c @@ -174,7 +174,7 @@ windows_kstat_t windows_kstat = { { "zfs_removal_suspend_progress", KSTAT_DATA_INT32 }, { "cpu_avx_supported", KSTAT_DATA_UINT32 }, { "zvol_io_threads", KSTAT_DATA_UINT32 }, - { "zfs_abd_prealloc_percent", KSTAT_DATA_UINT32 }, + { "zfs_prealloc_percent", KSTAT_DATA_UINT32 }, }; @@ -384,8 +384,8 @@ windows_kstat_update(kstat_t *ksp, int rw) ks->zfs_removal_suspend_progress.value.i32; cpu_avx_supported = ks->cpu_avx_supported.value.ui32; - zfs_abd_prealloc_percent = - ks->zfs_abd_prealloc_percent.value.ui32; + zfs_prealloc_percent = + ks->zfs_prealloc_percent.value.ui32; } else { /* kstat READ */ @@ -576,8 +576,8 @@ windows_kstat_update(kstat_t *ksp, int rw) cpu_avx_supported; ks->zvol_io_threads.value.ui32 = zvol_threads; - ks->zfs_abd_prealloc_percent.value.ui32 = - zfs_abd_prealloc_percent; + ks->zfs_prealloc_percent.value.ui32 = + zfs_prealloc_percent; } arc_kstat_update_windows(ksp, rw); return (0); diff --git a/module/os/windows/zfs/zvol_os.c b/module/os/windows/zfs/zvol_os.c index c52a08f6a814..edcf319f0c07 100644 --- a/module/os/windows/zfs/zvol_os.c +++ b/module/os/windows/zfs/zvol_os.c @@ -46,7 +46,7 @@ unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; int zvol_threads = 0; -int zfs_abd_prealloc_percent = 10; +int zfs_prealloc_percent = 0; taskq_t *zvol_taskq; From 1b5a18caff3f72807b0dc9fc5be9ad75b3d7cea7 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Sat, 10 Jan 2026 16:35:20 +0530 Subject: [PATCH 09/13] Reverted zfs_abd_chunk_size from 65536 to 4096 which was causing read performace issue --- module/os/windows/zfs/abd_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/windows/zfs/abd_os.c b/module/os/windows/zfs/abd_os.c index c0a181d9f395..08cefa3c57e1 100644 --- a/module/os/windows/zfs/abd_os.c +++ b/module/os/windows/zfs/abd_os.c @@ -87,7 +87,7 @@ struct { * will cause the machine to panic if you change it and try to access the data * within a scattered ABD. */ -size_t zfs_abd_chunk_size = 65536; +size_t zfs_abd_chunk_size = 4096; kmem_cache_t *abd_chunk_cache; static kstat_t *abd_ksp; From d95a9ef3302a98949ddf9eaa1a957ba577da050a Mon Sep 17 00:00:00 2001 From: Arun KV Date: Wed, 1 Apr 2026 15:41:41 +0530 Subject: [PATCH 10/13] Fix spl_thread_create not setting thread priority --- include/os/windows/spl/sys/sysmacros.h | 6 +- module/os/windows/spl/spl-thread.c | 82 +++++++++++++++++++++++--- module/zfs/txg.c | 3 +- 3 files changed, 79 insertions(+), 12 deletions(-) diff --git a/include/os/windows/spl/sys/sysmacros.h b/include/os/windows/spl/sys/sysmacros.h index cba1f5fc2d26..e53ae751953d 100644 --- a/include/os/windows/spl/sys/sysmacros.h +++ b/include/os/windows/spl/sys/sysmacros.h @@ -81,9 +81,9 @@ extern unsigned int num_ecores; * swap priority is at 92. Most ZFS priorities should probably * stay below this, but kmem_reap needs to be higher. */ -#define minclsyspri 81 /* BASEPRI_KERNEL */ -#define defclsyspri 81 /* BASEPRI_KERNEL */ -#define maxclsyspri 89 +#define minclsyspri 8 /* BASEPRI_KERNEL */ +#define defclsyspri 8 /* BASEPRI_KERNEL */ +#define maxclsyspri 12 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) diff --git a/module/os/windows/spl/spl-thread.c b/module/os/windows/spl/spl-thread.c index 6ef65414c6d4..93cb51557fd8 100644 --- a/module/os/windows/spl/spl-thread.c +++ b/module/os/windows/spl/spl-thread.c @@ -39,7 +39,78 @@ uint64_t zfs_threads = 0; -kthread_t * + +kthread_t* +spl_thread_create( + caddr_t stk, + size_t stksize, + void (*proc)(void*), + void* arg, + size_t len, + int state, +#ifdef SPL_DEBUG_THREAD + char* filename, + int line, +#endif + pri_t pri) +{ + NTSTATUS status; + HANDLE hThread = NULL; + PETHREAD eThread = NULL; + +#ifdef SPL_DEBUG_THREAD + dprintf("Start thread pri %d\n", pri); +#endif + + status = PsCreateSystemThread( + &hThread, + THREAD_ALL_ACCESS, + NULL, + NULL, + NULL, + proc, + arg); + + if (!NT_SUCCESS(status)) + return NULL; + + /* Convert HANDLE ETHREAD */ + status = ObReferenceObjectByHandle( + hThread, + THREAD_ALL_ACCESS, + *PsThreadType, + KernelMode, + (PVOID*)&eThread, + NULL); + + /* We no longer need the handle */ + ZwClose(hThread); + + if (!NT_SUCCESS(status)) + return NULL; + + /* Clamp priority to safe Windows range */ + KPRIORITY newPri = (KPRIORITY)pri; + + if (newPri > maxclsyspri) + newPri = maxclsyspri; + + if (newPri < minclsyspri) + newPri = minclsyspri; + + /* Set absolute priority */ + KeSetPriorityThread((PKTHREAD)eThread, newPri); + +#ifdef SPL_DEBUG_THREAD + dprintf("Thread created with priority %d\n", newPri); +#endif + + atomic_inc_64(&zfs_threads); + + return (kthread_t*)eThread; +} + +/*kthread_t* spl_thread_create( caddr_t stk, size_t stksize, @@ -72,12 +143,7 @@ spl_thread_create( if (result != STATUS_SUCCESS) return (NULL); - /* - * Improve the priority when asked to do so - * Thread priorities range from 0 to 31, where 0 is the lowest - * priority and 31 is the highest - */ - + if (pri > minclsyspri) { // thread_precedence_policy_data_t policy; // policy.importance = pri - minclsyspri; @@ -102,7 +168,7 @@ spl_thread_create( ObDereferenceObject(eThread); ZwClose(thread); return ((kthread_t *)eThread); -} +}*/ kthread_t * spl_current_thread(void) diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 46c3bb1bcd0b..ead8d7bc8832 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -213,8 +213,9 @@ txg_sync_start(dsl_pool_t *dp) * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ + tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, - dp, 0, &p0, TS_RUN, defclsyspri); + dp, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&tx->tx_sync_lock); } From d773f2f5d5db37073c7d7bdde264ad40a9a53a8e Mon Sep 17 00:00:00 2001 From: Pankaj Sharma Date: Thu, 16 Apr 2026 22:44:53 -0700 Subject: [PATCH 11/13] Change memory available logic and manual pressure --- module/os/windows/spl/spl-kmem.c | 2 +- module/os/windows/zfs/arc_os.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index a976337e213f..48c680ee2672 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -4235,7 +4235,7 @@ spl_free_wrapper(void) int64_t spl_free_manual_pressure_wrapper(void) { - return (spl_free_manual_pressure); + return (0); } uint64_t diff --git a/module/os/windows/zfs/arc_os.c b/module/os/windows/zfs/arc_os.c index 6a90d7604c70..0d23f80677e2 100644 --- a/module/os/windows/zfs/arc_os.c +++ b/module/os/windows/zfs/arc_os.c @@ -59,6 +59,7 @@ #include extern arc_stats_t arc_stats; +extern uint64_t zfs_arc_max; static kmutex_t arc_reclaim_lock; static kcondvar_t arc_reclaim_thread_cv; @@ -127,7 +128,7 @@ arc_free_memory(void) int64_t arc_available_memory(void) { - return (arc_free_memory() - arc_sys_free); + return (zfs_arc_max - aggsum_value(&arc_sums.arcstat_size)); } int @@ -137,8 +138,7 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) /* possibly wake up arc reclaim thread */ if (arc_reclaim_in_loop == B_FALSE) { - if (spl_free_manual_pressure_wrapper() != 0 || - !spl_minimal_physmem_p() || + if (!spl_minimal_physmem_p() || arc_reclaim_needed()) { cv_signal(&arc_reclaim_thread_cv); kpreempt(KPREEMPT_SYNC); @@ -798,7 +798,7 @@ arc_prune_async(int64_t adjust) int64_t arc_available_memory(void) { - return (arc_free_memory() - arc_sys_free); + return (zfs_arc_max - aggsum_value(&arc_sums.arcstat_size)); } int From 5f7c0c2d47f6f1aa5a402a31332d90bbb94bc48b Mon Sep 17 00:00:00 2001 From: Pankaj Sharma Date: Tue, 28 Apr 2026 02:35:59 -0700 Subject: [PATCH 12/13] Add dynamic zfs_dirty_data_max adjustments --- module/zfs/txg.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) diff --git a/module/zfs/txg.c b/module/zfs/txg.c index ead8d7bc8832..75957b765f84 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -113,6 +113,203 @@ static void txg_quiesce_thread(void *arg); int zfs_txg_timeout = 1; /* max seconds worth of delta per txg */ +#define DIRTY_FLOOR_BYTES (153ULL << 20) +#define DIRTY_CEIL_BYTES (4ULL << 30) +uint_t zfs_adc_target_sync_pct = 75; +/* PID gains, all scaled ×1000 to avoid floating point */ +int zfs_adc_kp = 200; /* Proportional: main corrective force */ +int zfs_adc_ki = 15; /* Integral: eliminates steady-state bias */ +int zfs_adc_kd = 100; /* Derivative: damping against oscillation */ + +/* EMA smoothing window in TXG count */ +uint_t zfs_adc_ema_alpha_pct = 25; /* 25% weight on new sample */ +/* Minimum TXGs between dirty_max updates (anti-flapping) */ +uint_t zfs_adc_holdoff_txgs = 2; +/* Master enable — 0 reverts to stock ZFS behavior instantly */ +int zfs_adc_enable = 1; + +typedef struct { + /* PID state */ + int64_t adc_integral; /* accumulated error (I term) */ + int64_t adc_prev_error; /* last error sample (D term) */ + clock_t adc_ema_delta; /* smoothed spa_sync duration */ + uint64_t adc_last_txg; /* TXG of last adjustment */ + + /* Bounds in bytes, computed once from physmem */ + uint64_t adc_min_dirty; + uint64_t adc_max_dirty; + + /* Diagnostics / kstat shadow */ + uint64_t adc_n_syncs; /* total TXGs observed */ + uint64_t adc_n_raised; /* times dirty_max was raised */ + uint64_t adc_n_lowered; /* times dirty_max was lowered */ + uint64_t adc_n_clamped; /* times a bound was hit */ + int64_t adc_last_p; /* last P term for debug */ + int64_t adc_last_i; /* last I term for debug */ + int64_t adc_last_d; /* last D term for debug */ +} txg_adc_t; + +static inline clock_t +adc_ema(clock_t prev, clock_t sample, uint_t alpha_pct) +{ + return (prev + (clock_t)(((int64_t)sample - prev) + * alpha_pct / 100)); +} + +static void +adc_init(txg_adc_t* adc, clock_t target_ticks) +{ + bzero(adc, sizeof(*adc)); + + adc->adc_min_dirty = DIRTY_FLOOR_BYTES; + adc->adc_max_dirty = DIRTY_CEIL_BYTES; + + /* Defensive: ensure min < max regardless of tunable misconfiguration */ + if (adc->adc_min_dirty >= adc->adc_max_dirty) + adc->adc_min_dirty = adc->adc_max_dirty / 8; + + /* Seed EMA at target — controller starts in steady state */ + adc->adc_ema_delta = target_ticks; + adc->adc_prev_error = 0; + adc->adc_integral = 0; +} + +static void +adc_update(txg_adc_t* adc, uint64_t txg, + clock_t raw_delta, clock_t target_ticks) +{ + int64_t error; /* normalized error × 1000 */ + int64_t p_term; /* proportional correction */ + int64_t i_term; /* integral correction */ + int64_t d_term; /* derivative correction */ + int64_t pid_out; /* combined PID output */ + int64_t adjustment; /* byte delta for dirty_max */ + uint64_t cur, proposed, next; + + adc->adc_n_syncs++; + + /* Step 1: Update smoothed sync duration via EMA */ + adc->adc_ema_delta = adc_ema(adc->adc_ema_delta, + raw_delta, zfs_adc_ema_alpha_pct); + + /* Step 2: Enforce adjustment holdoff (anti-flapping) */ + if (adc->adc_last_txg != 0 && + (txg - adc->adc_last_txg) < zfs_adc_holdoff_txgs) + return; + + /* Step 3: Compute normalized error + * + * error = (ema_delta - target) / target × 1000 + * + * > 0 : sync taking too long → dirty_max too high → must shrink + * < 0 : sync finishing early → dirty_max too low → can grow + * = 0 : perfect operating point + * + * Example: ema=6s, target=5s → error = +200 (20% over) + * Example: ema=3s, target=5s → error = -400 (40% under) + */ + if (target_ticks == 0) + return; /* Safety: avoid divide-by-zero on misconfiguration */ + + error = ((int64_t)adc->adc_ema_delta - (int64_t)target_ticks) + * 1000LL / (int64_t)target_ticks; + + /* Step 4: P term — immediate response to current error */ + p_term = (int64_t)zfs_adc_kp * error / 1000LL; + + /* Step 5: I term — accumulate to eliminate steady-state offset + * + * Anti-windup clamp: prevents integral from growing unboundedly + * during sustained overload (e.g., pool degraded, resilver running). + * Clamped at ±(30 × Kp) which limits I contribution to ≤3× P max. + */ + adc->adc_integral += error; + { + int64_t windup_limit = 30LL * (int64_t)zfs_adc_kp; + if (adc->adc_integral > windup_limit) adc->adc_integral = windup_limit; + if (adc->adc_integral < -windup_limit) adc->adc_integral = -windup_limit; + } + i_term = (int64_t)zfs_adc_ki * adc->adc_integral / 1000LL; + + /* Step 6: D term — dampen oscillation via rate-of-change */ + d_term = (int64_t)zfs_adc_kd + * (error - adc->adc_prev_error) / 1000LL; + adc->adc_prev_error = error; + + /* Save for diagnostics */ + adc->adc_last_p = p_term; + adc->adc_last_i = i_term; + adc->adc_last_d = d_term; + + /* Step 7: Combine PID output + * + * pid_out > 0 → sync was slow → DECREASE dirty_max + * pid_out < 0 → sync was fast → INCREASE dirty_max + * (sign inversion applied in Step 8) + */ + pid_out = p_term + i_term + d_term; + + if (pid_out == 0) + return; + + /* Step 8: Convert PID output to byte adjustment + * + * adjustment = -(pid_out / 1000) × dirty_max × step_scale + * + * pid_out is in units of 0.1% so dividing by 1000 gives fraction. + * Maximum single-step is capped at 20% of current dirty_max to + * prevent catastrophic collapse from one anomalous TXG. + */ + cur = zfs_dirty_data_max; + + adjustment = -((int64_t)cur / 1000LL) * pid_out; + + /* Cap single-step adjustment at ±20% of current dirty_max */ + { + int64_t max_step = (int64_t)(cur / 5); + if (adjustment > max_step) adjustment = max_step; + if (adjustment < -max_step) adjustment = -max_step; + } + + /* Step 9: Apply bounds */ + proposed = (int64_t)cur + adjustment; + + if (proposed <= adc->adc_min_dirty) { + next = adc->adc_min_dirty; + adc->adc_n_clamped++; + } + else if (proposed >= adc->adc_max_dirty) { + next = adc->adc_max_dirty; + adc->adc_n_clamped++; + } + else { + next = proposed; + } + + /* Step 10: Commit — single store, visible to txg_delay() immediately */ + if (next != cur) { + zfs_dirty_data_max = next; + adc->adc_last_txg = txg; + if (next > cur) adc->adc_n_raised++; + else adc->adc_n_lowered++; + + zfs_dbgmsg("txg_adc txg=%llu ema_delta=%ldms target=%ldms " + "err=%lld P=%lld I=%lld D=%lld " + "dirty_max %lluMB→%lluMB", + (u_longlong_t)txg, + (long)(((uint64_t)adc->adc_ema_delta * 1000ULL) / hz), + (long)(((uint64_t)target_ticks * 1000ULL) / hz), + (longlong_t)error, + (longlong_t)p_term, + (longlong_t)i_term, + (longlong_t)d_term, + (u_longlong_t)(cur >> 20), + (u_longlong_t)(next >> 20)); + } +} + + + /* * Prepare the txg subsystem. */ @@ -531,11 +728,18 @@ txg_sync_thread(void *arg) tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; clock_t start, delta; + txg_adc_t adc; (void) spl_fstrans_mark(); txg_thread_enter(tx, &cpr); start = delta = 0; + clock_t adc_target = (clock_t)(zfs_txg_timeout * hz) + * zfs_adc_target_sync_pct / 100; + + if (zfs_adc_enable) + adc_init(&adc, adc_target); + for (;;) { clock_t timeout = zfs_txg_timeout * hz; clock_t timer; @@ -599,6 +803,12 @@ txg_sync_thread(void *arg) delta = ddi_get_lbolt() - start; spa_txg_history_fini_io(spa, ts); + if (zfs_adc_enable) { + adc_target = (clock_t)(zfs_txg_timeout * hz) + * zfs_adc_target_sync_pct / 100; + adc_update(&adc, txg, delta, adc_target); + } + mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; From 1987bd9998de261c8ed58fa86b9e6a03249f5bd3 Mon Sep 17 00:00:00 2001 From: Pankaj Sharma Date: Tue, 28 Apr 2026 02:37:35 -0700 Subject: [PATCH 13/13] Remove extra line --- module/zfs/txg.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 75957b765f84..a940770c45bf 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -307,9 +307,6 @@ adc_update(txg_adc_t* adc, uint64_t txg, (u_longlong_t)(next >> 20)); } } - - - /* * Prepare the txg subsystem. */