diff --git a/include/os/windows/spl/sys/sysmacros.h b/include/os/windows/spl/sys/sysmacros.h index cba1f5fc2d26..e53ae751953d 100644 --- a/include/os/windows/spl/sys/sysmacros.h +++ b/include/os/windows/spl/sys/sysmacros.h @@ -81,9 +81,9 @@ extern unsigned int num_ecores; * swap priority is at 92. Most ZFS priorities should probably * stay below this, but kmem_reap needs to be higher. */ -#define minclsyspri 81 /* BASEPRI_KERNEL */ -#define defclsyspri 81 /* BASEPRI_KERNEL */ -#define maxclsyspri 89 +#define minclsyspri 8 /* BASEPRI_KERNEL */ +#define defclsyspri 8 /* BASEPRI_KERNEL */ +#define maxclsyspri 12 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) diff --git a/include/os/windows/zfs/sys/kstat_windows.h b/include/os/windows/zfs/sys/kstat_windows.h index 644b01354177..e8d7f2249c62 100644 --- a/include/os/windows/zfs/sys/kstat_windows.h +++ b/include/os/windows/zfs/sys/kstat_windows.h @@ -88,6 +88,7 @@ typedef struct windows_kstat { kstat_named_t spa_mode_global; kstat_named_t zfs_flags; kstat_named_t zfs_txg_timeout; + kstat_named_t zfs_adc_enable; kstat_named_t zfs_vdev_cache_max; kstat_named_t zfs_vdev_cache_size; kstat_named_t zfs_vdev_cache_bshift; @@ -150,6 +151,8 @@ typedef struct windows_kstat { kstat_named_t zfs_removal_suspend_progress; kstat_named_t cpu_avx_supported; kstat_named_t zvol_io_threads; + kstat_named_t zfs_prealloc_percent; + kstat_named_t zfs_adc_target_sync_pct; } windows_kstat_t; @@ -261,6 +264,8 @@ extern int zfs_autoimport_disable; extern int zfs_removal_suspend_progress; extern int cpu_avx_supported; extern int zvol_threads; +extern int zfs_prealloc_percent; +extern uint_t zfs_adc_target_sync_pct; int kstat_windows_init(void *); void kstat_windows_fini(void); @@ -268,5 +273,6 @@ void kstat_windows_fini(void); int arc_kstat_update(kstat_t *ksp, int rw); int arc_kstat_update_windows(kstat_t *ksp, int rw); int spl_kstat_registry(void *pRegistryPath, kstat_t *ksp); +int dynamic_dirty_data_kstat_update(kstat_t* ksp, int rw); #endif diff --git a/include/sys/arc.h b/include/sys/arc.h index f58fa53b6003..6fd9e5cc397f 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -78,6 +78,7 @@ typedef void arc_prune_func_t(int64_t bytes, void *priv); /* Shared module parameters */ extern int zfs_arc_average_blocksize; +extern uint64_t dirty_ceil_bytes; /* generic arc_done_func_t's which you can use */ arc_read_done_func_t arc_bcopy_func; diff --git a/include/sys/txg.h b/include/sys/txg.h index 22158bd1a5e6..b0a8851281d2 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -139,7 +139,15 @@ extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); /* Global tuning */ extern int zfs_txg_timeout; - +extern int zfs_adc_enable; +extern uint_t zfs_adc_target_sync_pct; + +typedef struct dynamic_dirty_data_stats { + kstat_named_t adc_target; + kstat_named_t spa_sync_time; + kstat_named_t data_flushed_per_sync; + kstat_named_t total_dirty_data; +} dynamic_dirty_data_stats_t; #ifdef ZFS_DEBUG #define TXG_VERIFY(spa, txg) txg_verify(spa, txg) diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index 3a57b55ea90d..48c680ee2672 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -76,6 +76,7 @@ static volatile _Atomic int64_t spl_free = 0; int64_t spl_free_delta_ema; static boolean_t spl_event_thread_exit = FALSE; +static boolean_t spl_abd_prealloc_thread_exit = FALSE; PKEVENT low_mem_event = NULL; static volatile _Atomic int64_t spl_free_manual_pressure = 0; @@ -131,6 +132,10 @@ extern uint64_t zfs_active_rwlock; extern uint64_t total_memory; extern uint64_t real_total_memory; +extern kmem_cache_t *abd_chunk_cache; +extern uint64_t zfs_arc_max; +extern int zfs_prealloc_percent; + #define MULT 1 static const char *KMEM_VA_PREFIX = "kmem_va"; @@ -4230,7 +4235,7 @@ spl_free_wrapper(void) int64_t spl_free_manual_pressure_wrapper(void) { - return (spl_free_manual_pressure); + return (0); } uint64_t @@ -4479,6 +4484,7 @@ spl_free_thread() spl_vm_pressure_level != MAGIC_PRESSURE_UNAVAILABLE) { /* there is pressure */ lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pressure_level: %lu\n",spl_vm_pressure_level)); new_spl_free = -(2LL * PAGE_SIZE * spl_vm_pages_wanted); if (spl_vm_pressure_level > 1) { emergency_lowmem = true; @@ -4533,6 +4539,7 @@ spl_free_thread() int64_t old_pressure = spl_free_manual_pressure; new_spl_free -= old_pressure * 2LL; lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_free_manual_pressure: %llu\n",spl_free_manual_pressure)); if (spl_free_fast_pressure) { emergency_lowmem = true; new_spl_free -= old_pressure * 4LL; @@ -4630,6 +4637,7 @@ spl_free_thread() new_spl_free += bminus; lowmem = true; emergency_lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pages_wanted %lu\n", spl_vm_pages_wanted)); // atomic swaps to set these variables used in arc.c int64_t previous_highest_pressure = 0; int64_t new_p = -bminus; @@ -4650,6 +4658,7 @@ spl_free_thread() new_spl_free -= bytes_wanted; if (reserve_low && !early_lots_free) { lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pages_wanted: %lu reserve_low: %lu early_lots_free: %lu\n", spl_vm_pages_wanted, reserve_low, early_lots_free)); if (recent_lowmem == 0) { recent_lowmem = time_now; } @@ -4760,6 +4769,7 @@ spl_free_thread() real_total_memory) > 75) { new_spl_free -= total_mem_used / 32; lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "segkmem_total_mem_allocated: %llu real_total_memory: %llu\n", segkmem_total_mem_allocated, real_total_memory)); } } @@ -4914,6 +4924,53 @@ spl_event_thread(void *notused) thread_exit(); } +static void +spl_abd_prealloc_thread(void *notused) +{ + NTSTATUS Status; + + typedef struct abd_prealloc_node { + list_node_t node; + } abd_prealloc_node_t; + + abd_prealloc_node_t *node; + list_t abd_prealloc_list; + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc start segkmem_total_mem_allocated: %lld total_memory: %lld\n", + segkmem_total_mem_allocated, total_memory)); + + dprintf("SPL: beginning spl_abd_prealloc_thread() loop\n"); + + list_create(&abd_prealloc_list, sizeof (abd_prealloc_node_t), offsetof(abd_prealloc_node_t, node)); + + while (!spl_abd_prealloc_thread_exit) { + + if (!abd_chunk_cache || !zfs_arc_max) { + delay(hz); + continue; + } + + if (segkmem_total_mem_allocated >= + (zfs_arc_max * zfs_prealloc_percent) / 100) { + break; + } + + node = (abd_prealloc_node_t *)kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); + list_insert_tail(&abd_prealloc_list, node); + } + + while ((node = list_remove_head(&abd_prealloc_list)) != NULL) { + kmem_cache_free(abd_chunk_cache, node); + } + + spl_abd_prealloc_thread_exit = FALSE; + dprintf("SPL: %s thread_exit\n", __func__); + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc done segkmem_total_mem_allocated: %lld total_memory: %lld zfs_arc_max: %llu zfs_prealloc_percent: %d%\n", + segkmem_total_mem_allocated, total_memory, zfs_arc_max, zfs_prealloc_percent)); + thread_exit(); +} + static int spl_kstat_update(kstat_t *ksp, int rw) @@ -5342,8 +5399,13 @@ spl_kmem_thread_init(void) (void) thread_create(NULL, 0, spl_free_thread, 0, 0, 0, 0, 92); spl_free_thread_running = TRUE; - spl_event_thread_exit = FALSE; - (void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); + if (zfs_prealloc_percent) { + spl_abd_prealloc_thread_exit = FALSE; + (void) thread_create(NULL, 0, spl_abd_prealloc_thread, 0, 0, 0, 0, 92); + } else { + spl_event_thread_exit = FALSE; + (void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); + } } void @@ -5351,6 +5413,7 @@ spl_kmem_thread_fini(void) { shutting_down = 1; + spl_abd_prealloc_thread_exit = TRUE; if (low_mem_event != NULL) { dprintf("SPL: stopping spl_event_thread\n"); spl_event_thread_exit = TRUE; diff --git a/module/os/windows/spl/spl-thread.c b/module/os/windows/spl/spl-thread.c index 6ef65414c6d4..93cb51557fd8 100644 --- a/module/os/windows/spl/spl-thread.c +++ b/module/os/windows/spl/spl-thread.c @@ -39,7 +39,78 @@ uint64_t zfs_threads = 0; -kthread_t * + +kthread_t* +spl_thread_create( + caddr_t stk, + size_t stksize, + void (*proc)(void*), + void* arg, + size_t len, + int state, +#ifdef SPL_DEBUG_THREAD + char* filename, + int line, +#endif + pri_t pri) +{ + NTSTATUS status; + HANDLE hThread = NULL; + PETHREAD eThread = NULL; + +#ifdef SPL_DEBUG_THREAD + dprintf("Start thread pri %d\n", pri); +#endif + + status = PsCreateSystemThread( + &hThread, + THREAD_ALL_ACCESS, + NULL, + NULL, + NULL, + proc, + arg); + + if (!NT_SUCCESS(status)) + return NULL; + + /* Convert HANDLE ETHREAD */ + status = ObReferenceObjectByHandle( + hThread, + THREAD_ALL_ACCESS, + *PsThreadType, + KernelMode, + (PVOID*)&eThread, + NULL); + + /* We no longer need the handle */ + ZwClose(hThread); + + if (!NT_SUCCESS(status)) + return NULL; + + /* Clamp priority to safe Windows range */ + KPRIORITY newPri = (KPRIORITY)pri; + + if (newPri > maxclsyspri) + newPri = maxclsyspri; + + if (newPri < minclsyspri) + newPri = minclsyspri; + + /* Set absolute priority */ + KeSetPriorityThread((PKTHREAD)eThread, newPri); + +#ifdef SPL_DEBUG_THREAD + dprintf("Thread created with priority %d\n", newPri); +#endif + + atomic_inc_64(&zfs_threads); + + return (kthread_t*)eThread; +} + +/*kthread_t* spl_thread_create( caddr_t stk, size_t stksize, @@ -72,12 +143,7 @@ spl_thread_create( if (result != STATUS_SUCCESS) return (NULL); - /* - * Improve the priority when asked to do so - * Thread priorities range from 0 to 31, where 0 is the lowest - * priority and 31 is the highest - */ - + if (pri > minclsyspri) { // thread_precedence_policy_data_t policy; // policy.importance = pri - minclsyspri; @@ -102,7 +168,7 @@ spl_thread_create( ObDereferenceObject(eThread); ZwClose(thread); return ((kthread_t *)eThread); -} +}*/ kthread_t * spl_current_thread(void) diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c index 2540dca395aa..4c3d0a645aea 100644 --- a/module/os/windows/spl/spl-vmem.c +++ b/module/os/windows/spl/spl-vmem.c @@ -448,6 +448,8 @@ uint64_t spl_frag_walk_cnt = 0; extern void spl_free_set_emergency_pressure(int64_t p); extern uint64_t segkmem_total_mem_allocated; extern uint64_t total_memory; +extern uint64_t zfs_arc_max; +extern int zfs_prealloc_percent; /* * Get a vmem_seg_t from the global segfree list. @@ -1732,12 +1734,23 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) vsp = vprev; } + // calling vm_source_free will free the memory to windows, we + // don't want to do this unless we are crossing the arc limit when + // zfs_prealloc_percent is enabled. + boolean_t allow_vm_source_free = true; + if (zfs_prealloc_percent) { + if (segkmem_total_mem_allocated < + (zfs_arc_max * 102) / 100) { + allow_vm_source_free = false; + } + } + /* * If the entire span is free, return it to the source. */ if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL && vsp->vs_aprev->vs_type == VMEM_SPAN && - vsp->vs_anext->vs_type == VMEM_SPAN) { + vsp->vs_anext->vs_type == VMEM_SPAN && allow_vm_source_free) { vaddr = (void *)vsp->vs_start; size = VS_SIZE(vsp); ASSERT(size == VS_SIZE(vsp->vs_aprev)); diff --git a/module/os/windows/spl/spl-windows.c b/module/os/windows/spl/spl-windows.c index e6b31edac923..3aabbaa91116 100644 --- a/module/os/windows/spl/spl-windows.c +++ b/module/os/windows/spl/spl-windows.c @@ -53,12 +53,14 @@ volatile unsigned int vm_page_speculative_count = 5500; uint64_t spl_GetPhysMem(void); uint64_t spl_GetZfsTotalMemory(PUNICODE_STRING RegistryPath); +uint64_t spl_getZfsPreallocSize(PUNICODE_STRING RegistryPath); #include #include // Size in bytes of the memory allocated in seg_kmem extern uint64_t segkmem_total_mem_allocated; +extern int zfs_prealloc_percent; #define MAXHOSTNAMELEN 64 extern char hostname[MAXHOSTNAMELEN]; @@ -528,6 +530,11 @@ spl_start(PUNICODE_STRING RegistryPath) spl_mutex_subsystem_init(); spl_kmem_init(total_memory); + // lets get the registry value now, because the zfs loads the registry little later + int reg_val = spl_getZfsPreallocSize(RegistryPath); + if (reg_val != 0) + zfs_prealloc_percent = reg_val; + spl_vnode_init(); spl_kmem_thread_init(); spl_kmem_mp_init(); @@ -755,3 +762,91 @@ spl_GetZfsTotalMemory(PUNICODE_STRING RegistryPath) ZwClose(h); return (newvalue); } + +uint64_t +spl_getZfsPreallocSize(PUNICODE_STRING RegistryPath) +{ + OBJECT_ATTRIBUTES ObjectAttributes; + HANDLE h; + NTSTATUS status; + uint64_t newvalue = 0; + + InitializeObjectAttributes(&ObjectAttributes, + RegistryPath, + OBJ_KERNEL_HANDLE | OBJ_CASE_INSENSITIVE, + NULL, + NULL); + + status = ZwOpenKey(&h, // KeyHandle + KEY_ALL_ACCESS, // DesiredAccess + &ObjectAttributes); // ObjectAttributes + + if (!NT_SUCCESS(status)) { + dprintf("%s: Unable to open Registry %wZ: 0x%x. " + "Going with defaults.\n", __func__, RegistryPath, status); + return (0); + } + + ULONG index = 0; + ULONG length = 0; + PKEY_VALUE_FULL_INFORMATION regBuffer = NULL; + + for (index = 0; status != STATUS_NO_MORE_ENTRIES; index++) { + // Get the buffer size necessary + status = ZwEnumerateValueKey(h, index, KeyValueFullInformation, + NULL, 0, &length); + + if ((status != STATUS_BUFFER_TOO_SMALL) && + (status != STATUS_BUFFER_OVERFLOW)) + break; // Something is wrong - or we finished + + // Allocate space to hold + regBuffer = (PKEY_VALUE_FULL_INFORMATION)ExAllocatePoolWithTag( + NonPagedPoolNx, length, 'zfsr'); + + if (regBuffer == NULL) + break; + + status = ZwEnumerateValueKey(h, index, KeyValueFullInformation, + regBuffer, length, &length); + if (!NT_SUCCESS(status)) { + break; + } + // Convert name to straight ascii so we compare with kstat + ULONG outlen = 0; + char keyname[KSTAT_STRLEN + 1] = { 0 }; + status = RtlUnicodeToUTF8N(keyname, KSTAT_STRLEN, &outlen, + regBuffer->Name, regBuffer->NameLength); + + // Conversion failed? move along.. + if (status != STATUS_SUCCESS && status + != STATUS_SOME_NOT_MAPPED) + break; + + // Output string is only null terminated if input is, + // so do so now. + keyname[outlen] = 0; + if (strcasecmp("zfs_prealloc_percent", keyname) == 0) { + if (regBuffer->Type != REG_DWORD || + regBuffer->DataLength != sizeof (uint32_t)) { + dprintf("%s: registry '%s' did not match. " + "Type needs to be REG_QWORD. (8 bytes)\n", + __func__, keyname); + } else { + newvalue = *(uint32_t *)((uint8_t *)regBuffer + + regBuffer->DataOffset); + dprintf("%s: zfs_prealloc_percent is set to:" + " %llu\n", __func__, newvalue); + } + break; + } + ExFreePool(regBuffer); + regBuffer = NULL; + } + + if (regBuffer) + ExFreePool(regBuffer); + + ZwClose(h); + return (newvalue); +} diff --git a/module/os/windows/zfs/arc_os.c b/module/os/windows/zfs/arc_os.c index 41eef66413f3..6443f9acd095 100644 --- a/module/os/windows/zfs/arc_os.c +++ b/module/os/windows/zfs/arc_os.c @@ -59,6 +59,7 @@ #include extern arc_stats_t arc_stats; +extern uint64_t zfs_arc_max; static kmutex_t arc_reclaim_lock; static kcondvar_t arc_reclaim_thread_cv; @@ -127,7 +128,7 @@ arc_free_memory(void) int64_t arc_available_memory(void) { - return (arc_free_memory() - arc_sys_free); + return (zfs_arc_max - aggsum_value(&arc_sums.arcstat_size)); } int @@ -137,8 +138,7 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) /* possibly wake up arc reclaim thread */ if (arc_reclaim_in_loop == B_FALSE) { - if (spl_free_manual_pressure_wrapper() != 0 || - !spl_minimal_physmem_p() || + if (!spl_minimal_physmem_p() || arc_reclaim_needed()) { cv_signal(&arc_reclaim_thread_cv); kpreempt(KPREEMPT_SYNC); @@ -696,6 +696,8 @@ arc_kstat_update_windows(kstat_t *ksp, int rw) zfs_arc_average_blocksize = ks->arc_zfs_arc_average_blocksize.value.ui64; zvol_threads = ks->zvol_io_threads.value.ui32; + zfs_prealloc_percent = ks->zfs_prealloc_percent.value.ui32; + zfs_adc_target_sync_pct = ks->zfs_adc_target_sync_pct.value.ui32; #ifdef _KERNEL if (ks->zfs_total_memory_limit.value.ui64 > total_memory && @@ -731,6 +733,8 @@ arc_kstat_update_windows(kstat_t *ksp, int rw) ks->arc_zfs_arc_average_blocksize.value.ui64 = zfs_arc_average_blocksize; ks->zvol_io_threads.value.ui32 = zvol_threads; + ks->zfs_prealloc_percent.value.ui32 = zfs_prealloc_percent; + ks->zfs_adc_target_sync_pct.value.ui32 = zfs_adc_target_sync_pct; #ifdef _KERNEL ks->zfs_total_memory_limit.value.ui64 = total_memory; @@ -796,7 +800,7 @@ arc_prune_async(int64_t adjust) int64_t arc_available_memory(void) { - return (arc_free_memory() - arc_sys_free); + return (zfs_arc_max - aggsum_value(&arc_sums.arcstat_size)); } int diff --git a/module/os/windows/zfs/zfs_kstat_windows.c b/module/os/windows/zfs/zfs_kstat_windows.c index 36799e2fdd40..5c52268cd502 100644 --- a/module/os/windows/zfs/zfs_kstat_windows.c +++ b/module/os/windows/zfs/zfs_kstat_windows.c @@ -113,6 +113,7 @@ windows_kstat_t windows_kstat = { {"spa_mode_global", KSTAT_DATA_INT64 }, {"zfs_flags", KSTAT_DATA_INT64 }, {"zfs_txg_timeout", KSTAT_DATA_INT64 }, + {"zfs_adc_enable", KSTAT_DATA_INT64 }, {"zfs_vdev_cache_max", KSTAT_DATA_INT64 }, {"zfs_vdev_cache_size", KSTAT_DATA_INT64 }, {"zfs_vdev_cache_bshift", KSTAT_DATA_INT64 }, @@ -173,7 +174,9 @@ windows_kstat_t windows_kstat = { { "zfs_total_memory_limit", KSTAT_DATA_UINT64 }, { "zfs_removal_suspend_progress", KSTAT_DATA_INT32 }, { "cpu_avx_supported", KSTAT_DATA_UINT32 }, - { "zvol_io_threads", KSTAT_DATA_UINT32 } + { "zvol_io_threads", KSTAT_DATA_UINT32 }, + { "zfs_prealloc_percent", KSTAT_DATA_UINT32 }, + { "zfs_adc_target_sync_pct", KSTAT_DATA_UINT32 }, }; @@ -290,6 +293,8 @@ windows_kstat_update(kstat_t *ksp, int rw) ks->zfs_flags.value.i64; zfs_txg_timeout = ks->zfs_txg_timeout.value.i64; + zfs_adc_enable = + ks->zfs_adc_enable.value.i64; zfs_vdev_cache_max = ks->zfs_vdev_cache_max.value.i64; zfs_vdev_cache_size = @@ -383,6 +388,10 @@ windows_kstat_update(kstat_t *ksp, int rw) ks->zfs_removal_suspend_progress.value.i32; cpu_avx_supported = ks->cpu_avx_supported.value.ui32; + zfs_prealloc_percent = + ks->zfs_prealloc_percent.value.ui32; + zfs_adc_target_sync_pct = + ks->zfs_adc_target_sync_pct.value.ui32; } else { /* kstat READ */ @@ -482,6 +491,8 @@ windows_kstat_update(kstat_t *ksp, int rw) zfs_flags; ks->zfs_txg_timeout.value.i64 = zfs_txg_timeout; + ks->zfs_adc_enable.value.i64 = + zfs_adc_enable; ks->zfs_vdev_cache_max.value.i64 = zfs_vdev_cache_max; ks->zfs_vdev_cache_size.value.i64 = @@ -573,6 +584,10 @@ windows_kstat_update(kstat_t *ksp, int rw) cpu_avx_supported; ks->zvol_io_threads.value.ui32 = zvol_threads; + ks->zfs_prealloc_percent.value.ui32 = + zfs_prealloc_percent; + ks->zfs_adc_target_sync_pct.value.ui32 = + zfs_adc_target_sync_pct; } arc_kstat_update_windows(ksp, rw); return (0); diff --git a/module/os/windows/zfs/zvol_os.c b/module/os/windows/zfs/zvol_os.c index 0500d2fdffc7..359a185b13cd 100644 --- a/module/os/windows/zfs/zvol_os.c +++ b/module/os/windows/zfs/zvol_os.c @@ -46,6 +46,7 @@ unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; int zvol_threads = 0; +int zfs_prealloc_percent = 100; taskq_t *zvol_taskq; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index e29e81201e50..5790035d4664 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -462,6 +462,7 @@ int zfs_arc_meta_prune = 10000; int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; int zfs_arc_meta_adjust_restarts = 4096; int zfs_arc_lotsfree_percent = 10; +uint64_t dirty_ceil_bytes; /* The 6 states: */ arc_state_t ARC_anon; @@ -7998,6 +7999,8 @@ arc_init(void) zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } + + dirty_ceil_bytes = zfs_dirty_data_max; } void diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 46c3bb1bcd0b..a37f79b422c5 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -34,6 +34,7 @@ #include #include #include +#include /* * ZFS Transaction Groups @@ -113,6 +114,286 @@ static void txg_quiesce_thread(void *arg); int zfs_txg_timeout = 1; /* max seconds worth of delta per txg */ +#define DIRTY_FLOOR_BYTES (153ULL << 20) +kstat_t* dd_ksp; +int64_t kstat_adc_target = 0; +int64_t kstat_spa_sync_time = 0; +uint64_t kstat_data_flushed_per_sync = 0; +uint64_t kstat_total_dirty_data = 0; + +dynamic_dirty_data_stats_t dynamic_dirty_data_stats = { + { "adc_target", KSTAT_DATA_INT64 }, + { "spa_sync_time", KSTAT_DATA_INT64 }, + { "data_flushed_per_sync", KSTAT_DATA_UINT64 }, + { "total_dirty_data", KSTAT_DATA_UINT64 }, + +}; +/* + * Target spa_sync duration as a fraction of zfs_txg_timeout. + * 75 = target 75% of timeout. This headroom allows for burst + * without immediately hitting throttle. + * + * Lower → more headroom, lower peak throughput + * Higher → more throughput, less burst tolerance + */ +uint_t zfs_adc_target_sync_pct = 75; + +/* PID gains, all scaled ×1000 to avoid floating point */ +int zfs_adc_kp = 200; /* Proportional: main corrective force */ +int zfs_adc_ki = 15; /* Integral: eliminates steady-state bias */ +int zfs_adc_kd = 100; /* Derivative: damping against oscillation */ + +/* EMA smoothing window in TXG count */ +uint_t zfs_adc_ema_alpha_pct = 25; /* 25% weight on new sample */ + +/* Minimum TXGs between dirty_max updates (anti-flapping) */ +uint_t zfs_adc_holdoff_txgs = 2; + +/* Master enable — 0 reverts to stock ZFS behavior instantly */ +int zfs_adc_enable = 1; + +/* ============================================================ + * ADC STATE — one instance on the stack of txg_sync_thread + * No heap allocation, no lock needed (single-threaded use) + * ============================================================ */ + +typedef struct { + /* PID state */ + int64_t adc_integral; /* accumulated error (I term) */ + int64_t adc_prev_error; /* last error sample (D term) */ + clock_t adc_ema_delta; /* smoothed spa_sync duration */ + uint64_t adc_last_txg; /* TXG of last adjustment */ + + /* Bounds in bytes, computed once from physmem */ + uint64_t adc_min_dirty; + uint64_t adc_max_dirty; + + /* Diagnostics / kstat shadow */ + uint64_t adc_n_syncs; /* total TXGs observed */ + uint64_t adc_n_raised; /* times dirty_max was raised */ + uint64_t adc_n_lowered; /* times dirty_max was lowered */ + uint64_t adc_n_clamped; /* times a bound was hit */ + int64_t adc_last_p; /* last P term for debug */ + int64_t adc_last_i; /* last I term for debug */ + int64_t adc_last_d; /* last D term for debug */ +} txg_adc_t; + +static int +dynamic_dirty_data_kstat_update(kstat_t* ksp, int rw) { + dynamic_dirty_data_stats_t* as = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (SET_ERROR(EACCES)); + as->adc_target.value.i64 = + kstat_adc_target; + as->spa_sync_time.value.i64 = + kstat_spa_sync_time; + as->data_flushed_per_sync.value.ui64 = + kstat_data_flushed_per_sync; + as->total_dirty_data.value.ui64 = + kstat_total_dirty_data; + + return (0); +} +/* + * adc_ema — Exponential moving average, integer arithmetic. + * + * new_ema = prev × (1 - α) + sample × α + * = prev + (sample - prev) × alpha_pct / 100 + * + * alpha_pct=25 means 25% weight on the newest sample, giving + * a smoothing time constant of ~3 TXGs — fast enough to track + * load changes, slow enough to ignore one-off scrub/snapshot bursts. + */ +static inline clock_t +adc_ema(clock_t prev, clock_t sample, uint_t alpha_pct) +{ + return (prev + (clock_t)(((int64_t)sample - prev) + * alpha_pct / 100)); +} + +/* + * adc_init — called once before the txg_sync_thread loop. + * + * Seeds the EMA at the target so the first TXG doesn't trigger + * an aggressive correction from a cold zero baseline. + */ +static void +adc_init(txg_adc_t* adc, clock_t target_ticks) +{ + bzero(adc, sizeof(*adc)); + + adc->adc_min_dirty = DIRTY_FLOOR_BYTES; + adc->adc_max_dirty = dirty_ceil_bytes; + + /* Defensive: ensure min < max regardless of tunable misconfiguration */ + if (adc->adc_min_dirty >= adc->adc_max_dirty) + adc->adc_min_dirty = adc->adc_max_dirty / 8; + + /* Seed EMA at target — controller starts in steady state */ + adc->adc_ema_delta = target_ticks; + adc->adc_prev_error = 0; + adc->adc_integral = 0; + + dd_ksp = kstat_create("zfs", 0, "dynamic_dirty_data_stats", "misc", KSTAT_TYPE_NAMED, + sizeof(dynamic_dirty_data_stats) / sizeof(kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (dd_ksp != NULL) { + dd_ksp->ks_data = &dynamic_dirty_data_stats; + dd_ksp->ks_update = dynamic_dirty_data_kstat_update; + kstat_install(dd_ksp); + } +} + +/* + * adc_update — core PID + dirty_max adjustment. + * + * Called every TXG with the just-measured spa_sync duration. + * Modifies zfs_dirty_data_max in place. + * + * @adc: controller state + * @txg: current TXG id (for holdoff tracking) + * @raw_delta: ddi_get_lbolt() delta from this spa_sync + * @target_ticks: desired spa_sync duration in lbolt ticks + */ +static void +adc_update(txg_adc_t* adc, uint64_t txg, + clock_t raw_delta, clock_t target_ticks, uint64_t data_flushed, uint64_t total_dirty) +{ + int64_t error; /* normalized error × 1000 */ + int64_t p_term; /* proportional correction */ + int64_t i_term; /* integral correction */ + int64_t d_term; /* derivative correction */ + int64_t pid_out; /* combined PID output */ + int64_t adjustment; /* byte delta for dirty_max */ + uint64_t cur, proposed, next; + + kstat_adc_target = (long)(((uint64_t)target_ticks * 1000ULL) / hz); + kstat_spa_sync_time = (long)(((uint64_t)raw_delta * 1000ULL) / hz); + kstat_data_flushed_per_sync = data_flushed; + kstat_total_dirty_data = total_dirty; + + adc->adc_n_syncs++; + + /* Step 1: Update smoothed sync duration via EMA */ + adc->adc_ema_delta = adc_ema(adc->adc_ema_delta, + raw_delta, zfs_adc_ema_alpha_pct); + + /* Step 2: Enforce adjustment holdoff (anti-flapping) */ + if (adc->adc_last_txg != 0 && + (txg - adc->adc_last_txg) < zfs_adc_holdoff_txgs) + return; + + /* Step 3: Compute normalized error + * + * error = (ema_delta - target) / target × 1000 + * + * > 0 : sync taking too long → dirty_max too high → must shrink + * < 0 : sync finishing early → dirty_max too low → can grow + * = 0 : perfect operating point + * + * Example: ema=6s, target=5s → error = +200 (20% over) + * Example: ema=3s, target=5s → error = -400 (40% under) + */ + if (target_ticks == 0) + return; /* Safety: avoid divide-by-zero on misconfiguration */ + + error = ((int64_t)adc->adc_ema_delta - (int64_t)target_ticks) + * 1000LL / (int64_t)target_ticks; + + /* Step 4: P term — immediate response to current error */ + p_term = (int64_t)zfs_adc_kp * error / 1000LL; + + /* Step 5: I term — accumulate to eliminate steady-state offset + * + * Anti-windup clamp: prevents integral from growing unboundedly + * during sustained overload (e.g., pool degraded, resilver running). + * Clamped at ±(30 × Kp) which limits I contribution to ≤3× P max. + */ + adc->adc_integral += error; + { + int64_t windup_limit = 30LL * (int64_t)zfs_adc_kp; + if (adc->adc_integral > windup_limit) adc->adc_integral = windup_limit; + if (adc->adc_integral < -windup_limit) adc->adc_integral = -windup_limit; + } + i_term = (int64_t)zfs_adc_ki * adc->adc_integral / 1000LL; + + /* Step 6: D term — dampen oscillation via rate-of-change */ + d_term = (int64_t)zfs_adc_kd + * (error - adc->adc_prev_error) / 1000LL; + adc->adc_prev_error = error; + + /* Save for diagnostics */ + adc->adc_last_p = p_term; + adc->adc_last_i = i_term; + adc->adc_last_d = d_term; + + /* Step 7: Combine PID output + * + * pid_out > 0 → sync was slow → DECREASE dirty_max + * pid_out < 0 → sync was fast → INCREASE dirty_max + * (sign inversion applied in Step 8) + */ + pid_out = p_term + i_term + d_term; + + if (pid_out == 0) + return; + + /* Step 8: Convert PID output to byte adjustment + * + * adjustment = -(pid_out / 1000) × dirty_max × step_scale + * + * pid_out is in units of 0.1% so dividing by 1000 gives fraction. + * Maximum single-step is capped at 20% of current dirty_max to + * prevent catastrophic collapse from one anomalous TXG. + */ + cur = zfs_dirty_data_max; + + adjustment = -((int64_t)cur / 1000LL) * pid_out; + + /* Cap single-step adjustment at ±20% of current dirty_max */ + { + int64_t max_step = (int64_t)(cur / 5); + if (adjustment > max_step) adjustment = max_step; + if (adjustment < -max_step) adjustment = -max_step; + } + + /* Step 9: Apply bounds */ + proposed = (int64_t)cur + adjustment; + + if (proposed <= adc->adc_min_dirty) { + next = adc->adc_min_dirty; + adc->adc_n_clamped++; + } + else if (proposed >= adc->adc_max_dirty) { + next = adc->adc_max_dirty; + adc->adc_n_clamped++; + } + else { + next = proposed; + } + + /* Step 10: Commit — single store, visible to txg_delay() immediately */ + if (next != cur) { + zfs_dirty_data_max = next; + adc->adc_last_txg = txg; + if (next > cur) adc->adc_n_raised++; + else adc->adc_n_lowered++; + + zfs_dbgmsg("txg_adc txg=%llu ema_delta=%ldms target=%ldms " + "err=%lld P=%lld I=%lld D=%lld " + "dirty_max %lluMB→%lluMB", + (u_longlong_t)txg, + (long)(((uint64_t)adc->adc_ema_delta * 1000ULL) / hz), + (long)(((uint64_t)target_ticks * 1000ULL) / hz), + (longlong_t)error, + (longlong_t)p_term, + (longlong_t)i_term, + (longlong_t)d_term, + (u_longlong_t)(cur >> 20), + (u_longlong_t)(next >> 20)); + } +} /* * Prepare the txg subsystem. */ @@ -213,8 +494,9 @@ txg_sync_start(dsl_pool_t *dp) * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ + tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, - dp, 0, &p0, TS_RUN, defclsyspri); + dp, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&tx->tx_sync_lock); } @@ -530,11 +812,23 @@ txg_sync_thread(void *arg) tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; clock_t start, delta; + /* ADC: declare controller state — stack allocated, + * zero overhead when zfs_adc_enable == 0 */ + txg_adc_t adc; (void) spl_fstrans_mark(); txg_thread_enter(tx, &cpr); start = delta = 0; + /* ADC: compute target once; recomputed if timeout changes. + * target = zfs_txg_timeout × target_pct / 100 */ + clock_t adc_target = (clock_t)(zfs_txg_timeout * hz) + * zfs_adc_target_sync_pct / 100; + + /* ADC: initialize controller — seeds EMA, computes bounds */ + if (zfs_adc_enable) + adc_init(&adc, adc_target); + for (;;) { clock_t timeout = zfs_txg_timeout * hz; clock_t timer; @@ -573,7 +867,7 @@ txg_sync_thread(void *arg) } if (tx->tx_exiting) - txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); /* * Consume the quiesced txg which has been handed off to @@ -593,11 +887,24 @@ txg_sync_thread(void *arg) mutex_exit(&tx->tx_sync_lock); txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp); + uint64_t dirty_flushed = spa->spa_dsl_pool->dp_dirty_pertxg[txg & TXG_MASK]; + uint64_t total_dirty = dp->dp_dirty_total; start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; spa_txg_history_fini_io(spa, ts); + /* ADC: feed measured delta into controller. + * This is the ONLY net-new call in the hot path. + * adc_update() is O(1), no allocation, no lock. + * Recompute target here to pick up runtime tunable changes + * (operator can adjust zfs_txg_timeout or target_pct live). */ + if (zfs_adc_enable) { + adc_target = (clock_t)(zfs_txg_timeout * hz) + * zfs_adc_target_sync_pct / 100; + adc_update(&adc, txg, delta, adc_target, dirty_flushed, total_dirty); + } + mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0;