From cc510a36e735f7565a7a6e4fa2f06b0adcc38082 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Thu, 4 Jun 2026 17:36:13 +0200 Subject: [PATCH 1/4] add support for setting sm count in nvcuda backend --- kernel_tuner/backends/nvcuda.py | 62 ++++++++++++++++++++++++++++++++- kernel_tuner/utils/nvcuda.py | 9 +++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index c4598816..69001190 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -8,7 +8,7 @@ from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.nvcuda import CudaRuntimeObserver from kernel_tuner.util import SkippableFailure -from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home +from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home, _check # embedded in try block to be able to generate documentation # and run tests without cuda-python installed @@ -84,6 +84,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cuda_error_check(err) err, self.end = driver.cuEventCreate(0) cuda_error_check(err) + self.green_ctx = None # default dynamically allocated shared memory size, can be overwritten using smem_args self.smem_size = 0 @@ -115,6 +116,61 @@ def __del__(self): err = driver.cuMemFree(device_memory) cuda_error_check(err) + + def set_sm_percentage(self, sm_percentage): + """ + Create a CUDA green context owning ~`sm_percentage` of the device's SMs + and return a stream bound to it. Kernels launched on the returned stream + are restricted to that SM partition. + + Returns: (green_ctx, stream, num_sms_assigned) + Requires: CUDA >= 12.4 and a GPU that supports SM partitioning. + """ + + if not 0 < sm_percentage <= 100: + raise ValueError("sm_percentage must be in (0, 100]") + + # Cleanup old stream and green context if any + _check(driver.cuStreamDestroy(self.stream)) + if self.green_ctx: + _check(driver.cuGreenCtxDestroy(self.green_ctx)) + + # Get total SMs and desired percentage + total_sms = _check(driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, self.device)) + want = max(1, round(total_sms * sm_percentage / 100.0)) + + # Full SM resource pool of the device. + sm_resource = _check(driver.cuDeviceGetDevResource( + self.device, driver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM)) + + # Split off one group of at least `want` SMs. The driver rounds up to the + # device's partitioning granularity, so the actual count may be larger. + groups, _nb, _remaining = _check(driver.cuDevSmResourceSplitByCount( + 1, # number of groups requested + sm_resource, # input resource + 0, # useFlags (0 = default) + want, # minCount of SMs per group + )) + group = groups[0] + assigned = group.sm.smCount + + # Descriptor -> green context. + desc = _check(driver.cuDevResourceGenerateDesc([group], 1)) + green_ctx = _check(driver.cuGreenCtxCreate( + desc, self.device, driver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM)) + + # A stream from the green context confines launches to its SMs. + stream = _check(driver.cuGreenCtxStreamCreate( + green_ctx, + driver.CUstream_flags.CU_STREAM_NON_BLOCKING, + 0, # priority + )) + self.green_ctx = green_ctx + self.stream = stream + self.assigned_sm_count = assigned + + def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. @@ -140,6 +196,7 @@ def ready_argument_list(self, arguments): gpu_args.append(arg) return gpu_args + def compile(self, kernel_instance): """Call the CUDA compiler to compile the kernel, return the device function. @@ -223,6 +280,9 @@ def compile(self, kernel_instance): print(log.decode("utf-8")) raise re + if "CUDA_SM_PERCENTAGE" in kernel_instance.params: + self.set_sm_percentage(kernel_instance.params["CUDA_SM_PERCENTAGE"]) + return self.func def start_event(self): diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py index cffbfea0..83dca98a 100644 --- a/kernel_tuner/utils/nvcuda.py +++ b/kernel_tuner/utils/nvcuda.py @@ -62,6 +62,15 @@ def cuda_error_check(error): raise RuntimeError(f"NVRTC error: {desc.decode()}") +def _check(call_result): + """Unwrap a cuda-python (CUresult, ...) return tuple and raise on error.""" + err, *rest = call_result + cuda_error_check(err) + if not rest: + return None + return rest[0] if len(rest) == 1 else tuple(rest) + + def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="75") From 92acaedad3203115112b74fe3496eecefd6c2d7c Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Fri, 12 Jun 2026 15:34:30 +0200 Subject: [PATCH 2/4] add cache for green contexts, set sm count on benchmark instead of compile --- kernel_tuner/backends/nvcuda.py | 31 ++++++++++++++++++++++--------- kernel_tuner/core.py | 4 ++++ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 69001190..831102e7 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -84,6 +84,8 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cuda_error_check(err) err, self.end = driver.cuEventCreate(0) cuda_error_check(err) + self.current_sm_percentage = 100 + self.green_ctx_cache = {} self.green_ctx = None # default dynamically allocated shared memory size, can be overwritten using smem_args @@ -111,10 +113,17 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = env["device_name"] def __del__(self): + # Cleanup streams and green contexts, if any + if self.green_ctx_cache: + for val in self.green_ctx_cache.values(): + green_ctx, stream, _ = val + _check(driver.cuStreamDestroy(stream)) + _check(driver.cuGreenCtxDestroy(green_ctx)) + + # Cleanup for device_memory in self.allocations: if isinstance(device_memory, driver.CUdeviceptr): - err = driver.cuMemFree(device_memory) - cuda_error_check(err) + _check(driver.cuMemFree(device_memory)) def set_sm_percentage(self, sm_percentage): @@ -130,10 +139,15 @@ def set_sm_percentage(self, sm_percentage): if not 0 < sm_percentage <= 100: raise ValueError("sm_percentage must be in (0, 100]") - # Cleanup old stream and green context if any - _check(driver.cuStreamDestroy(self.stream)) - if self.green_ctx: - _check(driver.cuGreenCtxDestroy(self.green_ctx)) + # Check if sm_percentage is already applied + if sm_percentage == self.current_sm_percentage: + return + + # Check if this sm_percentage has been requested before + if sm_percentage in self.green_ctx_cache: + self.green_ctx, self.stream, self.assigned_sm_count = self.green_ctx_cache[sm_percentage] + self.current_sm_percentage = sm_percentage + return # Get total SMs and desired percentage total_sms = _check(driver.cuDeviceGetAttribute( @@ -166,9 +180,11 @@ def set_sm_percentage(self, sm_percentage): driver.CUstream_flags.CU_STREAM_NON_BLOCKING, 0, # priority )) + self.green_ctx_cache[sm_percentage] = (green_ctx, stream, assigned) self.green_ctx = green_ctx self.stream = stream self.assigned_sm_count = assigned + self.current_sm_percentage = sm_percentage def ready_argument_list(self, arguments): @@ -280,9 +296,6 @@ def compile(self, kernel_instance): print(log.decode("utf-8")) raise re - if "CUDA_SM_PERCENTAGE" in kernel_instance.params: - self.set_sm_percentage(kernel_instance.params["CUDA_SM_PERCENTAGE"]) - return self.func def start_event(self): diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 69d95254..6779983a 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -470,8 +470,12 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid) + # Set execution parameters if self.use_nvml and not skip_nvml_setting: self.set_nvml_parameters(instance) + if "cuda_sm_percentage" in instance.params: + # Currently only supported on cuda-python (NVCUDA) + self.dev.set_sm_percentage(instance.params["cuda_sm_percentage"]) # Call the observers to register the configuration to be benchmarked for obs in self.dev.observers: From cf5986ab115701687a5fb30ab7cd6a00a33bf3ca Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Fri, 12 Jun 2026 15:43:27 +0200 Subject: [PATCH 3/4] update documentation --- doc/source/vocabulary.rst | 25 +++++++++++++++---------- kernel_tuner/backends/nvcuda.py | 13 +++++++++---- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/doc/source/vocabulary.rst b/doc/source/vocabulary.rst index 084d09cb..a35db6d6 100644 --- a/doc/source/vocabulary.rst +++ b/doc/source/vocabulary.rst @@ -13,19 +13,24 @@ In general, it is best to avoid using these parameter names for purposes other t .. code-block:: python - kernel_tuner #is inserted by Kernel Tuner to signal the code is compiled using the tuner + kernel_tuner # is inserted by Kernel Tuner to signal the code is compiled using the tuner - block_size_* #reserved for thread block dimensions - grid_size_* #reserved for grid dimensions, if you want to tune these use problem_size + block_size_* # reserved for thread block dimensions + grid_size_* # reserved for grid dimensions, if you want to tune these use problem_size - compiler_opt_* #reserved for future support for tuning compiler options + compiler_opt_* # reserved for future support for tuning compiler options + + loop_unroll_factor_* # reserved for tunable parameters that specify loop unrolling factors + + nvml_* # reserved for tunable parameters and outputs related to NVML + nvml_pwr_limit # use NVML to set power limit + nvml_gr_clock # use NVML to set graphics clock + nvml_mem_clock # use NVML to set memory clock + + cuda_* # reserved for setting parameters related to CUDA kernel execution + cuda_sm_percentage # set the percentage of active SMs (requires cuda-python) - loop_unroll_factor_* #reserved for tunable parameters that specify loop unrolling factors - nvml_* #reserved for tunable parameters and outputs related to NVML - nvml_pwr_limit #use NVML to set power limit - nvml_gr_clock #use NVML to set graphics clock - nvml_mem_clock #use NVML to set memory clock There are also a number of names that Kernel Tuner uses for reporting benchmarking results. @@ -33,7 +38,7 @@ Because these are reported along with the tunable parameters, it is generally a .. code-block:: python - time* #reserved for time measurements + time* # reserved for time measurements # Information that can be observed using kernel_tuner.nvml.NVMLObserver: nvml_energy diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 831102e7..1d1ecc39 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -127,12 +127,17 @@ def __del__(self): def set_sm_percentage(self, sm_percentage): - """ + """ Set the active SM percentage + Create a CUDA green context owning ~`sm_percentage` of the device's SMs - and return a stream bound to it. Kernels launched on the returned stream - are restricted to that SM partition. + and a stream bound to it. Kernels launched afterwards are restricted + to that SM partition. Green contexts are cached in self.green_ctx_cache. + The actual number of SMs in the partition may not exactly match the + requested percentage. An observer may be used to query: + + * Currently assigned number of SMs: self.assigned_sm_count + * Currently requested SM percentage: self.current_sm_percentage - Returns: (green_ctx, stream, num_sms_assigned) Requires: CUDA >= 12.4 and a GPU that supports SM partitioning. """ From d2a552c8f1d86a19a2126cd1cf3fd135b915d329 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Fri, 12 Jun 2026 15:53:11 +0200 Subject: [PATCH 4/4] add test --- test/test_cuda_functions.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/test_cuda_functions.py b/test/test_cuda_functions.py index 1fe509c1..f6bd3e8b 100644 --- a/test/test_cuda_functions.py +++ b/test/test_cuda_functions.py @@ -56,6 +56,23 @@ def test_compile(): dev = nvcuda.CudaFunctions(0) dev.compile(kernel_instance) + +@skip_if_no_cuda +def test_set_sm_percentage(): + + dev = nvcuda.CudaFunctions(0) + default_stream = dev.stream + + test_value = 50 + dev.set_sm_percentage(test_value) + + assert dev.current_sm_percentage == test_value + assert test_value in dev.green_ctx_cache + assert dev.green_ctx is not None + assert not dev.stream == default_stream + assert dev.assigned_sm_count + + @skip_if_no_cuda def test_compile_template():