KernelTuner · benvanwerkhoven · Jun 12, 2026 · Jun 4, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/doc/source/vocabulary.rst b/doc/source/vocabulary.rst
@@ -13,27 +13,32 @@ In general, it is best to avoid using these parameter names for purposes other t
 
 .. code-block:: python
 
-    kernel_tuner #is inserted by Kernel Tuner to signal the code is compiled using the tuner
+    kernel_tuner # is inserted by Kernel Tuner to signal the code is compiled using the tuner
 
-    block_size_* #reserved for thread block dimensions
-    grid_size_* #reserved for grid dimensions, if you want to tune these use problem_size
+    block_size_* # reserved for thread block dimensions
+    grid_size_* # reserved for grid dimensions, if you want to tune these use problem_size
 
-    compiler_opt_* #reserved for future support for tuning compiler options
+    compiler_opt_* # reserved for future support for tuning compiler options
+
+    loop_unroll_factor_* # reserved for tunable parameters that specify loop unrolling factors
+
+    nvml_* # reserved for tunable parameters and outputs related to NVML
+    nvml_pwr_limit # use NVML to set power limit
+    nvml_gr_clock # use NVML to set graphics clock
+    nvml_mem_clock # use NVML to set memory clock
+
+    cuda_* # reserved for setting parameters related to CUDA kernel execution
+    cuda_sm_percentage # set the percentage of active SMs (requires cuda-python)
 
-    loop_unroll_factor_* #reserved for tunable parameters that specify loop unrolling factors
 
-    nvml_* #reserved for tunable parameters and outputs related to NVML
-    nvml_pwr_limit #use NVML to set power limit
-    nvml_gr_clock #use NVML to set graphics clock
-    nvml_mem_clock #use NVML to set memory clock
 
 
 There are also a number of names that Kernel Tuner uses for reporting benchmarking results. 
 Because these are reported along with the tunable parameters, it is generally a good idea to not use these names for any tunable parameters.
 
 .. code-block:: python
 
-    time* #reserved for time measurements
+    time* # reserved for time measurements
 
     # Information that can be observed using kernel_tuner.nvml.NVMLObserver:
     nvml_energy

diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py
@@ -8,7 +8,7 @@
 from kernel_tuner.backends.backend import GPUBackend
 from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
 from kernel_tuner.util import SkippableFailure
-from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home
+from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home, _check
 
 # embedded in try block to be able to generate documentation
 # and run tests without cuda-python installed
@@ -84,6 +84,9 @@
         cuda_error_check(err)
         err, self.end = driver.cuEventCreate(0)
         cuda_error_check(err)
+        self.current_sm_percentage = 100
+        self.green_ctx_cache = {}
+        self.green_ctx = None
 
         # default dynamically allocated shared memory size, can be overwritten using smem_args
         self.smem_size = 0
@@ -116,10 +119,84 @@
             observer.register_device(self)
 
     def __del__(self):
+        # Cleanup streams and green contexts, if any
+        if self.green_ctx_cache:
+            for val in self.green_ctx_cache.values():
+                green_ctx, stream, _ = val
+                _check(driver.cuStreamDestroy(stream))
+                _check(driver.cuGreenCtxDestroy(green_ctx))
+
+        # Cleanup
         for device_memory in self.allocations:
             if isinstance(device_memory, driver.CUdeviceptr):
-                err = driver.cuMemFree(device_memory)
-                cuda_error_check(err)
+                _check(driver.cuMemFree(device_memory))
+
+
+    def set_sm_percentage(self, sm_percentage):
+        """ Set the active SM percentage
+
+        Create a CUDA green context owning ~`sm_percentage` of the device's SMs
+        and a stream bound to it. Kernels launched afterwards are restricted
+        to that SM partition. Green contexts are cached in self.green_ctx_cache.
+        The actual number of SMs in the partition may not exactly match the
+        requested percentage. An observer may be used to query:
+
+         *   Currently assigned number of SMs: self.assigned_sm_count
+         *   Currently requested SM percentage: self.current_sm_percentage
+
+        Requires: CUDA >= 12.4 and a GPU that supports SM partitioning.
+        """
+
+        if not 0 < sm_percentage <= 100:
+            raise ValueError("sm_percentage must be in (0, 100]")
+
+        # Check if sm_percentage is already applied
+        if sm_percentage == self.current_sm_percentage:
+            return
+
+        # Check if this sm_percentage has been requested before
+        if sm_percentage in self.green_ctx_cache:
+            self.green_ctx, self.stream, self.assigned_sm_count = self.green_ctx_cache[sm_percentage]
+            self.current_sm_percentage = sm_percentage
+            return
+
+        # Get total SMs and desired percentage
+        total_sms = _check(driver.cuDeviceGetAttribute(
+            driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, self.device))
+        want = max(1, round(total_sms * sm_percentage / 100.0))
+
+        # Full SM resource pool of the device.
+        sm_resource = _check(driver.cuDeviceGetDevResource(
+            self.device, driver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM))
+
+        # Split off one group of at least `want` SMs. The driver rounds up to the
+        # device's partitioning granularity, so the actual count may be larger.
+        groups, _nb, _remaining = _check(driver.cuDevSmResourceSplitByCount(
+            1,            # number of groups requested
+            sm_resource,  # input resource
+            0,            # useFlags (0 = default)
+            want,         # minCount of SMs per group
+        ))
+        group = groups[0]
+        assigned = group.sm.smCount
+
+        # Descriptor -> green context.
+        desc = _check(driver.cuDevResourceGenerateDesc([group], 1))
+        green_ctx = _check(driver.cuGreenCtxCreate(
+            desc, self.device, driver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM))
+
+        # A stream from the green context confines launches to its SMs.
+        stream = _check(driver.cuGreenCtxStreamCreate(
+            green_ctx,
+            driver.CUstream_flags.CU_STREAM_NON_BLOCKING,
+            0,  # priority
+        ))
+        self.green_ctx_cache[sm_percentage] = (green_ctx, stream, assigned)
+        self.green_ctx = green_ctx
+        self.stream = stream
+        self.assigned_sm_count = assigned
+        self.current_sm_percentage = sm_percentage
+
 
     def ready_argument_list(self, arguments):
         """Ready argument list to be passed to the kernel, allocates gpu mem.
@@ -146,6 +223,7 @@
                 gpu_args.append(arg)
         return gpu_args
 
+
     def compile(self, kernel_instance):
         """Call the CUDA compiler to compile the kernel, return the device function.
 

diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -470,8 +470,12 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
         logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads)
         logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid)
 
+        # Set execution parameters
         if self.use_nvml and not skip_nvml_setting:
             self.set_nvml_parameters(instance)
+        if "cuda_sm_percentage" in instance.params:
+            # Currently only supported on cuda-python (NVCUDA)
+            self.dev.set_sm_percentage(instance.params["cuda_sm_percentage"])
 
         # Call the observers to register the configuration to be benchmarked
         for obs in self.dev.observers:

diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py
@@ -62,6 +62,15 @@ def cuda_error_check(error):
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
 
 
+def _check(call_result):
+    """Unwrap a cuda-python (CUresult, ...) return tuple and raise on error."""
+    err, *rest = call_result
+    cuda_error_check(err)
+    if not rest:
+        return None
+    return rest[0] if len(rest) == 1 else tuple(rest)
+
+
 def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str:
     """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options."""
     return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="75")

diff --git a/test/test_cuda_functions.py b/test/test_cuda_functions.py
@@ -56,6 +56,23 @@ def test_compile():
     dev = nvcuda.CudaFunctions(0)
     dev.compile(kernel_instance)
 
+
+@skip_if_no_cuda
+def test_set_sm_percentage():
+
+    dev = nvcuda.CudaFunctions(0)
+    default_stream = dev.stream
+
+    test_value = 50
+    dev.set_sm_percentage(test_value)
+
+    assert dev.current_sm_percentage == test_value
+    assert test_value in dev.green_ctx_cache
+    assert dev.green_ctx is not None
+    assert not dev.stream == default_stream
+    assert dev.assigned_sm_count
+
+
 @skip_if_no_cuda
 def test_compile_template():