Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions doc/source/vocabulary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,32 @@ In general, it is best to avoid using these parameter names for purposes other t

.. code-block:: python

kernel_tuner #is inserted by Kernel Tuner to signal the code is compiled using the tuner
kernel_tuner # is inserted by Kernel Tuner to signal the code is compiled using the tuner

block_size_* #reserved for thread block dimensions
grid_size_* #reserved for grid dimensions, if you want to tune these use problem_size
block_size_* # reserved for thread block dimensions
grid_size_* # reserved for grid dimensions, if you want to tune these use problem_size

compiler_opt_* #reserved for future support for tuning compiler options
compiler_opt_* # reserved for future support for tuning compiler options

loop_unroll_factor_* # reserved for tunable parameters that specify loop unrolling factors

nvml_* # reserved for tunable parameters and outputs related to NVML
nvml_pwr_limit # use NVML to set power limit
nvml_gr_clock # use NVML to set graphics clock
nvml_mem_clock # use NVML to set memory clock

cuda_* # reserved for setting parameters related to CUDA kernel execution
cuda_sm_percentage # set the percentage of active SMs (requires cuda-python)

loop_unroll_factor_* #reserved for tunable parameters that specify loop unrolling factors

nvml_* #reserved for tunable parameters and outputs related to NVML
nvml_pwr_limit #use NVML to set power limit
nvml_gr_clock #use NVML to set graphics clock
nvml_mem_clock #use NVML to set memory clock


There are also a number of names that Kernel Tuner uses for reporting benchmarking results.
Because these are reported along with the tunable parameters, it is generally a good idea to not use these names for any tunable parameters.

.. code-block:: python

time* #reserved for time measurements
time* # reserved for time measurements

# Information that can be observed using kernel_tuner.nvml.NVMLObserver:
nvml_energy
Expand Down
84 changes: 81 additions & 3 deletions kernel_tuner/backends/nvcuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from kernel_tuner.backends.backend import GPUBackend
from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
from kernel_tuner.util import SkippableFailure
from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home
from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home, _check

# embedded in try block to be able to generate documentation
# and run tests without cuda-python installed
Expand Down Expand Up @@ -84,6 +84,9 @@
cuda_error_check(err)
err, self.end = driver.cuEventCreate(0)
cuda_error_check(err)
self.current_sm_percentage = 100
self.green_ctx_cache = {}
self.green_ctx = None

# default dynamically allocated shared memory size, can be overwritten using smem_args
self.smem_size = 0
Expand Down Expand Up @@ -116,10 +119,84 @@
observer.register_device(self)

def __del__(self):
# Cleanup streams and green contexts, if any
if self.green_ctx_cache:
for val in self.green_ctx_cache.values():
green_ctx, stream, _ = val
_check(driver.cuStreamDestroy(stream))
_check(driver.cuGreenCtxDestroy(green_ctx))

# Cleanup
for device_memory in self.allocations:
if isinstance(device_memory, driver.CUdeviceptr):
err = driver.cuMemFree(device_memory)
cuda_error_check(err)
_check(driver.cuMemFree(device_memory))


def set_sm_percentage(self, sm_percentage):
""" Set the active SM percentage

Create a CUDA green context owning ~`sm_percentage` of the device's SMs
and a stream bound to it. Kernels launched afterwards are restricted
to that SM partition. Green contexts are cached in self.green_ctx_cache.
The actual number of SMs in the partition may not exactly match the
requested percentage. An observer may be used to query:

* Currently assigned number of SMs: self.assigned_sm_count
* Currently requested SM percentage: self.current_sm_percentage

Requires: CUDA >= 12.4 and a GPU that supports SM partitioning.
"""

if not 0 < sm_percentage <= 100:
raise ValueError("sm_percentage must be in (0, 100]")

# Check if sm_percentage is already applied
if sm_percentage == self.current_sm_percentage:
return

# Check if this sm_percentage has been requested before
if sm_percentage in self.green_ctx_cache:
self.green_ctx, self.stream, self.assigned_sm_count = self.green_ctx_cache[sm_percentage]
self.current_sm_percentage = sm_percentage
return

# Get total SMs and desired percentage
total_sms = _check(driver.cuDeviceGetAttribute(
driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, self.device))
want = max(1, round(total_sms * sm_percentage / 100.0))

# Full SM resource pool of the device.
sm_resource = _check(driver.cuDeviceGetDevResource(
self.device, driver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM))

# Split off one group of at least `want` SMs. The driver rounds up to the
# device's partitioning granularity, so the actual count may be larger.
groups, _nb, _remaining = _check(driver.cuDevSmResourceSplitByCount(
1, # number of groups requested

Check warning on line 175 in kernel_tuner/backends/nvcuda.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Move this trailing comment on the previous empty line.

See more on https://sonarcloud.io/project/issues?id=KernelTuner_kernel_tuner&issues=AZ6TSw0nX2fRQ0gEeM6M&open=AZ6TSw0nX2fRQ0gEeM6M&pullRequest=384
sm_resource, # input resource

Check warning on line 176 in kernel_tuner/backends/nvcuda.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Move this trailing comment on the previous empty line.

See more on https://sonarcloud.io/project/issues?id=KernelTuner_kernel_tuner&issues=AZ6TSw0nX2fRQ0gEeM6N&open=AZ6TSw0nX2fRQ0gEeM6N&pullRequest=384
0, # useFlags (0 = default)

Check warning on line 177 in kernel_tuner/backends/nvcuda.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Move this trailing comment on the previous empty line.

See more on https://sonarcloud.io/project/issues?id=KernelTuner_kernel_tuner&issues=AZ6TSw0nX2fRQ0gEeM6O&open=AZ6TSw0nX2fRQ0gEeM6O&pullRequest=384
want, # minCount of SMs per group

Check warning on line 178 in kernel_tuner/backends/nvcuda.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Move this trailing comment on the previous empty line.

See more on https://sonarcloud.io/project/issues?id=KernelTuner_kernel_tuner&issues=AZ6TSw0nX2fRQ0gEeM6P&open=AZ6TSw0nX2fRQ0gEeM6P&pullRequest=384
))
group = groups[0]
assigned = group.sm.smCount

# Descriptor -> green context.
desc = _check(driver.cuDevResourceGenerateDesc([group], 1))
green_ctx = _check(driver.cuGreenCtxCreate(
desc, self.device, driver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM))

# A stream from the green context confines launches to its SMs.
stream = _check(driver.cuGreenCtxStreamCreate(
green_ctx,
driver.CUstream_flags.CU_STREAM_NON_BLOCKING,
0, # priority
))
self.green_ctx_cache[sm_percentage] = (green_ctx, stream, assigned)
self.green_ctx = green_ctx
self.stream = stream
self.assigned_sm_count = assigned
self.current_sm_percentage = sm_percentage


def ready_argument_list(self, arguments):
"""Ready argument list to be passed to the kernel, allocates gpu mem.
Expand All @@ -146,6 +223,7 @@
gpu_args.append(arg)
return gpu_args


def compile(self, kernel_instance):
"""Call the CUDA compiler to compile the kernel, return the device function.

Expand Down
4 changes: 4 additions & 0 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,12 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads)
logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid)

# Set execution parameters
if self.use_nvml and not skip_nvml_setting:
self.set_nvml_parameters(instance)
if "cuda_sm_percentage" in instance.params:
# Currently only supported on cuda-python (NVCUDA)
self.dev.set_sm_percentage(instance.params["cuda_sm_percentage"])

# Call the observers to register the configuration to be benchmarked
for obs in self.dev.observers:
Expand Down
9 changes: 9 additions & 0 deletions kernel_tuner/utils/nvcuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,15 @@ def cuda_error_check(error):
raise RuntimeError(f"NVRTC error: {desc.decode()}")


def _check(call_result):
"""Unwrap a cuda-python (CUresult, ...) return tuple and raise on error."""
err, *rest = call_result
cuda_error_check(err)
if not rest:
return None
return rest[0] if len(rest) == 1 else tuple(rest)


def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str:
"""Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options."""
return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="75")
Expand Down
17 changes: 17 additions & 0 deletions test/test_cuda_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,23 @@ def test_compile():
dev = nvcuda.CudaFunctions(0)
dev.compile(kernel_instance)


@skip_if_no_cuda
def test_set_sm_percentage():

dev = nvcuda.CudaFunctions(0)
default_stream = dev.stream

test_value = 50
dev.set_sm_percentage(test_value)

assert dev.current_sm_percentage == test_value
assert test_value in dev.green_ctx_cache
assert dev.green_ctx is not None
assert not dev.stream == default_stream
assert dev.assigned_sm_count


@skip_if_no_cuda
def test_compile_template():

Expand Down
Loading