diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index e289f3da69..0d6ca87df1 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -456,6 +456,16 @@ jobs: path: ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}/test_*${{ env.PY_EXT_SUFFIX }} if-no-files-found: error + - name: Build cuda.core test binaries + run: bash ${{ env.CUDA_CORE_TEST_BINARIES_DIR }}/build_test_binaries.sh + + - name: Upload cuda.core test binaries + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-test-binaries + path: ${{ env.CUDA_CORE_TEST_BINARIES_DIR }}/*.o + if-no-files-found: error + # Note: This overwrites CUDA_PATH etc - name: Set up mini CTK uses: ./.github/actions/fetch_ctk diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 4d64242ea0..3a6e2eb972 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -279,6 +279,19 @@ jobs: pwd ls -lahR $CUDA_CORE_CYTHON_TESTS_DIR + - name: Download cuda.core test binaries + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-test-binaries + path: ${{ env.CUDA_CORE_TEST_BINARIES_DIR }} + run-id: ${{ inputs.run-id || github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Display structure of downloaded cuda.core test binaries + run: | + pwd + ls -lahR $CUDA_CORE_TEST_BINARIES_DIR + - name: Set up Python ${{ matrix.PY_VER }} uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 6ccacfff8f..6db50b89da 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -259,6 +259,19 @@ jobs: Get-Location Get-ChildItem -Recurse -Force $env:CUDA_CORE_CYTHON_TESTS_DIR | Select-Object Mode, LastWriteTime, Length, FullName + - name: Download cuda.core test binaries + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-test-binaries + path: ${{ env.CUDA_CORE_TEST_BINARIES_DIR }} + run-id: ${{ inputs.run-id || github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Display structure of downloaded cuda.core test binaries + run: | + Get-Location + Get-ChildItem -Recurse -Force $env:CUDA_CORE_TEST_BINARIES_DIR | Select-Object Mode, LastWriteTime, Length, FullName + - name: Set up Python ${{ matrix.PY_VER }} uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: diff --git a/.gitignore b/.gitignore index c55480b755..d2a5bf6e52 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,9 @@ cache_driver cache_runtime cache_nvrtc +# cuda.core test object fixtures built locally / downloaded as CI artifacts +cuda_core/tests/test_binaries/*.o + # CUDA Python specific (auto-generated) cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx diff --git a/ci/tools/env-vars b/ci/tools/env-vars index 30fac1cdce..8ffbfa1347 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -34,6 +34,7 @@ CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${HOST_ echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_BASENAME}-${SHA}" echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "${REPO_DIR}/cuda_core/dist")" echo "CUDA_CORE_CYTHON_TESTS_DIR=$(realpath "${REPO_DIR}/cuda_core/tests/cython")" + echo "CUDA_CORE_TEST_BINARIES_DIR=$(realpath "${REPO_DIR}/cuda_core/tests/test_binaries")" echo "PYTHON_VERSION_FORMATTED=${PYTHON_VERSION_FORMATTED}" } >> $GITHUB_ENV diff --git a/cuda_core/tests/test_binaries/build_test_binaries.sh b/cuda_core/tests/test_binaries/build_test_binaries.sh new file mode 100755 index 0000000000..6e4de10b26 --- /dev/null +++ b/cuda_core/tests/test_binaries/build_test_binaries.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +# Build .o test fixtures. Invoked at CI build stage + +SCRIPTPATH=$(dirname "$(realpath "$0")") + +nvcc -dc -o "${SCRIPTPATH}/saxpy.o" "${SCRIPTPATH}/saxpy.cu" + +ls -lah "${SCRIPTPATH}/saxpy.o" diff --git a/cuda_core/tests/test_binaries/saxpy.cu b/cuda_core/tests/test_binaries/saxpy.cu new file mode 100644 index 0000000000..f0afc58637 --- /dev/null +++ b/cuda_core/tests/test_binaries/saxpy.cu @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include + +__device__ float saxpy_step(float a, float x, float y) { + return a * x + y; +} + +template +__global__ void saxpy(const T a, const T* x, const T* y, T* out, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + out[tid] = a * x[tid] + y[tid]; + } +} diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 3a438f825a..f82777a95f 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -8,7 +8,7 @@ import pytest import cuda.core -from cuda.core import Device, Kernel, ObjectCode, Program, ProgramOptions +from cuda.core import Device, Kernel, Linker, LinkerOptions, ObjectCode, Program, ProgramOptions from cuda.core._program import _can_load_generated_ptx from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return from cuda.core._utils.version import binding_version, driver_version @@ -172,6 +172,36 @@ def get_saxpy_fatbin(init_cuda): return bytes(fatbin), sym_map +@pytest.fixture(scope="module") +def get_saxpy_object(): + """Read the pre-built saxpy.o. + + In CI: produced by build stage into a test wheel file. + In local dev: auto-built on demand if nvcc is available; if you edit + saxpy.cu, remove the stale saxpy.o to force a rebuild. + """ + import subprocess + from pathlib import Path + + from cuda.pathfinder import find_nvidia_binary_utility + + binaries_dir = Path(__file__).parent / "test_binaries" + obj_path = binaries_dir / "saxpy.o" + + if not obj_path.is_file(): + if find_nvidia_binary_utility("nvcc") is None: + pytest.skip( + f"saxpy.o not found at {obj_path} and nvcc is unavailable. " + "In CI this is downloaded from the build stage." + ) + subprocess.run( # noqa: S603 + ["bash", str(binaries_dir / "build_test_binaries.sh")], # noqa: S607 + check=True, + ) + + return obj_path.read_bytes() + + def test_get_kernel(init_cuda): kernel = """extern "C" __global__ void ABC() { }""" @@ -330,6 +360,67 @@ def test_object_code_load_fatbin_from_file(get_saxpy_fatbin, tmp_path, convert_p mod_obj.get_kernel("saxpy") # force loading +def test_object_code_load_object(get_saxpy_object): + obj = get_saxpy_object + assert isinstance(obj, bytes) + mod_obj = ObjectCode.from_object(obj) + assert mod_obj.code == obj + assert mod_obj.code_type == "object" + with pytest.raises(RuntimeError, match=r'Unsupported code type "object"'): + mod_obj.get_kernel("saxpy") + + +def test_object_code_load_object_from_file(get_saxpy_object, tmp_path): + obj_file = tmp_path / "test.o" + obj_file.write_bytes(get_saxpy_object) + arg = str(obj_file) + mod_obj = ObjectCode.from_object(arg) + assert mod_obj.code == arg + assert mod_obj.code_type == "object" + + +def test_object_code_load_object_with_linker(get_saxpy_object, init_cuda): + arch = "sm_" + "".join(f"{i}" for i in init_cuda.compute_capability) + kernel_ptx = Program( + r""" + extern __device__ float saxpy_step(float a, float x, float y); + extern "C" __global__ void linked_kernel(float a, float x, float y, float* out) { + if (threadIdx.x == 0 && blockIdx.x == 0) *out = saxpy_step(a, x, y); + } + """, + "c++", + ProgramOptions(relocatable_device_code=True, arch=arch), + ).compile("ptx") + linked = Linker( + kernel_ptx, + ObjectCode.from_object(get_saxpy_object), + options=LinkerOptions(arch=arch), + ).link("cubin") + kernel = linked.get_kernel("linked_kernel") + + import numpy as np + + stream = init_cuda.create_stream() + host_buf = cuda.core.LegacyPinnedMemoryResource().allocate(4) + result = np.from_dlpack(host_buf).view(np.float32) + result[:] = 0.0 + dev_buf = init_cuda.memory_resource.allocate(4, stream=init_cuda.default_stream) + + cuda.core.launch( + stream, + cuda.core.LaunchConfig(grid=1, block=1), + kernel, + np.float32(2.0), + np.float32(3.0), + np.float32(4.0), + dev_buf, + ) + dev_buf.copy_to(host_buf, stream=stream) + stream.sync() + + assert result[0] == 10.0 + + def test_saxpy_arguments(get_saxpy_kernel_cubin, cuda12_4_prerequisite_check): krn, _ = get_saxpy_kernel_cubin