NVIDIA · seberg · Jun 2, 2026 · Jun 2, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/ci/tools/run-tests b/ci/tools/run-tests
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -20,6 +20,14 @@ fi
 
 test_module=${1}
 
+FREE_THREADING=""
+PYTEST_PARALLEL_ARGS=()
+if python -c 'import sys; assert not sys._is_gil_enabled()' 2> /dev/null; then
+  FREE_THREADING="-ft"
+  PYTEST_PARALLEL_ARGS=(--parallel-threads=4)
+  pip install pytest-run-parallel
+fi
+
 # For standard modes, install pathfinder up front (it is a direct dependency
 # of bindings, and a transitive dependency of core).  Nightly modes install
 # all wheels together in a single pip call further below.
@@ -36,7 +44,7 @@ if [[ "${test_module}" == "pathfinder" ]]; then
     "LD:${CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS} " \
     "FH:${CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS} " \
     "BC:${CUDA_PATHFINDER_TEST_FIND_NVIDIA_BITCODE_LIB_STRICTNESS}"
-  pytest -ra -s -v --durations=0 tests/ |& tee /tmp/pathfinder_test_log.txt
+  pytest -ra -s -v --durations=0 "${PYTEST_PARALLEL_ARGS[@]}" tests/ |& tee /tmp/pathfinder_test_log.txt
   # Report the number of "INFO test_" lines (including zero)
   # to support quick validations based on GHA log archives.
   line_count=$(awk '/^INFO test_/ {count++} END {print count+0}' /tmp/pathfinder_test_log.txt)
@@ -51,21 +59,16 @@ elif [[ "${test_module}" == "bindings" ]]; then
     pip install $(ls "${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl)[all] --group test
   fi
   echo "Running bindings tests"
-  ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/
+  ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize "${PYTEST_PARALLEL_ARGS[@]}" tests/
   if [[ "${SKIP_CYTHON_TEST}" == 0 ]]; then
-    ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython
+    ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize "${PYTEST_PARALLEL_ARGS[@]}" tests/cython
   fi
   popd
 elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then
   # Shared setup for core and nightly modes.
   TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})"
   CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
 
-  FREE_THREADING=""
-  if python -c 'import sys; assert not sys._is_gil_enabled()' 2> /dev/null; then
-    FREE_THREADING+="-ft"
-  fi
-
   # Resolve bindings based on BINDINGS_SOURCE (set by env-vars):
   #   main/backport → local wheel from artifacts dir
   #   published     → install from PyPI by version
@@ -106,11 +109,11 @@ elif [[ "${test_module}" == "core" || "${test_module}" == nightly-* ]]; then
     echo "Installed packages before core tests:"
     pip list
     echo "Running core tests"
-    ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/
+    ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize "${PYTEST_PARALLEL_ARGS[@]}" tests/
     # Currently our CI always installs the latest bindings (from either major version).
     # This is not compatible with the test requirements.
     if [[ "${SKIP_CYTHON_TEST}" == 0 ]]; then
-      ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/cython
+      ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize "${PYTEST_PARALLEL_ARGS[@]}" tests/cython
     fi
   else
     # Nightly optional-dependency testing.

diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py
@@ -1,8 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import functools
+import inspect
 import pathlib
 import sys
+from contextlib import contextmanager
 from importlib.metadata import PackageNotFoundError, distribution
 
 import pytest
@@ -25,6 +28,84 @@
         sys.path.insert(0, test_helpers_root)
 
 
+def _parallel_threads_enabled(config):
+    parallel_threads = getattr(config.option, "parallel_threads", 0)
+    if parallel_threads == "auto":
+        return True
+    return parallel_threads is not None and int(parallel_threads) > 0
+
+
+def pytest_configure(config):
+    if _parallel_threads_enabled(config):
+        config.pluginmanager.register(_CudaBindingsParallelPlugin(), name="_cuda_bindings_parallel_plugin")
+
+
+@contextmanager
+def _thread_context():
+    # Defensive: if this worker thread already has an active context (e.g. from
+    # double-wrapping), reuse it rather than pushing another one.
+    # Note: fixtures never run on the test thread; this is purely a safety net.
+    err, existing = cuda.cuCtxGetCurrent()
+    if err == cuda.CUresult.CUDA_SUCCESS and existing and int(existing) != 0:
+        yield None, existing
+        return
+
+    # cuInit(0) is idempotent; safe to call even if cuda_driver fixture already ran.
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    err, device = cuda.cuDeviceGet(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    try:
+        yield device, ctx
+    finally:
+        (err,) = cuda.cuCtxDestroy(ctx)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+
+
+def _wrap_worker_cuda_test(func):
+    if getattr(func, "_cuda_bindings_worker_cuda_wrapped", False):
+        return func
+
+    sig = inspect.signature(func)
+    wants_device = "device" in sig.parameters
+    wants_ctx = "ctx" in sig.parameters
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with _thread_context() as (device, ctx):
+            # device is None when reusing an existing context (defensive path);
+            # keep whatever the fixture provided in kwargs as-is.
+            if wants_device and device is not None:
+                kwargs["device"] = device
+            if wants_ctx:
+                kwargs["ctx"] = ctx
+            return func(*args, **kwargs)
+
+    wrapper._cuda_bindings_worker_cuda_wrapped = True
+    return wrapper
+
+
+def _item_needs_thread_ctx(item):
+    fixturenames = getattr(item, "fixturenames", ())
+    # 'device' is present when the module-level ctx(device) autouse chain is
+    # active (test_cuda.py, test_kernelParams.py, nvml tests, …).
+    # 'driver' is present for test_cufile.py tests that use the local driver
+    # fixture; their local ctx() shadows the parent ctx(device) so 'device'
+    # does not appear in their fixture chain, but they still need a per-thread
+    # CUDA context for cuMemAlloc and similar calls made inside the test.
+    return "device" in fixturenames or "driver" in fixturenames
+
+
+class _CudaBindingsParallelPlugin:
+    @pytest.hookimpl(tryfirst=True)
+    def pytest_collection_modifyitems(self, config, items):
+        for item in items:
+            if _item_needs_thread_ctx(item):
+                item.obj = _wrap_worker_cuda_test(item.obj)
+
+
 @pytest.fixture(scope="module")
 def cuda_driver():
     (err,) = cuda.cuInit(0)

diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py
@@ -42,6 +42,7 @@ def get_architecture_name(arch):
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
+@pytest.mark.thread_unsafe(reason="nvml init affects other threads")
 def test_init_ref_count():
     """
     Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works

diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
@@ -456,6 +456,7 @@ def test_cuda_mem_range_attr(device):
 
 
 @pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported")
+@pytest.mark.thread_unsafe(reason="used high memory can be higher if threaded.")
 def test_cuda_graphMem_attr(device):
     err, stream = cuda.cuStreamCreate(0)
     assert err == cuda.CUresult.CUDA_SUCCESS