From 3694e06171e9b2316396377103e31ba605eaef6e Mon Sep 17 00:00:00 2001
From: Keith Kraus <keith.j.kraus@gmail.com>
Date: Wed, 10 Jun 2026 23:08:46 -0400
Subject: [PATCH] Regenerate cuda-bindings docs anchors

---
 .../cuda/bindings/_bindings/cydriver.pxd.in   |   2 +-
 .../cuda/bindings/_bindings/cydriver.pyx.in   |   2 +-
 .../cuda/bindings/_bindings/cyruntime.pxd.in  |   2 +-
 .../cuda/bindings/_bindings/cyruntime.pyx.in  |   2 +-
 .../bindings/_bindings/cyruntime_ptds.pxd.in  |   2 +-
 .../bindings/_bindings/cyruntime_ptds.pyx.in  |   2 +-
 cuda_bindings/cuda/bindings/cydriver.pxd.in   |   2 +-
 cuda_bindings/cuda/bindings/cydriver.pyx.in   |   2 +-
 cuda_bindings/cuda/bindings/cyruntime.pxd.in  |   2 +-
 cuda_bindings/cuda/bindings/cyruntime.pyx.in  |   2 +-
 .../cuda/bindings/cyruntime_functions.pxi.in  |   2 +-
 .../cuda/bindings/cyruntime_types.pxi.in      |   2 +-
 cuda_bindings/cuda/bindings/driver.pxd.in     |  74 +--
 cuda_bindings/cuda/bindings/driver.pyx.in     | 260 ++++----
 cuda_bindings/cuda/bindings/runtime.pxd.in    |  79 +--
 cuda_bindings/cuda/bindings/runtime.pyx.in    | 595 +++++++++---------
 cuda_bindings/docs/source/module/driver.rst   | 314 ++++++---
 cuda_bindings/docs/source/module/runtime.rst  | 324 ++++++----
 18 files changed, 944 insertions(+), 726 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index 85107bb0fe..e8115cf2a3 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 from cuda.bindings.cydriver cimport *
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index 6cd5fd689b..3d8bbb0724 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index 178ba2022a..a0e0e8eb23 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 include "../cyruntime_types.pxi"
 
 include "../_lib/cyruntime/cyruntime.pxd"
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index 0a7de77221..e508ed1443 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 include "../cyruntime_functions.pxi"
 
 import os
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index 08e14a023d..65b23e5e7f 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index c771cf89de..798f53f8d7 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index 416f428b7b..48b18aa48e 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index aa552f17f6..86b3b60ae2 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 cimport cuda.bindings._bindings.cydriver as cydriver
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index 453011b2ba..8e7d85ddea 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index 230b5d8f84..982f45c185 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 cimport cuda.bindings._bindings.cyruntime as cyruntime
 cimport cython
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index 981b55fb29..05a5f27f59 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 cdef extern from "cuda_runtime_api.h":
 
     {{if 'cudaDeviceReset' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index a7ad5839ac..e33ed606c9 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 
 cdef extern from "vector_types.h":
 
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index 76997b5269..4999986973 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 
 include "_lib/utils.pxd"
@@ -1968,8 +1968,8 @@ cdef class CUlaunchAttributeValue_union:
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - `CUeventCUevent` event - Event to fire
+        when all blocks trigger it.    - `Event` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
         - `triggerAtBlockStart` - If this is set to non-0, each block
         launch will automatically trigger the event.
@@ -1978,8 +1978,8 @@ cdef class CUlaunchAttributeValue_union:
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - `CUeventCUevent` event - Event to fire when the last
+        block launches    - `int` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -2018,8 +2018,8 @@ cdef class CUlaunchAttributeValue_union:
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
         following fields: - `int` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `CUgraphDeviceNodeCUgraphDeviceNode` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -7083,8 +7083,8 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - `CUeventCUevent` event - Event to fire
+        when all blocks trigger it.    - `Event` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
         - `triggerAtBlockStart` - If this is set to non-0, each block
         launch will automatically trigger the event.
@@ -7093,8 +7093,8 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - `CUeventCUevent` event - Event to fire when the last
+        block launches    - `int` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7133,8 +7133,8 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
         following fields: - `int` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `CUgraphDeviceNodeCUgraphDeviceNode` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -7294,8 +7294,8 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - `CUeventCUevent` event - Event to fire
+        when all blocks trigger it.    - `Event` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
         - `triggerAtBlockStart` - If this is set to non-0, each block
         launch will automatically trigger the event.
@@ -7304,8 +7304,8 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - `CUeventCUevent` event - Event to fire when the last
+        block launches    - `int` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7344,8 +7344,8 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
         following fields: - `int` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `CUgraphDeviceNodeCUgraphDeviceNode` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -7425,8 +7425,8 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - `CUeventCUevent` event - Event to fire
+        when all blocks trigger it.    - `Event` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
         - `triggerAtBlockStart` - If this is set to non-0, each block
         launch will automatically trigger the event.
@@ -7435,8 +7435,8 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - `CUeventCUevent` event - Event to fire when the last
+        block launches    - `int` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7475,8 +7475,8 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
         following fields: - `int` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `CUgraphDeviceNodeCUgraphDeviceNode` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -7556,8 +7556,8 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - `CUeventCUevent` event - Event to fire
+        when all blocks trigger it.    - `Event` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
         - `triggerAtBlockStart` - If this is set to non-0, each block
         launch will automatically trigger the event.
@@ -7566,8 +7566,8 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - `CUeventCUevent` event - Event to fire when the last
+        block launches    - `int` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7606,8 +7606,8 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
         following fields: - `int` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `CUgraphDeviceNodeCUgraphDeviceNode` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -7687,8 +7687,8 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - `CUeventCUevent` event - Event to fire
+        when all blocks trigger it.    - `Event` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
         - `triggerAtBlockStart` - If this is set to non-0, each block
         launch will automatically trigger the event.
@@ -7697,8 +7697,8 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - `CUeventCUevent` event - Event to fire when the last
+        block launches    - `int` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7737,8 +7737,8 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
         following fields: - `int` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `CUgraphDeviceNodeCUgraphDeviceNode` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 206c2557fc..0c1ce25f71 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -52,16 +52,16 @@ CU_IPC_HANDLE_SIZE = cydriver.CU_IPC_HANDLE_SIZE
 
 #: Legacy stream handle
 #:
-#: Stream handle that can be passed as a CUstream to use an implicit stream
-#: with legacy synchronization behavior.
+#: Stream handle that can be passed as a :py:obj:`~.CUstream` to use an
+#: implicit stream with legacy synchronization behavior.
 #:
 #: See details of the \link_sync_behavior
 CU_STREAM_LEGACY = cydriver.CU_STREAM_LEGACY
 
 #: Per-thread stream handle
 #:
-#: Stream handle that can be passed as a CUstream to use an implicit stream
-#: with per-thread synchronization behavior.
+#: Stream handle that can be passed as a :py:obj:`~.CUstream` to use an
+#: implicit stream with per-thread synchronization behavior.
 #:
 #: See details of the \link_sync_behavior
 CU_STREAM_PER_THREAD = cydriver.CU_STREAM_PER_THREAD
@@ -229,8 +229,8 @@ CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC = cydriver.CUDA_COOPERA
 
 #: If set, the CUDA array is a collection of layers, where each layer is
 #: either a 1D or a 2D array and the Depth member of
-#: CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of
-#: a 3D array.
+#: :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` specifies the number of layers, not
+#: the depth of a 3D array.
 CUDA_ARRAY3D_LAYERED = cydriver.CUDA_ARRAY3D_LAYERED
 
 #: Deprecated, use CUDA_ARRAY3D_LAYERED
@@ -421,7 +421,8 @@ class CUctx_flags(_FastEnum):
 
     CU_CTX_BLOCKING_SYNC = (
         cydriver.CUctx_flags_enum.CU_CTX_BLOCKING_SYNC,
-        'Set blocking synchronization as default scheduling [Deprecated]\n'
+        'Set blocking synchronization as default scheduling\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'CU_CTX_SCHED_MASK' in found_values}}
     CU_CTX_SCHED_MASK = cydriver.CUctx_flags_enum.CU_CTX_SCHED_MASK{{endif}}
@@ -2746,7 +2747,7 @@ class CUfunction_attribute(_FastEnum):
     CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = (
         cydriver.CUfunction_attribute_enum.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE,
         'The block scheduling policy of a function. The value type is\n'
-        'CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. See\n'
+        ':py:obj:`~.CUclusterSchedulingPolicy` / cudaClusterSchedulingPolicy. See\n'
         ':py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`\n'
     ){{endif}}
     {{if 'CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED' in found_values}}
@@ -4793,7 +4794,8 @@ class CUresult(_FastEnum):
     CUDA_ERROR_CONTEXT_ALREADY_CURRENT = (
         cydriver.cudaError_enum.CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
         'This indicated that the context being supplied as a parameter to the API\n'
-        'call was already the active context. [Deprecated]\n'
+        'call was already the active context.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'CUDA_ERROR_MAP_FAILED' in found_values}}
 
@@ -6028,7 +6030,8 @@ class CUmemAllocationHandleType(_FastEnum):
 
     CU_MEM_HANDLE_TYPE_FABRIC = (
         cydriver.CUmemAllocationHandleType_enum.CU_MEM_HANDLE_TYPE_FABRIC,
-        'Allows a fabric handle to be used for exporting. (CUmemFabricHandle)\n'
+        'Allows a fabric handle to be used for exporting.\n'
+        '(:py:obj:`~.CUmemFabricHandle`)\n'
     ){{endif}}
     {{if 'CU_MEM_HANDLE_TYPE_MAX' in found_values}}
     CU_MEM_HANDLE_TYPE_MAX = cydriver.CUmemAllocationHandleType_enum.CU_MEM_HANDLE_TYPE_MAX{{endif}}
@@ -6360,54 +6363,55 @@ class CUmemPool_attribute(_FastEnum):
 
     CU_MEMPOOL_ATTR_RELEASE_THRESHOLD = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-        '(value type = cuuint64_t) Amount of reserved memory in bytes to hold onto\n'
-        'before trying to release memory back to the OS. When more than the release\n'
-        'threshold bytes of memory are held by the memory pool, the allocator will\n'
-        'try to release memory back to the OS on the next call to stream, event or\n'
-        'context synchronize. (default 0)\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to\n'
+        'hold onto before trying to release memory back to the OS. When more than\n'
+        'the release threshold bytes of memory are held by the memory pool, the\n'
+        'allocator will try to release memory back to the OS on the next call to\n'
+        'stream, event or context synchronize. (default 0)\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT' in found_values}}
 
     CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
-        '(value type = cuuint64_t) Amount of backing memory currently allocated for\n'
-        'the mempool.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of backing memory currently\n'
+        'allocated for the mempool.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH' in found_values}}
 
     CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
-        '(value type = cuuint64_t) High watermark of backing memory allocated for\n'
-        'the mempool since the last time it was reset. High watermark can only be\n'
-        'reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of backing memory\n'
+        'allocated for the mempool since the last time it was reset. High watermark\n'
+        'can only be reset to zero.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_USED_MEM_CURRENT' in found_values}}
 
     CU_MEMPOOL_ATTR_USED_MEM_CURRENT = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
-        '(value type = cuuint64_t) Amount of memory from the pool that is currently\n'
-        'in use by the application.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory from the pool that\n'
+        'is currently in use by the application.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_USED_MEM_HIGH' in found_values}}
 
     CU_MEMPOOL_ATTR_USED_MEM_HIGH = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_USED_MEM_HIGH,
-        '(value type = cuuint64_t) High watermark of the amount of memory from the\n'
-        'pool that was in use by the application since the last time it was reset.\n'
-        'High watermark can only be reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of the amount of\n'
+        'memory from the pool that was in use by the application since the last time\n'
+        'it was reset. High watermark can only be reset to zero.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_ALLOCATION_TYPE' in found_values}}
 
     CU_MEMPOOL_ATTR_ALLOCATION_TYPE = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_ALLOCATION_TYPE,
-        '(value type = CUmemAllocationType) The allocation type of the mempool\n'
+        '(value type = :py:obj:`~.CUmemAllocationType`) The allocation type of the\n'
+        'mempool\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES' in found_values}}
 
     CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES,
-        '(value type = CUmemAllocationHandleType) Available export handle types for\n'
-        'the mempool. For imported pools this value is always\n'
+        '(value type = :py:obj:`~.CUmemAllocationHandleType`) Available export\n'
+        'handle types for the mempool. For imported pools this value is always\n'
         'CU_MEM_HANDLE_TYPE_NONE as an imported pool cannot be re-exported\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_LOCATION_ID' in found_values}}
@@ -6422,18 +6426,18 @@ class CUmemPool_attribute(_FastEnum):
 
     CU_MEMPOOL_ATTR_LOCATION_TYPE = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_LOCATION_TYPE,
-        '(value type = CUmemLocationType) The location type for the mempool. For\n'
-        'imported memory pools where the device is not directly visible to the\n'
-        'importing process or pools imported via fabric handles across nodes this\n'
-        'will be CU_MEM_LOCATION_TYPE_INVISIBLE.\n'
+        '(value type = :py:obj:`~.CUmemLocationType`) The location type for the\n'
+        'mempool. For imported memory pools where the device is not directly visible\n'
+        'to the importing process or pools imported via fabric handles across nodes\n'
+        'this will be CU_MEM_LOCATION_TYPE_INVISIBLE.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_MAX_POOL_SIZE' in found_values}}
 
     CU_MEMPOOL_ATTR_MAX_POOL_SIZE = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_MAX_POOL_SIZE,
-        '(value type = cuuint64_t) Maximum size of the pool in bytes, this value may\n'
-        'be higher than what was initially passed to cuMemPoolCreate due to\n'
-        'alignment requirements. A value of 0 indicates no maximum size. For\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes,\n'
+        'this value may be higher than what was initially passed to cuMemPoolCreate\n'
+        'due to alignment requirements. A value of 0 indicates no maximum size. For\n'
         'CU_MEM_ALLOCATION_TYPE_MANAGED and IPC imported pools this value will be\n'
         'system dependent.\n'
     ){{endif}}
@@ -6529,7 +6533,7 @@ class CUmemcpy3DOperandType(_FastEnum):
 
     CU_MEMCPY_OPERAND_TYPE_ARRAY = (
         cydriver.CUmemcpy3DOperandType_enum.CU_MEMCPY_OPERAND_TYPE_ARRAY,
-        'Memcpy operand is a CUarray.\n'
+        'Memcpy operand is a :py:obj:`~.CUarray`.\n'
     ){{endif}}
     {{if 'CU_MEMCPY_OPERAND_TYPE_MAX' in found_values}}
     CU_MEMCPY_OPERAND_TYPE_MAX = cydriver.CUmemcpy3DOperandType_enum.CU_MEMCPY_OPERAND_TYPE_MAX{{endif}}
@@ -6545,30 +6549,30 @@ class CUgraphMem_attribute(_FastEnum):
 
     CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT = (
         cydriver.CUgraphMem_attribute_enum.CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT,
-        '(value type = cuuint64_t) Amount of memory, in bytes, currently associated\n'
-        'with graphs\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently\n'
+        'associated with graphs\n'
     ){{endif}}
     {{if 'CU_GRAPH_MEM_ATTR_USED_MEM_HIGH' in found_values}}
 
     CU_GRAPH_MEM_ATTR_USED_MEM_HIGH = (
         cydriver.CUgraphMem_attribute_enum.CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
-        '(value type = cuuint64_t) High watermark of memory, in bytes, associated\n'
-        'with graphs since the last time it was reset. High watermark can only be\n'
-        'reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes,\n'
+        'associated with graphs since the last time it was reset. High watermark can\n'
+        'only be reset to zero.\n'
     ){{endif}}
     {{if 'CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT' in found_values}}
 
     CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT = (
         cydriver.CUgraphMem_attribute_enum.CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT,
-        '(value type = cuuint64_t) Amount of memory, in bytes, currently allocated\n'
-        'for use by the CUDA graphs asynchronous allocator.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently\n'
+        'allocated for use by the CUDA graphs asynchronous allocator.\n'
     ){{endif}}
     {{if 'CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH' in found_values}}
 
     CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH = (
         cydriver.CUgraphMem_attribute_enum.CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH,
-        '(value type = cuuint64_t) High watermark of memory, in bytes, currently\n'
-        'allocated for use by the CUDA graphs asynchronous allocator.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes,\n'
+        'currently allocated for use by the CUDA graphs asynchronous allocator.\n'
     ){{endif}}
 
 {{endif}}
@@ -6718,49 +6722,49 @@ class CUgraphDebugDot_flags(_FastEnum):
 
     CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS,
-        'Adds CUDA_KERNEL_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_KERNEL_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS,
-        'Adds CUDA_MEMCPY3D values to output\n'
+        'Adds :py:obj:`~.CUDA_MEMCPY3D` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS,
-        'Adds CUDA_MEMSET_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_MEMSET_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS,
-        'Adds CUDA_HOST_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_HOST_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS,
-        'Adds CUevent handle from record and wait nodes to output\n'
+        'Adds :py:obj:`~.CUevent` handle from record and wait nodes to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS,
-        'Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS,
-        'Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_EXT_SEM_WAIT_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES,
-        'Adds CUkernelNodeAttrValue values to output\n'
+        'Adds :py:obj:`~.CUkernelNodeAttrValue` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES' in found_values}}
 
@@ -14307,8 +14311,8 @@ cdef class CUlaunchAttributeValue_union:
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - `CUeventCUevent` event - Event to fire
+        when all blocks trigger it.    - `Event` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
         - `triggerAtBlockStart` - If this is set to non-0, each block
         launch will automatically trigger the event.
@@ -14317,8 +14321,8 @@ cdef class CUlaunchAttributeValue_union:
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - `CUeventCUevent` event - Event to fire when the last
+        block launches    - `int` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -14357,8 +14361,8 @@ cdef class CUlaunchAttributeValue_union:
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
         following fields: - `int` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `CUgraphDeviceNodeCUgraphDeviceNode` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -29765,17 +29769,17 @@ def cuModuleGetGlobal(hmod, char* name):
 def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option] | list[CUjit_option]], optionValues : Optional[tuple[Any] | list[Any]]):
     """ Creates a pending JIT linker invocation.
 
-    If the call is successful, the caller owns the returned CUlinkState,
-    which should eventually be destroyed with :py:obj:`~.cuLinkDestroy`.
-    The device code machine size (32 or 64 bit) will match the calling
-    application.
+    If the call is successful, the caller owns the returned
+    :py:obj:`~.CUlinkState`, which should eventually be destroyed with
+    :py:obj:`~.cuLinkDestroy`. The device code machine size (32 or 64 bit)
+    will match the calling application.
 
     Both linker and compiler options may be specified. Compiler options
     will be applied to inputs to this linker action which must be compiled
     from PTX. The options :py:obj:`~.CU_JIT_WALL_TIME`,
     :py:obj:`~.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES`, and
     :py:obj:`~.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES` will accumulate data
-    until the CUlinkState is destroyed.
+    until the :py:obj:`~.CUlinkState` is destroyed.
 
     The data passed in via :py:obj:`~.cuLinkAddData` and
     :py:obj:`~.cuLinkAddFile` will be treated as relocatable (-rdc=true to
@@ -29783,9 +29787,9 @@ def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option]
     and will have similar consequences as offline relocatable device code
     linking.
 
-    `optionValues` must remain valid for the life of the CUlinkState if
-    output options are used. No other references to inputs are maintained
-    after this call returns.
+    `optionValues` must remain valid for the life of the
+    :py:obj:`~.CUlinkState` if output options are used. No other references
+    to inputs are maintained after this call returns.
 
     Parameters
     ----------
@@ -29801,8 +29805,8 @@ def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option]
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_JIT_COMPILER_NOT_FOUND`
     stateOut : :py:obj:`~.CUlinkState`
-        On success, this will contain a CUlinkState to specify and complete
-        this action
+        On success, this will contain a :py:obj:`~.CUlinkState` to specify
+        and complete this action
 
     See Also
     --------
@@ -30875,7 +30879,7 @@ def cuKernelGetAttribute(attrib not None : CUfunction_attribute, kernel, dev):
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`:
       The block scheduling policy of a function. The value type is
-      CUclusterSchedulingPolicy.
+      :py:obj:`~.CUclusterSchedulingPolicy`.
 
     Parameters
     ----------
@@ -30993,7 +30997,7 @@ def cuKernelSetAttribute(attrib not None : CUfunction_attribute, int val, kernel
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`:
       The block scheduling policy of a function. The value type is
-      CUclusterSchedulingPolicy.
+      :py:obj:`~.CUclusterSchedulingPolicy`.
 
     Parameters
     ----------
@@ -32197,8 +32201,8 @@ def cuIpcGetEventHandle(event):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_MAP_FAILED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pHandle : :py:obj:`~.CUipcEventHandle`
-        Pointer to a user allocated CUipcEventHandle in which to return the
-        opaque event handle
+        Pointer to a user allocated :py:obj:`~.CUipcEventHandle` in which
+        to return the opaque event handle
 
     See Also
     --------
@@ -37302,11 +37306,11 @@ def cuMemPoolSetAttribute(pool, attr not None : CUmemPool_attribute, value):
     Supported attributes are:
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD`: (value type =
-      cuuint64_t) Amount of reserved memory in bytes to hold onto before
-      trying to release memory back to the OS. When more than the release
-      threshold bytes of memory are held by the memory pool, the allocator
-      will try to release memory back to the OS on the next call to stream,
-      event or context synchronize. (default 0)
+      :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold
+      onto before trying to release memory back to the OS. When more than
+      the release threshold bytes of memory are held by the memory pool,
+      the allocator will try to release memory back to the OS on the next
+      call to stream, event or context synchronize. (default 0)
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES`: (value
       type = int) Allow :py:obj:`~.cuMemAllocAsync` to use memory
@@ -37326,13 +37330,13 @@ def cuMemPoolSetAttribute(pool, attr not None : CUmemPool_attribute, value):
       (default enabled).
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH`: (value type =
-      cuuint64_t) Reset the high watermark that tracks the amount of
-      backing memory that was allocated for the memory pool. It is illegal
-      to set this attribute to a non-zero value.
+      :py:obj:`~.cuuint64_t`) Reset the high watermark that tracks the
+      amount of backing memory that was allocated for the memory pool. It
+      is illegal to set this attribute to a non-zero value.
 
-    - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_HIGH`: (value type = cuuint64_t)
-      Reset the high watermark that tracks the amount of used memory that
-      was allocated for the memory pool.
+    - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_HIGH`: (value type =
+      :py:obj:`~.cuuint64_t`) Reset the high watermark that tracks the
+      amount of used memory that was allocated for the memory pool.
 
     Parameters
     ----------
@@ -37377,11 +37381,11 @@ def cuMemPoolGetAttribute(pool, attr not None : CUmemPool_attribute):
     Supported attributes are:
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD`: (value type =
-      cuuint64_t) Amount of reserved memory in bytes to hold onto before
-      trying to release memory back to the OS. When more than the release
-      threshold bytes of memory are held by the memory pool, the allocator
-      will try to release memory back to the OS on the next call to stream,
-      event or context synchronize. (default 0)
+      :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold
+      onto before trying to release memory back to the OS. When more than
+      the release threshold bytes of memory are held by the memory pool,
+      the allocator will try to release memory back to the OS on the next
+      call to stream, event or context synchronize. (default 0)
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES`: (value
       type = int) Allow :py:obj:`~.cuMemAllocAsync` to use memory
@@ -37401,30 +37405,30 @@ def cuMemPoolGetAttribute(pool, attr not None : CUmemPool_attribute):
       (default enabled).
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT`: (value type =
-      cuuint64_t) Amount of backing memory currently allocated for the
-      mempool
+      :py:obj:`~.cuuint64_t`) Amount of backing memory currently allocated
+      for the mempool
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH`: (value type =
-      cuuint64_t) High watermark of backing memory allocated for the
-      mempool since the last time it was reset.
+      :py:obj:`~.cuuint64_t`) High watermark of backing memory allocated
+      for the mempool since the last time it was reset.
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_CURRENT`: (value type =
-      cuuint64_t) Amount of memory from the pool that is currently in use
-      by the application.
+      :py:obj:`~.cuuint64_t`) Amount of memory from the pool that is
+      currently in use by the application.
 
-    - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_HIGH`: (value type = cuuint64_t)
-      High watermark of the amount of memory from the pool that was in use
-      by the application.
+    - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_HIGH`: (value type =
+      :py:obj:`~.cuuint64_t`) High watermark of the amount of memory from
+      the pool that was in use by the application.
 
     The following properties can be also be queried on imported and default
     pools:
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_ALLOCATION_TYPE`: (value type =
-      CUmemAllocationType) The allocation type of the mempool
+      :py:obj:`~.CUmemAllocationType`) The allocation type of the mempool
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES`: (value type =
-      CUmemAllocationHandleType) Available export handle types for the
-      mempool. For imported pools this value is always
+      :py:obj:`~.CUmemAllocationHandleType`) Available export handle types
+      for the mempool. For imported pools this value is always
       CU_MEM_HANDLE_TYPE_NONE as an imported pool cannot be re-exported
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_LOCATION_ID`: (value type = int) The
@@ -37432,16 +37436,16 @@ def cuMemPoolGetAttribute(pool, attr not None : CUmemPool_attribute):
       CU_MEM_LOCATION_TYPE_INVISIBLE then ID will be CU_DEVICE_INVALID.
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_LOCATION_TYPE`: (value type =
-      CUmemLocationType) The location type for the mempool. For imported
-      memory pools where the device is not directly visible to the
+      :py:obj:`~.CUmemLocationType`) The location type for the mempool. For
+      imported memory pools where the device is not directly visible to the
       importing process or pools imported via fabric handles across nodes
       this will be CU_MEM_LOCATION_TYPE_INVISIBLE.
 
-    - :py:obj:`~.CU_MEMPOOL_ATTR_MAX_POOL_SIZE`: (value type = cuuint64_t)
-      Maximum size of the pool in bytes, this value may be higher than what
-      was initially passed to cuMemPoolCreate due to alignment
-      requirements. A value of 0 indicates no maximum size. For
-      CU__MEM_ALLOCATION_TYPE_MANAGED and IPC imported pools this value
+    - :py:obj:`~.CU_MEMPOOL_ATTR_MAX_POOL_SIZE`: (value type =
+      :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes, this value
+      may be higher than what was initially passed to cuMemPoolCreate due
+      to alignment requirements. A value of 0 indicates no maximum size.
+      For CU__MEM_ALLOCATION_TYPE_MANAGED and IPC imported pools this value
       will be system dependent.
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_HW_DECOMPRESS_ENABLED`: (value type = int)
@@ -37984,7 +37988,7 @@ def cuMemPoolExportToShareableHandle(pool, handleType not None : CUmemAllocation
 
     Notes
     -----
-    : To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
+    : To create an IPC capable mempool, create a mempool with a :py:obj:`~.CUmemAllocationHandleType` other than CU_MEM_HANDLE_TYPE_NONE.
     """
     cdef cydriver.CUmemoryPool cypool
     if pool is None:
@@ -38908,7 +38912,7 @@ def cuLogicalEndpointAddDevice(leId, dev):
     Associates a device to a logical endpoint. The type of the logical
     endpoint must be :py:obj:`~.CU_LOGICAL_ENDPOINT_TYPE_MULTICAST`. The
     added device will be a part of the multicast team of size specified by
-    CUlogicalEndpointProp::multicast::numDevices during
+    :py:obj:`~.CUlogicalEndpointProp.multicast.numDevices` during
     :py:obj:`~.cuLogicalEndpointCreate`. The association of the device to
     the multicast logical endpoint is permanent during the life time of the
     multicast logical endpoint. All devices must be added to the multicast
@@ -39559,7 +39563,7 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     - Returns in `*data` the device pointer value through which `ptr` may
       be accessed by kernels running in the current :py:obj:`~.CUcontext`.
-      The type of `data` must be CUdeviceptr *.
+      The type of `data` must be :py:obj:`~.CUdeviceptr` *.
 
     - If there exists no device pointer value through which kernels running
       in the current :py:obj:`~.CUcontext` may access `ptr` then
@@ -39586,7 +39590,7 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     - Returns in `*data` two tokens for use with the nv-p2p.h Linux kernel
       interface. `data` must be a struct of type
-      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
+      :py:obj:`~.CUDA_POINTER_ATTRIBUTE_P2P_TOKENS`.
 
     - `ptr` must be a pointer to memory obtained from
       :py:obj:`~.py`:obj:`~.cuMemAlloc()`. Note that p2pToken and
@@ -44148,7 +44152,7 @@ def cuFuncGetAttribute(attrib not None : CUfunction_attribute, hfunc):
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`:
       The block scheduling policy of a function. The value type is
-      CUclusterSchedulingPolicy.
+      :py:obj:`~.CUclusterSchedulingPolicy`.
 
     With a few execeptions, function attributes may also be queried on
     unloaded function handles returned from
@@ -44261,7 +44265,7 @@ def cuFuncSetAttribute(hfunc, attrib not None : CUfunction_attribute, int value)
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`:
       The block scheduling policy of a function. The value type is
-      CUclusterSchedulingPolicy.
+      :py:obj:`~.CUclusterSchedulingPolicy`.
 
     Parameters
     ----------
@@ -45718,7 +45722,7 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
 
     Notes
     -----
-    In certain cases where cubins are created with no ABI (i.e., using `ptxas`  `no`), this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by growing the per-thread stack as needed per launch and not shrinking it afterwards.
+    In certain cases where cubins are created with no ABI (i.e., using `ptxas` `--abi-compile` `no`), this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by growing the per-thread stack as needed per launch and not shrinking it afterwards.
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -46082,7 +46086,7 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
     root of the graph. `dependencies` may not have any duplicate entries. A
     handle to the new node will be returned in `phGraphNode`.
 
-    The CUDA_KERNEL_NODE_PARAMS structure is defined as:
+    The :py:obj:`~.CUDA_KERNEL_NODE_PARAMS` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -50764,8 +50768,8 @@ def cuGraphDebugDotPrint(hGraph, char* path, unsigned int flags):
     path : bytes
         The path to write the DOT file to
     flags : unsigned int
-        Flags from CUgraphDebugDot_flags for specifying which additional
-        node information to write
+        Flags from :py:obj:`~.CUgraphDebugDot_flags` for specifying which
+        additional node information to write
 
     Returns
     -------
@@ -53310,8 +53314,8 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
       supported address mode is :py:obj:`~.CU_TR_ADDRESS_MODE_CLAMP`.
 
     - :py:obj:`~.CUDA_TEXTURE_DESC.filterMode` specifies the filtering mode
-      to be used when fetching from the texture. CUfilter_mode is defined
-      as:
+      to be used when fetching from the texture. :py:obj:`~.CUfilter_mode`
+      is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -56374,12 +56378,12 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
 
     The API does not set the green context current. In order to set it
     current, you need to explicitly set it current by first converting the
-    green context to a CUcontext using :py:obj:`~.cuCtxFromGreenCtx` and
-    subsequently calling :py:obj:`~.cuCtxSetCurrent` /
-    :py:obj:`~.cuCtxPushCurrent`. It should be noted that a green context
-    can be current to only one thread at a time. There is no internal
-    synchronization to make API calls accessing the same green context from
-    multiple threads work.
+    green context to a :py:obj:`~.CUcontext` using
+    :py:obj:`~.cuCtxFromGreenCtx` and subsequently calling
+    :py:obj:`~.cuCtxSetCurrent` / :py:obj:`~.cuCtxPushCurrent`. It should
+    be noted that a green context can be current to only one thread at a
+    time. There is no internal synchronization to make API calls accessing
+    the same green context from multiple threads work.
 
     Note: The API is not supported on 32-bit platforms.
 
@@ -57989,7 +57993,7 @@ def cuEGLStreamConsumerConnectWithFlags(stream, unsigned int flags):
     """ Connect CUDA to EGLStream as a consumer with given flags.
 
     Connect CUDA as a consumer to EGLStreamKHR specified by `stream` with
-    specified `flags` defined by CUeglResourceLocationFlags.
+    specified `flags` defined by :py:obj:`~.CUeglResourceLocationFlags`.
 
     The flags specify whether the consumer wants to access frames from
     system memory or video memory. Default is
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 323fc99e46..4aa61e200c 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 
 include "_lib/utils.pxd"
@@ -1624,7 +1624,7 @@ cdef class cudaMemAllocNodeParams:
     {{endif}}
     {{if 'cudaMemAllocNodeParams.accessDescCount' in found_struct}}
     accessDescCount : size_t
-        in: Number of `accessDescs`s
+        in: Number of `accessDescsaccessDescs`s
     {{endif}}
     {{if 'cudaMemAllocNodeParams.bytesize' in found_struct}}
     bytesize : size_t
@@ -1675,7 +1675,7 @@ cdef class cudaMemAllocNodeParamsV2:
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.accessDescCount' in found_struct}}
     accessDescCount : size_t
-        in: Number of `accessDescs`s
+        in: Number of `accessDescsaccessDescs`s
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.bytesize' in found_struct}}
     bytesize : size_t
@@ -3134,8 +3134,9 @@ cdef class cudaDevSmResourceGroupParams_st:
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of `cudaDevSmResourceGroup_flags` values to indicate
-        this this group is created.
+        Combination of
+        `cudaDevSmResourceGroup_flagscudaDevSmResourceGroup_flags` values
+        to indicate this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -4245,11 +4246,12 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - `cudaEvent_t` event - Event to fire when
-        all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        the following fields: - `cudaEvent_tcudaEvent_t` event - Event to
+        fire when all blocks trigger it.    - `int` flags; - Event record
+        flags, see cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -4284,9 +4286,9 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - `cudaEvent_t` event - Event to fire
-        when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        with the following fields: - `cudaEvent_tcudaEvent_t` event - Event
+        to fire when the last block launches.    - `int` flags - Event
+        record flags, see cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -4295,8 +4297,8 @@ cdef class cudaLaunchAttributeValue:
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
         fields: - `int` deviceUpdatable - Whether or not the resulting
         kernel node should be device-updatable.    -
-        `cudaGraphDeviceNode_t` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `cudaGraphDeviceNode_tcudaGraphDeviceNode_t` devNode - Returns a
+        handle to pass to the various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -4781,8 +4783,9 @@ cdef class cudaDevSmResourceGroupParams(cudaDevSmResourceGroupParams_st):
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of `cudaDevSmResourceGroup_flags` values to indicate
-        this this group is created.
+        Combination of
+        `cudaDevSmResourceGroup_flagscudaDevSmResourceGroup_flags` values
+        to indicate this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -5098,11 +5101,12 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - `cudaEvent_t` event - Event to fire when
-        all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        the following fields: - `cudaEvent_tcudaEvent_t` event - Event to
+        fire when all blocks trigger it.    - `int` flags; - Event record
+        flags, see cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -5137,9 +5141,9 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - `cudaEvent_t` event - Event to fire
-        when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        with the following fields: - `cudaEvent_tcudaEvent_t` event - Event
+        to fire when the last block launches.    - `int` flags - Event
+        record flags, see cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -5148,8 +5152,8 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
         fields: - `int` deviceUpdatable - Whether or not the resulting
         kernel node should be device-updatable.    -
-        `cudaGraphDeviceNode_t` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `cudaGraphDeviceNode_tcudaGraphDeviceNode_t` devNode - Returns a
+        handle to pass to the various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -5229,11 +5233,12 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - `cudaEvent_t` event - Event to fire when
-        all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        the following fields: - `cudaEvent_tcudaEvent_t` event - Event to
+        fire when all blocks trigger it.    - `int` flags; - Event record
+        flags, see cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -5268,9 +5273,9 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - `cudaEvent_t` event - Event to fire
-        when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        with the following fields: - `cudaEvent_tcudaEvent_t` event - Event
+        to fire when the last block launches.    - `int` flags - Event
+        record flags, see cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -5279,8 +5284,8 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
         fields: - `int` deviceUpdatable - Whether or not the resulting
         kernel node should be device-updatable.    -
-        `cudaGraphDeviceNode_t` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `cudaGraphDeviceNode_tcudaGraphDeviceNode_t` devNode - Returns a
+        handle to pass to the various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 5c38d5c0a2..5bb20a6642 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1740+g3731b17a1. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -83,16 +83,16 @@ cudaStreamNonBlocking = cyruntime.cudaStreamNonBlocking
 
 #: Legacy stream handle
 #:
-#: Stream handle that can be passed as a cudaStream_t to use an implicit
-#: stream with legacy synchronization behavior.
+#: Stream handle that can be passed as a :py:obj:`~.cudaStream_t` to use an
+#: implicit stream with legacy synchronization behavior.
 #:
 #: See details of the \link_sync_behavior
 cudaStreamLegacy = cyruntime.cudaStreamLegacy
 
 #: Per-thread stream handle
 #:
-#: Stream handle that can be passed as a cudaStream_t to use an implicit
-#: stream with per-thread synchronization behavior.
+#: Stream handle that can be passed as a :py:obj:`~.cudaStream_t` to use an
+#: implicit stream with per-thread synchronization behavior.
 #:
 #: See details of the \link_sync_behavior
 cudaStreamPerThread = cyruntime.cudaStreamPerThread
@@ -136,7 +136,9 @@ cudaDeviceScheduleYield = cyruntime.cudaDeviceScheduleYield
 #: Device flag - Use blocking synchronization
 cudaDeviceScheduleBlockingSync = cyruntime.cudaDeviceScheduleBlockingSync
 
-#: Device flag - Use blocking synchronization [Deprecated]
+#: Device flag - Use blocking synchronization
+#:
+#: [Deprecated]
 cudaDeviceBlockingSync = cyruntime.cudaDeviceBlockingSync
 
 #: Device schedule flags mask
@@ -443,14 +445,16 @@ class cudaError_t(_FastEnum):
     cudaErrorInvalidHostPointer = (
         cyruntime.cudaError.cudaErrorInvalidHostPointer,
         'This indicates that at least one host pointer passed to the API call is not\n'
-        'a valid host pointer. [Deprecated]\n'
+        'a valid host pointer.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorInvalidDevicePointer' in found_values}}
 
     cudaErrorInvalidDevicePointer = (
         cyruntime.cudaError.cudaErrorInvalidDevicePointer,
         'This indicates that at least one device pointer passed to the API call is\n'
-        'not a valid device pointer. [Deprecated]\n'
+        'not a valid device pointer.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorInvalidTexture' in found_values}}
 
@@ -486,28 +490,32 @@ class cudaError_t(_FastEnum):
     cudaErrorAddressOfConstant = (
         cyruntime.cudaError.cudaErrorAddressOfConstant,
         'This indicated that the user has taken the address of a constant variable,\n'
-        'which was forbidden up until the CUDA 3.1 release. [Deprecated]\n'
+        'which was forbidden up until the CUDA 3.1 release.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorTextureFetchFailed' in found_values}}
 
     cudaErrorTextureFetchFailed = (
         cyruntime.cudaError.cudaErrorTextureFetchFailed,
         'This indicated that a texture fetch was not able to be performed. This was\n'
-        'previously used for device emulation of texture operations. [Deprecated]\n'
+        'previously used for device emulation of texture operations.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorTextureNotBound' in found_values}}
 
     cudaErrorTextureNotBound = (
         cyruntime.cudaError.cudaErrorTextureNotBound,
         'This indicated that a texture was not bound for access. This was previously\n'
-        'used for device emulation of texture operations. [Deprecated]\n'
+        'used for device emulation of texture operations.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorSynchronizationError' in found_values}}
 
     cudaErrorSynchronizationError = (
         cyruntime.cudaError.cudaErrorSynchronizationError,
         'This indicated that a synchronization operation had failed. This was\n'
-        'previously used for some device emulation functions. [Deprecated]\n'
+        'previously used for some device emulation functions.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorInvalidFilterSetting' in found_values}}
 
@@ -527,21 +535,24 @@ class cudaError_t(_FastEnum):
 
     cudaErrorMixedDeviceExecution = (
         cyruntime.cudaError.cudaErrorMixedDeviceExecution,
-        'Mixing of device and device emulation code was not allowed. [Deprecated]\n'
+        'Mixing of device and device emulation code was not allowed.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorNotYetImplemented' in found_values}}
 
     cudaErrorNotYetImplemented = (
         cyruntime.cudaError.cudaErrorNotYetImplemented,
         'This indicates that the API call is not yet implemented. Production\n'
-        'releases of CUDA will never return this error. [Deprecated]\n'
+        'releases of CUDA will never return this error.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorMemoryValueTooLarge' in found_values}}
 
     cudaErrorMemoryValueTooLarge = (
         cyruntime.cudaError.cudaErrorMemoryValueTooLarge,
         'This indicated that an emulated device pointer exceeded the 32-bit address\n'
-        'range. [Deprecated]\n'
+        'range.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorStubLibrary' in found_values}}
 
@@ -618,7 +629,7 @@ class cudaError_t(_FastEnum):
         'Driver context was created using an older version of the API, because the\n'
         'Runtime API call expects a primary driver context and the Driver context is\n'
         'not primary, or because the Driver context has been destroyed. Please see\n'
-        ':py:obj:`~.Interactions`with the CUDA Driver API" for more information.\n'
+        ':py:obj:`~.Interactions with the CUDA Driver API` for more information.\n'
     ){{endif}}
     {{if 'cudaErrorMissingConfiguration' in found_values}}
 
@@ -633,7 +644,8 @@ class cudaError_t(_FastEnum):
     cudaErrorPriorLaunchFailure = (
         cyruntime.cudaError.cudaErrorPriorLaunchFailure,
         'This indicated that a previous kernel launch failed. This was previously\n'
-        'used for device emulation of kernel launches. [Deprecated]\n'
+        'used for device emulation of kernel launches.\n'
+        '[Deprecated]\n'
     ){{endif}}
     {{if 'cudaErrorLaunchMaxDepthExceeded' in found_values}}
 
@@ -1606,41 +1618,41 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -1653,11 +1665,11 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
-        'event. Event recorded through this launch attribute is guaranteed to only\n'
-        'trigger after all block in the associated kernel trigger the event. A block\n'
-        'can trigger the event programmatically in a future CUDA release. A trigger\n'
-        "can also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
+        'Event recorded through this launch attribute is guaranteed to only trigger\n'
+        'after all block in the associated kernel trigger the event. A block can\n'
+        'trigger the event programmatically in a future CUDA release. A trigger can\n'
+        "also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -1678,28 +1690,28 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -1733,7 +1745,7 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -1760,7 +1772,7 @@ class cudaLaunchAttributeID(_FastEnum):
         'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
@@ -1788,7 +1800,7 @@ class cudaLaunchAttributeID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -1814,7 +1826,7 @@ class cudaLaunchAttributeID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -1823,8 +1835,8 @@ class cudaLaunchAttributeID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
-        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
+        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -3871,7 +3883,7 @@ class cudaFuncCache(_FastEnum):
 
 class cudaSharedMemConfig(_FastEnum):
     """
-    CUDA shared memory configuration [Deprecated]
+    CUDA shared memory configuration  [Deprecated]
     """
     {{if 'cudaSharedMemBankSizeDefault' in found_values}}
     cudaSharedMemBankSizeDefault = cyruntime.cudaSharedMemConfig.cudaSharedMemBankSizeDefault{{endif}}
@@ -5074,14 +5086,15 @@ class cudaMemPoolAttr(_FastEnum):
 
     cudaMemPoolAttrAllocationType = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrAllocationType,
-        '(value type = cudaMemAllocationType) The allocation type of the mempool\n'
+        '(value type = :py:obj:`~.cudaMemAllocationType`) The allocation type of the\n'
+        'mempool\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrExportHandleTypes' in found_values}}
 
     cudaMemPoolAttrExportHandleTypes = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrExportHandleTypes,
-        '(value type = cudaMemAllocationHandleType) Available export handle types\n'
-        'for the mempool. For imported pools this value is always\n'
+        '(value type = :py:obj:`~.cudaMemAllocationHandleType`) Available export\n'
+        'handle types for the mempool. For imported pools this value is always\n'
         'cudaMemHandleTypeNone as an imported pool cannot be re-exported\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrLocationId' in found_values}}
@@ -5096,10 +5109,10 @@ class cudaMemPoolAttr(_FastEnum):
 
     cudaMemPoolAttrLocationType = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrLocationType,
-        '(value type = cudaMemLocationType) The location type for the mempool. For\n'
-        'imported memory pools where the device is not directly visible to the\n'
-        'importing process or pools imported via fabric handles across nodes this\n'
-        'will be cudaMemLocationTypeInvisible\n'
+        '(value type = :py:obj:`~.cudaMemLocationType`) The location type for the\n'
+        'mempool. For imported memory pools where the device is not directly visible\n'
+        'to the importing process or pools imported via fabric handles across nodes\n'
+        'this will be cudaMemLocationTypeInvisible\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrMaxPoolSize' in found_values}}
 
@@ -5255,7 +5268,8 @@ class cudaMemAllocationHandleType(_FastEnum):
 
     cudaMemHandleTypeFabric = (
         cyruntime.cudaMemAllocationHandleType.cudaMemHandleTypeFabric,
-        'Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)\n'
+        'Allows a fabric handle to be used for exporting.\n'
+        '(:py:obj:`~.cudaMemFabricHandle_t`)\n'
     ){{endif}}
 
 {{endif}}
@@ -5976,13 +5990,13 @@ class cudaKernelFunctionType(_FastEnum):
 
     cudaKernelFunctionTypeKernel = (
         cyruntime.cudaKernelFunctionType.cudaKernelFunctionTypeKernel,
-        'Function handle is a cudaKernel_t\n'
+        'Function handle is a :py:obj:`~.cudaKernel_t`\n'
     ){{endif}}
     {{if 'cudaKernelFunctionTypeFunction' in found_values}}
 
     cudaKernelFunctionTypeFunction = (
         cyruntime.cudaKernelFunctionType.cudaKernelFunctionTypeFunction,
-        'Function handle is a cudaFunction_t\n'
+        'Function handle is a :py:obj:`~.cudaFunction_t`\n'
     ){{endif}}
 
 {{endif}}
@@ -6376,7 +6390,7 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsEventNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsEventNodeParams,
-        'Adds cudaEvent_t handle from record and wait nodes to output\n'
+        'Adds :py:obj:`~.cudaEvent_t` handle from record and wait nodes to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsExtSemasSignalNodeParams' in found_values}}
 
@@ -6710,41 +6724,41 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -6757,11 +6771,11 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
-        'event. Event recorded through this launch attribute is guaranteed to only\n'
-        'trigger after all block in the associated kernel trigger the event. A block\n'
-        'can trigger the event programmatically in a future CUDA release. A trigger\n'
-        "can also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
+        'Event recorded through this launch attribute is guaranteed to only trigger\n'
+        'after all block in the associated kernel trigger the event. A block can\n'
+        'trigger the event programmatically in a future CUDA release. A trigger can\n'
+        "also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -6782,28 +6796,28 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -6837,7 +6851,7 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -6864,7 +6878,7 @@ class cudaStreamAttrID(_FastEnum):
         'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
@@ -6892,7 +6906,7 @@ class cudaStreamAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -6918,7 +6932,7 @@ class cudaStreamAttrID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -6927,8 +6941,8 @@ class cudaStreamAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
-        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
+        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -6958,41 +6972,41 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -7005,11 +7019,11 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
-        'event. Event recorded through this launch attribute is guaranteed to only\n'
-        'trigger after all block in the associated kernel trigger the event. A block\n'
-        'can trigger the event programmatically in a future CUDA release. A trigger\n'
-        "can also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
+        'Event recorded through this launch attribute is guaranteed to only trigger\n'
+        'after all block in the associated kernel trigger the event. A block can\n'
+        'trigger the event programmatically in a future CUDA release. A trigger can\n'
+        "also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -7030,28 +7044,28 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -7085,7 +7099,7 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -7112,7 +7126,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
@@ -7140,7 +7154,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -7166,7 +7180,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -7175,8 +7189,8 @@ class cudaKernelNodeAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
-        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
+        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -11654,7 +11668,7 @@ cdef class cudaMemAllocNodeParams:
     {{endif}}
     {{if 'cudaMemAllocNodeParams.accessDescCount' in found_struct}}
     accessDescCount : size_t
-        in: Number of `accessDescs`s
+        in: Number of `accessDescsaccessDescs`s
     {{endif}}
     {{if 'cudaMemAllocNodeParams.bytesize' in found_struct}}
     bytesize : size_t
@@ -11805,7 +11819,7 @@ cdef class cudaMemAllocNodeParamsV2:
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.accessDescCount' in found_struct}}
     accessDescCount : size_t
-        in: Number of `accessDescs`s
+        in: Number of `accessDescsaccessDescs`s
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.bytesize' in found_struct}}
     bytesize : size_t
@@ -16419,8 +16433,9 @@ cdef class cudaDevSmResourceGroupParams_st:
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of `cudaDevSmResourceGroup_flags` values to indicate
-        this this group is created.
+        Combination of
+        `cudaDevSmResourceGroup_flagscudaDevSmResourceGroup_flags` values
+        to indicate this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -19664,11 +19679,12 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - `cudaEvent_t` event - Event to fire when
-        all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        the following fields: - `cudaEvent_tcudaEvent_t` event - Event to
+        fire when all blocks trigger it.    - `int` flags; - Event record
+        flags, see cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -19703,9 +19719,9 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - `cudaEvent_t` event - Event to fire
-        when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        with the following fields: - `cudaEvent_tcudaEvent_t` event - Event
+        to fire when the last block launches.    - `int` flags - Event
+        record flags, see cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -19714,8 +19730,8 @@ cdef class cudaLaunchAttributeValue:
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
         fields: - `int` deviceUpdatable - Whether or not the resulting
         kernel node should be device-updatable.    -
-        `cudaGraphDeviceNode_t` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        `cudaGraphDeviceNode_tcudaGraphDeviceNode_t` devNode - Returns a
+        handle to pass to the various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
@@ -22131,9 +22147,11 @@ def cudaDeviceFlushGPUDirectRDMAWrites(target not None : cudaFlushGPUDirectRDMAW
     Parameters
     ----------
     target : :py:obj:`~.cudaFlushGPUDirectRDMAWritesTarget`
-        The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
+        The target of the operation, see
+        :py:obj:`~.cudaFlushGPUDirectRDMAWritesTarget`
     scope : :py:obj:`~.cudaFlushGPUDirectRDMAWritesScope`
-        The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
+        The scope of the operation, see
+        :py:obj:`~.cudaFlushGPUDirectRDMAWritesScope`
 
     Returns
     -------
@@ -25223,88 +25241,83 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeOpaqueFd`, then
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::fd must be a valid
-    file descriptor referencing a memory object. Ownership of the file
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.fd` must be a valid file
+    descriptor referencing a memory object. Ownership of the file
     descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
     after it is imported results in undefined behavior.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeOpaqueWin32`, then exactly one
-    of :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must not
-    be NULL. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle is not
-    NULL, then it must represent a valid shared NT handle that references a
-    memory object. Ownership of this handle is not transferred to CUDA
-    after the import operation, so the application must release the handle
-    using the appropriate system call. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
-    characters that refers to a memory object.
+    of :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` must not be
+    NULL. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle`
+    is not NULL, then it must represent a valid shared NT handle that
+    references a memory object. Ownership of this handle is not transferred
+    to CUDA after the import operation, so the application must release the
+    handle using the appropriate system call. If
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` is not NULL,
+    then it must point to a NULL-terminated array of UTF-16 characters that
+    refers to a memory object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeOpaqueWin32Kmt`, then
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle must be
-    non-NULL and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must be
-    NULL. The handle specified must be a globally shared KMT handle. This
-    handle does not hold a reference to the underlying object, and thus
-    will be invalid when all references to the memory object are destroyed.
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` must be
+    non-NULL and :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name`
+    must be NULL. The handle specified must be a globally shared KMT
+    handle. This handle does not hold a reference to the underlying object,
+    and thus will be invalid when all references to the memory object are
+    destroyed.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeD3D12Heap`, then exactly one of
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must not
-    be NULL. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle is not
-    NULL, then it must represent a valid shared NT handle that is returned
-    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Heap
-    object. This handle holds a reference to the underlying object. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` must not be
+    NULL. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle`
+    is not NULL, then it must represent a valid shared NT handle that is
+    returned by ID3D12Device::CreateSharedHandle when referring to a
+    ID3D12Heap object. This handle holds a reference to the underlying
+    object. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name`
+    is not NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Heap object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeD3D12Resource`, then exactly one
-    of :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must not
-    be NULL. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle is not
-    NULL, then it must represent a valid shared NT handle that is returned
-    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Resource
-    object. This handle holds a reference to the underlying object. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
+    of :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` must not be
+    NULL. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle`
+    is not NULL, then it must represent a valid shared NT handle that is
+    returned by ID3D12Device::CreateSharedHandle when referring to a
+    ID3D12Resource object. This handle holds a reference to the underlying
+    object. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name`
+    is not NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Resource object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeD3D11Resource`,then exactly one
-    of :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must not
-    be NULL. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
+    of :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` must not be
+    NULL. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle`
+    is   not NULL, then it must represent a valid shared NT handle that is
     returned by IDXGIResource1::CreateSharedHandle when referring to a
     ID3D11Resource object. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
-    characters that refers to a ID3D11Resource object.
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` is not NULL,
+    then it must point to a NULL-terminated array of UTF-16 characters that
+    refers to a ID3D11Resource object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeD3D11ResourceKmt`, then
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle must be
-    non-NULL and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must be
-    NULL. The handle specified must be a valid shared KMT handle that is
-    returned by IDXGIResource::GetSharedHandle when referring to a
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` must be
+    non-NULL and :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name`
+    must be NULL. The handle specified must be a valid shared KMT handle
+    that is returned by IDXGIResource::GetSharedHandle when referring to a
     ID3D11Resource object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, then
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::nvSciBufObject must
-    be NON-NULL and reference a valid NvSciBuf object. If the NvSciBuf
-    object imported into CUDA is also mapped by other drivers, then the
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.nvSciBufObject` must be
+    NON-NULL and reference a valid NvSciBuf object. If the NvSciBuf object
+    imported into CUDA is also mapped by other drivers, then the
     application must use :py:obj:`~.cudaWaitExternalSemaphoresAsync` or
     :py:obj:`~.cudaSignalExternalSemaphoresAsync` as approprriate barriers
     to maintain coherence between CUDA and the other drivers. See
@@ -25555,7 +25568,7 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeOpaqueFd`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::fd must be a valid
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.fd` must be a valid
     file descriptor referencing a synchronization object. Ownership of the
     file descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
@@ -25563,80 +25576,78 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeOpaqueWin32`, then exactly
-    one of
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name must
+    one of :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle`
+    and :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must
     not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that
-    references a synchronization object. Ownership of this handle is not
-    transferred to CUDA after the import operation, so the application must
-    release the handle using the appropriate system call. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name is not
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that references a
+    synchronization object. Ownership of this handle is not transferred to
+    CUDA after the import operation, so the application must release the
+    handle using the appropriate system call. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` is not
     NULL, then it must name a valid synchronization object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle must
-    be non-NULL and
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name must
-    be NULL. The handle specified must be a globally shared KMT handle.
-    This handle does not hold a reference to the underlying object, and
-    thus will be invalid when all references to the synchronization object
-    are destroyed.
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` must be
+    non-NULL and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must be
+    NULL. The handle specified must be a globally shared KMT handle. This
+    handle does not hold a reference to the underlying object, and thus
+    will be invalid when all references to the synchronization object are
+    destroyed.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeD3D12Fence`, then exactly one
-    of :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle
-    and :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
-    returned by ID3D12Device::CreateSharedHandle when referring to a
-    ID3D12Fence object. This handle holds a reference to the underlying
-    object. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name is not
+    of :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that is returned
+    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Fence
+    object. This handle holds a reference to the underlying object. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` is not
     NULL, then it must name a valid synchronization object that refers to a
     valid ID3D12Fence object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeD3D11Fence`, then exactly one
-    of :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle
-    and :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
-    returned by ID3D11Fence::CreateSharedHandle. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name is not
+    of :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that is returned
+    by ID3D11Fence::CreateSharedHandle. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` is not
     NULL, then it must name a valid synchronization object that refers to a
     valid ID3D11Fence object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::nvSciSyncObj
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.nvSciSyncObj`
     represents a valid NvSciSyncObj.
 
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutex`, then exactly one
-    of :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle
-    and :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it represent a valid shared NT handle that is returned
-    by IDXGIResource1::CreateSharedHandle when referring to a
-    IDXGIKeyedMutex object.
+    of :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it represent a valid shared NT handle that is returned by
+    IDXGIResource1::CreateSharedHandle when referring to a IDXGIKeyedMutex
+    object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutexKmt`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle must
-    be non-NULL and
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name must
-    be NULL. The handle specified must represent a valid KMT handle that is
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` must be
+    non-NULL and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must be
+    NULL. The handle specified must represent a valid KMT handle that is
     returned by IDXGIResource::GetSharedHandle when referring to a
     IDXGIKeyedMutex object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::fd must be a valid
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.fd` must be a valid
     file descriptor referencing a synchronization object. Ownership of the
     file descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
@@ -25645,15 +25656,15 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32`, then
     exactly one of
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name must
-    not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that
-    references a synchronization object. Ownership of this handle is not
-    transferred to CUDA after the import operation, so the application must
-    release the handle using the appropriate system call. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name is not
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that references a
+    synchronization object. Ownership of this handle is not transferred to
+    CUDA after the import operation, so the application must release the
+    handle using the appropriate system call. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` is not
     NULL, then it must name a valid synchronization object.
 
     Parameters
@@ -25706,15 +25717,15 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd`,
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32` then
     the semaphore will be set to the value specified in
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::fence::value.
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.fence.value`.
 
     If the semaphore object is of the type
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` this API sets
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
-    to a value that can be used by subsequent waiters of the same NvSciSync
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence` to
+    a value that can be used by subsequent waiters of the same NvSciSync
     object to order operations with those currently submitted in `stream`.
     Such an update will overwrite previous contents of
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence.
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence`.
     By default, signaling such an external semaphore object causes
     appropriate memory synchronization operations to be performed over all
     the external memory objects that are imported as
@@ -25731,7 +25742,7 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     in :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` to
     cudaNvSciSyncAttrSignal, this API will return cudaErrorNotSupported.
 
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence`
     associated with semaphore object of the type
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` can be
     deterministic. For this the NvSciSyncAttrList used to create the
@@ -25750,7 +25761,7 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     with deterministic fence support enabled in different streams or by
     adding explicit dependency amongst such streams so that the semaphore
     is signaled in order.
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence`
     associated with semaphore object of the type
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` can be timestamp
     enabled. For this the NvSciSyncAttrList used to create the object must
@@ -25771,7 +25782,7 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutex`,
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutexKmt`, then the
     keyed mutex will be released with the key specified in
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::keyedmutex::key.
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.keyedmutex.key`.
 
     Parameters
     ----------
@@ -25866,14 +25877,14 @@ def cudaWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalSem
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32` then
     waiting on the semaphore will wait until the value of the semaphore is
     greater than or equal to
-    :py:obj:`~.cudaExternalSemaphoreWaitParams`::params::fence::value.
+    :py:obj:`~.cudaExternalSemaphoreWaitParams.params.fence.value`.
 
     If the semaphore object is of the type
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` then, waiting on
     the semaphore will wait until the
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
-    is signaled by the signaler of the NvSciSyncObj that was associated
-    with this semaphore object. By default, waiting on such an external
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence` is
+    signaled by the signaler of the NvSciSyncObj that was associated with
+    this semaphore object. By default, waiting on such an external
     semaphore object causes appropriate memory synchronization operations
     to be performed over all external memory objects that are imported as
     :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`. This ensures that any
@@ -25893,10 +25904,9 @@ def cudaWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalSem
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutex`,
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutexKmt`, then the
     keyed mutex will be acquired when it is released with the key specified
-    in
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::keyedmutex::key
+    in :py:obj:`~.cudaExternalSemaphoreSignalParams.params.keyedmutex.key`
     or until the timeout specified by
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::keyedmutex::timeoutMs
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.keyedmutex.timeoutMs`
     has lapsed. The timeout interval can either be a finite value specified
     in milliseconds or an infinite value. In case an infinite value is
     specified the timeout never elapses. The windows INFINITE macro must be
@@ -26175,7 +26185,7 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
 
     - :py:obj:`~.cudaFuncAttributeClusterSchedulingPolicyPreference`: The
       block scheduling policy of a function. The value type is
-      cudaClusterSchedulingPolicy.
+      :py:obj:`~.cudaClusterSchedulingPolicy`.
 
     cudaLaunchKernel (C++ API), cudaFuncSetCacheConfig (C++ API),
     :py:obj:`~.cudaFuncGetAttributes (C API)`,
@@ -27529,7 +27539,7 @@ def cudaMalloc3D(extent not None : cudaExtent):
 
     See Also
     --------
-    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaPitchedPtr, make_cudaExtent, :py:obj:`~.cuMemAllocPitch`
+    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMemAllocPitch`
     """
     cdef cudaPitchedPtr pitchedDevPtr = cudaPitchedPtr()
     with nogil:
@@ -27651,7 +27661,7 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuArray3DCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuArray3DCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
@@ -27777,7 +27787,7 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuMipmappedArrayCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayCreate`
     """
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
@@ -27819,7 +27829,7 @@ def cudaGetMipmappedArrayLevel(mipmappedArray, unsigned int level):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuMipmappedArrayGetLevel`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayGetLevel`
     """
     cdef cyruntime.cudaMipmappedArray_const_t cymipmappedArray
     if mipmappedArray is None:
@@ -27913,7 +27923,7 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, make_cudaExtent, make_cudaPos, :py:obj:`~.cuMemcpy3D`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3D`
     """
     cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p is not None else NULL
     with nogil:
@@ -28047,7 +28057,7 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, make_cudaExtent, make_cudaPos, :py:obj:`~.cuMemcpy3DAsync`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3DAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -29146,31 +29156,31 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
 
     For a given operand, if :py:obj:`~.cudaMemcpy3DOperand.type` is
     specified as :py:obj:`~.cudaMemcpyOperandTypePointer`, then
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr will be used. The
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::ptr field must contain the
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr` will be used. The
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr.ptr` field must contain the
     pointer where the copy should begin. The
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::rowLength field specifies the
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr.rowLength` field specifies the
     length of each row in elements and must either be zero or be greater
     than or equal to the width of the copy specified in
-    :py:obj:`~.cudaMemcpy3DBatchOp`::extent::width. The
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::layerHeight field specifies
-    the height of each layer and must either be zero or be greater than or
+    :py:obj:`~.cudaMemcpy3DBatchOp.extent.width`. The
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr.layerHeight` field specifies the
+    height of each layer and must either be zero or be greater than or
     equal to the height of the copy specified in
-    :py:obj:`~.cudaMemcpy3DBatchOp`::extent::height. When either of these
+    :py:obj:`~.cudaMemcpy3DBatchOp.extent.height`. When either of these
     values is zero, that aspect of the operand is considered to be tightly
     packed according to the copy extent. For managed memory pointers on
     devices where :py:obj:`~.cudaDevAttrConcurrentManagedAccess` is true or
     system-allocated pageable memory on devices where
     :py:obj:`~.cudaDevAttrPageableMemoryAccess` is true, the
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::locHint field can be used to
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr.locHint` field can be used to
     hint the location of the operand.
 
     If an operand's type is specified as
     :py:obj:`~.cudaMemcpyOperandTypeArray`, then
-    :py:obj:`~.cudaMemcpy3DOperand`::op::array will be used. The
-    :py:obj:`~.cudaMemcpy3DOperand`::op::array::array field specifies the
-    CUDA array and :py:obj:`~.cudaMemcpy3DOperand`::op::array::offset
-    specifies the 3D offset into that array where the copy begins.
+    :py:obj:`~.cudaMemcpy3DOperand.op.array` will be used. The
+    :py:obj:`~.cudaMemcpy3DOperand.op.array.array` field specifies the CUDA
+    array and :py:obj:`~.cudaMemcpy3DOperand.op.array.offset` specifies the
+    3D offset into that array where the copy begins.
 
     The :py:obj:`~.cudaMemcpyAttributes.srcAccessOrder` indicates the
     source access ordering to be observed for copies associated with the
@@ -29746,7 +29756,7 @@ def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, make_cudaPitchedPtr, make_cudaExtent
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
     """
     with nogil:
         err = cyruntime.cudaMemset3D(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0])
@@ -29923,7 +29933,7 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, make_cudaPitchedPtr, make_cudaExtent
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -31412,11 +31422,11 @@ def cudaMemPoolGetAttribute(memPool, attr not None : cudaMemPoolAttr):
     pools:
 
     - :py:obj:`~.cudaMemPoolAttrAllocationType`: (value type =
-      cudaMemAllocationType) The allocation type of the mempool
+      :py:obj:`~.cudaMemAllocationType`) The allocation type of the mempool
 
     - :py:obj:`~.cudaMemPoolAttrExportHandleTypes`: (value type =
-      cudaMemAllocationHandleType) Available export handle types for the
-      mempool. For imported pools this value is always
+      :py:obj:`~.cudaMemAllocationHandleType`) Available export handle
+      types for the mempool. For imported pools this value is always
       cudaMemHandleTypeNone as an imported pool cannot be re-exported
 
     - :py:obj:`~.cudaMemPoolAttrLocationId`: (value type = int) The
@@ -31424,10 +31434,10 @@ def cudaMemPoolGetAttribute(memPool, attr not None : cudaMemPoolAttr):
       cudaMemLocationTypeInvisible then ID will be cudaInvalidDeviceId.
 
     - :py:obj:`~.cudaMemPoolAttrLocationType`: (value type =
-      cudaMemLocationType) The location type for the mempool. For imported
-      memory pools where the device is not directly visible to the
-      importing process or pools imported via fabric handles across nodes
-      this will be cudaMemlocataionTypeInvisible.
+      :py:obj:`~.cudaMemLocationType`) The location type for the mempool.
+      For imported memory pools where the device is not directly visible to
+      the importing process or pools imported via fabric handles across
+      nodes this will be cudaMemlocataionTypeInvisible.
 
     - :py:obj:`~.cudaMemPoolAttrMaxPoolSize`: (value type = cuuint64_t)
       Maximum size of the pool in bytes, this value may be higher than what
@@ -31584,20 +31594,20 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
 
     To create a memory pool for host memory not targeting a specific NUMA
     node, applications must set set
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type to
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` to
     :py:obj:`~.cudaMemLocationTypeHost`.
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::id is ignored for such
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.id` is ignored for such
     pools. Pools created with the type :py:obj:`~.cudaMemLocationTypeHost`
     are not IPC capable and :py:obj:`~.cudaMemPoolProps.handleTypes` must
     be 0, any other values will result in
     :py:obj:`~.cudaErrorInvalidValue`. To create a memory pool targeting a
     specific host NUMA node, applications must set
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type to
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` to
     :py:obj:`~.cudaMemLocationTypeHostNuma` and
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::id must specify the NUMA
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.id` must specify the NUMA
     ID of the host memory node. Specifying
     :py:obj:`~.cudaMemLocationTypeHostNumaCurrent` as the
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type will result in
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` will result in
     :py:obj:`~.cudaErrorInvalidValue`. By default, the pool's memory will
     be accessible from the device it is allocated on. In the case of pools
     created with :py:obj:`~.cudaMemLocationTypeHostNuma` or
@@ -31628,11 +31638,11 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
 
     To create a managed memory pool, applications must set
-    :py:obj:`~.cudaMemPoolProps`:cudaMemAllocationType to
+    :py:obj:`~.cudaMemPoolProps.py`:obj:`~.cudaMemAllocationType` to
     :py:obj:`~.cudaMemAllocationTypeManaged`.
-    :py:obj:`~.cudaMemPoolProps`::cudaMemAllocationHandleType must also be
+    :py:obj:`~.cudaMemPoolProps.cudaMemAllocationHandleType` must also be
     set to :py:obj:`~.cudaMemHandleTypeNone` since IPC is not supported.
-    For managed memory pools, :py:obj:`~.cudaMemPoolProps`::cudaMemLocation
+    For managed memory pools, :py:obj:`~.cudaMemPoolProps.cudaMemLocation`
     will be treated as the preferred location for all allocations created
     from the pool. An application can also set
     :py:obj:`~.cudaMemLocationTypeNone` to indicate no preferred location.
@@ -32817,41 +32827,41 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypeArray`,
-    :py:obj:`~.cudaResourceDesc`::res::array::array must be set to a valid
+    :py:obj:`~.cudaResourceDesc.res.array.array` must be set to a valid
     CUDA array handle.
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypeMipmappedArray`,
-    :py:obj:`~.cudaResourceDesc`::res::mipmap::mipmap must be set to a
-    valid CUDA mipmapped array handle and
+    :py:obj:`~.cudaResourceDesc.res.mipmap.mipmap` must be set to a valid
+    CUDA mipmapped array handle and
     :py:obj:`~.cudaTextureDesc.normalizedCoords` must be set to true.
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypeLinear`,
-    :py:obj:`~.cudaResourceDesc`::res::linear::devPtr must be set to a
-    valid device pointer, that is aligned to
+    :py:obj:`~.cudaResourceDesc.res.linear.devPtr` must be set to a valid
+    device pointer, that is aligned to
     :py:obj:`~.cudaDeviceProp.textureAlignment`.
-    :py:obj:`~.cudaResourceDesc`::res::linear::desc describes the format
-    and the number of components per array element.
-    :py:obj:`~.cudaResourceDesc`::res::linear::sizeInBytes specifies the
-    size of the array in bytes. The total number of elements in the linear
+    :py:obj:`~.cudaResourceDesc.res.linear.desc` describes the format and
+    the number of components per array element.
+    :py:obj:`~.cudaResourceDesc.res.linear.sizeInBytes` specifies the size
+    of the array in bytes. The total number of elements in the linear
     address range cannot exceed
     :py:obj:`~.cudaDeviceGetTexture1DLinearMaxWidth()`. The number of
     elements is computed as (sizeInBytes / sizeof(desc)).
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypePitch2D`,
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::devPtr must be set to a
-    valid device pointer, that is aligned to
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.devPtr` must be set to a valid
+    device pointer, that is aligned to
     :py:obj:`~.cudaDeviceProp.textureAlignment`.
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::desc describes the format
-    and the number of components per array element.
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::width and
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::height specify the width
-    and height of the array in elements, and cannot exceed
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.desc` describes the format and
+    the number of components per array element.
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.width` and
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.height` specify the width and
+    height of the array in elements, and cannot exceed
     :py:obj:`~.cudaDeviceProp.maxTexture2DLinear`[0] and
     :py:obj:`~.cudaDeviceProp.maxTexture2DLinear`[1] respectively.
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::pitchInBytes specifies the
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.pitchInBytes` specifies the
     pitch between two rows in bytes and has to be aligned to
     :py:obj:`~.cudaDeviceProp.texturePitchAlignment`. Pitch cannot exceed
     :py:obj:`~.cudaDeviceProp.maxTexture2DLinear`[2].
@@ -33199,7 +33209,7 @@ def cudaCreateSurfaceObject(pResDesc : Optional[cudaResourceDesc]):
     describes the data to perform surface load/stores on.
     :py:obj:`~.cudaResourceDesc.resType` must be
     :py:obj:`~.cudaResourceTypeArray` and
-    :py:obj:`~.cudaResourceDesc`::res::array::array must be set to a valid
+    :py:obj:`~.cudaResourceDesc.res.array.array` must be set to a valid
     CUDA array handle.
 
     Surface objects are only supported on devices of compute capability 3.0
@@ -38320,8 +38330,8 @@ def cudaGraphDebugDotPrint(graph, char* path, unsigned int flags):
     path : bytes
         The path to write the DOT file to
     flags : unsigned int
-        Flags from cudaGraphDebugDotFlags for specifying which additional
-        node information to write
+        Flags from :py:obj:`~.cudaGraphDebugDotFlags` for specifying which
+        additional node information to write
 
     Returns
     -------
@@ -39766,7 +39776,7 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
 
     - :py:obj:`~.cudaFuncAttributeClusterSchedulingPolicyPreference`: The
       block scheduling policy of a function. The value type is
-      cudaClusterSchedulingPolicy.
+      :py:obj:`~.cudaClusterSchedulingPolicy`.
 
     Parameters
     ----------
@@ -39830,7 +39840,7 @@ def cudaDeviceGetDevResource(int device, typename not None : cudaDevResourceType
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     resource : :py:obj:`~.cudaDevResource`
-        Output pointer to a cudaDevResource structure
+        Output pointer to a :py:obj:`~.cudaDevResource` structure
 
     See Also
     --------
@@ -40351,7 +40361,7 @@ def cudaExecutionCtxGetDevResource(ctx, typename not None : cudaDevResourceType)
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     resource : :py:obj:`~.cudaDevResource`
-        Output pointer to a cudaDevResource structure
+        Output pointer to a :py:obj:`~.cudaDevResource` structure
 
     See Also
     --------
@@ -40601,7 +40611,7 @@ def cudaStreamGetDevResource(hStream, typename not None : cudaDevResourceType):
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorDeviceUninitialized`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidHandle`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorCallRequiresNewerDriver`,
     resource : :py:obj:`~.cudaDevResource`
-        Output pointer to a cudaDevResource structure
+        Output pointer to a :py:obj:`~.cudaDevResource` structure
 
     See Also
     --------
@@ -40749,8 +40759,9 @@ def cudaDeviceGetExecutionCtx(int device):
 
     Returns in `ctx` the execution context for the specified device. This
     is the device's primary context. The returned context can then be
-    passed to APIs that take in a cudaExecutionContext_t enabling explicit
-    context-based programming without relying on thread-local state.
+    passed to APIs that take in a :py:obj:`~.cudaExecutionContext_t`
+    enabling explicit context-based programming without relying on thread-
+    local state.
 
     Passing the returned execution context to
     :py:obj:`~.cudaExecutionCtxDestroy()` is not allowed and will result in
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index 49c633aa07..89994ff4cb 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -146,7 +146,11 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUctx_flags.CU_CTX_BLOCKING_SYNC
 
 
-        Set blocking synchronization as default scheduling [Deprecated]
+        Set blocking synchronization as default scheduling
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.driver.CUctx_flags.CU_CTX_SCHED_MASK
@@ -406,25 +410,25 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUstreamWaitValue_flags.CU_STREAM_WAIT_VALUE_GEQ
 
 
-        Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit values). Note this is a cyclic comparison which ignores wraparound. (Default behavior.)
+        Wait until (int32_t)(\*addr - value) >= 0 (or int64_t for 64 bit values). Note this is a cyclic comparison which ignores wraparound. (Default behavior.)
 
 
     .. autoattribute:: cuda.bindings.driver.CUstreamWaitValue_flags.CU_STREAM_WAIT_VALUE_EQ
 
 
-        Wait until *addr == value.
+        Wait until \*addr == value.
 
 
     .. autoattribute:: cuda.bindings.driver.CUstreamWaitValue_flags.CU_STREAM_WAIT_VALUE_AND
 
 
-        Wait until (*addr & value) != 0.
+        Wait until (\*addr & value) != 0.
 
 
     .. autoattribute:: cuda.bindings.driver.CUstreamWaitValue_flags.CU_STREAM_WAIT_VALUE_NOR
 
 
-        Wait until ~(*addr | value) != 0. Support for this operation can be queried with :py:obj:`~.cuDeviceGetAttribute()` and :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR`.
+        Wait until ~(\*addr | value) != 0. Support for this operation can be queried with :py:obj:`~.cuDeviceGetAttribute()` and :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUstreamWaitValue_flags.CU_STREAM_WAIT_VALUE_FLUSH
@@ -506,19 +510,19 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUstreamAtomicReductionOpType.CU_STREAM_ATOMIC_REDUCTION_OP_OR
 
 
-        Performs an atomic OR: *(address) = *(address) | value
+        Performs an atomic OR: \*(address) = \*(address) | value
 
 
     .. autoattribute:: cuda.bindings.driver.CUstreamAtomicReductionOpType.CU_STREAM_ATOMIC_REDUCTION_OP_AND
 
 
-        Performs an atomic AND: *(address) = *(address) & value
+        Performs an atomic AND: \*(address) = \*(address) & value
 
 
     .. autoattribute:: cuda.bindings.driver.CUstreamAtomicReductionOpType.CU_STREAM_ATOMIC_REDUCTION_OP_ADD
 
 
-        Performs an atomic ADD: *(address) = *(address) + value
+        Performs an atomic ADD: \*(address) = \*(address) + value
 
 .. autoclass:: cuda.bindings.driver.CUstreamAtomicReductionDataType
 
@@ -2081,7 +2085,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE
 
 
-        Returns in `*data` a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression.
+        Returns in ``*data`` a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression.
 
 .. autoclass:: cuda.bindings.driver.CUfunction_attribute
 
@@ -2118,13 +2122,13 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION
 
 
-        The PTX virtual architecture version for which the function was compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA 3.0.
+        The PTX virtual architecture version for which the function was compiled. This value is the major PTX version \* 10 + the minor PTX version, so a PTX version 1.3 function would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA 3.0.
 
 
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION
 
 
-        The binary architecture version for which the function was compiled. This value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary architecture version.
+        The binary architecture version for which the function was compiled. This value is the major binary version \* 10 + the minor binary version, so a binary version 1.3 function would return the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary architecture version.
 
 
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
@@ -2206,7 +2210,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
 
 
-        The block scheduling policy of a function. The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+        The block scheduling policy of a function. The value type is :py:obj:`~.CUclusterSchedulingPolicy` / cudaClusterSchedulingPolicy. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
 
 
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED
@@ -2451,7 +2455,7 @@ Data types used by CUDA driver
 
         Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option :py:obj:`~.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES`)
 
-        Option type: char *
+        Option type: char \*
 
         Applies to: compiler and linker
 
@@ -2473,7 +2477,7 @@ Data types used by CUDA driver
 
         Pointer to a buffer in which to print any log messages that reflect errors (the buffer size is specified via option :py:obj:`~.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES`)
 
-        Option type: char *
+        Option type: char \*
 
         Applies to: compiler and linker
 
@@ -2523,7 +2527,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_FALLBACK_STRATEGY
 
 
-        Specifies choice of fallback strategy if matching cubin is not found. Choice is based on supplied :py:obj:`~.CUjit_fallback`. This option cannot be used with cuLink* APIs as the linker requires exact matches.
+        Specifies choice of fallback strategy if matching cubin is not found. Choice is based on supplied :py:obj:`~.CUjit_fallback`. This option cannot be used with cuLink\* APIs as the linker requires exact matches.
 
         Option type: unsigned int for enumerated type :py:obj:`~.CUjit_fallback`
 
@@ -2563,7 +2567,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_CACHE_MODE
 
 
-        Specifies whether to enable caching explicitly (-dlcm) 
+        Specifies whether to enable caching explicitly (-dlcm)
 
         Choice is based on supplied :py:obj:`~.CUjit_cacheMode_enum`.
 
@@ -2597,7 +2601,7 @@ Data types used by CUDA driver
 
         It is illegal to register the same device symbol at multiple addresses.
 
-        Option type: const char **
+        Option type: const char \*\*
 
         Applies to: dynamic linker only
 
@@ -2609,7 +2613,7 @@ Data types used by CUDA driver
 
         Must contain :py:obj:`~.CU_JIT_GLOBAL_SYMBOL_COUNT` entries.
 
-        Option type: void **
+        Option type: void \*\*
 
         Applies to: dynamic linker only
 
@@ -3240,7 +3244,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF
 
 
-        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If `size` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
+        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If ``size`` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE
@@ -3384,7 +3388,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphDependencyType.CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC
 
 
-        This dependency type allows the downstream node to use `cudaGridDependencySynchronize()`. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC` or :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER` outgoing port.
+        This dependency type allows the downstream node to use ``cudaGridDependencySynchronize()``. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC` or :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER` outgoing port.
 
 .. autoclass:: cuda.bindings.driver.CUgraphInstantiateResult
 
@@ -3553,7 +3557,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
 
 
-        Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. 
+        Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks.
 
          The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set).
 
@@ -3579,21 +3583,21 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
 
 
-        Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. 
+        Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible.
 
-         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. 
+         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks.
 
-         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
+         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than ``maxBlocksPerCluster``, if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
 
 
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT
 
 
-        Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. 
+        Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event.
 
-         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. 
+         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock.
 
-         A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. 
+         A launch completion event is nominally similar to a programmatic event with ``triggerAtBlockStart`` set except that it is not visible to ``cudaGridDependencySynchronize()`` and can be used with compute capability less than 9.0.
 
          The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set).
 
@@ -3601,11 +3605,11 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE
 
 
-        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. 
+        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error.
 
-         :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable` can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. 
+         :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable` can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.
 
-         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. 
+         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`.
 
          If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again.
 
@@ -3619,13 +3623,13 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING
 
 
-        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. 
+        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization.
 
 
 
-         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. 
+         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption.
 
-         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
+         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application.
 
          Valid values for :py:obj:`~.CUlaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled).
 
@@ -3746,7 +3750,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlibraryOption.CU_LIBRARY_BINARY_IS_PRESERVED
 
 
-        Specifes that the argument `code` passed to :py:obj:`~.cuLibraryLoadData()` will be preserved. Specifying this option will let the driver know that `code` can be accessed at any point until :py:obj:`~.cuLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of `code`. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cuLibraryLoadFromFile()` is invalid and will return :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+        Specifes that the argument ``code`` passed to :py:obj:`~.cuLibraryLoadData()` will be preserved. Specifying this option will let the driver know that ``code`` can be accessed at any point until :py:obj:`~.cuLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of ``code``. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cuLibraryLoadFromFile()` is invalid and will return :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUlibraryOption.CU_LIBRARY_NUM_OPTIONS
@@ -3858,7 +3862,11 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
 
 
-        This indicated that the context being supplied as a parameter to the API call was already the active context. [Deprecated]
+        This indicated that the context being supplied as a parameter to the API call was already the active context.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_MAP_FAILED
@@ -4890,7 +4898,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
 
 
-        Allows a fabric handle to be used for exporting. (CUmemFabricHandle)
+        Allows a fabric handle to be used for exporting. (:py:obj:`~.CUmemFabricHandle`)
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX
@@ -5128,43 +5136,43 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD
 
 
-        (value type = cuuint64_t) Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next call to stream, event or context synchronize. (default 0)
+        (value type = :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next call to stream, event or context synchronize. (default 0)
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT
 
 
-        (value type = cuuint64_t) Amount of backing memory currently allocated for the mempool.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of backing memory currently allocated for the mempool.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH
 
 
-        (value type = cuuint64_t) High watermark of backing memory allocated for the mempool since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of backing memory allocated for the mempool since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT
 
 
-        (value type = cuuint64_t) Amount of memory from the pool that is currently in use by the application.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory from the pool that is currently in use by the application.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_HIGH
 
 
-        (value type = cuuint64_t) High watermark of the amount of memory from the pool that was in use by the application since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of the amount of memory from the pool that was in use by the application since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_ALLOCATION_TYPE
 
 
-        (value type = CUmemAllocationType) The allocation type of the mempool
+        (value type = :py:obj:`~.CUmemAllocationType`) The allocation type of the mempool
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES
 
 
-        (value type = CUmemAllocationHandleType) Available export handle types for the mempool. For imported pools this value is always CU_MEM_HANDLE_TYPE_NONE as an imported pool cannot be re-exported
+        (value type = :py:obj:`~.CUmemAllocationHandleType`) Available export handle types for the mempool. For imported pools this value is always CU_MEM_HANDLE_TYPE_NONE as an imported pool cannot be re-exported
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_LOCATION_ID
@@ -5176,13 +5184,13 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_LOCATION_TYPE
 
 
-        (value type = CUmemLocationType) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be CU_MEM_LOCATION_TYPE_INVISIBLE.
+        (value type = :py:obj:`~.CUmemLocationType`) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be CU_MEM_LOCATION_TYPE_INVISIBLE.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_MAX_POOL_SIZE
 
 
-        (value type = cuuint64_t) Maximum size of the pool in bytes, this value may be higher than what was initially passed to cuMemPoolCreate due to alignment requirements. A value of 0 indicates no maximum size. For CU_MEM_ALLOCATION_TYPE_MANAGED and IPC imported pools this value will be system dependent.
+        (value type = :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes, this value may be higher than what was initially passed to cuMemPoolCreate due to alignment requirements. A value of 0 indicates no maximum size. For CU_MEM_ALLOCATION_TYPE_MANAGED and IPC imported pools this value will be system dependent.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_HW_DECOMPRESS_ENABLED
@@ -5239,7 +5247,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemcpy3DOperandType.CU_MEMCPY_OPERAND_TYPE_ARRAY
 
 
-        Memcpy operand is a CUarray.
+        Memcpy operand is a :py:obj:`~.CUarray`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemcpy3DOperandType.CU_MEMCPY_OPERAND_TYPE_MAX
@@ -5249,25 +5257,25 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT
 
 
-        (value type = cuuint64_t) Amount of memory, in bytes, currently associated with graphs
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently associated with graphs
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_USED_MEM_HIGH
 
 
-        (value type = cuuint64_t) High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT
 
 
-        (value type = cuuint64_t) Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
 
 
-        (value type = cuuint64_t) High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
 .. autoclass:: cuda.bindings.driver.CUgraphChildGraphNodeOwnership
 
@@ -5361,49 +5369,49 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS
 
 
-        Adds CUDA_KERNEL_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_KERNEL_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS
 
 
-        Adds CUDA_MEMCPY3D values to output
+        Adds :py:obj:`~.CUDA_MEMCPY3D` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS
 
 
-        Adds CUDA_MEMSET_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_MEMSET_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS
 
 
-        Adds CUDA_HOST_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_HOST_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS
 
 
-        Adds CUevent handle from record and wait nodes to output
+        Adds :py:obj:`~.CUevent` handle from record and wait nodes to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS
 
 
-        Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS
 
 
-        Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_EXT_SEM_WAIT_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES
 
 
-        Adds CUkernelNodeAttrValue values to output
+        Adds :py:obj:`~.CUkernelNodeAttrValue` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES
@@ -5466,7 +5474,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphInstantiate_flags.CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD
 
 
-        Automatically upload the graph after instantiation. Only supported by :py:obj:`~.cuGraphInstantiateWithParams`. The upload will be performed using the stream provided in `instantiateParams`.
+        Automatically upload the graph after instantiation. Only supported by :py:obj:`~.cuGraphInstantiateWithParams`. The upload will be performed using the stream provided in ``instantiateParams``.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphInstantiate_flags.CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH
@@ -6441,7 +6449,7 @@ Data types used by CUDA driver
 
 
 
-    Stream handle that can be passed as a CUstream to use an implicit stream with legacy synchronization behavior.
+    Stream handle that can be passed as a :py:obj:`~.CUstream` to use an implicit stream with legacy synchronization behavior.
 
 
 
@@ -6453,7 +6461,7 @@ Data types used by CUDA driver
 
 
 
-    Stream handle that can be passed as a CUstream to use an implicit stream with per-thread synchronization behavior.
+    Stream handle that can be passed as a :py:obj:`~.CUstream` to use an implicit stream with per-thread synchronization behavior.
 
 
 
@@ -6535,19 +6543,19 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC
 
-    When the `flags` parameter of :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+    When the ``flags`` parameter of :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
 .. autoattribute:: cuda.bindings.driver.CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
 
-    When the `flags` parameter of :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` contains this flag, it indicates that waiting on an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+    When the ``flags`` parameter of :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` contains this flag, it indicates that waiting on an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
 .. autoattribute:: cuda.bindings.driver.CUDA_NVSCISYNC_ATTR_SIGNAL
 
-    When `flags` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to this, it indicates that application needs signaler specific NvSciSyncAttr to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
+    When ``flags`` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to this, it indicates that application needs signaler specific NvSciSyncAttr to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
 
 .. autoattribute:: cuda.bindings.driver.CUDA_NVSCISYNC_ATTR_WAIT
 
-    When `flags` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to this, it indicates that application needs waiter specific NvSciSyncAttr to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
+    When ``flags`` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to this, it indicates that application needs waiter specific NvSciSyncAttr to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
 
 .. autoattribute:: cuda.bindings.driver.CU_MEM_CREATE_USAGE_TILE_POOL
 
@@ -6571,7 +6579,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CUDA_ARRAY3D_LAYERED
 
-    If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array.
+    If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` specifies the number of layers, not the depth of a 3D array.
 
 .. autoattribute:: cuda.bindings.driver.CUDA_ARRAY3D_2DARRAY
 
@@ -6643,7 +6651,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_END
 
-    End of array terminator for the `extra` parameter to :py:obj:`~.cuLaunchKernel`
+    End of array terminator for the ``extra`` parameter to :py:obj:`~.cuLaunchKernel`
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT
 
@@ -6651,7 +6659,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_POINTER
 
-    Indicator that the next value in the `extra` parameter to :py:obj:`~.cuLaunchKernel` will be a pointer to a buffer containing all kernel parameters used for launching kernel `f`. This buffer needs to honor all alignment/padding requirements of the individual parameters. If :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not also specified in the `extra` array, then :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` will have no effect.
+    Indicator that the next value in the ``extra`` parameter to :py:obj:`~.cuLaunchKernel` will be a pointer to a buffer containing all kernel parameters used for launching kernel ``f``. This buffer needs to honor all alignment/padding requirements of the individual parameters. If :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not also specified in the ``extra`` array, then :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` will have no effect.
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT
 
@@ -6659,7 +6667,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_SIZE
 
-    Indicator that the next value in the `extra` parameter to :py:obj:`~.cuLaunchKernel` will be a pointer to a size_t which contains the size of the buffer specified with :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`. It is required that :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` also be specified in the `extra` array if the value associated with :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not zero.
+    Indicator that the next value in the ``extra`` parameter to :py:obj:`~.cuLaunchKernel` will be a pointer to a size_t which contains the size of the buffer specified with :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`. It is required that :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` also be specified in the ``extra`` array if the value associated with :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not zero.
 
 .. autoattribute:: cuda.bindings.driver.CU_PARAM_TR_DEFAULT
 
@@ -6685,6 +6693,10 @@ Data types used by CUDA driver
 Error Handling
 --------------
 
+MANBRIEF error handling functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the error handling functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuGetErrorString
@@ -6693,6 +6705,10 @@ This section describes the error handling functions of the low-level CUDA driver
 Initialization
 --------------
 
+MANBRIEF initialization functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the initialization functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuInit
@@ -6700,6 +6716,10 @@ This section describes the initialization functions of the low-level CUDA driver
 Version Management
 ------------------
 
+MANBRIEF version management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the version management functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuDriverGetVersion
@@ -6707,6 +6727,10 @@ This section describes the version management functions of the low-level CUDA dr
 Device Management
 -----------------
 
+MANBRIEF device management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the device management functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuDeviceGet
@@ -6728,6 +6752,10 @@ This section describes the device management functions of the low-level CUDA dri
 Primary Context Management
 --------------------------
 
+MANBRIEF primary context management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the primary context management functions of the low-level CUDA driver application programming interface.
 
 
@@ -6743,6 +6771,10 @@ The primary context is unique per device and shared with the CUDA runtime API. T
 Context Management
 ------------------
 
+MANBRIEF context management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the context management functions of the low-level CUDA driver application programming interface.
 
 
@@ -6776,6 +6808,10 @@ Please note that some functions are described in Primary Context Management sect
 Module Management
 -----------------
 
+MANBRIEF module management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the module management functions of the low-level CUDA driver application programming interface.
 
 .. autoclass:: cuda.bindings.driver.CUmoduleLoadingMode
@@ -6810,6 +6846,10 @@ This section describes the module management functions of the low-level CUDA dri
 Library Management
 ------------------
 
+MANBRIEF library management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the library management functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuLibraryLoadData
@@ -6834,6 +6874,10 @@ This section describes the library management functions of the low-level CUDA dr
 Memory Management
 -----------------
 
+MANBRIEF memory management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the memory management functions of the low-level CUDA driver application programming interface.
 
 .. autoclass:: cuda.bindings.driver.CUmemDecompressParams_st
@@ -6944,6 +6988,10 @@ This section describes the memory management functions of the low-level CUDA dri
 Virtual Memory Management
 -------------------------
 
+MANBRIEF virtual memory management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the virtual memory management functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuMemAddressReserve
@@ -6964,16 +7012,18 @@ This section describes the virtual memory management functions of the low-level
 Stream Ordered Memory Allocator
 -------------------------------
 
-This section describes the stream ordered memory allocator exposed by the low-level CUDA driver application programming interface.
+MANBRIEF Functions for performing allocation and free operations in stream order. Functions for controlling the behavior of the underlying allocator. (CURRENT_FILE) ENDMANBRIEF
 
 
 
+This section describes the stream ordered memory allocator exposed by the low-level CUDA driver application programming interface.
 
 
-**overview**
 
 
 
+**overview**
+
 The asynchronous allocator allows the user to allocate and free in stream order. All asynchronous accesses of the allocation must happen between the stream executions of the allocation and the free. If the memory is accessed outside of the promised stream order, a use before allocation / use after free error will cause undefined behavior.
 
 The allocator is free to reallocate the memory as long as it can guarantee that compliant memory accesses will not overlap temporally. The allocator may refer to internal stream ordering as well as inter-stream dependencies (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. The allocator may also insert inter-stream dependencies to establish the temporal guarantee.
@@ -6984,8 +7034,6 @@ The allocator is free to reallocate the memory as long as it can guarantee that
 
 **Supported Platforms**
 
-
-
 Whether or not a device supports the integrated stream ordered memory allocator may be queried by calling cuDeviceGetAttribute() with the device attribute CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
 
 .. autofunction:: cuda.bindings.driver.cuMemFreeAsync
@@ -7009,16 +7057,18 @@ Whether or not a device supports the integrated stream ordered memory allocator
 Multicast Object Management
 ---------------------------
 
-This section describes the CUDA multicast object operations exposed by the low-level CUDA driver application programming interface.
+MANBRIEF Functions for creating multicast objects, adding devices to them and binding/unbinding memory (CURRENT_FILE) ENDMANBRIEF
 
 
 
+This section describes the CUDA multicast object operations exposed by the low-level CUDA driver application programming interface.
 
 
-**overview**
 
 
 
+**overview**
+
 A multicast object created via cuMulticastCreate enables certain memory operations to be broadcast to a team of devices. Devices can be added to a multicast object via cuMulticastAddDevice. Memory can be bound on each participating device via cuMulticastBindMem, cuMulticastBindMem_v2, cuMulticastBindAddr, or cuMulticastBindAddr_v2. Multicast objects can be mapped into a device's virtual address space using the virtual memmory management APIs (see cuMemMap and cuMemSetAccess).
 
 
@@ -7027,8 +7077,6 @@ A multicast object created via cuMulticastCreate enables certain memory operatio
 
 **Supported Platforms**
 
-
-
 Support for multicast on a specific device can be queried using the device attribute CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED
 
 .. autofunction:: cuda.bindings.driver.cuMulticastCreate
@@ -7043,6 +7091,10 @@ Support for multicast on a specific device can be queried using the device attri
 Logical Endpoint
 ----------------
 
+MANBRIEF logical endpoint functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the logical endpoint functions of the low-level CUDA driver application programming interface.
 
 .. autoclass:: cuda.bindings.driver.CUlogicalEndpointFabricHandle_st
@@ -7096,15 +7148,17 @@ This section describes the logical endpoint functions of the low-level CUDA driv
 Unified Addressing
 ------------------
 
-This section describes the unified addressing functions of the low-level CUDA driver application programming interface.
+MANBRIEF unified addressing functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
 
 
+This section describes the unified addressing functions of the low-level CUDA driver application programming interface.
 
 
 
-**Overview**
 
 
+**Overview**
 
 CUDA devices can share a unified address space with the host. For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below).
 
@@ -7114,8 +7168,6 @@ CUDA devices can share a unified address space with the host. For these devices
 
 **Supported Platforms**
 
-
-
 Whether or not a device supports unified addressing may be queried by calling cuDeviceGetAttribute() with the device attribute CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
 
 Unified addressing is automatically enabled in 64-bit processes
@@ -7126,8 +7178,6 @@ Unified addressing is automatically enabled in 64-bit processes
 
 **Looking Up Information from Pointer Values**
 
-
-
 It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cuPointerGetAttribute()
 
 Since pointers are unique, it is not necessary to specify information about the pointers specified to the various copy functions in the CUDA API. The function cuMemcpy() may be used to perform a copy between two pointers, ignoring whether they point to host or device memory (making cuMemcpyHtoD(), cuMemcpyDtoD(), and cuMemcpyDtoH() unnecessary for devices supporting unified addressing). For multidimensional copies, the memory type CU_MEMORYTYPE_UNIFIED may be used to specify that the CUDA driver should infer the location of the pointer from its value.
@@ -7138,8 +7188,6 @@ Since pointers are unique, it is not necessary to specify information about the
 
 **Automatic Mapping of Host Allocated Host Memory**
 
-
-
 All host memory allocated in all contexts using cuMemAllocHost() and cuMemHostAlloc() is always directly accessible from all contexts on all devices that support unified addressing. This is the case regardless of whether or not the flags CU_MEMHOSTALLOC_PORTABLE and CU_MEMHOSTALLOC_DEVICEMAP are specified.
 
 The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host, so it is not necessary to call cuMemHostGetDevicePointer() to get the device pointer for these allocations.
@@ -7152,8 +7200,6 @@ Note that this is not the case for memory allocated using the flag CU_MEMHOSTALL
 
 **Automatic Registration of Peer Memory**
 
-
-
 Upon enabling direct access from a context that supports unified addressing to another peer context that supports unified addressing using cuCtxEnablePeerAccess() all memory allocated in the peer context using cuMemAlloc() and cuMemAllocPitch() will immediately be accessible by the current context. The device pointer value through which any peer memory may be accessed in the current context is the same pointer value through which that memory may be accessed in the peer context.
 
 
@@ -7162,8 +7208,6 @@ Upon enabling direct access from a context that supports unified addressing to a
 
 **Exceptions, Disjoint Addressing**
 
-
-
 Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cuMemHostRegister() and host memory allocated using the flag CU_MEMHOSTALLOC_WRITECOMBINED. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all contexts that support unified addressing.
 
 This device address may be queried using cuMemHostGetDevicePointer() when a context using unified addressing is current. Either the host or the unified device pointer value may be used to refer to this memory through cuMemcpy() and similar functions using the CU_MEMORYTYPE_UNIFIED memory type.
@@ -7182,6 +7226,10 @@ This device address may be queried using cuMemHostGetDevicePointer() when a cont
 Stream Management
 -----------------
 
+MANBRIEF stream management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the stream management functions of the low-level CUDA driver application programming interface.
 
 .. autoclass:: cuda.bindings.driver.CUgraphRecaptureStatus
@@ -7235,6 +7283,10 @@ This section describes the stream management functions of the low-level CUDA dri
 Event Management
 ----------------
 
+MANBRIEF event management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the event management functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuEventCreate
@@ -7248,6 +7300,10 @@ This section describes the event management functions of the low-level CUDA driv
 External Resource Interoperability
 ----------------------------------
 
+MANBRIEF External resource interoperability functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the external resource interoperability functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuImportExternalMemory
@@ -7262,6 +7318,10 @@ This section describes the external resource interoperability functions of the l
 Stream Memory Operations
 ------------------------
 
+MANBRIEF Stream memory operations of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the stream memory operations of the low-level CUDA driver application programming interface.
 
 
@@ -7297,6 +7357,10 @@ Warning: Improper use of these APIs may deadlock the application. Synchronizatio
 Execution Control
 -----------------
 
+MANBRIEF execution control functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the execution control functions of the low-level CUDA driver application programming interface.
 
 .. autoclass:: cuda.bindings.driver.CUfunctionLoadingState
@@ -7327,6 +7391,10 @@ This section describes the execution control functions of the low-level CUDA dri
 Graph Management
 ----------------
 
+MANBRIEF graph management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the graph management functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuGraphCreate
@@ -7421,6 +7489,10 @@ This section describes the graph management functions of the low-level CUDA driv
 Occupancy
 ---------
 
+MANBRIEF occupancy calculation functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the occupancy calculation functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuOccupancyMaxActiveBlocksPerMultiprocessor
@@ -7434,6 +7506,10 @@ This section describes the occupancy calculation functions of the low-level CUDA
 Texture Object Management
 -------------------------
 
+MANBRIEF texture object management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the texture object management functions of the low-level CUDA driver application programming interface. The texture object API is only supported on devices of compute capability 3.0 or higher.
 
 .. autofunction:: cuda.bindings.driver.cuTexObjectCreate
@@ -7445,6 +7521,10 @@ This section describes the texture object management functions of the low-level
 Surface Object Management
 -------------------------
 
+MANBRIEF surface object management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the surface object management functions of the low-level CUDA driver application programming interface. The surface object API is only supported on devices of compute capability 3.0 or higher.
 
 .. autofunction:: cuda.bindings.driver.cuSurfObjectCreate
@@ -7454,6 +7534,10 @@ This section describes the surface object management functions of the low-level
 Tensor Map Object Managment
 ---------------------------
 
+MANBRIEF tensor map object management functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the tensor map object management functions of the low-level CUDA driver application programming interface. The tensor core API is only supported on devices of compute capability 9.0 or higher.
 
 .. autofunction:: cuda.bindings.driver.cuTensorMapEncodeTiled
@@ -7464,6 +7548,10 @@ This section describes the tensor map object management functions of the low-lev
 Peer Context Memory Access
 --------------------------
 
+MANBRIEF direct peer context memory access functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the direct peer context memory access functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuDeviceCanAccessPeer
@@ -7475,6 +7563,10 @@ This section describes the direct peer context memory access functions of the lo
 Graphics Interoperability
 -------------------------
 
+MANBRIEF graphics interoperability functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the graphics interoperability functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuGraphicsUnregisterResource
@@ -7488,6 +7580,10 @@ This section describes the graphics interoperability functions of the low-level
 Driver Entry Point Access
 -------------------------
 
+MANBRIEF driver entry point access functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the driver entry point access functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuGetProcAddress
@@ -7495,6 +7591,10 @@ This section describes the driver entry point access functions of the low-level
 Coredump Attributes Control API
 -------------------------------
 
+MANBRIEF coredump attribute control functions for the low-level CUDA API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the coredump attribute control functions of the low-level CUDA driver application programming interface.
 
 .. autoclass:: cuda.bindings.driver.CUcoredumpSettings
@@ -7573,6 +7673,10 @@ This section describes the coredump attribute control functions of the low-level
 Green Contexts
 --------------
 
+MANBRIEF Driver level API for creation and manipulation of green contexts (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the APIs for creation and manipulation of green contexts in the CUDA driver. Green contexts are a lightweight alternative to traditional contexts, that can be used to select a subset of device resources. This allows the developer to, for example, select SMs from distinct spatial partitions of the GPU and target them via CUDA stream operations, kernel launches, etc.
 
 
@@ -7693,9 +7797,9 @@ Workqueues
 
 
 
-For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG``\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
+For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG``, the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit`` field. The ``sharingScope`` field determines how workqueue resources are shared:
 
-- ``CU_WORKQUEUE_SCOPE_DEVICE_CTX:``\  Use all shared workqueue resources across all contexts (default driver behavior).
+- ``CU_WORKQUEUE_SCOPE_DEVICE_CTX:`` Use all shared workqueue resources across all contexts (default driver behavior).
 
 
 
@@ -7703,7 +7807,7 @@ For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG``\ , the resource specifies the expe
 
 
 
-- ``CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED:``\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
+- ``CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED:`` When possible, use non-overlapping workqueue resources with other balanced green contexts.
 
 
 
@@ -7719,7 +7823,7 @@ The maximum concurrency limit depends on ::CUDA_DEVICE_MAX_CONNECTIONS and can b
 
 
 
-For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE``\ , the resource represents a pre-existing workqueue that can be retrieved from existing contexts or green contexts. This allows reusing workqueue resources across different green contexts.
+For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE``, the resource represents a pre-existing workqueue that can be retrieved from existing contexts or green contexts. This allows reusing workqueue resources across different green contexts.
 
 
 
@@ -7737,7 +7841,7 @@ Even if the green contexts have disjoint SM partitions, it is not guaranteed tha
 
 Additionally, there are two known scenarios, where its possible for the workload to run on more SMs than was provisioned (but never less).
 
-- On Volta+ MPS: When ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE``\  is used, the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
+- On Volta+ MPS: When ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE`` is used, the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
 
 
 
@@ -7845,6 +7949,10 @@ Additionally, there are two known scenarios, where its possible for the workload
 Error Log Management Functions
 ------------------------------
 
+MANBRIEF error log management functions for the low-level CUDA API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the error log management functions of the low-level CUDA driver application programming interface.
 
 .. autoclass:: cuda.bindings.driver.CUlogLevel
@@ -7870,7 +7978,7 @@ CUDA API versioning support
 
 
 
-
+MANBRIEF CUDA checkpoint and restore functionality of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
 
 
 
@@ -7894,6 +8002,10 @@ Checkpoint and restore capabilities are currently restricted to Linux.
 Profiler Control
 ----------------
 
+MANBRIEF profiler control functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the profiler control functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuProfilerStart
@@ -7902,6 +8014,10 @@ This section describes the profiler control functions of the low-level CUDA driv
 EGL Interoperability
 --------------------
 
+MANBRIEF EGL interoperability functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the EGL interoperability functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuGraphicsEGLRegisterImage
@@ -7920,6 +8036,10 @@ This section describes the EGL interoperability functions of the low-level CUDA
 OpenGL Interoperability
 -----------------------
 
+MANBRIEF OpenGL interoperability functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the OpenGL interoperability functions of the low-level CUDA driver application programming interface. Note that mapping of OpenGL resources is performed with the graphics API agnostic, resource mapping interface described in Graphics Interoperability.
 
 .. autoclass:: cuda.bindings.driver.CUGLDeviceList
@@ -7948,6 +8068,10 @@ This section describes the OpenGL interoperability functions of the low-level CU
 VDPAU Interoperability
 ----------------------
 
+MANBRIEF VDPAU interoperability functions of the low-level CUDA driver API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the VDPAU interoperability functions of the low-level CUDA driver application programming interface.
 
 .. autofunction:: cuda.bindings.driver.cuVDPAUGetDevice
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index 29da14b40d..dcfd5eaac5 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -161,13 +161,21 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidHostPointer
 
 
-        This indicates that at least one host pointer passed to the API call is not a valid host pointer. [Deprecated]
+        This indicates that at least one host pointer passed to the API call is not a valid host pointer.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidDevicePointer
 
 
-        This indicates that at least one device pointer passed to the API call is not a valid device pointer. [Deprecated]
+        This indicates that at least one device pointer passed to the API call is not a valid device pointer.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidTexture
@@ -197,25 +205,41 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAddressOfConstant
 
 
-        This indicated that the user has taken the address of a constant variable, which was forbidden up until the CUDA 3.1 release. [Deprecated]
+        This indicated that the user has taken the address of a constant variable, which was forbidden up until the CUDA 3.1 release.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTextureFetchFailed
 
 
-        This indicated that a texture fetch was not able to be performed. This was previously used for device emulation of texture operations. [Deprecated]
+        This indicated that a texture fetch was not able to be performed. This was previously used for device emulation of texture operations.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTextureNotBound
 
 
-        This indicated that a texture was not bound for access. This was previously used for device emulation of texture operations. [Deprecated]
+        This indicated that a texture was not bound for access. This was previously used for device emulation of texture operations.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSynchronizationError
 
 
-        This indicated that a synchronization operation had failed. This was previously used for some device emulation functions. [Deprecated]
+        This indicated that a synchronization operation had failed. This was previously used for some device emulation functions.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidFilterSetting
@@ -233,19 +257,31 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMixedDeviceExecution
 
 
-        Mixing of device and device emulation code was not allowed. [Deprecated]
+        Mixing of device and device emulation code was not allowed.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNotYetImplemented
 
 
-        This indicates that the API call is not yet implemented. Production releases of CUDA will never return this error. [Deprecated]
+        This indicates that the API call is not yet implemented. Production releases of CUDA will never return this error.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMemoryValueTooLarge
 
 
-        This indicated that an emulated device pointer exceeded the 32-bit address range. [Deprecated]
+        This indicated that an emulated device pointer exceeded the 32-bit address range.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStubLibrary
@@ -299,7 +335,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIncompatibleDriverContext
 
 
-        This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed. Please see `Interactions with the CUDA Driver API`_ for more information.
+        This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed. Please see :py:obj:`~.Interactions with the CUDA Driver API` for more information.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMissingConfiguration
@@ -311,7 +347,11 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPriorLaunchFailure
 
 
-        This indicated that a previous kernel launch failed. This was previously used for device emulation of kernel launches. [Deprecated]
+        This indicated that a previous kernel launch failed. This was previously used for device emulation of kernel launches.
+
+
+
+        [Deprecated]
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchMaxDepthExceeded
@@ -2870,13 +2910,13 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrAllocationType
 
 
-        (value type = cudaMemAllocationType) The allocation type of the mempool
+        (value type = :py:obj:`~.cudaMemAllocationType`) The allocation type of the mempool
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrExportHandleTypes
 
 
-        (value type = cudaMemAllocationHandleType) Available export handle types for the mempool. For imported pools this value is always cudaMemHandleTypeNone as an imported pool cannot be re-exported
+        (value type = :py:obj:`~.cudaMemAllocationHandleType`) Available export handle types for the mempool. For imported pools this value is always cudaMemHandleTypeNone as an imported pool cannot be re-exported
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrLocationId
@@ -2888,7 +2928,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrLocationType
 
 
-        (value type = cudaMemLocationType) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be cudaMemLocationTypeInvisible
+        (value type = :py:obj:`~.cudaMemLocationType`) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be cudaMemLocationTypeInvisible
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrMaxPoolSize
@@ -3009,7 +3049,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeFabric
 
 
-        Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)
+        Allows a fabric handle to be used for exporting. (:py:obj:`~.cudaMemFabricHandle_t`)
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphMemAttributeType
 
@@ -3384,7 +3424,7 @@ Data types used by CUDA Runtime
 
         Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option :py:obj:`~.cudaJitInfoLogBufferSizeBytes`)
 
-        Option type: char *
+        Option type: char \*
 
         Applies to: compiler and linker
 
@@ -3406,7 +3446,7 @@ Data types used by CUDA Runtime
 
         Pointer to a buffer in which to print any log messages that reflect errors (the buffer size is specified via option :py:obj:`~.cudaJitErrorLogBufferSizeBytes`)
 
-        Option type: char *
+        Option type: char \*
 
         Applies to: compiler and linker
 
@@ -3474,7 +3514,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitCacheMode
 
 
-        Specifies whether to enable caching explicitly (-dlcm) 
+        Specifies whether to enable caching explicitly (-dlcm)
 
         Choice is based on supplied :py:obj:`~.cudaJit_CacheMode`.
 
@@ -3524,7 +3564,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLibraryOption.cudaLibraryBinaryIsPreserved
 
 
-        Specifes that the argument `code` passed to :py:obj:`~.cudaLibraryLoadData()` will be preserved. Specifying this option will let the driver know that `code` can be accessed at any point until :py:obj:`~.cudaLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of `code`. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cudaLibraryLoadFromFile()` is invalid and will return :py:obj:`~.cudaErrorInvalidValue`.
+        Specifes that the argument ``code`` passed to :py:obj:`~.cudaLibraryLoadData()` will be preserved. Specifying this option will let the driver know that ``code`` can be accessed at any point until :py:obj:`~.cudaLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of ``code``. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cudaLibraryLoadFromFile()` is invalid and will return :py:obj:`~.cudaErrorInvalidValue`.
 
 .. autoclass:: cuda.bindings.runtime.cudaJit_CacheMode
 
@@ -3594,13 +3634,13 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeKernel
 
 
-        Function handle is a cudaKernel_t
+        Function handle is a :py:obj:`~.cudaKernel_t`
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeFunction
 
 
-        Function handle is a cudaFunction_t
+        Function handle is a :py:obj:`~.cudaFunction_t`
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphConditionalHandleFlags
 
@@ -3614,7 +3654,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeIf
 
 
-        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If `size` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
+        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If ``size`` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeWhile
@@ -3778,7 +3818,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDependencyType.cudaGraphDependencyTypeProgrammatic
 
 
-        This dependency type allows the downstream node to use `cudaGridDependencySynchronize()`. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.cudaGraphKernelNodePortProgrammatic` or :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion` outgoing port.
+        This dependency type allows the downstream node to use ``cudaGridDependencySynchronize()``. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.cudaGraphKernelNodePortProgrammatic` or :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion` outgoing port.
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphExecUpdateResult
 
@@ -3970,7 +4010,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsEventNodeParams
 
 
-        Adds cudaEvent_t handle from record and wait nodes to output
+        Adds :py:obj:`~.cudaEvent_t` handle from record and wait nodes to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasSignalNodeParams
@@ -4013,19 +4053,19 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUpload
 
 
-        Automatically upload the graph after instantiation. Only supported by 
+        Automatically upload the graph after instantiation. Only supported by
 
-         :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the 
+         :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the
 
-         stream provided in `instantiateParams`.
+         stream provided in ``instantiateParams``.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch
 
 
-        Instantiate the graph to be launchable from the device. This flag can only 
+        Instantiate the graph to be launchable from the device. This flag can only
 
-         be used on platforms which support unified addressing. This flag cannot be 
+         be used on platforms which support unified addressing. This flag cannot be
 
          used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch.
 
@@ -4078,43 +4118,43 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::cooperative.
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.cooperative`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy
 
 
-        Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.
+        Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::clusterDim.
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization
 
 
-        Valid for launches. Setting :py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed to non-0 signals that the kernel will use programmatic means to resolve its stream dependency, so that the CUDA runtime should opportunistically allow the grid's execution to overlap with the previous kernel in the stream, if that kernel requests the overlap. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions).
+        Valid for launches. Setting :py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed` to non-0 signals that the kernel will use programmatic means to resolve its stream dependency, so that the CUDA runtime should opportunistically allow the grid's execution to overlap with the previous kernel in the stream, if that kernel requests the overlap. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions).
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent
 
 
-        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. 
+        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks.
 
          The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set).
 
@@ -4122,39 +4162,39 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePriority
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::priority.
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.priority`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension
 
 
-        Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. 
+        Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible.
 
-         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. 
+         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks.
 
-         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
+         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than ``maxBlocksPerCluster``, if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent
 
 
-        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the event. 
+        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event.
 
-         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. 
+         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock.
 
-         A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. 
+         A launch completion event is nominally similar to a programmatic event with ``triggerAtBlockStart`` set except that it is not visible to ``cudaGridDependencySynchronize()`` and can be used with compute capability less than 9.0.
 
          The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set).
 
@@ -4162,11 +4202,11 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode
 
 
-        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. 
+        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error.
 
-         :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. 
+         :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.
 
-         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. 
+         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`.
 
          If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again.
 
@@ -4174,27 +4214,27 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout
 
 
-        Valid for launches. On devices where the L1 cache and shared memory use the same hardware resources, setting :py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage between 0-100 signals sets the shared memory carveout preference in percent of the total shared memory for that kernel launch. This attribute takes precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`. This is only a hint, and the driver can choose a different configuration if required for the launch.
+        Valid for launches. On devices where the L1 cache and shared memory use the same hardware resources, setting :py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage between 0-100 signals sets the shared memory carveout preference in percent of the total shared memory for that kernel launch. This attribute takes precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`. This is only a hint, and the driver can choose a different configuration if required for the launch.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling
 
 
-        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. 
+        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization.
 
 
 
-         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. 
+         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption.
 
-         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
+         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application.
 
-         Valid values for :py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled).
+         Valid values for :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled).
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode
 
 
-        Valid for graph nodes, launches. This indicates whether the kernel launch is allowed to use a non-portable cluster size. Valid values for :py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will return :py:obj:`~.cudaErrorInvalidValue`
+        Valid for graph nodes, launches. This indicates whether the kernel launch is allowed to use a non-portable cluster size. Valid values for :py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will return :py:obj:`~.cudaErrorInvalidValue`
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeSharedMemoryMode
@@ -5136,7 +5176,7 @@ Data types used by CUDA Runtime
 
 
 
-    Stream handle that can be passed as a cudaStream_t to use an implicit stream with legacy synchronization behavior.
+    Stream handle that can be passed as a :py:obj:`~.cudaStream_t` to use an implicit stream with legacy synchronization behavior.
 
 
 
@@ -5148,7 +5188,7 @@ Data types used by CUDA Runtime
 
 
 
-    Stream handle that can be passed as a cudaStream_t to use an implicit stream with per-thread synchronization behavior.
+    Stream handle that can be passed as a :py:obj:`~.cudaStream_t` to use an implicit stream with per-thread synchronization behavior.
 
 
 
@@ -5204,7 +5244,11 @@ Data types used by CUDA Runtime
 
 .. autoattribute:: cuda.bindings.runtime.cudaDeviceBlockingSync
 
-    Device flag - Use blocking synchronization [Deprecated]
+    Device flag - Use blocking synchronization
+
+
+
+    [Deprecated]
 
 .. autoattribute:: cuda.bindings.runtime.cudaDeviceScheduleMask
 
@@ -5389,7 +5433,7 @@ impl_private
 
 
 
-
+MANBRIEF device management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
 
 
 
@@ -5433,6 +5477,10 @@ This section describes the device management functions of the CUDA runtime appli
 Error Handling
 --------------
 
+MANBRIEF error handling functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the error handling functions of the CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaGetLastError
@@ -5443,6 +5491,10 @@ This section describes the error handling functions of the CUDA runtime applicat
 Stream Management
 -----------------
 
+MANBRIEF stream management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the stream management functions of the CUDA runtime application programming interface.
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphRecaptureCallbackData
@@ -5477,6 +5529,10 @@ This section describes the stream management functions of the CUDA runtime appli
 Event Management
 ----------------
 
+MANBRIEF event management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the event management functions of the CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaEventCreate
@@ -5491,6 +5547,10 @@ This section describes the event management functions of the CUDA runtime applic
 External Resource Interoperability
 ----------------------------------
 
+MANBRIEF External resource interoperability functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the external resource interoperability functions of the CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaImportExternalMemory
@@ -5505,6 +5565,10 @@ This section describes the external resource interoperability functions of the C
 Execution Control
 -----------------
 
+MANBRIEF execution control functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the execution control functions of the CUDA runtime application programming interface.
 
 
@@ -5521,6 +5585,10 @@ Some functions have overloaded C++ API template versions documented separately i
 Occupancy
 ---------
 
+MANBRIEF occupancy calculation functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the occupancy calculation functions of the CUDA runtime application programming interface.
 
 
@@ -5538,6 +5606,10 @@ See cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlock
 Memory Management
 -----------------
 
+MANBRIEF memory management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the memory management functions of the CUDA runtime application programming interface.
 
 
@@ -5608,10 +5680,14 @@ Some functions have overloaded C++ API template versions documented separately i
 Stream Ordered Memory Allocator
 -------------------------------
 
-**overview**
+MANBRIEF Functions for performing allocation and free operations in stream order. Functions for controlling the behavior of the underlying allocator. (CURRENT_FILE) ENDMANBRIEF
+
 
 
 
+
+**overview**
+
 The asynchronous allocator allows the user to allocate and free in stream order. All asynchronous accesses of the allocation must happen between the stream executions of the allocation and the free. If the memory is accessed outside of the promised stream order, a use before allocation / use after free error will cause undefined behavior.
 
 The allocator is free to reallocate the memory as long as it can guarantee that compliant memory accesses will not overlap temporally. The allocator may refer to internal stream ordering as well as inter-stream dependencies (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. The allocator may also insert inter-stream dependencies to establish the temporal guarantee.
@@ -5622,8 +5698,6 @@ The allocator is free to reallocate the memory as long as it can guarantee that
 
 **Supported Platforms**
 
-
-
 Whether or not a device supports the integrated stream ordered memory allocator may be queried by calling cudaDeviceGetAttribute() with the device attribute cudaDevAttrMemoryPoolsSupported.
 
 .. autofunction:: cuda.bindings.runtime.cudaMallocAsync
@@ -5647,17 +5721,19 @@ Whether or not a device supports the integrated stream ordered memory allocator
 Unified Addressing
 ------------------
 
-This section describes the unified addressing functions of the CUDA runtime application programming interface.
+MANBRIEF unified addressing functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
 
 
 
+This section describes the unified addressing functions of the CUDA runtime application programming interface.
+
 
 
-**Overview**
 
 
+**Overview**
 
-CUDA devices can share a unified address space with the host. 
+CUDA devices can share a unified address space with the host.
 
  For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below).
 
@@ -5667,8 +5743,6 @@ CUDA devices can share a unified address space with the host.
 
 **Supported Platforms**
 
-
-
 Whether or not a device supports unified addressing may be queried by calling cudaGetDeviceProperties() with the device property cudaDeviceProp::unifiedAddressing.
 
 Unified addressing is automatically enabled in 64-bit processes .
@@ -5679,11 +5753,9 @@ Unified addressing is automatically enabled in 64-bit processes .
 
 **Looking Up Information from Pointer Values**
 
-
-
 It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes()
 
-Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. 
+Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions.
 
  The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value.
 
@@ -5693,11 +5765,9 @@ Since pointers are unique, it is not necessary to specify information about the
 
 **Automatic Mapping of Host Allocated Host Memory**
 
-
-
 All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified.
 
-The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. 
+The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations.
 
 
 
@@ -5709,8 +5779,6 @@ Note that this is not the case for memory allocated using the flag cudaHostAlloc
 
 **Direct Access of Peer Memory**
 
-
-
 Upon enabling direct access from a device that supports unified addressing to another peer device that supports unified addressing using cudaDeviceEnablePeerAccess() all memory allocated in the peer device using cudaMalloc() and cudaMallocPitch() will immediately be accessible by the current device. The device pointer value through which any peer's memory may be accessed in the current device is the same pointer value through which that memory may be accessed from the peer device.
 
 
@@ -5719,9 +5787,7 @@ Upon enabling direct access from a device that supports unified addressing to an
 
 **Exceptions, Disjoint Addressing**
 
-
-
-Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. 
+Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing.
 
 
 
@@ -5732,6 +5798,10 @@ This device address may be queried using cudaHostGetDevicePointer() when a devic
 Peer Device Memory Access
 -------------------------
 
+MANBRIEF peer device memory access functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the peer device memory access functions of the CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaDeviceCanAccessPeer
@@ -5819,6 +5889,10 @@ This section describes the EGL interoperability functions of the CUDA runtime ap
 Graphics Interoperability
 -------------------------
 
+MANBRIEF graphics interoperability functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the graphics interoperability functions of the CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaGraphicsUnregisterResource
@@ -5832,6 +5906,10 @@ This section describes the graphics interoperability functions of the CUDA runti
 Texture Object Management
 -------------------------
 
+MANBRIEF texture object management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the low level texture object management functions of the CUDA runtime application programming interface. The texture object API is only supported on devices of compute capability 3.0 or higher.
 
 .. autofunction:: cuda.bindings.runtime.cudaGetChannelDesc
@@ -5845,6 +5923,10 @@ This section describes the low level texture object management functions of the
 Surface Object Management
 -------------------------
 
+MANBRIEF surface object management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the low level texture object management functions of the CUDA runtime application programming interface. The surface object API is only supported on devices of compute capability 3.0 or higher.
 
 .. autofunction:: cuda.bindings.runtime.cudaCreateSurfaceObject
@@ -5863,6 +5945,10 @@ Version Management
 Error Log Management Functions
 ------------------------------
 
+MANBRIEF error log management interface for the CUDA Runtime and Driver (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the error log management functions of the CUDA runtime application programming interface. The Error Log Management interface will operate on both the CUDA Driver and CUDA Runtime.
 
 .. autoclass:: cuda.bindings.runtime.cudaLogsCallback_t
@@ -5875,6 +5961,10 @@ This section describes the error log management functions of the CUDA runtime ap
 Graph Management
 ----------------
 
+MANBRIEF graph management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the graph management functions of CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaGraphCreate
@@ -5970,6 +6060,10 @@ This section describes the graph management functions of CUDA runtime applicatio
 Driver Entry Point Access
 -------------------------
 
+MANBRIEF driver entry point access functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the driver entry point access functions of CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaGetDriverEntryPoint
@@ -5978,6 +6072,10 @@ This section describes the driver entry point access functions of CUDA runtime a
 Library Management
 ------------------
 
+MANBRIEF library management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the library management functions of the CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaLibraryLoadData
@@ -5994,15 +6092,17 @@ This section describes the library management functions of the CUDA runtime appl
 Execution Context Management
 ----------------------------
 
-This section describes the execution context management functions of the CUDA runtime application programming interface.
+MANBRIEF execution context management functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
 
 
+This section describes the execution context management functions of the CUDA runtime application programming interface.
 
 
 
-**Overview**
 
 
+**Overview**
 
 A CUDA execution context cudaExecutionContext_t serves as an abstraction for the contexts exposed by the CUDA Runtime, specifically green contexts and the primary context, and provides a unified programming model and API interface for contexts in the Runtime.
 
@@ -6052,7 +6152,7 @@ Once you have an execution context at hand, you can perform context-level operat
 
 
 
-- Performing context-level graph node operations via cudaGraphAddNode by specifying the context in ``nodeParams``\ . Note that individual node creation APIs, such as cudaGraphAddKernelNode, do not support specifying an execution context.
+- Performing context-level graph node operations via cudaGraphAddNode by specifying the context in ``nodeParams``. Note that individual node creation APIs, such as cudaGraphAddKernelNode, do not support specifying an execution context.
 
 
 
@@ -6072,8 +6172,6 @@ Note: Developers should treat cudaExecutionContext_t as an opaque handle and avo
 
 **Lifetime of CUDA Resources**
 
-
-
 The lifetime of CUDA resources (memory, streams, events, modules, etc) is not tied to the lifetime of the execution context. Their lifetime is tied to the device against which they were created. As such, usage of cudaDeviceReset() should be avoided to persist the lifetime of these resources.
 
 
@@ -6082,16 +6180,12 @@ The lifetime of CUDA resources (memory, streams, events, modules, etc) is not ti
 
 **APIs Operating on Current Context**
 
-
-
 The CUDA runtime does not provide a way to set an execution context as current. Since, the majority of the runtime APIs operate on the current context, we document below how the developer can work with these APIs.
 
 
 
 **APIs Operating on Device Resources**
 
-
-
 To work with these APIs (for example, cudaMalloc, cudaEventCreate, etc), developers are expected to call cudaSetDevice() prior to invoking them. Doing so does not impact functional correctness as these APIs operate on resources that are device-wide. If users have a context handle at hand, they can get the device handle from the context handle using cudaExecutionCtxGetDevice().
 
 
@@ -6100,8 +6194,6 @@ To work with these APIs (for example, cudaMalloc, cudaEventCreate, etc), develop
 
 **APIs Operating on Context Resources**
 
-
-
 These APIs (for example, cudaLaunchKernel, cudaMemcpyAsync, cudaMemsetAsync, etc) take in a stream and resources are inferred from the context bound to the stream at creation. See cudaExecutionCtxStreamCreate for more details. Developers are expected to use the stream-based APIs for context awareness and always pass an explicit stream handle to ensure context-awareness, and avoid reliance on the default NULL stream, which implicitly binds to the current context.
 
 
@@ -6112,8 +6204,6 @@ These APIs (for example, cudaLaunchKernel, cudaMemcpyAsync, cudaMemsetAsync, etc
 
 **Green Contexts**
 
-
-
 Green contexts are a lightweight alternative to traditional contexts, that can be used to select a subset of device resources. This allows the developer to, for example, select SMs from distinct spatial partitions of the GPU and target them via CUDA stream operations, kernel launches, etc.
 
 Here are the broad initial steps to follow to get started:
@@ -6216,9 +6306,9 @@ There are two possible partition operations - with cudaDevSmResourceSplitByCount
 
 Workqueues
 
-For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
+For ``cudaDevResourceTypeWorkqueueConfig``, the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit`` field. The ``sharingScope`` field determines how workqueue resources are shared:
 
-- ``cudaDevWorkqueueConfigScopeDeviceCtx:``\  Use all shared workqueue resources across all contexts (default driver behavior).
+- ``cudaDevWorkqueueConfigScopeDeviceCtx:`` Use all shared workqueue resources across all contexts (default driver behavior).
 
 
 
@@ -6226,7 +6316,7 @@ For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expecte
 
 
 
-- ``cudaDevWorkqueueConfigScopeGreenCtxBalanced:``\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
+- ``cudaDevWorkqueueConfigScopeGreenCtxBalanced:`` When possible, use non-overlapping workqueue resources with other balanced green contexts.
 
 
 
@@ -6238,17 +6328,17 @@ For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expecte
 
 The maximum concurrency limit depends on ::CUDA_DEVICE_MAX_CONNECTIONS and can be queried from the device via cudaDeviceGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
 
-For ``cudaDevResourceTypeWorkqueue``\ , the resource represents a pre-existing workqueue that can be retrieved from existing execution contexts. This allows reusing workqueue resources across different execution contexts.
+For ``cudaDevResourceTypeWorkqueue``, the resource represents a pre-existing workqueue that can be retrieved from existing execution contexts. This allows reusing workqueue resources across different execution contexts.
 
 On Concurrency
 
-Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and ``cudaDevWorkqueueConfigScopeGreenCtxBalanced``\  workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
+Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and ``cudaDevWorkqueueConfigScopeGreenCtxBalanced`` workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
 
 Additionally, there are two known scenarios, where its possible for the workload to run on more SMs than was provisioned (but never less).
 
 
 
-- On Volta+ MPS: When ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE``\  is used, the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
+- On Volta+ MPS: When ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE`` is used, the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
 
 
 
@@ -6281,26 +6371,28 @@ impl_private
 
 
 
+MANBRIEF C++ high level API functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
 
 
 
-
-This section describes the C++ high level API functions of the CUDA runtime application programming interface. To use these functions, your application needs to be compiled with the ``nvcc``\  compiler.
+This section describes the C++ high level API functions of the CUDA runtime application programming interface. To use these functions, your application needs to be compiled with the ``nvcc`` compiler.
 
 
 Interactions with the CUDA Driver API
 -------------------------------------
 
-This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
+MANBRIEF interactions between CUDA Driver API and CUDA Runtime API (CURRENT_FILE) ENDMANBRIEF
 
 
 
+This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
 
 
-**Execution Contexts**
 
 
 
+**Execution Contexts**
+
 The CUDA Runtime provides cudaExecutionContext_t as an abstraction over driver-level contexts—specifically, green contexts and the primary context.
 
 There are two primary ways to obtain an execution context:
@@ -6331,8 +6423,6 @@ Note: Developers should treat cudaExecutionContext_t as an opaque handle and avo
 
 **Primary Context (aka Device Execution Context)**
 
-
-
 The primary context is the default execution context associated with a device in the Runtime. It can be obtained via a call to cudaDeviceGetExecutionCtx(). There is a one-to-one mapping between CUDA devices in the runtime and their primary contexts within a process.
 
 From the CUDA Runtime’s perspective, a device and its primary context are functionally synonymous.
@@ -6345,8 +6435,6 @@ Unless explicitly overridden, either by making a different context current via t
 
 **Initialization and Tear-Down**
 
-
-
 Unless an explicit execution context is specified (see “Execution Context Management” for APIs), CUDA Runtime API calls operate on the CUDA Driver ::CUcontext which is current to the calling host thread. If no ::CUcontext is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context (device execution context) for a device will be selected, made current to the calling thread, and initialized. The context will be initialized using the parameters specified by the CUDA Runtime API functions cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice(), ::cudaD3D11SetDirect3DDevice(), cudaGLSetGLDevice(), and cudaVDPAUSetVDPAUDevice(). Note that these functions will fail with cudaErrorSetOnActiveProcess if they are called when the primary context for the specified device has already been initialized, except for cudaSetDeviceFlags() which will simply overwrite the previous settings.
 
 The function cudaInitDevice() ensures that the primary context is initialized for the requested device but does not make it current to the calling thread.
@@ -6363,8 +6451,6 @@ Note that primary contexts are shared resources. It is recommended that the prim
 
 **CUcontext Interoperability**
 
-
-
 Note that the use of multiple ::CUcontext s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended to either use execution contexts cudaExecutionContext_t or the implicit one-to-one device-to-primary context mapping for the process provided by the CUDA Runtime API.
 
 If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections.
@@ -6381,8 +6467,6 @@ Please note that attaching to legacy CUcontext (those with a version of 3010 as
 
 **Interactions between CUstream and cudaStream_t**
 
-
-
 The types ::CUstream and cudaStream_t are identical and may be used interchangeably.
 
 
@@ -6391,8 +6475,6 @@ The types ::CUstream and cudaStream_t are identical and may be used interchangea
 
 **Interactions between CUevent and cudaEvent_t**
 
-
-
 The types ::CUevent and cudaEvent_t are identical and may be used interchangeably.
 
 
@@ -6401,13 +6483,11 @@ The types ::CUevent and cudaEvent_t are identical and may be used interchangeabl
 
 **Interactions between CUarray and cudaArray_t**
 
+The types ::CUarray and struct ::cudaArray \* represent the same data type and may be used interchangeably by casting the two types between each other.
 
+In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray \*, it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray \*.
 
-The types ::CUarray and struct ::cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other.
-
-In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *, it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
-
-In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray, it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
+In order to use a struct ::cudaArray \* in a CUDA Driver API function which takes a ::CUarray, it is necessary to explicitly cast the struct ::cudaArray \* to a ::CUarray .
 
 
 
@@ -6415,8 +6495,6 @@ In order to use a struct ::cudaArray * in a CUDA Driver API function which takes
 
 **Interactions between CUgraphicsResource and cudaGraphicsResource_t**
 
-
-
 The types ::CUgraphicsResource and cudaGraphicsResource_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
 In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource to a cudaGraphicsResource_t.
@@ -6429,8 +6507,6 @@ In order to use a cudaGraphicsResource_t in a CUDA Driver API function which tak
 
 **Interactions between CUtexObject and cudaTextureObject_t**
 
-
-
 The types ::CUtexObject and cudaTextureObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
 In order to use a ::CUtexObject in a CUDA Runtime API function which takes a cudaTextureObject_t, it is necessary to explicitly cast the ::CUtexObject to a cudaTextureObject_t.
@@ -6443,8 +6519,6 @@ In order to use a cudaTextureObject_t in a CUDA Driver API function which takes
 
 **Interactions between CUsurfObject and cudaSurfaceObject_t**
 
-
-
 The types ::CUsurfObject and cudaSurfaceObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
 In order to use a ::CUsurfObject in a CUDA Runtime API function which takes a cudaSurfaceObject_t, it is necessary to explicitly cast the ::CUsurfObject to a cudaSurfaceObject_t.
@@ -6457,8 +6531,6 @@ In order to use a cudaSurfaceObject_t in a CUDA Driver API function which takes
 
 **Interactions between CUfunction and cudaFunction_t**
 
-
-
 The types ::CUfunction and cudaFunction_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
 In order to use a cudaFunction_t in a CUDA Driver API function which takes a ::CUfunction, it is necessary to explicitly cast the cudaFunction_t to a ::CUfunction.
@@ -6469,8 +6541,6 @@ In order to use a cudaFunction_t in a CUDA Driver API function which takes a ::C
 
 **Interactions between CUkernel and cudaKernel_t**
 
-
-
 The types ::CUkernel and cudaKernel_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
 In order to use a cudaKernel_t in a CUDA Driver API function which takes a ::CUkernel, it is necessary to explicitly cast the cudaKernel_t to a ::CUkernel.
@@ -6480,6 +6550,10 @@ In order to use a cudaKernel_t in a CUDA Driver API function which takes a ::CUk
 Profiler Control
 ----------------
 
+MANBRIEF profiler control functions of the CUDA runtime API (CURRENT_FILE) ENDMANBRIEF
+
+
+
 This section describes the profiler control functions of the CUDA runtime application programming interface.
 
 .. autofunction:: cuda.bindings.runtime.cudaProfilerStart