From e6707d9bc98098fb2554d47273bd02902b12762f Mon Sep 17 00:00:00 2001 From: Chris Fregly Date: Thu, 11 Jun 2026 15:35:54 -0700 Subject: [PATCH] docs(cute tutorials): note the per-load lowering cost of narrow TMEM_LOAD atoms; prefer wider atoms in t2r epilogues ptxas (CUDA 13.x) lowers each tcgen05.ld of the 32dp32b1x atom through a per-load LEPC + CALL.ABS.NOINC + WARPSYNC convergence-helper call: 256 loads/thread for a 128x256 fp32 accumulator = 256 helper calls per warp, which can dominate a small kernel's fixed cost. Switching one line to SM100_TMEM_LOAD_32dp32b32x (8 loads/thread) measured 1.49x on a full GEMM kernel on GB300 (sm_103), bit-identical output; x128 regresses (serialized register writeback). Comment-only change to the five Blackwell tutorials. --- examples/cute/tutorial/blackwell/01_mma_sm100.cu | 8 ++++++++ examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu | 8 ++++++++ .../cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu | 8 ++++++++ examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu | 8 ++++++++ examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu | 8 ++++++++ 5 files changed, 40 insertions(+) diff --git a/examples/cute/tutorial/blackwell/01_mma_sm100.cu b/examples/cute/tutorial/blackwell/01_mma_sm100.cu index af3391eb06..3ee938b42e 100644 --- a/examples/cute/tutorial/blackwell/01_mma_sm100.cu +++ b/examples/cute/tutorial/blackwell/01_mma_sm100.cu @@ -300,6 +300,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) // Step 3: The Epilogue. // Create the tiled copy operation for the accumulator (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc); ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x); diff --git a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu index 3aaea7060a..651cbb3797 100644 --- a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu +++ b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu @@ -339,6 +339,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) // Step 3: The Epilogue. // Create the tiled copy operation for the accumulator (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc); ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x); diff --git a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu index 420072274b..296e0156c5 100644 --- a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu +++ b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu @@ -377,6 +377,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) // Step 3: The Epilogue. // Create the tiled copy operation for the accumulator (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc); ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x); diff --git a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu index 2d4799f9fe..005d189268 100644 --- a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu +++ b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu @@ -379,6 +379,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) // Step 3: The Epilogue. // Create the tiled copy operation for the accumulator (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc); ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x); diff --git a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu index 8a058c5cf5..7fb239e4e0 100644 --- a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu +++ b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu @@ -409,6 +409,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) tma_transaction_bytes = sizeof(make_tensor_like(tGS_sC)); // Partition for TMEM accumulators load (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tAcc_epi(_,_0{})); ThrCopy thr_t2r = t2r_copy.get_slice(threadIdx.x); Tensor tTR_tAcc = thr_t2r.partition_S(tAcc_epi); // (TmemCpy,NumTmemCpy,NumTiles)