From e6707d9bc98098fb2554d47273bd02902b12762f Mon Sep 17 00:00:00 2001
From: Chris Fregly <chris@fregly.com>
Date: Thu, 11 Jun 2026 15:35:54 -0700
Subject: [PATCH] docs(cute tutorials): note the per-load lowering cost of
 narrow TMEM_LOAD atoms; prefer wider atoms in t2r epilogues

ptxas (CUDA 13.x) lowers each tcgen05.ld of the 32dp32b1x atom through a
per-load LEPC + CALL.ABS.NOINC + WARPSYNC convergence-helper call: 256
loads/thread for a 128x256 fp32 accumulator = 256 helper calls per warp,
which can dominate a small kernel's fixed cost. Switching one line to
SM100_TMEM_LOAD_32dp32b32x (8 loads/thread) measured 1.49x on a full GEMM
kernel on GB300 (sm_103), bit-identical output; x128 regresses (serialized
register writeback). Comment-only change to the five Blackwell tutorials.
---
 examples/cute/tutorial/blackwell/01_mma_sm100.cu          | 8 ++++++++
 examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu      | 8 ++++++++
 .../cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu | 8 ++++++++
 examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu  | 8 ++++++++
 examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu  | 8 ++++++++
 5 files changed, 40 insertions(+)

diff --git a/examples/cute/tutorial/blackwell/01_mma_sm100.cu b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
index af3391eb06..3ee938b42e 100644
--- a/examples/cute/tutorial/blackwell/01_mma_sm100.cu
+++ b/examples/cute/tutorial/blackwell/01_mma_sm100.cu
@@ -300,6 +300,14 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // Step 3: The Epilogue.
 
   // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+  // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one
+  // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load
+  // through a per-load convergence-helper call, so the t2r phase can
+  // dominate small/medium kernels. Prefer a wider atom (e.g.
+  // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real
+  // epilogues; on sm_103 we measured 1.49x on the whole kernel from this
+  // one line. Very wide atoms (x128) can regress on register-writeback
+  // serialization -- sweep the width for your tile shape.
   TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
   ThrCopy   thr_t2r_copy   = tiled_t2r_copy.get_slice(threadIdx.x);
 
diff --git a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
index 3aaea7060a..651cbb3797 100644
--- a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
+++ b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu
@@ -339,6 +339,14 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // Step 3: The Epilogue.
 
   // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+  // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one
+  // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load
+  // through a per-load convergence-helper call, so the t2r phase can
+  // dominate small/medium kernels. Prefer a wider atom (e.g.
+  // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real
+  // epilogues; on sm_103 we measured 1.49x on the whole kernel from this
+  // one line. Very wide atoms (x128) can regress on register-writeback
+  // serialization -- sweep the width for your tile shape.
   TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
   ThrCopy   thr_t2r_copy   = tiled_t2r_copy.get_slice(threadIdx.x);
 
diff --git a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
index 420072274b..296e0156c5 100644
--- a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
+++ b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu
@@ -377,6 +377,14 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // Step 3: The Epilogue.
 
   // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+  // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one
+  // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load
+  // through a per-load convergence-helper call, so the t2r phase can
+  // dominate small/medium kernels. Prefer a wider atom (e.g.
+  // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real
+  // epilogues; on sm_103 we measured 1.49x on the whole kernel from this
+  // one line. Very wide atoms (x128) can regress on register-writeback
+  // serialization -- sweep the width for your tile shape.
   TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
   ThrCopy   thr_t2r_copy   = tiled_t2r_copy.get_slice(threadIdx.x);
 
diff --git a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
index 2d4799f9fe..005d189268 100644
--- a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
+++ b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu
@@ -379,6 +379,14 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   // Step 3: The Epilogue.
 
   // Create the tiled copy operation for the accumulator (TMEM -> RMEM)
+  // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one
+  // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load
+  // through a per-load convergence-helper call, so the t2r phase can
+  // dominate small/medium kernels. Prefer a wider atom (e.g.
+  // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real
+  // epilogues; on sm_103 we measured 1.49x on the whole kernel from this
+  // one line. Very wide atoms (x128) can regress on register-writeback
+  // serialization -- sweep the width for your tile shape.
   TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc);
   ThrCopy   thr_t2r_copy   = tiled_t2r_copy.get_slice(threadIdx.x);
 
diff --git a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
index 8a058c5cf5..7fb239e4e0 100644
--- a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
+++ b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu
@@ -409,6 +409,14 @@ gemm_device(ATensor mA,                      // (Gemm_M, Gemm_K)
   tma_transaction_bytes = sizeof(make_tensor_like(tGS_sC));
 
   // Partition for TMEM accumulators load (TMEM -> RMEM)
+  // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one
+  // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load
+  // through a per-load convergence-helper call, so the t2r phase can
+  // dominate small/medium kernels. Prefer a wider atom (e.g.
+  // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real
+  // epilogues; on sm_103 we measured 1.49x on the whole kernel from this
+  // one line. Very wide atoms (x128) can regress on register-writeback
+  // serialization -- sweep the width for your tile shape.
   TiledCopy t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tAcc_epi(_,_0{}));
   ThrCopy   thr_t2r  = t2r_copy.get_slice(threadIdx.x);
   Tensor tTR_tAcc = thr_t2r.partition_S(tAcc_epi);          // (TmemCpy,NumTmemCpy,NumTiles)