pytorch · SS-JIA · Mar 10, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 12, 2026
@@ -802,9 +802,36 @@ def check_conv_node(node: torch.fx.Node) -> bool:
 
         return True
 
+    def pick_conv_storage(
+        node: torch.fx.Node,
+    ) -> Tuple[List[utils.TensorRepSet], utils.TensorRepSet]:
+        x_shape = node.args[0].meta["val"].size()  # type: ignore[union-attr]
+        no_storage_tail = [utils.NO_STORAGE] * (len(node.args) - 1)
+        if len(x_shape) == 3:
+            weight = node.args[1]  # type: ignore[union-attr]
+            weight_shape = weight.meta["val"].size()  # type: ignore[union-attr]
+            groups = node.args[8]  # type: ignore[union-attr]
+            groups_val = groups if isinstance(groups, int) else int(groups)
+            is_depthwise = weight_shape[0] == groups_val and weight_shape[1] == 1
+            if weight_shape[2] == 1 or is_depthwise:
+                # Pointwise and depthwise 1D conv both have texture implementations
+                # using width-packed TEXTURE_3D.
+                return (
+                    [utils.WIDTH_PACKED_TEXTURE] + no_storage_tail,
+                    utils.WIDTH_PACKED_TEXTURE,
+                )
+            # General (non-pointwise, non-depthwise) 1D convolution: buffer path
+            return [utils.CONTIGUOUS_BUFFER] + no_storage_tail, utils.CONTIGUOUS_BUFFER
+        else:
+            # 2D convolution: channels-packed texture path
+            return (
+                [utils.CHANNELS_PACKED_TEXTURE] + no_storage_tail,
+                utils.CHANNELS_PACKED_TEXTURE,
+            )
+
     return OpFeatures(
         inputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,  # input
+            utils.CHANNELS_PACKED_TEXTURE,  # input (overridden by pick_conv_storage)
             utils.NO_STORAGE,  # weight (prepacked)
             utils.NO_STORAGE,  # bias (prepacked)
             utils.NO_STORAGE,  # stride (non tensor)
@@ -820,6 +847,7 @@ def check_conv_node(node: torch.fx.Node) -> bool:
         supports_resize=True,
         supports_prepacking=True,
         are_node_inputs_supported_fn=check_conv_node,
+        pick_io_storage_fn=pick_conv_storage,
     )
 
 

@@ -8,114 +8,75 @@
 
 #version 450 core
 
+${define_required_extensions("buffer", DTYPE)}
+
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
 
 #define op(X, A, B) ${OPERATOR}
 
-layout(std430) buffer;
+${define_active_storage_type(STORAGE)}
 
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "kernel_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
+layout(std430) buffer;
 
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "ivec4", "in_sizes")}
+#include "indexing.glslh"
 
-${layout_declare_ubo(B,"int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer")}
 
+${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_meta")}
+${layout_declare_ubo(B, "ivec4", "weight_strides")}
+${layout_declare_ubo(B, "int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
 ${layout_declare_ubo(B, "float", "out_min", "float", "out_max")}
 
-#include "indexing_utils.h"
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-${layout_declare_spec_const(C, "int", "kernel_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 kernel_axis_map = unhash_axis_map(kernel_layout);
-
-${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);
-
-// Let us define
-//
-// input = (N, in_C, in_L),
-// output = (N, out_C, out_L),
-// groups = G,
-// kernel = K,
-//
-// which results in shapes
-//
-// weight = (out_C, in_C / G, K),
-// bias = (out_C,).
-//
-// This implementation performs N x out_C x out_L shader invocations, where each invocation
-// calculates the rolling kernel of the length dimension for each batch, i.e.,
-// computes out_L results.
+/*
+ * Computes a 1D convolution over width-packed buffer tensors. Each shader
+ * invocation computes one output element at position (n, out_c, out_l).
+ *
+ * Tensor sizes/strides are in WHCN order:
+ *   out_meta sizes: W=L_out, H=C_out, C=N
+ *   in_meta sizes:  W=L_in,  H=C_in
+ */
 void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  const int out_l = int(gl_GlobalInvocationID.x);
+  const int out_c = int(gl_GlobalInvocationID.y);
+  const int n = int(gl_GlobalInvocationID.z);
 
-  if (any(greaterThanEqual(lpos, out_limits))) {
+  if (out_l >= int(size_at(out_meta, 0)) ||
+      out_c >= int(size_at(out_meta, 1)) ||
+      n >= int(size_at(out_meta, 2))) {
     return;
   }
 
-  // "out_c" is the output's channel index where we write our result.
-  // Across shader invocations, this is the only value that varies.
-  const int out_c = lpos.y;
-
-  // "in_c" tracks the input's channel start index.
-  // We iterate over the input group that corresponds to the output group.
   const int c_start = (out_c / out_group_size) * in_group_size;
-  const int c_end = c_start + in_group_size;
-
-  // "out_l" tracks the output's length index where we write our result.
-  const int out_l = lpos.x;
-
-  // "N" is the batch index
-  const int N = lpos.z;
-
-  // "in_l" tracks the input's length start index for our input-kernel overlay
-  // region.
-  const int in_l = out_l * stride - padding;
-  VEC4_T sum = VEC4_T(0);
-
-  const int out_c_packed_index = out_c >> 2;
-  const int out_c_packed_lane = out_c & 0x3;
-
-  for (int in_c = c_start; in_c < c_end; ++in_c) {
-    // "k" tracks the kernel's index for our input-kernel computation.
-    // It reads out-of-bound zeros, but trying to avoid them complicates
-    // for-loop conditions, which results in worse performance.
-
-    // The weight tensor is channel-packed. It may not be trival choice for
-    // performance reason since need to have more data fetch. The reason is
-    // for some sequence model, we found that the weight tensor
-    // (out_channel, in_channel / group, kernel) often has a large
-    // out_channel >> kernel, leading to non-optimal use of memory as the
-    // weight tensor gets very deep. As a mitigation, we use channel-packing
-    // for the weight tensor, yielding a 75% reduction in weight-tensor
-    // memory.
-
-    // It is possible to further reduce the memory footprint by swapping the
-    // dimensions, using x extent for out_channel, and y for kernel.
-    for (int k = 0; k < kernel_size; k++) {
-      const ivec3 w_lpos = ivec3(k, in_c % in_group_size, out_c_packed_index);
-      const VEC4_T weight_texel = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
-      VEC4_T weight = VEC4_T(weight_texel[out_c_packed_lane]);
 
-      const ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, N), in_axis_map);
-      sum = fma(weight, load_texel(t_in, in_pos), sum);
+  T sum = T(0);
+  for (int ic = 0; ic < in_group_size; ic++) {
+    const int in_c = c_start + ic;
+    for (int k = 0; k < kernel_size; k++) {
+      const int in_l = out_l * stride - padding + k * dilation;
+      if (in_l >= 0 && in_l < int(size_at(in_meta, 0))) {
+        TensorIndex4D in_tidx;
+        in_tidx.data = ivec4(in_l, in_c, n, 0);
+        const uint in_idx = tensor4d_idx_to_linear_idx(in_meta, in_tidx);
+        // Weight tidx (k, ic, out_c) in [C_out, C_in/g, K]: (k, ic, out_c, 0)
+        const int w_idx = k * weight_strides.x + ic * weight_strides.y +
+            out_c * weight_strides.z;
+        sum += t_in[in_idx] * t_weight[w_idx];
+      }
     }
   }
 
-  const VEC4_T bias = load_texel_lpos(bias_in, ivec3(out_c_packed_index, 0, 0), bias_axis_map);
-  const ivec3 out_lpos = ivec3(out_l, out_c, N);
-  write_texel_lpos(t_out, out_lpos, op(sum + bias[out_c_packed_lane], out_min, out_max), out_axis_map);
+  sum += T(t_bias[out_c]);
+
+  TensorIndex4D out_tidx;
+  out_tidx.data = ivec4(out_l, out_c, n, 0);
+  const uint out_idx = tensor4d_idx_to_linear_idx(out_meta, out_tidx);
+  t_out[out_idx] = op(sum, T(out_min), T(out_max));
 }
@@ -8,7 +8,7 @@ conv1d:
   parameter_names_with_default_values:
     OPERATOR: X
     DTYPE: float
-    STORAGE: texture3d
+    STORAGE: buffer
   generate_variant_forall:
     DTYPE:
       - VALUE: half

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+#define op(X, A, B) ${OPERATOR}
+
+${define_active_storage_type(STORAGE)}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_meta")}
+${layout_declare_ubo(B, "ivec4", "weight_strides")}
+${layout_declare_ubo(B, "int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
+${layout_declare_ubo(B, "float", "out_min", "float", "out_max")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a depthwise 1D convolution over width-packed buffer tensors. Each
+ * shader invocation computes one output element at position (n, c, out_l).
+ *
+ * For depthwise conv: groups == C_in == C_out, so each output channel uses
+ * exactly one input channel. Weight shape is [C, 1, K].
+ */
+void main() {
+  const int out_l = int(gl_GlobalInvocationID.x);
+  const int c = int(gl_GlobalInvocationID.y);
+  const int n = int(gl_GlobalInvocationID.z);
+
+  if (out_l >= int(size_at(out_meta, 0)) ||
+      c >= int(size_at(out_meta, 1)) ||
+      n >= int(size_at(out_meta, 2))) {
+    return;
+  }
+
+  T sum = T(0);
+  for (int k = 0; k < kernel_size; k++) {
+    const int in_l = out_l * stride - padding + k * dilation;
+    if (in_l >= 0 && in_l < int(size_at(in_meta, 0))) {
+      TensorIndex4D in_tidx;
+      in_tidx.data = ivec4(in_l, c, n, 0);
+      const uint in_idx = tensor4d_idx_to_linear_idx(in_meta, in_tidx);
+      // Weight tidx (k, 0, c) in [C, 1, K]: (k, 0, c, 0)
+      const int w_idx = k * weight_strides.x + c * weight_strides.z;
+      sum += t_in[in_idx] * t_weight[w_idx];
+    }
+  }
+
+  sum += T(t_bias[c]);
+
+  TensorIndex4D out_tidx;
+  out_tidx.data = ivec4(out_l, c, n, 0);
+  const uint out_idx = tensor4d_idx_to_linear_idx(out_meta, out_tidx);
+  t_out[out_idx] = op(sum, T(out_min), T(out_max));
+}
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv1d_dw:
+  parameter_names_with_default_values:
+    OPERATOR: X
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv1d_dw
+    - NAME: conv1d_dw_clamp
+      OPERATOR: clamp(X, A, B)