Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,9 +802,36 @@ def check_conv_node(node: torch.fx.Node) -> bool:

return True

def pick_conv_storage(
node: torch.fx.Node,
) -> Tuple[List[utils.TensorRepSet], utils.TensorRepSet]:
x_shape = node.args[0].meta["val"].size() # type: ignore[union-attr]
no_storage_tail = [utils.NO_STORAGE] * (len(node.args) - 1)
if len(x_shape) == 3:
weight = node.args[1] # type: ignore[union-attr]
weight_shape = weight.meta["val"].size() # type: ignore[union-attr]
groups = node.args[8] # type: ignore[union-attr]
groups_val = groups if isinstance(groups, int) else int(groups)
is_depthwise = weight_shape[0] == groups_val and weight_shape[1] == 1
if weight_shape[2] == 1 or is_depthwise:
# Pointwise and depthwise 1D conv both have texture implementations
# using width-packed TEXTURE_3D.
return (
[utils.WIDTH_PACKED_TEXTURE] + no_storage_tail,
utils.WIDTH_PACKED_TEXTURE,
)
# General (non-pointwise, non-depthwise) 1D convolution: buffer path
return [utils.CONTIGUOUS_BUFFER] + no_storage_tail, utils.CONTIGUOUS_BUFFER
else:
# 2D convolution: channels-packed texture path
return (
[utils.CHANNELS_PACKED_TEXTURE] + no_storage_tail,
utils.CHANNELS_PACKED_TEXTURE,
)

return OpFeatures(
inputs_storage=[
utils.CHANNELS_PACKED_TEXTURE, # input
utils.CHANNELS_PACKED_TEXTURE, # input (overridden by pick_conv_storage)
utils.NO_STORAGE, # weight (prepacked)
utils.NO_STORAGE, # bias (prepacked)
utils.NO_STORAGE, # stride (non tensor)
Expand All @@ -820,6 +847,7 @@ def check_conv_node(node: torch.fx.Node) -> bool:
supports_resize=True,
supports_prepacking=True,
are_node_inputs_supported_fn=check_conv_node,
pick_io_storage_fn=pick_conv_storage,
)


Expand Down
135 changes: 48 additions & 87 deletions backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -8,114 +8,75 @@

#version 450 core

${define_required_extensions("buffer", DTYPE)}

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}
#define T ${buffer_scalar_type(DTYPE)}

#define op(X, A, B) ${OPERATOR}

layout(std430) buffer;
${define_active_storage_type(STORAGE)}

${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
${layout_declare_tensor(B, "r", "kernel_in", DTYPE, STORAGE)}
${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
layout(std430) buffer;

${layout_declare_ubo(B, "ivec3", "out_limits")}
${layout_declare_ubo(B, "ivec4", "in_sizes")}
#include "indexing.glslh"

${layout_declare_ubo(B,"int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_weight", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer")}

${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
${layout_declare_ubo(B, "BufferMetadata", "in_meta")}
${layout_declare_ubo(B, "ivec4", "weight_strides")}
${layout_declare_ubo(B, "int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
${layout_declare_ubo(B, "float", "out_min", "float", "out_max")}

#include "indexing_utils.h"

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);

${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);

${layout_declare_spec_const(C, "int", "kernel_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 kernel_axis_map = unhash_axis_map(kernel_layout);

${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);

// Let us define
//
// input = (N, in_C, in_L),
// output = (N, out_C, out_L),
// groups = G,
// kernel = K,
//
// which results in shapes
//
// weight = (out_C, in_C / G, K),
// bias = (out_C,).
//
// This implementation performs N x out_C x out_L shader invocations, where each invocation
// calculates the rolling kernel of the length dimension for each batch, i.e.,
// computes out_L results.
/*
* Computes a 1D convolution over width-packed buffer tensors. Each shader
* invocation computes one output element at position (n, out_c, out_l).
*
* Tensor sizes/strides are in WHCN order:
* out_meta sizes: W=L_out, H=C_out, C=N
* in_meta sizes: W=L_in, H=C_in
*/
void main() {
const ivec3 lpos = ivec3(gl_GlobalInvocationID);
const int out_l = int(gl_GlobalInvocationID.x);
const int out_c = int(gl_GlobalInvocationID.y);
const int n = int(gl_GlobalInvocationID.z);

if (any(greaterThanEqual(lpos, out_limits))) {
if (out_l >= int(size_at(out_meta, 0)) ||
out_c >= int(size_at(out_meta, 1)) ||
n >= int(size_at(out_meta, 2))) {
return;
}

// "out_c" is the output's channel index where we write our result.
// Across shader invocations, this is the only value that varies.
const int out_c = lpos.y;

// "in_c" tracks the input's channel start index.
// We iterate over the input group that corresponds to the output group.
const int c_start = (out_c / out_group_size) * in_group_size;
const int c_end = c_start + in_group_size;

// "out_l" tracks the output's length index where we write our result.
const int out_l = lpos.x;

// "N" is the batch index
const int N = lpos.z;

// "in_l" tracks the input's length start index for our input-kernel overlay
// region.
const int in_l = out_l * stride - padding;
VEC4_T sum = VEC4_T(0);

const int out_c_packed_index = out_c >> 2;
const int out_c_packed_lane = out_c & 0x3;

for (int in_c = c_start; in_c < c_end; ++in_c) {
// "k" tracks the kernel's index for our input-kernel computation.
// It reads out-of-bound zeros, but trying to avoid them complicates
// for-loop conditions, which results in worse performance.

// The weight tensor is channel-packed. It may not be trival choice for
// performance reason since need to have more data fetch. The reason is
// for some sequence model, we found that the weight tensor
// (out_channel, in_channel / group, kernel) often has a large
// out_channel >> kernel, leading to non-optimal use of memory as the
// weight tensor gets very deep. As a mitigation, we use channel-packing
// for the weight tensor, yielding a 75% reduction in weight-tensor
// memory.

// It is possible to further reduce the memory footprint by swapping the
// dimensions, using x extent for out_channel, and y for kernel.
for (int k = 0; k < kernel_size; k++) {
const ivec3 w_lpos = ivec3(k, in_c % in_group_size, out_c_packed_index);
const VEC4_T weight_texel = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
VEC4_T weight = VEC4_T(weight_texel[out_c_packed_lane]);

const ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, N), in_axis_map);
sum = fma(weight, load_texel(t_in, in_pos), sum);
T sum = T(0);
for (int ic = 0; ic < in_group_size; ic++) {
const int in_c = c_start + ic;
for (int k = 0; k < kernel_size; k++) {
const int in_l = out_l * stride - padding + k * dilation;
if (in_l >= 0 && in_l < int(size_at(in_meta, 0))) {
TensorIndex4D in_tidx;
in_tidx.data = ivec4(in_l, in_c, n, 0);
const uint in_idx = tensor4d_idx_to_linear_idx(in_meta, in_tidx);
// Weight tidx (k, ic, out_c) in [C_out, C_in/g, K]: (k, ic, out_c, 0)
const int w_idx = k * weight_strides.x + ic * weight_strides.y +
out_c * weight_strides.z;
sum += t_in[in_idx] * t_weight[w_idx];
}
}
}

const VEC4_T bias = load_texel_lpos(bias_in, ivec3(out_c_packed_index, 0, 0), bias_axis_map);
const ivec3 out_lpos = ivec3(out_l, out_c, N);
write_texel_lpos(t_out, out_lpos, op(sum + bias[out_c_packed_lane], out_min, out_max), out_axis_map);
sum += T(t_bias[out_c]);

TensorIndex4D out_tidx;
out_tidx.data = ivec4(out_l, out_c, n, 0);
const uint out_idx = tensor4d_idx_to_linear_idx(out_meta, out_tidx);
t_out[out_idx] = op(sum, T(out_min), T(out_max));
}
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ conv1d:
parameter_names_with_default_values:
OPERATOR: X
DTYPE: float
STORAGE: texture3d
STORAGE: buffer
generate_variant_forall:
DTYPE:
- VALUE: half
Expand Down
75 changes: 75 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

${define_required_extensions("buffer", DTYPE)}

#define PRECISION ${PRECISION}

#define T ${buffer_scalar_type(DTYPE)}

#define op(X, A, B) ${OPERATOR}

${define_active_storage_type(STORAGE)}

layout(std430) buffer;

#include "indexing.glslh"

${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_weight", DTYPE, "buffer")}
${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer")}

${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
${layout_declare_ubo(B, "BufferMetadata", "in_meta")}
${layout_declare_ubo(B, "ivec4", "weight_strides")}
${layout_declare_ubo(B, "int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
${layout_declare_ubo(B, "float", "out_min", "float", "out_max")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

/*
* Computes a depthwise 1D convolution over width-packed buffer tensors. Each
* shader invocation computes one output element at position (n, c, out_l).
*
* For depthwise conv: groups == C_in == C_out, so each output channel uses
* exactly one input channel. Weight shape is [C, 1, K].
*/
void main() {
const int out_l = int(gl_GlobalInvocationID.x);
const int c = int(gl_GlobalInvocationID.y);
const int n = int(gl_GlobalInvocationID.z);

if (out_l >= int(size_at(out_meta, 0)) ||
c >= int(size_at(out_meta, 1)) ||
n >= int(size_at(out_meta, 2))) {
return;
}

T sum = T(0);
for (int k = 0; k < kernel_size; k++) {
const int in_l = out_l * stride - padding + k * dilation;
if (in_l >= 0 && in_l < int(size_at(in_meta, 0))) {
TensorIndex4D in_tidx;
in_tidx.data = ivec4(in_l, c, n, 0);
const uint in_idx = tensor4d_idx_to_linear_idx(in_meta, in_tidx);
// Weight tidx (k, 0, c) in [C, 1, K]: (k, 0, c, 0)
const int w_idx = k * weight_strides.x + c * weight_strides.z;
sum += t_in[in_idx] * t_weight[w_idx];
}
}

sum += T(t_bias[c]);

TensorIndex4D out_tidx;
out_tidx.data = ivec4(out_l, c, n, 0);
const uint out_idx = tensor4d_idx_to_linear_idx(out_meta, out_tidx);
t_out[out_idx] = op(sum, T(out_min), T(out_max));
}
19 changes: 19 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/conv1d_dw.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

conv1d_dw:
parameter_names_with_default_values:
OPERATOR: X
DTYPE: float
STORAGE: buffer
generate_variant_forall:
DTYPE:
- VALUE: half
- VALUE: float
shader_variants:
- NAME: conv1d_dw
- NAME: conv1d_dw_clamp
OPERATOR: clamp(X, A, B)
Loading
Loading