From e7160e721e203a178944151f789581c5af93ff0d Mon Sep 17 00:00:00 2001 From: ssjia Date: Wed, 24 Jun 2026 11:03:49 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- .../graph/ops/glsl/choose_qparams_per_row.glsl | 4 ++-- .../graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl | 2 +- .../ops/glsl/linear_int8_input_scales_zps_load.glslh | 3 ++- .../runtime/graph/ops/glsl/linear_q4gsw_coop.glsl | 2 +- .../glsl/quantize_and_pack_4h4w_with_group_sums.glsl | 2 +- .../vulkan/runtime/graph/ops/impl/ChooseQParams.cpp | 7 +++++++ .../test/custom_ops/test_choose_qparams_per_row.cpp | 12 ++++++------ .../vulkan/test/custom_ops/test_q4gsw_linear.cpp | 8 ++++---- 8 files changed, 24 insertions(+), 16 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl index 0b2cd7fef5a..e34cb2bd4ef 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl @@ -30,7 +30,7 @@ layout(std430) buffer; #include "common.glslh" ${layout_declare_tensor(B, "w", "t_scales", DTYPE, "texture3d")} -${layout_declare_tensor(B, "w", "t_zps", "int8", "texture3d")} +${layout_declare_tensor(B, "w", "t_zps", "float", "texture3d")} ${layout_declare_tensor(B, "r", "t_input", DTYPE, STORAGE, is_scalar_array=False)} ${layout_declare_ubo(B, "ivec4", "input_sizes")} @@ -196,7 +196,7 @@ void main() { if (worker_id == 0) { imageStore(t_scales, ivec3(output_y4, 0, 0), scales_out); - imageStore(t_zps, ivec3(output_y4, 0, 0), zps_out); + imageStore(t_zps, ivec3(output_y4, 0, 0), vec4(zps_out)); } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl index fa0129b65a5..e98e67776cf 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl @@ -46,7 +46,7 @@ ${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=Fa ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")} +${layout_declare_tensor(B, "r", "t_int8_input_zps", "float", "texture3d")} ${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_scales_zps_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_scales_zps_load.glslh index e1a570622c2..9b178d5c6c0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_scales_zps_load.glslh +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_scales_zps_load.glslh @@ -20,7 +20,8 @@ void load_int8_input_scales_and_zps( [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) { scales.data[m4] = VEC4_T(texelFetch(t_int8_input_scales, ivec3(m4_start + m4, 0, 0), 0)); - zps.data[m4] = texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0); + zps.data[m4] = + ivec4(texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0)); } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl index 053f27d6c9b..02bc4c28615 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl @@ -40,7 +40,7 @@ $if DYNAMIC_QUANT_VARIANT: ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INPUT_STORAGE, is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_int_input_sums", "int", "buffer", is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_input_scale", DTYPE, "texture3d")} - ${layout_declare_tensor(B, "r", "t_input_zp", "int", "texture3d")} + ${layout_declare_tensor(B, "r", "t_input_zp", "float", "texture3d")} ${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)} diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.glsl index e4d211a95f5..388c334b9d9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.glsl @@ -33,7 +33,7 @@ ${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is ${layout_declare_tensor(B, "w", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)} ${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")} -${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")} +${layout_declare_tensor(B, "r", "t_int8_input_zps", "float", "texture3d")} ${layout_declare_ubo(B, "ivec4", "input_sizes")} diff --git a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp b/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp index 5b8615e0a70..023830c8b9e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp @@ -78,6 +78,13 @@ utils::uvec3 pick_choose_qparams_per_row_local_wg_size( return {workers_per_output, outputs_per_wg, 1u}; } +// The per-token zero-point tensor is fp32-typed (matching torchao's serialized +// asymmetric per-token zero_point_dtype=fp32), even though its values are +// integer-valued in [-128, 127]. The shaders read it as a float texel and +// convert to int for the integer dequant-correction. Declaring the shader +// binding fp32 to match the tensor's allocation avoids the +// float-image-read-through-an-integer-binding format mismatch that corrupted +// negative zero-points on Mali. void add_choose_qparams_per_row_node( ComputeGraph& graph, const ValueRef& input, diff --git a/backends/vulkan/test/custom_ops/test_choose_qparams_per_row.cpp b/backends/vulkan/test/custom_ops/test_choose_qparams_per_row.cpp index 69f7812e4ab..8841c69069e 100644 --- a/backends/vulkan/test/custom_ops/test_choose_qparams_per_row.cpp +++ b/backends/vulkan/test/custom_ops/test_choose_qparams_per_row.cpp @@ -44,7 +44,7 @@ TestCase create_test_case_from_config( config.channel_size < kRefDimSizeLimit); std::string prefix = is_perf ? "PERF" : "ACCU"; std::string in_dtype = dtype_short(input_dtype); - std::string out_dtype = "f32,i8"; // pair: (scale, zero_point) + std::string out_dtype = "f32,f32"; // pair: (scale, zero_point) std::string shape_str = "[" + std::to_string(config.num_channels) + "," + std::to_string(config.channel_size) + "]"; std::string storage_str = repr_str(storage_type, utils::kWidthPacked); @@ -81,10 +81,10 @@ TestCase create_test_case_from_config( utils::kWidthPacked, DataGenType::ZEROS); - // Output zero_point tensor (int8) - [num_channels] + // Output zero_point tensor (float) - [num_channels] ValueSpec zero_point_out( {config.num_channels}, - vkapi::kChar, // int8 for quantized zero point + vkapi::kFloat, utils::kTexture3D, // Always buffer as per requirement utils::kWidthPacked, DataGenType::ZEROS); @@ -289,7 +289,7 @@ void choose_qparams_per_channel_reference_impl(TestCase& test_case) { // Prepare output data auto& scale_ref_data = scale_out_spec.get_ref_float_data(); - auto& zero_point_ref_data = zero_point_out_spec.get_ref_int8_data(); + auto& zero_point_ref_data = zero_point_out_spec.get_ref_float_data(); scale_ref_data.resize(num_channels); zero_point_ref_data.resize(num_channels); @@ -312,9 +312,9 @@ void choose_qparams_per_channel_reference_impl(TestCase& test_case) { calculate_scale_and_zero_point_reference( min_val, max_val, quant_min, quant_max, scale, zero_point); - // Store results (cast zero_point to int8) + // Store results scale_ref_data[channel] = scale; - zero_point_ref_data[channel] = static_cast(zero_point); + zero_point_ref_data[channel] = static_cast(zero_point); } } diff --git a/backends/vulkan/test/custom_ops/test_q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/test_q4gsw_linear.cpp index 7a10c9fe22a..c51bd6b9d3d 100644 --- a/backends/vulkan/test/custom_ops/test_q4gsw_linear.cpp +++ b/backends/vulkan/test/custom_ops/test_q4gsw_linear.cpp @@ -97,7 +97,7 @@ TestCase create_test_case_from_config( ValueSpec input_zero_point( {1, config.M}, // Per-input channel tensor - vkapi::kChar, + vkapi::kFloat, storage_type, utils::kWidthPacked, DataGenType::RANDINT); @@ -428,7 +428,7 @@ void linear_dq8ca_q4gsw_reference_impl(TestCase& test_case) { auto& input_scale_data = input_scale_spec.get_float_data(); // Per-input channel tensor auto& input_zero_point_data = - input_zeros_spec.get_int8_data(); // Per-input channel tensor + input_zeros_spec.get_float_data(); // Per-input channel tensor auto& weight_data = weight_spec.get_uint8_data(); auto& weight_sums_data = weight_sums_spec.get_int32_data(); @@ -462,8 +462,8 @@ void linear_dq8ca_q4gsw_reference_impl(TestCase& test_case) { // Use per-input channel scale and zero point - index by batch dimension float input_scale = input_scale_data[b]; // {1, M} -> index by batch - int8_t input_zero_point = - input_zero_point_data[b]; // {1, M} -> index by batch + int8_t input_zero_point = static_cast( + input_zero_point_data[b]); // {1, M} -> index by batch float quant_input_f = std::round(input_data[input_idx] / input_scale) + input_zero_point;