diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp index c940cdcf839..31a023c350e 100644 --- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp +++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp @@ -47,7 +47,7 @@ namespace { /** * Asserts that the parameters are valid. */ -void check_dequantize_per_tensor_args( +[[maybe_unused]] void check_dequantize_per_tensor_args( const Tensor& input, int64_t quant_min, int64_t quant_max, @@ -86,6 +86,18 @@ void check_dequantize_per_tensor_args( quant_max); } +// PDX aligned vector load/store (PDX_LV*) require 16-byte (128-bit) aligned +// buffers on Fusion G3; an unaligned access raises LoadStoreAlignmentCause. The +// inline SIMD fast path below is only safe when both buffers are aligned; +// otherwise the caller falls back to the alignment-safe NNLib kernel. +[[maybe_unused]] constexpr uintptr_t kSimdAlignmentBytes = 16; +[[maybe_unused]] inline bool dequant_simd_aligned( + const void* in_ptr, + const void* out_ptr) { + return (reinterpret_cast(in_ptr) % kSimdAlignmentBytes == 0) && + (reinterpret_cast(out_ptr) % kSimdAlignmentBytes == 0); +} + } // namespace /* Local function which calls the kernels based on the input datatype */ @@ -295,10 +307,8 @@ Tensor& dequantize_impl( if (zero_point_data != nullptr) { \ zero_point = zero_point_data[current_ix]; \ } \ - out_data_ptr[current_ix] = \ - static_cast( \ - input_data_ptr[current_ix] - zero_point) * \ - _scale; \ + out_data_ptr[current_ix] = static_cast( \ + (input_data_ptr[current_ix] - zero_point) * _scale); \ } \ }, \ input, \ @@ -503,10 +513,8 @@ Tensor& dequantize_impl( if (zero_point_data != nullptr) { \ zero_point = zero_point_data[current_ix]; \ } \ - out_data_ptr[current_ix] = \ - static_cast( \ - input_data_ptr[current_ix] - zero_point) * \ - _scale; \ + out_data_ptr[current_ix] = static_cast( \ + (input_data_ptr[current_ix] - zero_point) * _scale); \ } \ }, \ input, \ @@ -579,26 +587,403 @@ Tensor& dequantize_per_tensor_out( ScalarType dtype, Tensor& out) { constexpr ScalarType out_dtype = ScalarType::Float; - #ifdef OP_ARG_CHECK torch::executor::Error err = resize_tensor(out, input.sizes()); ET_CHECK_MSG( err == torch::executor::Error::Ok, "Failed to resize out Tensor in dequantize_per_tensor_out"); - check_dequantize_per_tensor_args( input, quant_min, quant_max, dtype, out_dtype, out); #endif - float scale_data = (float)scale; int zero_point_data = (int)zero_point; - + // Fast path: bypass dequantize_impl dispatch overhead for the most common + // per-tensor cases (int8/uint8 -> float32). This avoids the inp_shape copy + // loop, is_asym_dequant check, optimized flag check, and the multi-branch + // type dispatch in dequantize_impl. + if (out.scalar_type() == ScalarType::Float) { + float* __restrict__ out_data = out.mutable_data_ptr(); + // For per-tensor dequantization (axis=NULL), flatten to 1D shape. + // This avoids the shape copy loop and lets the NNLIB kernel skip + // multi-dimensional stride calculations internally. + int inp_shape[1] = {static_cast(input.numel())}; + constexpr int num_inp_dims = 1; + if (input.scalar_type() == ScalarType::Char) { + const int8_t* __restrict__ input_data = input.const_data_ptr(); +#if defined(__XTENSA__) + // Direct inline PDX SIMD dequantization for per-tensor int8->float32. + // 4x-unrolled to improve ILP: the Fusion G3 scheduler can interleave + // independent loads, converts, and FP MACs across unrolled iterations, + // hiding load-to-use latency and reducing loop overhead. + if (dequant_simd_aligned(input_data, out_data)) { + const int numel = inp_shape[0]; + auto vIn = reinterpret_cast(input_data); + auto vOut = reinterpret_cast(out_data); + const xb_vecMxf32 v_scale{ + scale_data, scale_data, scale_data, scale_data}; + int i = 0; + if (zero_point_data != 0) { + const float zp_f = static_cast(zero_point_data); + const xb_vecMxf32 v_zp{zp_f, zp_f, zp_f, zp_f}; + // 4x unrolled main loop: 16 elements per iteration + const int e16 = (numel >> 4) << 4; + for (; i < e16; i += 16) { + xb_vecMx32 vP0 = PDX_LV32_MX8_I(vIn, 0); + xb_vecMx32 vP1 = PDX_LV32_MX8_I(vIn + 1, 0); + xb_vecMx32 vP2 = PDX_LV32_MX8_I(vIn + 2, 0); + xb_vecMx32 vP3 = PDX_LV32_MX8_I(vIn + 3, 0); + xb_vecMxf32 vF0 = (xb_vecMxf32)vP0; + xb_vecMxf32 vF1 = (xb_vecMxf32)vP1; + xb_vecMxf32 vF2 = (xb_vecMxf32)vP2; + xb_vecMxf32 vF3 = (xb_vecMxf32)vP3; + vOut[0] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF0, v_zp), v_scale); + vOut[1] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF1, v_zp), v_scale); + vOut[2] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF2, v_zp), v_scale); + vOut[3] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF3, v_zp), v_scale); + vIn += 4; + vOut += 4; + } + // Scalar residual for remaining 0-15 elements + for (; i < numel; ++i) { + out_data[i] = + (static_cast(input_data[i]) - zp_f) * scale_data; + } + } else { + // 4x unrolled main loop: 16 elements per iteration + const int e16 = (numel >> 4) << 4; + for (; i < e16; i += 16) { + xb_vecMx32 vP0 = PDX_LV32_MX8_I(vIn, 0); + xb_vecMx32 vP1 = PDX_LV32_MX8_I(vIn + 1, 0); + xb_vecMx32 vP2 = PDX_LV32_MX8_I(vIn + 2, 0); + xb_vecMx32 vP3 = PDX_LV32_MX8_I(vIn + 3, 0); + xb_vecMxf32 vF0 = (xb_vecMxf32)vP0; + xb_vecMxf32 vF1 = (xb_vecMxf32)vP1; + xb_vecMxf32 vF2 = (xb_vecMxf32)vP2; + xb_vecMxf32 vF3 = (xb_vecMxf32)vP3; + vOut[0] = PDX_MUL_MXF32(vF0, v_scale); + vOut[1] = PDX_MUL_MXF32(vF1, v_scale); + vOut[2] = PDX_MUL_MXF32(vF2, v_scale); + vOut[3] = PDX_MUL_MXF32(vF3, v_scale); + vIn += 4; + vOut += 4; + } + // Scalar residual for remaining 0-15 elements + for (; i < numel; ++i) { + out_data[i] = static_cast(input_data[i]) * scale_data; + } + } + return out; + } +#endif + if (zero_point_data != 0) { + XT_KERNEL_CHECK( + context, + out, + xa_nn_elm_dequantize_asym8_f32, + out_data, + input_data, + inp_shape, + num_inp_dims, + NULL, + &zero_point_data, + &scale_data); + } else { + XT_KERNEL_CHECK( + context, + out, + xa_nn_elm_dequantize_sym8_f32, + out_data, + input_data, + inp_shape, + num_inp_dims, + NULL, + &scale_data); + } + return out; + } else if (input.scalar_type() == ScalarType::Byte) { + const uint8_t* __restrict__ input_data = input.const_data_ptr(); +#if defined(__XTENSA__) + // Direct inline PDX SIMD dequantization for per-tensor uint8->float32. + // 4x-unrolled to improve ILP: the Fusion G3 scheduler can interleave + // independent loads, converts, and FP MACs across unrolled iterations, + // hiding load-to-use latency and reducing loop overhead. + if (dequant_simd_aligned(input_data, out_data)) { + const int numel = inp_shape[0]; + auto vIn = reinterpret_cast(input_data); + auto vOut = reinterpret_cast(out_data); + const xb_vecMxf32 v_scale{ + scale_data, scale_data, scale_data, scale_data}; + int i = 0; + if (zero_point_data != 0) { + const float zp_f = static_cast(zero_point_data); + const xb_vecMxf32 v_zp{zp_f, zp_f, zp_f, zp_f}; + // 4x unrolled main loop: 16 elements per iteration + const int e16 = (numel >> 4) << 4; + for (; i < e16; i += 16) { + xb_vecMxu32 vP0 = PDX_LVU32_MX8_I(vIn, 0); + xb_vecMxu32 vP1 = PDX_LVU32_MX8_I(vIn + 1, 0); + xb_vecMxu32 vP2 = PDX_LVU32_MX8_I(vIn + 2, 0); + xb_vecMxu32 vP3 = PDX_LVU32_MX8_I(vIn + 3, 0); + xb_vecMxf32 vF0 = (xb_vecMxf32)vP0; + xb_vecMxf32 vF1 = (xb_vecMxf32)vP1; + xb_vecMxf32 vF2 = (xb_vecMxf32)vP2; + xb_vecMxf32 vF3 = (xb_vecMxf32)vP3; + vOut[0] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF0, v_zp), v_scale); + vOut[1] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF1, v_zp), v_scale); + vOut[2] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF2, v_zp), v_scale); + vOut[3] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF3, v_zp), v_scale); + vIn += 4; + vOut += 4; + } + // Scalar residual for remaining 0-15 elements + for (; i < numel; ++i) { + out_data[i] = + (static_cast(input_data[i]) - zp_f) * scale_data; + } + } else { + // 4x unrolled main loop: 16 elements per iteration + const int e16 = (numel >> 4) << 4; + for (; i < e16; i += 16) { + xb_vecMxu32 vP0 = PDX_LVU32_MX8_I(vIn, 0); + xb_vecMxu32 vP1 = PDX_LVU32_MX8_I(vIn + 1, 0); + xb_vecMxu32 vP2 = PDX_LVU32_MX8_I(vIn + 2, 0); + xb_vecMxu32 vP3 = PDX_LVU32_MX8_I(vIn + 3, 0); + xb_vecMxf32 vF0 = (xb_vecMxf32)vP0; + xb_vecMxf32 vF1 = (xb_vecMxf32)vP1; + xb_vecMxf32 vF2 = (xb_vecMxf32)vP2; + xb_vecMxf32 vF3 = (xb_vecMxf32)vP3; + vOut[0] = PDX_MUL_MXF32(vF0, v_scale); + vOut[1] = PDX_MUL_MXF32(vF1, v_scale); + vOut[2] = PDX_MUL_MXF32(vF2, v_scale); + vOut[3] = PDX_MUL_MXF32(vF3, v_scale); + vIn += 4; + vOut += 4; + } + // Scalar residual for remaining 0-15 elements + for (; i < numel; ++i) { + out_data[i] = static_cast(input_data[i]) * scale_data; + } + } + return out; + } +#endif + if (zero_point_data != 0) { + XT_KERNEL_CHECK( + context, + out, + xa_nn_elm_dequantize_asym8u_f32, + out_data, + input_data, + inp_shape, + num_inp_dims, + NULL, + &zero_point_data, + &scale_data); + } else { + XT_KERNEL_CHECK( + context, + out, + xa_nn_elm_dequantize_sym8u_f32, + out_data, + input_data, + inp_shape, + num_inp_dims, + NULL, + &scale_data); + } + return out; + } else if (input.scalar_type() == ScalarType::Short) { + const int16_t* __restrict__ input_data = input.const_data_ptr(); +#if defined(__XTENSA__) + // Direct inline PDX SIMD dequantization for per-tensor int16->float32. + // 4x-unrolled to improve ILP: the Fusion G3 scheduler can interleave + // independent loads, converts, and FP MACs across unrolled iterations, + // hiding load-to-use latency and reducing loop overhead. + if (dequant_simd_aligned(input_data, out_data)) { + const int numel = inp_shape[0]; + auto vIn = reinterpret_cast(input_data); + auto vOut = reinterpret_cast(out_data); + const xb_vecMxf32 v_scale{ + scale_data, scale_data, scale_data, scale_data}; + int i = 0; + if (zero_point_data != 0) { + const float zp_f = static_cast(zero_point_data); + const xb_vecMxf32 v_zp{zp_f, zp_f, zp_f, zp_f}; + // 4x unrolled main loop: 16 elements per iteration + const int e16 = (numel >> 4) << 4; + for (; i < e16; i += 16) { + xb_vecMx32 vP0 = PDX_LV32_MX16_I(vIn, 0); + xb_vecMx32 vP1 = PDX_LV32_MX16_I(vIn + 1, 0); + xb_vecMx32 vP2 = PDX_LV32_MX16_I(vIn + 2, 0); + xb_vecMx32 vP3 = PDX_LV32_MX16_I(vIn + 3, 0); + xb_vecMxf32 vF0 = (xb_vecMxf32)vP0; + xb_vecMxf32 vF1 = (xb_vecMxf32)vP1; + xb_vecMxf32 vF2 = (xb_vecMxf32)vP2; + xb_vecMxf32 vF3 = (xb_vecMxf32)vP3; + vOut[0] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF0, v_zp), v_scale); + vOut[1] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF1, v_zp), v_scale); + vOut[2] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF2, v_zp), v_scale); + vOut[3] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF3, v_zp), v_scale); + vIn += 4; + vOut += 4; + } + // Scalar residual for remaining 0-15 elements + for (; i < numel; ++i) { + out_data[i] = + (static_cast(input_data[i]) - zp_f) * scale_data; + } + } else { + // 4x unrolled main loop: 16 elements per iteration + const int e16 = (numel >> 4) << 4; + for (; i < e16; i += 16) { + xb_vecMx32 vP0 = PDX_LV32_MX16_I(vIn, 0); + xb_vecMx32 vP1 = PDX_LV32_MX16_I(vIn + 1, 0); + xb_vecMx32 vP2 = PDX_LV32_MX16_I(vIn + 2, 0); + xb_vecMx32 vP3 = PDX_LV32_MX16_I(vIn + 3, 0); + xb_vecMxf32 vF0 = (xb_vecMxf32)vP0; + xb_vecMxf32 vF1 = (xb_vecMxf32)vP1; + xb_vecMxf32 vF2 = (xb_vecMxf32)vP2; + xb_vecMxf32 vF3 = (xb_vecMxf32)vP3; + vOut[0] = PDX_MUL_MXF32(vF0, v_scale); + vOut[1] = PDX_MUL_MXF32(vF1, v_scale); + vOut[2] = PDX_MUL_MXF32(vF2, v_scale); + vOut[3] = PDX_MUL_MXF32(vF3, v_scale); + vIn += 4; + vOut += 4; + } + // Scalar residual for remaining 0-15 elements + for (; i < numel; ++i) { + out_data[i] = static_cast(input_data[i]) * scale_data; + } + } + return out; + } +#endif + if (zero_point_data != 0) { + XT_KERNEL_CHECK( + context, + out, + xa_nn_elm_dequantize_asym16_f32, + out_data, + input_data, + inp_shape, + num_inp_dims, + NULL, + &zero_point_data, + &scale_data); + } else { + XT_KERNEL_CHECK( + context, + out, + xa_nn_elm_dequantize_sym16_f32, + out_data, + input_data, + inp_shape, + num_inp_dims, + NULL, + &scale_data); + } + return out; + } else if (input.scalar_type() == ScalarType::UInt16) { + const uint16_t* __restrict__ input_data = + input.const_data_ptr(); +#if defined(__XTENSA__) + // Direct inline PDX SIMD dequantization for per-tensor uint16->float32. + // 4x-unrolled to improve ILP: the Fusion G3 scheduler can interleave + // independent loads, converts, and FP MACs across unrolled iterations, + // hiding load-to-use latency and reducing loop overhead. + if (dequant_simd_aligned(input_data, out_data)) { + const int numel = inp_shape[0]; + auto vIn = reinterpret_cast(input_data); + auto vOut = reinterpret_cast(out_data); + const xb_vecMxf32 v_scale{ + scale_data, scale_data, scale_data, scale_data}; + int i = 0; + if (zero_point_data != 0) { + const float zp_f = static_cast(zero_point_data); + const xb_vecMxf32 v_zp{zp_f, zp_f, zp_f, zp_f}; + // 4x unrolled main loop: 16 elements per iteration + const int e16 = (numel >> 4) << 4; + for (; i < e16; i += 16) { + xb_vecMxu32 vP0 = PDX_LVU32_MX16_I(vIn, 0); + xb_vecMxu32 vP1 = PDX_LVU32_MX16_I(vIn + 1, 0); + xb_vecMxu32 vP2 = PDX_LVU32_MX16_I(vIn + 2, 0); + xb_vecMxu32 vP3 = PDX_LVU32_MX16_I(vIn + 3, 0); + xb_vecMxf32 vF0 = (xb_vecMxf32)vP0; + xb_vecMxf32 vF1 = (xb_vecMxf32)vP1; + xb_vecMxf32 vF2 = (xb_vecMxf32)vP2; + xb_vecMxf32 vF3 = (xb_vecMxf32)vP3; + vOut[0] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF0, v_zp), v_scale); + vOut[1] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF1, v_zp), v_scale); + vOut[2] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF2, v_zp), v_scale); + vOut[3] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF3, v_zp), v_scale); + vIn += 4; + vOut += 4; + } + // Scalar residual for remaining 0-15 elements + for (; i < numel; ++i) { + out_data[i] = + (static_cast(input_data[i]) - zp_f) * scale_data; + } + } else { + // 4x unrolled main loop: 16 elements per iteration + const int e16 = (numel >> 4) << 4; + for (; i < e16; i += 16) { + xb_vecMxu32 vP0 = PDX_LVU32_MX16_I(vIn, 0); + xb_vecMxu32 vP1 = PDX_LVU32_MX16_I(vIn + 1, 0); + xb_vecMxu32 vP2 = PDX_LVU32_MX16_I(vIn + 2, 0); + xb_vecMxu32 vP3 = PDX_LVU32_MX16_I(vIn + 3, 0); + xb_vecMxf32 vF0 = (xb_vecMxf32)vP0; + xb_vecMxf32 vF1 = (xb_vecMxf32)vP1; + xb_vecMxf32 vF2 = (xb_vecMxf32)vP2; + xb_vecMxf32 vF3 = (xb_vecMxf32)vP3; + vOut[0] = PDX_MUL_MXF32(vF0, v_scale); + vOut[1] = PDX_MUL_MXF32(vF1, v_scale); + vOut[2] = PDX_MUL_MXF32(vF2, v_scale); + vOut[3] = PDX_MUL_MXF32(vF3, v_scale); + vIn += 4; + vOut += 4; + } + // Scalar residual for remaining 0-15 elements + for (; i < numel; ++i) { + out_data[i] = static_cast(input_data[i]) * scale_data; + } + } + return out; + } +#endif + if (zero_point_data != 0) { + XT_KERNEL_CHECK( + context, + out, + xa_nn_elm_dequantize_asym16u_f32, + out_data, + input_data, + inp_shape, + num_inp_dims, + NULL, + &zero_point_data, + &scale_data); + } else { + XT_KERNEL_CHECK( + context, + out, + xa_nn_elm_dequantize_sym16u_f32, + out_data, + input_data, + inp_shape, + num_inp_dims, + NULL, + &scale_data); + } + return out; + } + // Fall through to generic path for other input types + } dequantize_impl( context, out, input, &scale_data, &zero_point_data, NULL, out_dtype); - return out; } - Tensor& dequantize_per_tensor_tensor_args_out( KernelRuntimeContext& context, const Tensor& input,