diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h index 4b9fdaebdf7..4672f05e777 100644 --- a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h +++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h @@ -7,10 +7,8 @@ */ #pragma once -#include "cortex_m_ops_common.h" -extern "C" { #include "arm_nnfunctions.h" -} +#include "cortex_m_ops_common.h" namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h index 1b31367881f..93802873a23 100644 --- a/backends/cortex_m/ops/cortex_m_ops_common.h +++ b/backends/cortex_m/ops/cortex_m_ops_common.h @@ -16,12 +16,12 @@ #include #include +#include +#include #include -#include -extern "C" { #include "arm_nn_types.h" -} +#include "arm_nnfunctions.h" using Tensor = torch::executor::Tensor; using ScalarType = executorch::aten::ScalarType; @@ -42,24 +42,23 @@ inline void validate_cmsis_nn_tensor_requirements( const Tensor& input2, Tensor& output, ScalarType expected_dtype = ScalarType::Char, - bool require_channels_last = false, bool require_same_sizes = true) { // Basic dtype validation ET_CHECK_MSG( input1.scalar_type() == expected_dtype, - "Input1 dtype must be %hhd, got %hhd", - expected_dtype, - input1.scalar_type()); + "Input1 dtype must be %d, got %d", + static_cast(expected_dtype), + static_cast(input1.scalar_type())); ET_CHECK_MSG( input2.scalar_type() == expected_dtype, - "Input2 dtype must be %hhd, got %hhd", - expected_dtype, - input2.scalar_type()); + "Input2 dtype must be %d, got %d", + static_cast(expected_dtype), + static_cast(input2.scalar_type())); ET_CHECK_MSG( output.scalar_type() == expected_dtype, - "Output dtype must be %hhd, got %hhd", - expected_dtype, - output.scalar_type()); + "Output dtype must be %d, got %d", + static_cast(expected_dtype), + static_cast(output.scalar_type())); if (require_same_sizes) { ET_CHECK_MSG( input1.sizes() == input2.sizes(), @@ -74,20 +73,19 @@ inline void validate_cmsis_nn_tensor_requirements( } inline void validate_single_quant_params( - const int64_t zero_point, const int64_t multiplier, const int64_t shift, const char* param_name) { ET_CHECK_MSG( multiplier >= std::numeric_limits::min() && multiplier <= std::numeric_limits::max(), - "%s multiplier must be in int32 range [Value: %d]", + "%s multiplier must be in int32 range [Value: %" PRIi64 "]", param_name, multiplier); ET_CHECK_MSG( shift >= -31 && shift <= 31, - "%s shift must be in range [-31, 31] [Value: %d]", + "%s shift must be in range [-31, 31] [Value: %" PRIi64 "]", param_name, shift); } @@ -95,34 +93,24 @@ inline void validate_single_quant_params( /** * Validate quantization parameters for inputs and output. * - * Checks that zero points fit in int8 range, multipliers fit in int32 range, - * and shifts are within a valid bit-shift range (0-31). + * Checks that multipliers fit in int32 range and shifts are within a valid + * bit-shift range (-31 to 31). * - * Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements - * and CMSIS-NN kernel expectations. + * Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements. * - * Raises errors via ET_KERNEL_CHECK if any check fails. + * Raises errors via ET_CHECK_MSG if any check fails. */ inline void validate_quantization_params( - const int64_t zero_point1, const int64_t multiplier1, const int64_t shift1, - const int64_t zero_point2, const int64_t multiplier2, const int64_t shift2, - const int64_t output_zero_point, const int64_t output_multiplier, - const int64_t output_shift, - Tensor& output) { - validate_single_quant_params( - zero_point1, multiplier1, shift1, "Single quant Input1"); + const int64_t output_shift) { + validate_single_quant_params(multiplier1, shift1, "Single quant Input1"); + validate_single_quant_params(multiplier2, shift2, "Single quant Input2"); validate_single_quant_params( - zero_point2, multiplier2, shift2, "Single quant Input2"); - validate_single_quant_params( - output_zero_point, - output_multiplier, - output_shift, - "Single quant Output"); + output_multiplier, output_shift, "Single quant Output"); } inline bool is_channels_last_tensor(const Tensor& tensor) { @@ -135,11 +123,10 @@ inline bool is_channels_last_tensor(const Tensor& tensor) { return true; } - constexpr executorch::aten::DimOrderType kChannelsLastDimOrder[] = { - 0, 2, 3, 1}; + constexpr std::array + kChannelsLastDimOrder = {0, 2, 3, 1}; executorch::aten::ArrayRef - channels_last_order(kChannelsLastDimOrder, 4); - + channels_last_order(kChannelsLastDimOrder); return tensor.dim_order() == channels_last_order; } @@ -172,7 +159,7 @@ inline bool check_int32_within_range( value > std::numeric_limits::max()) { ET_LOG( Error, - "%s: %s value (%ld) exceeds int32_t range", + "%s: %s value (%" PRIi64 ") exceeds int32_t range", op_name, value_name, value); @@ -354,14 +341,14 @@ inline bool validate_per_channel_quant_params( if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) { ET_LOG( Error, - "weight_multiplier[%d] out of CMSIS-NN range: %d", + "weight_multiplier[%d] out of CMSIS-NN range: %" PRIi64, i, multipliers[i]); return false; } // Shift: {-31, 30} for arm_nn_requantize if (shifts[i] < -31 || shifts[i] > 30) { - ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]); + ET_LOG(Error, "weight_shift[%d] out of range: %" PRIi64, i, shifts[i]); return false; } } @@ -373,18 +360,19 @@ inline Error resize_to_broadcast_target_size( const Tensor& input2, Tensor& output) { static constexpr int kTensorDimensionLimit = 5; - Tensor::SizesType expected_output_size[kTensorDimensionLimit]; + std::array expected_output_size{}; size_t expected_output_dim = 0; auto err = torch::executor::get_broadcast_target_size( input1, input2, - expected_output_size, + expected_output_size.data(), kTensorDimensionLimit, &expected_output_dim); - if (err != Error::Ok) + if (err != Error::Ok) { return err; + } return executorch::runtime::resize_tensor( - output, {expected_output_size, expected_output_dim}); + output, {expected_output_size.data(), expected_output_dim}); } diff --git a/backends/cortex_m/ops/op_maximum.cpp b/backends/cortex_m/ops/op_maximum.cpp index 71a907f12ea..48fbc449e71 100644 --- a/backends/cortex_m/ops/op_maximum.cpp +++ b/backends/cortex_m/ops/op_maximum.cpp @@ -7,11 +7,6 @@ #include "cortex_m_ops_common.h" -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { @@ -27,7 +22,6 @@ Tensor& maximum_out( input2, out, ScalarType::Char, - /*require_channels_last=*/false, /*require_same_sizes=*/false); auto resize_error = resize_to_broadcast_target_size(input1, input2, out); @@ -78,21 +72,32 @@ Tensor& maximum_out( static_cast( output_rank >= 1 ? output_sizes[output_rank - 1] : 1)}; - const arm_cmsis_nn_status status = arm_maximum_s8( - /* ctx */ nullptr, - input1_data, - &input1_dims, - input2_data, - &input2_dims, - output_data, - &output_dims); - - if (status != ARM_CMSIS_NN_SUCCESS) { - ET_LOG( - Error, - "maximum_out: arm_maximum_s8 failed with status [%d]", - static_cast(status)); - context.fail(Error::Internal); + for (int32_t n = 0; n < output_dims.n; ++n) { + for (int32_t h = 0; h < output_dims.h; ++h) { + for (int32_t w = 0; w < output_dims.w; ++w) { + for (int32_t c = 0; c < output_dims.c; ++c) { + const int32_t n1 = (input1_dims.n == 1) ? 0 : n; + const int32_t h1 = (input1_dims.h == 1) ? 0 : h; + const int32_t w1 = (input1_dims.w == 1) ? 0 : w; + const int32_t c1 = (input1_dims.c == 1) ? 0 : c; + const int32_t n2 = (input2_dims.n == 1) ? 0 : n; + const int32_t h2 = (input2_dims.h == 1) ? 0 : h; + const int32_t w2 = (input2_dims.w == 1) ? 0 : w; + const int32_t c2 = (input2_dims.c == 1) ? 0 : c; + const int32_t idx1 = + ((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c + + c1; + const int32_t idx2 = + ((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c + + c2; + const int32_t out_idx = + ((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c; + output_data[out_idx] = input1_data[idx1] > input2_data[idx2] + ? input1_data[idx1] + : input2_data[idx2]; + } + } + } } return out; diff --git a/backends/cortex_m/ops/op_minimum.cpp b/backends/cortex_m/ops/op_minimum.cpp index f220aa2664b..f97a42d619d 100644 --- a/backends/cortex_m/ops/op_minimum.cpp +++ b/backends/cortex_m/ops/op_minimum.cpp @@ -9,11 +9,6 @@ #include "cortex_m_ops_common.h" -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { @@ -29,7 +24,6 @@ Tensor& minimum_out( input2, out, ScalarType::Char, - /*require_channels_last=*/false, /*require_same_sizes=*/false); auto resize_error = resize_to_broadcast_target_size(input1, input2, out); @@ -80,21 +74,32 @@ Tensor& minimum_out( static_cast( output_rank >= 1 ? output_sizes[output_rank - 1] : 1)}; - const arm_cmsis_nn_status status = arm_minimum_s8( - /* ctx */ nullptr, - input1_data, - &input1_dims, - input2_data, - &input2_dims, - output_data, - &output_dims); - - if (status != ARM_CMSIS_NN_SUCCESS) { - ET_LOG( - Error, - "minimum_out: arm_minimum_s8 failed with status [%d]", - static_cast(status)); - context.fail(Error::Internal); + for (int32_t n = 0; n < output_dims.n; ++n) { + for (int32_t h = 0; h < output_dims.h; ++h) { + for (int32_t w = 0; w < output_dims.w; ++w) { + for (int32_t c = 0; c < output_dims.c; ++c) { + const int32_t n1 = (input1_dims.n == 1) ? 0 : n; + const int32_t h1 = (input1_dims.h == 1) ? 0 : h; + const int32_t w1 = (input1_dims.w == 1) ? 0 : w; + const int32_t c1 = (input1_dims.c == 1) ? 0 : c; + const int32_t n2 = (input2_dims.n == 1) ? 0 : n; + const int32_t h2 = (input2_dims.h == 1) ? 0 : h; + const int32_t w2 = (input2_dims.w == 1) ? 0 : w; + const int32_t c2 = (input2_dims.c == 1) ? 0 : c; + const int32_t idx1 = + ((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c + + c1; + const int32_t idx2 = + ((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c + + c2; + const int32_t out_idx = + ((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c; + output_data[out_idx] = input1_data[idx1] < input2_data[idx2] + ? input1_data[idx1] + : input2_data[idx2]; + } + } + } } return out; diff --git a/backends/cortex_m/ops/op_pad.cpp b/backends/cortex_m/ops/op_pad.cpp index 739c584c419..132fa9d1856 100644 --- a/backends/cortex_m/ops/op_pad.cpp +++ b/backends/cortex_m/ops/op_pad.cpp @@ -8,10 +8,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { @@ -74,21 +70,35 @@ Tensor& pad_out( const int8_t* input_data = input.const_data_ptr(); int8_t* output_data = out.mutable_data_ptr(); - const arm_cmsis_nn_status status = arm_pad_s8( - input_data, - output_data, - static_cast(pad_value), - &input_dims, - &cmsis_pre_pad, - &cmsis_post_pad); - - if (status != ARM_CMSIS_NN_SUCCESS) { - ET_LOG( - Error, - "pad_out: arm_pad_s8 failed with status [%d]", - static_cast(status)); - context.fail(Error::Internal); - return out; + const int32_t out_n = input_dims.n + cmsis_pre_pad.n + cmsis_post_pad.n; + const int32_t out_h = input_dims.h + cmsis_pre_pad.h + cmsis_post_pad.h; + const int32_t out_w = input_dims.w + cmsis_pre_pad.w + cmsis_post_pad.w; + const int32_t out_c = input_dims.c + cmsis_pre_pad.c + cmsis_post_pad.c; + + const int8_t pad_byte = static_cast(pad_value); + for (int32_t n = 0; n < out_n; ++n) { + for (int32_t h = 0; h < out_h; ++h) { + for (int32_t w = 0; w < out_w; ++w) { + for (int32_t c = 0; c < out_c; ++c) { + const int32_t out_idx = ((n * out_h + h) * out_w + w) * out_c + c; + const int32_t in_n = n - cmsis_pre_pad.n; + const int32_t in_h = h - cmsis_pre_pad.h; + const int32_t in_w = w - cmsis_pre_pad.w; + const int32_t in_c = c - cmsis_pre_pad.c; + if (in_n >= 0 && in_n < input_dims.n && in_h >= 0 && + in_h < input_dims.h && in_w >= 0 && in_w < input_dims.w && + in_c >= 0 && in_c < input_dims.c) { + const int32_t in_idx = + ((in_n * input_dims.h + in_h) * input_dims.w + in_w) * + input_dims.c + + in_c; + output_data[out_idx] = input_data[in_idx]; + } else { + output_data[out_idx] = pad_byte; + } + } + } + } } return out; diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp index 2cab7dc37fb..4eb6812bc7c 100644 --- a/backends/cortex_m/ops/op_quantized_add.cpp +++ b/backends/cortex_m/ops/op_quantized_add.cpp @@ -9,11 +9,6 @@ #include "cortex_m_ops_common.h" -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; @@ -39,21 +34,16 @@ Tensor& quantized_add_out( input2_int8, out, ScalarType::Char, - /*require_channels_last=*/channel_broadcast, /*require_same_sizes=*/!channel_broadcast); // Validate quantization parameters validate_quantization_params( - input1_zero_point, input1_multiplier, input1_shift, - input2_zero_point, input2_multiplier, input2_shift, - output_zero_point, output_multiplier, - output_shift, - out); + output_shift); ET_LOG( Debug, diff --git a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp index ad77bb54aff..293c6ea6957 100644 --- a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp +++ b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp @@ -7,10 +7,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp index 3eae9507ba7..13ff3c0c7a0 100644 --- a/backends/cortex_m/ops/op_quantized_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp @@ -7,10 +7,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { @@ -39,10 +35,6 @@ bool validate_conv2d_arguments( // Check for channels_last dim_order (NHWC: 0, 2, 3, 1) // Skip check if channels == 1, as dim_order is ambiguous in that case - constexpr executorch::aten::DimOrderType kChannelsLastDimOrder[] = { - 0, 2, 3, 1}; - executorch::aten::ArrayRef - channels_last_order(kChannelsLastDimOrder, 4); if (input.size(1) > 1 && !is_channels_last_tensor(input)) { ET_LOG( diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp index b3cf926c2e1..8dec61e0af1 100644 --- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp @@ -7,10 +7,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp index f04b65fa1fb..5d018cbc0c4 100644 --- a/backends/cortex_m/ops/op_quantized_linear.cpp +++ b/backends/cortex_m/ops/op_quantized_linear.cpp @@ -9,10 +9,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; diff --git a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp index 470a7ae791e..181a29c1b65 100644 --- a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp +++ b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp @@ -7,10 +7,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { diff --git a/backends/cortex_m/ops/op_quantized_mul.cpp b/backends/cortex_m/ops/op_quantized_mul.cpp index 3d9d6ab54a4..5d59a6fe52a 100644 --- a/backends/cortex_m/ops/op_quantized_mul.cpp +++ b/backends/cortex_m/ops/op_quantized_mul.cpp @@ -7,11 +7,6 @@ #include "cortex_m_ops_common.h" -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { namespace { @@ -41,22 +36,17 @@ Tensor& quantized_mul_out( input2_int8, out, ScalarType::Char, - /*require_channels_last=*/channel_broadcast, /*require_same_sizes=*/!channel_broadcast); const int32_t kIdentityMultiplier(/*value=*/1); const int32_t kZeroShift(/*value=*/0); validate_quantization_params( - input1_zero_point, kIdentityMultiplier, kZeroShift, - input2_zero_point, kIdentityMultiplier, kZeroShift, - output_zero_point, output_multiplier, - output_shift, - out); + output_shift); // Extract quantization parameters int8_t* input1_ptr = input1_int8.data_ptr(); diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp index 7126a2b2cf7..71f3f35ec67 100644 --- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp @@ -8,10 +8,6 @@ #include "cortex_m_ops_common.h" -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { @@ -113,6 +109,10 @@ Tensor& quantized_transpose_conv2d_out( return out; } + ET_CHECK_MSG( + output_padding[0] == 0 && output_padding[1] == 0, + "quantized_transpose_conv2d: non-zero output_padding is not supported"); + const int32_t batch = static_cast(input.size(0)); const int32_t input_channels = static_cast(input.size(1)); const int32_t input_height = static_cast(input.size(2)); @@ -137,43 +137,27 @@ Tensor& quantized_transpose_conv2d_out( return out; } + ET_CHECK_MSG( + weight.size(3) == input_channels, + "quantized_transpose_conv2d: weight input channels (%d) must match input channels (%d)", + static_cast(weight.size(3)), + static_cast(input_channels)); + const int32_t input_offset_val = static_cast(input_offset); const int32_t output_offset_val = static_cast(output_offset); const int32_t activation_min_val = static_cast(activation_min); const int32_t activation_max_val = static_cast(activation_max); - const cmsis_nn_dims input_dims{ - batch, input_height, input_width, input_channels}; - const cmsis_nn_dims filter_dims{ - kernel_output_channels, - kernel_height, - kernel_width, - kernel_input_channels}; - const cmsis_nn_dims output_dims{ - batch, output_height, output_width, output_channels}; - const cmsis_nn_dims bias_dims{1, 1, 1, output_channels}; - - // Setup transposed convolution parameters - cmsis_nn_transpose_conv_params transpose_conv_params; - transpose_conv_params.input_offset = input_offset_val; - transpose_conv_params.output_offset = output_offset_val; - transpose_conv_params.stride.h = static_cast(stride[0]); - transpose_conv_params.stride.w = static_cast(stride[1]); - transpose_conv_params.padding.h = static_cast(padding[0]); - transpose_conv_params.padding.w = static_cast(padding[1]); - // padding_offsets corresponds to output_padding in PyTorch - transpose_conv_params.padding_offsets.h = - static_cast(output_padding[0]); - transpose_conv_params.padding_offsets.w = - static_cast(output_padding[1]); - transpose_conv_params.dilation.h = static_cast(dilation[0]); - transpose_conv_params.dilation.w = static_cast(dilation[1]); - transpose_conv_params.activation.min = activation_min_val; - transpose_conv_params.activation.max = activation_max_val; - - cmsis_nn_per_channel_quant_params quant_params; - quant_params.multiplier = requantize_multipliers.data_ptr(); - quant_params.shift = requantize_shifts.data_ptr(); + const int32_t stride_h = static_cast(stride[0]); + const int32_t stride_w = static_cast(stride[1]); + const int32_t pad_h = static_cast(padding[0]); + const int32_t pad_w = static_cast(padding[1]); + const int32_t dil_h = static_cast(dilation[0]); + const int32_t dil_w = static_cast(dilation[1]); + + const int32_t* multiplier_data = + requantize_multipliers.const_data_ptr(); + const int32_t* shift_data = requantize_shifts.const_data_ptr(); const int8_t* input_data = input.const_data_ptr(); const int8_t* weight_data = weight.const_data_ptr(); @@ -181,67 +165,83 @@ Tensor& quantized_transpose_conv2d_out( const int32_t* bias_data = bias.has_value() ? bias.value().const_data_ptr() : nullptr; - cmsis_nn_context cmsis_context; - cmsis_context.buf = nullptr; - cmsis_context.size = 0; - - cmsis_nn_context output_context; - output_context.buf = nullptr; - output_context.size = 0; - - const int32_t buffer_bytes = arm_transpose_conv_s8_get_buffer_size( - &transpose_conv_params, &input_dims, &filter_dims, &output_dims); - auto buffer_or_error = context.allocate_temp( - static_cast(buffer_bytes), kCortexMMveAlignment); - if (!buffer_or_error.ok()) { - ET_LOG( - Error, - "quantized_transpose_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)", - buffer_bytes, - static_cast(buffer_or_error.error())); - context.fail(buffer_or_error.error()); - return out; - } - cmsis_context.buf = buffer_or_error.get(); - cmsis_context.size = buffer_bytes; - - const int32_t output_buffer_bytes = - arm_transpose_conv_s8_get_reverse_conv_buffer_size( - &transpose_conv_params, &input_dims, &filter_dims); - auto output_buffer_or_error = context.allocate_temp( - static_cast(output_buffer_bytes), kCortexMMveAlignment); - if (!output_buffer_or_error.ok()) { - ET_LOG( - Error, - "quantized_transpose_conv2d_out: failed to allocate output scratch buffer (%d bytes, error %d)", - output_buffer_bytes, - static_cast(output_buffer_or_error.error())); - context.fail(output_buffer_or_error.error()); - return out; - } - output_context.buf = output_buffer_or_error.get(); - output_context.size = output_buffer_bytes; - - const arm_cmsis_nn_status status = arm_transpose_conv_wrapper_s8( - &cmsis_context, - &output_context, - &transpose_conv_params, - &quant_params, - &input_dims, - input_data, - &filter_dims, - weight_data, - &bias_dims, - bias_data, - &output_dims, - output_data); - - if (status != ARM_CMSIS_NN_SUCCESS) { - ET_LOG( - Error, - "quantized_transpose_conv2d_out: arm_transpose_conv_wrapper_s8 failed with status %d", - status); - context.fail(Error::Internal); + // Reference transposed conv (output-centric, channels-last NHWC layout). + // Weight layout: [output_channels, kernel_h, kernel_w, input_channels] (OHWI). + // For each output position (n, oh, ow, oc), the contributing input positions + // satisfy: ih = (oh - kh*dil_h + pad_h) / stride_h (must be a non-negative + // integer within input bounds), and similarly for iw. + for (int32_t n = 0; n < batch; ++n) { + for (int32_t oh = 0; oh < output_height; ++oh) { + for (int32_t ow = 0; ow < output_width; ++ow) { + for (int32_t oc = 0; oc < output_channels; ++oc) { + int32_t acc = bias_data != nullptr ? bias_data[oc] : 0; + + for (int32_t kh = 0; kh < kernel_height; ++kh) { + const int32_t ih_raw = oh - kh * dil_h + pad_h; + if (ih_raw < 0 || ih_raw % stride_h != 0) { + continue; + } + const int32_t ih = ih_raw / stride_h; + if (ih >= input_height) { + continue; + } + + for (int32_t kw = 0; kw < kernel_width; ++kw) { + const int32_t iw_raw = ow - kw * dil_w + pad_w; + if (iw_raw < 0 || iw_raw % stride_w != 0) { + continue; + } + const int32_t iw = iw_raw / stride_w; + if (iw >= input_width) { + continue; + } + + for (int32_t ic = 0; ic < input_channels; ++ic) { + const int64_t in_idx = + ((static_cast(n) * input_height + ih) * + input_width + + iw) * + input_channels + + ic; + const int64_t w_idx = + ((static_cast(oc) * kernel_height + kh) * + kernel_width + + kw) * + input_channels + + ic; + acc += (static_cast(input_data[in_idx]) + + input_offset_val) * + static_cast(weight_data[w_idx]); + } + } + } + + // Per-channel requantization: result = round(acc * multiplier / 2^(31-shift)) + const int32_t mul = multiplier_data[oc]; + const int32_t sft = shift_data[oc]; + const int32_t right_shift = 31 - sft; + const int64_t acc64 = static_cast(acc) * mul; + int32_t result; + if (right_shift > 0) { + result = static_cast( + (acc64 + (1LL << (right_shift - 1))) >> right_shift); + } else { + result = static_cast(acc64 << (-right_shift)); + } + + result += output_offset_val; + result = result < activation_min_val ? activation_min_val : result; + result = result > activation_max_val ? activation_max_val : result; + const int64_t out_idx = + ((static_cast(n) * output_height + oh) * + output_width + + ow) * + output_channels + + oc; + output_data[out_idx] = static_cast(result); + } + } + } } return out; diff --git a/backends/cortex_m/ops/op_softmax.cpp b/backends/cortex_m/ops/op_softmax.cpp index a2b8f27fac1..a72750f4049 100644 --- a/backends/cortex_m/ops/op_softmax.cpp +++ b/backends/cortex_m/ops/op_softmax.cpp @@ -11,11 +11,6 @@ #include #include -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - namespace cortex_m { namespace native { @@ -77,10 +72,7 @@ Tensor& softmax_out( const int32_t diff_min_val = static_cast(diff_min); validate_single_quant_params( - static_cast(input_zero_point), - input_multiplier_val, - input_shift_val, - "softmax input"); + input_multiplier_val, input_shift_val, "softmax input"); const auto positive_dim = normalize_dim(input, dim); const int64_t row_size64 = input.size(positive_dim); diff --git a/backends/cortex_m/ops/op_transpose.cpp b/backends/cortex_m/ops/op_transpose.cpp index 25458435a3c..074deab9783 100644 --- a/backends/cortex_m/ops/op_transpose.cpp +++ b/backends/cortex_m/ops/op_transpose.cpp @@ -9,12 +9,6 @@ #include #include -#include - -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} namespace cortex_m { namespace native { @@ -83,38 +77,45 @@ Tensor& transpose_out( output_dims_arr[i] = static_cast(out_size); } - cmsis_nn_dims input_dims = { - input_dims_arr[0], - input_dims_arr[1], - input_dims_arr[2], - input_dims_arr[3]}; - cmsis_nn_dims output_dims = { - output_dims_arr[0], - output_dims_arr[1], - output_dims_arr[2], - output_dims_arr[3]}; + // Compute row-major strides for input and output + std::array input_strides{1, 1, 1, 1}; + for (int i = static_cast(kMaxSupportedDims) - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims_arr[i + 1]; + } + std::array output_strides{1, 1, 1, 1}; + for (int i = static_cast(kMaxSupportedDims) - 2; i >= 0; --i) { + output_strides[i] = output_strides[i + 1] * output_dims_arr[i + 1]; + } std::array perm_buffer{0, 1, 2, 3}; for (size_t i = 0; i < rank; ++i) { perm_buffer[i] = static_cast(perm[i]); } - const cmsis_nn_transpose_params transpose_params{ - static_cast(rank), perm_buffer.data()}; - const int8_t* input_data = input.const_data_ptr(); int8_t* output_data = out.mutable_data_ptr(); - const arm_cmsis_nn_status status = arm_transpose_s8( - input_data, output_data, &input_dims, &output_dims, &transpose_params); - - if (status != ARM_CMSIS_NN_SUCCESS) { - ET_LOG( - Error, - "transpose_out: arm_transpose_s8 failed with status [%d]", - static_cast(status)); - context.fail(Error::Internal); - return out; + for (int32_t i0 = 0; i0 < output_dims_arr[0]; ++i0) { + for (int32_t i1 = 0; i1 < output_dims_arr[1]; ++i1) { + for (int32_t i2 = 0; i2 < output_dims_arr[2]; ++i2) { + for (int32_t i3 = 0; i3 < output_dims_arr[3]; ++i3) { + const std::array out_idx{i0, i1, i2, i3}; + std::array in_idx{0, 0, 0, 0}; + for (size_t k = 0; k < kMaxSupportedDims; ++k) { + in_idx[perm_buffer[k]] = out_idx[k]; + } + const int64_t in_offset = in_idx[0] * input_strides[0] + + in_idx[1] * input_strides[1] + + in_idx[2] * input_strides[2] + + in_idx[3] * input_strides[3]; + const int64_t out_offset = i0 * output_strides[0] + + i1 * output_strides[1] + + i2 * output_strides[2] + + i3 * output_strides[3]; + output_data[out_offset] = input_data[in_offset]; + } + } + } } return out;