Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions backends/cortex_m/ops/cmsis_scratch_buffer_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
*/
#pragma once

#include "cortex_m_ops_common.h"
extern "C" {
#include "arm_nnfunctions.h"
}
#include "cortex_m_ops_common.h"

namespace cortex_m {
namespace native {
Expand Down
78 changes: 33 additions & 45 deletions backends/cortex_m/ops/cortex_m_ops_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
#include <executorch/runtime/platform/assert.h>

#include <array>
#include <cinttypes>
#include <limits>
#include <optional>

extern "C" {
#include "arm_nn_types.h"
}
#include "arm_nnfunctions.h"

using Tensor = torch::executor::Tensor;
using ScalarType = executorch::aten::ScalarType;
Expand All @@ -42,24 +42,23 @@ inline void validate_cmsis_nn_tensor_requirements(
const Tensor& input2,
Tensor& output,
ScalarType expected_dtype = ScalarType::Char,
bool require_channels_last = false,
bool require_same_sizes = true) {
// Basic dtype validation
ET_CHECK_MSG(
input1.scalar_type() == expected_dtype,
"Input1 dtype must be %hhd, got %hhd",
expected_dtype,
input1.scalar_type());
"Input1 dtype must be %d, got %d",
static_cast<int>(expected_dtype),
static_cast<int>(input1.scalar_type()));
ET_CHECK_MSG(
input2.scalar_type() == expected_dtype,
"Input2 dtype must be %hhd, got %hhd",
expected_dtype,
input2.scalar_type());
"Input2 dtype must be %d, got %d",
static_cast<int>(expected_dtype),
static_cast<int>(input2.scalar_type()));
ET_CHECK_MSG(
output.scalar_type() == expected_dtype,
"Output dtype must be %hhd, got %hhd",
expected_dtype,
output.scalar_type());
"Output dtype must be %d, got %d",
static_cast<int>(expected_dtype),
static_cast<int>(output.scalar_type()));
if (require_same_sizes) {
ET_CHECK_MSG(
input1.sizes() == input2.sizes(),
Expand All @@ -74,55 +73,44 @@ inline void validate_cmsis_nn_tensor_requirements(
}

inline void validate_single_quant_params(
const int64_t zero_point,
const int64_t multiplier,
const int64_t shift,
const char* param_name) {
ET_CHECK_MSG(
multiplier >= std::numeric_limits<int32_t>::min() &&
multiplier <= std::numeric_limits<int32_t>::max(),
"%s multiplier must be in int32 range [Value: %d]",
"%s multiplier must be in int32 range [Value: %" PRIi64 "]",
param_name,
multiplier);

ET_CHECK_MSG(
shift >= -31 && shift <= 31,
"%s shift must be in range [-31, 31] [Value: %d]",
"%s shift must be in range [-31, 31] [Value: %" PRIi64 "]",
param_name,
shift);
}

/**
* Validate quantization parameters for inputs and output.
*
* Checks that zero points fit in int8 range, multipliers fit in int32 range,
* and shifts are within a valid bit-shift range (0-31).
* Checks that multipliers fit in int32 range and shifts are within a valid
* bit-shift range (-31 to 31).
*
* Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements
* and CMSIS-NN kernel expectations.
* Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements.
*
* Raises errors via ET_KERNEL_CHECK if any check fails.
* Raises errors via ET_CHECK_MSG if any check fails.
*/
inline void validate_quantization_params(
const int64_t zero_point1,
const int64_t multiplier1,
const int64_t shift1,
const int64_t zero_point2,
const int64_t multiplier2,
const int64_t shift2,
const int64_t output_zero_point,
const int64_t output_multiplier,
const int64_t output_shift,
Tensor& output) {
validate_single_quant_params(
zero_point1, multiplier1, shift1, "Single quant Input1");
const int64_t output_shift) {
Comment on lines 103 to +109
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validate_quantization_params no longer validates zero points even though the preceding doc comment still states it does. Either update the doc comment accordingly, or re-add int8 range checks for the input/output zero points so callers still get the documented validation.

Copilot uses AI. Check for mistakes.
validate_single_quant_params(multiplier1, shift1, "Single quant Input1");
validate_single_quant_params(multiplier2, shift2, "Single quant Input2");
validate_single_quant_params(
zero_point2, multiplier2, shift2, "Single quant Input2");
validate_single_quant_params(
output_zero_point,
output_multiplier,
output_shift,
"Single quant Output");
output_multiplier, output_shift, "Single quant Output");
}

inline bool is_channels_last_tensor(const Tensor& tensor) {
Expand All @@ -135,11 +123,10 @@ inline bool is_channels_last_tensor(const Tensor& tensor) {
return true;
}

constexpr executorch::aten::DimOrderType kChannelsLastDimOrder[] = {
0, 2, 3, 1};
constexpr std::array<executorch::aten::DimOrderType, 4>
kChannelsLastDimOrder = {0, 2, 3, 1};
executorch::aten::ArrayRef<executorch::aten::DimOrderType>
channels_last_order(kChannelsLastDimOrder, 4);

channels_last_order(kChannelsLastDimOrder);
return tensor.dim_order() == channels_last_order;
}

Expand Down Expand Up @@ -172,7 +159,7 @@ inline bool check_int32_within_range(
value > std::numeric_limits<int32_t>::max()) {
ET_LOG(
Error,
"%s: %s value (%ld) exceeds int32_t range",
"%s: %s value (%" PRIi64 ") exceeds int32_t range",
op_name,
value_name,
value);
Expand Down Expand Up @@ -354,14 +341,14 @@ inline bool validate_per_channel_quant_params(
if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
ET_LOG(
Error,
"weight_multiplier[%d] out of CMSIS-NN range: %d",
"weight_multiplier[%d] out of CMSIS-NN range: %" PRIi64,
i,
multipliers[i]);
return false;
}
// Shift: {-31, 30} for arm_nn_requantize
if (shifts[i] < -31 || shifts[i] > 30) {
ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
ET_LOG(Error, "weight_shift[%d] out of range: %" PRIi64, i, shifts[i]);
return false;
}
}
Expand All @@ -373,18 +360,19 @@ inline Error resize_to_broadcast_target_size(
const Tensor& input2,
Tensor& output) {
static constexpr int kTensorDimensionLimit = 5;
Tensor::SizesType expected_output_size[kTensorDimensionLimit];
std::array<Tensor::SizesType, kTensorDimensionLimit> expected_output_size{};
size_t expected_output_dim = 0;
auto err = torch::executor::get_broadcast_target_size(
input1,
input2,
expected_output_size,
expected_output_size.data(),
kTensorDimensionLimit,
&expected_output_dim);

if (err != Error::Ok)
if (err != Error::Ok) {
return err;
}

return executorch::runtime::resize_tensor(
output, {expected_output_size, expected_output_dim});
output, {expected_output_size.data(), expected_output_dim});
}
47 changes: 26 additions & 21 deletions backends/cortex_m/ops/op_maximum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,6 @@

#include "cortex_m_ops_common.h"

// Include CMSIS-NN headers with C linkage
extern "C" {
#include "arm_nnfunctions.h"
}

namespace cortex_m {
namespace native {

Expand All @@ -27,7 +22,6 @@ Tensor& maximum_out(
input2,
out,
ScalarType::Char,
/*require_channels_last=*/false,
/*require_same_sizes=*/false);

auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
Expand Down Expand Up @@ -78,21 +72,32 @@ Tensor& maximum_out(
static_cast<int32_t>(
output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};

const arm_cmsis_nn_status status = arm_maximum_s8(
/* ctx */ nullptr,
input1_data,
&input1_dims,
input2_data,
&input2_dims,
output_data,
&output_dims);

if (status != ARM_CMSIS_NN_SUCCESS) {
ET_LOG(
Error,
"maximum_out: arm_maximum_s8 failed with status [%d]",
static_cast<int>(status));
context.fail(Error::Internal);
for (int32_t n = 0; n < output_dims.n; ++n) {
for (int32_t h = 0; h < output_dims.h; ++h) {
for (int32_t w = 0; w < output_dims.w; ++w) {
for (int32_t c = 0; c < output_dims.c; ++c) {
const int32_t n1 = (input1_dims.n == 1) ? 0 : n;
const int32_t h1 = (input1_dims.h == 1) ? 0 : h;
const int32_t w1 = (input1_dims.w == 1) ? 0 : w;
const int32_t c1 = (input1_dims.c == 1) ? 0 : c;
const int32_t n2 = (input2_dims.n == 1) ? 0 : n;
const int32_t h2 = (input2_dims.h == 1) ? 0 : h;
const int32_t w2 = (input2_dims.w == 1) ? 0 : w;
const int32_t c2 = (input2_dims.c == 1) ? 0 : c;
const int32_t idx1 =
((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c +
c1;
const int32_t idx2 =
((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c +
c2;
const int32_t out_idx =
((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c;
output_data[out_idx] = input1_data[idx1] > input2_data[idx2]
? input1_data[idx1]
: input2_data[idx2];
}
}
}
}

return out;
Expand Down
47 changes: 26 additions & 21 deletions backends/cortex_m/ops/op_minimum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,6 @@

#include "cortex_m_ops_common.h"

// Include CMSIS-NN headers with C linkage
extern "C" {
#include "arm_nnfunctions.h"
}

namespace cortex_m {
namespace native {

Expand All @@ -29,7 +24,6 @@ Tensor& minimum_out(
input2,
out,
ScalarType::Char,
/*require_channels_last=*/false,
/*require_same_sizes=*/false);

auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
Expand Down Expand Up @@ -80,21 +74,32 @@ Tensor& minimum_out(
static_cast<int32_t>(
output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};

const arm_cmsis_nn_status status = arm_minimum_s8(
/* ctx */ nullptr,
input1_data,
&input1_dims,
input2_data,
&input2_dims,
output_data,
&output_dims);

if (status != ARM_CMSIS_NN_SUCCESS) {
ET_LOG(
Error,
"minimum_out: arm_minimum_s8 failed with status [%d]",
static_cast<int>(status));
context.fail(Error::Internal);
for (int32_t n = 0; n < output_dims.n; ++n) {
for (int32_t h = 0; h < output_dims.h; ++h) {
for (int32_t w = 0; w < output_dims.w; ++w) {
for (int32_t c = 0; c < output_dims.c; ++c) {
const int32_t n1 = (input1_dims.n == 1) ? 0 : n;
const int32_t h1 = (input1_dims.h == 1) ? 0 : h;
const int32_t w1 = (input1_dims.w == 1) ? 0 : w;
const int32_t c1 = (input1_dims.c == 1) ? 0 : c;
const int32_t n2 = (input2_dims.n == 1) ? 0 : n;
const int32_t h2 = (input2_dims.h == 1) ? 0 : h;
const int32_t w2 = (input2_dims.w == 1) ? 0 : w;
const int32_t c2 = (input2_dims.c == 1) ? 0 : c;
const int32_t idx1 =
((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c +
c1;
const int32_t idx2 =
((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c +
c2;
const int32_t out_idx =
((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c;
output_data[out_idx] = input1_data[idx1] < input2_data[idx2]
? input1_data[idx1]
: input2_data[idx2];
}
}
}
}

return out;
Expand Down
48 changes: 29 additions & 19 deletions backends/cortex_m/ops/op_pad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@

#include "cortex_m_ops_common.h"

extern "C" {
#include "arm_nnfunctions.h"
}

namespace cortex_m {
namespace native {

Expand Down Expand Up @@ -74,21 +70,35 @@ Tensor& pad_out(
const int8_t* input_data = input.const_data_ptr<int8_t>();
int8_t* output_data = out.mutable_data_ptr<int8_t>();

const arm_cmsis_nn_status status = arm_pad_s8(
input_data,
output_data,
static_cast<int8_t>(pad_value),
&input_dims,
&cmsis_pre_pad,
&cmsis_post_pad);

if (status != ARM_CMSIS_NN_SUCCESS) {
ET_LOG(
Error,
"pad_out: arm_pad_s8 failed with status [%d]",
static_cast<int>(status));
context.fail(Error::Internal);
return out;
const int32_t out_n = input_dims.n + cmsis_pre_pad.n + cmsis_post_pad.n;
const int32_t out_h = input_dims.h + cmsis_pre_pad.h + cmsis_post_pad.h;
const int32_t out_w = input_dims.w + cmsis_pre_pad.w + cmsis_post_pad.w;
const int32_t out_c = input_dims.c + cmsis_pre_pad.c + cmsis_post_pad.c;

const int8_t pad_byte = static_cast<int8_t>(pad_value);
for (int32_t n = 0; n < out_n; ++n) {
for (int32_t h = 0; h < out_h; ++h) {
for (int32_t w = 0; w < out_w; ++w) {
for (int32_t c = 0; c < out_c; ++c) {
const int32_t out_idx = ((n * out_h + h) * out_w + w) * out_c + c;
const int32_t in_n = n - cmsis_pre_pad.n;
const int32_t in_h = h - cmsis_pre_pad.h;
const int32_t in_w = w - cmsis_pre_pad.w;
const int32_t in_c = c - cmsis_pre_pad.c;
if (in_n >= 0 && in_n < input_dims.n && in_h >= 0 &&
in_h < input_dims.h && in_w >= 0 && in_w < input_dims.w &&
in_c >= 0 && in_c < input_dims.c) {
const int32_t in_idx =
((in_n * input_dims.h + in_h) * input_dims.w + in_w) *
input_dims.c +
in_c;
output_data[out_idx] = input_data[in_idx];
} else {
output_data[out_idx] = pad_byte;
}
}
}
}
}

return out;
Expand Down
Loading
Loading