diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
index 4b9fdaebdf7..4672f05e777 100644
--- a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
+++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
@@ -7,10 +7,8 @@
  */
 #pragma once
 
-#include "cortex_m_ops_common.h"
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
index 1b31367881f..93802873a23 100644
--- a/backends/cortex_m/ops/cortex_m_ops_common.h
+++ b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -16,12 +16,12 @@
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <array>
+#include <cinttypes>
 #include <limits>
-#include <optional>
 
-extern "C" {
 #include "arm_nn_types.h"
-}
+#include "arm_nnfunctions.h"
 
 using Tensor = torch::executor::Tensor;
 using ScalarType = executorch::aten::ScalarType;
@@ -42,24 +42,23 @@ inline void validate_cmsis_nn_tensor_requirements(
     const Tensor& input2,
     Tensor& output,
     ScalarType expected_dtype = ScalarType::Char,
-    bool require_channels_last = false,
     bool require_same_sizes = true) {
   // Basic dtype validation
   ET_CHECK_MSG(
       input1.scalar_type() == expected_dtype,
-      "Input1 dtype must be %hhd, got %hhd",
-      expected_dtype,
-      input1.scalar_type());
+      "Input1 dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(input1.scalar_type()));
   ET_CHECK_MSG(
       input2.scalar_type() == expected_dtype,
-      "Input2 dtype must be %hhd, got %hhd",
-      expected_dtype,
-      input2.scalar_type());
+      "Input2 dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(input2.scalar_type()));
   ET_CHECK_MSG(
       output.scalar_type() == expected_dtype,
-      "Output dtype must be %hhd, got %hhd",
-      expected_dtype,
-      output.scalar_type());
+      "Output dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(output.scalar_type()));
   if (require_same_sizes) {
     ET_CHECK_MSG(
         input1.sizes() == input2.sizes(),
@@ -74,20 +73,19 @@ inline void validate_cmsis_nn_tensor_requirements(
 }
 
 inline void validate_single_quant_params(
-    const int64_t zero_point,
     const int64_t multiplier,
     const int64_t shift,
     const char* param_name) {
   ET_CHECK_MSG(
       multiplier >= std::numeric_limits<int32_t>::min() &&
           multiplier <= std::numeric_limits<int32_t>::max(),
-      "%s multiplier must be in int32 range [Value: %d]",
+      "%s multiplier must be in int32 range [Value: %" PRIi64 "]",
       param_name,
       multiplier);
 
   ET_CHECK_MSG(
       shift >= -31 && shift <= 31,
-      "%s shift must be in range [-31, 31] [Value: %d]",
+      "%s shift must be in range [-31, 31] [Value: %" PRIi64 "]",
       param_name,
       shift);
 }
@@ -95,34 +93,24 @@ inline void validate_single_quant_params(
 /**
  * Validate quantization parameters for inputs and output.
  *
- * Checks that zero points fit in int8 range, multipliers fit in int32 range,
- * and shifts are within a valid bit-shift range (0-31).
+ * Checks that multipliers fit in int32 range and shifts are within a valid
+ * bit-shift range (-31 to 31).
  *
- * Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements
- * and CMSIS-NN kernel expectations.
+ * Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements.
  *
- * Raises errors via ET_KERNEL_CHECK if any check fails.
+ * Raises errors via ET_CHECK_MSG if any check fails.
  */
 inline void validate_quantization_params(
-    const int64_t zero_point1,
     const int64_t multiplier1,
     const int64_t shift1,
-    const int64_t zero_point2,
     const int64_t multiplier2,
     const int64_t shift2,
-    const int64_t output_zero_point,
     const int64_t output_multiplier,
-    const int64_t output_shift,
-    Tensor& output) {
-  validate_single_quant_params(
-      zero_point1, multiplier1, shift1, "Single quant Input1");
+    const int64_t output_shift) {
+  validate_single_quant_params(multiplier1, shift1, "Single quant Input1");
+  validate_single_quant_params(multiplier2, shift2, "Single quant Input2");
   validate_single_quant_params(
-      zero_point2, multiplier2, shift2, "Single quant Input2");
-  validate_single_quant_params(
-      output_zero_point,
-      output_multiplier,
-      output_shift,
-      "Single quant Output");
+      output_multiplier, output_shift, "Single quant Output");
 }
 
 inline bool is_channels_last_tensor(const Tensor& tensor) {
@@ -135,11 +123,10 @@ inline bool is_channels_last_tensor(const Tensor& tensor) {
     return true;
   }
 
-  constexpr executorch::aten::DimOrderType kChannelsLastDimOrder[] = {
-      0, 2, 3, 1};
+  constexpr std::array<executorch::aten::DimOrderType, 4>
+      kChannelsLastDimOrder = {0, 2, 3, 1};
   executorch::aten::ArrayRef<executorch::aten::DimOrderType>
-      channels_last_order(kChannelsLastDimOrder, 4);
-
+      channels_last_order(kChannelsLastDimOrder);
   return tensor.dim_order() == channels_last_order;
 }
 
@@ -172,7 +159,7 @@ inline bool check_int32_within_range(
       value > std::numeric_limits<int32_t>::max()) {
     ET_LOG(
         Error,
-        "%s: %s value (%ld) exceeds int32_t range",
+        "%s: %s value (%" PRIi64 ") exceeds int32_t range",
         op_name,
         value_name,
         value);
@@ -354,14 +341,14 @@ inline bool validate_per_channel_quant_params(
     if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
       ET_LOG(
           Error,
-          "weight_multiplier[%d] out of CMSIS-NN range: %d",
+          "weight_multiplier[%d] out of CMSIS-NN range: %" PRIi64,
           i,
           multipliers[i]);
       return false;
     }
     // Shift: {-31, 30} for arm_nn_requantize
     if (shifts[i] < -31 || shifts[i] > 30) {
-      ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
+      ET_LOG(Error, "weight_shift[%d] out of range: %" PRIi64, i, shifts[i]);
       return false;
     }
   }
@@ -373,18 +360,19 @@ inline Error resize_to_broadcast_target_size(
     const Tensor& input2,
     Tensor& output) {
   static constexpr int kTensorDimensionLimit = 5;
-  Tensor::SizesType expected_output_size[kTensorDimensionLimit];
+  std::array<Tensor::SizesType, kTensorDimensionLimit> expected_output_size{};
   size_t expected_output_dim = 0;
   auto err = torch::executor::get_broadcast_target_size(
       input1,
       input2,
-      expected_output_size,
+      expected_output_size.data(),
       kTensorDimensionLimit,
       &expected_output_dim);
 
-  if (err != Error::Ok)
+  if (err != Error::Ok) {
     return err;
+  }
 
   return executorch::runtime::resize_tensor(
-      output, {expected_output_size, expected_output_dim});
+      output, {expected_output_size.data(), expected_output_dim});
 }
diff --git a/backends/cortex_m/ops/op_maximum.cpp b/backends/cortex_m/ops/op_maximum.cpp
index 71a907f12ea..48fbc449e71 100644
--- a/backends/cortex_m/ops/op_maximum.cpp
+++ b/backends/cortex_m/ops/op_maximum.cpp
@@ -7,11 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -27,7 +22,6 @@ Tensor& maximum_out(
       input2,
       out,
       ScalarType::Char,
-      /*require_channels_last=*/false,
       /*require_same_sizes=*/false);
 
   auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
@@ -78,21 +72,32 @@ Tensor& maximum_out(
       static_cast<int32_t>(
           output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};
 
-  const arm_cmsis_nn_status status = arm_maximum_s8(
-      /* ctx */ nullptr,
-      input1_data,
-      &input1_dims,
-      input2_data,
-      &input2_dims,
-      output_data,
-      &output_dims);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "maximum_out: arm_maximum_s8 failed with status [%d]",
-        static_cast<int>(status));
-    context.fail(Error::Internal);
+  for (int32_t n = 0; n < output_dims.n; ++n) {
+    for (int32_t h = 0; h < output_dims.h; ++h) {
+      for (int32_t w = 0; w < output_dims.w; ++w) {
+        for (int32_t c = 0; c < output_dims.c; ++c) {
+          const int32_t n1 = (input1_dims.n == 1) ? 0 : n;
+          const int32_t h1 = (input1_dims.h == 1) ? 0 : h;
+          const int32_t w1 = (input1_dims.w == 1) ? 0 : w;
+          const int32_t c1 = (input1_dims.c == 1) ? 0 : c;
+          const int32_t n2 = (input2_dims.n == 1) ? 0 : n;
+          const int32_t h2 = (input2_dims.h == 1) ? 0 : h;
+          const int32_t w2 = (input2_dims.w == 1) ? 0 : w;
+          const int32_t c2 = (input2_dims.c == 1) ? 0 : c;
+          const int32_t idx1 =
+              ((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c +
+              c1;
+          const int32_t idx2 =
+              ((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c +
+              c2;
+          const int32_t out_idx =
+              ((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c;
+          output_data[out_idx] = input1_data[idx1] > input2_data[idx2]
+              ? input1_data[idx1]
+              : input2_data[idx2];
+        }
+      }
+    }
   }
 
   return out;
diff --git a/backends/cortex_m/ops/op_minimum.cpp b/backends/cortex_m/ops/op_minimum.cpp
index f220aa2664b..f97a42d619d 100644
--- a/backends/cortex_m/ops/op_minimum.cpp
+++ b/backends/cortex_m/ops/op_minimum.cpp
@@ -9,11 +9,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -29,7 +24,6 @@ Tensor& minimum_out(
       input2,
       out,
       ScalarType::Char,
-      /*require_channels_last=*/false,
       /*require_same_sizes=*/false);
 
   auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
@@ -80,21 +74,32 @@ Tensor& minimum_out(
       static_cast<int32_t>(
           output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};
 
-  const arm_cmsis_nn_status status = arm_minimum_s8(
-      /* ctx */ nullptr,
-      input1_data,
-      &input1_dims,
-      input2_data,
-      &input2_dims,
-      output_data,
-      &output_dims);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "minimum_out: arm_minimum_s8 failed with status [%d]",
-        static_cast<int>(status));
-    context.fail(Error::Internal);
+  for (int32_t n = 0; n < output_dims.n; ++n) {
+    for (int32_t h = 0; h < output_dims.h; ++h) {
+      for (int32_t w = 0; w < output_dims.w; ++w) {
+        for (int32_t c = 0; c < output_dims.c; ++c) {
+          const int32_t n1 = (input1_dims.n == 1) ? 0 : n;
+          const int32_t h1 = (input1_dims.h == 1) ? 0 : h;
+          const int32_t w1 = (input1_dims.w == 1) ? 0 : w;
+          const int32_t c1 = (input1_dims.c == 1) ? 0 : c;
+          const int32_t n2 = (input2_dims.n == 1) ? 0 : n;
+          const int32_t h2 = (input2_dims.h == 1) ? 0 : h;
+          const int32_t w2 = (input2_dims.w == 1) ? 0 : w;
+          const int32_t c2 = (input2_dims.c == 1) ? 0 : c;
+          const int32_t idx1 =
+              ((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c +
+              c1;
+          const int32_t idx2 =
+              ((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c +
+              c2;
+          const int32_t out_idx =
+              ((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c;
+          output_data[out_idx] = input1_data[idx1] < input2_data[idx2]
+              ? input1_data[idx1]
+              : input2_data[idx2];
+        }
+      }
+    }
   }
 
   return out;
diff --git a/backends/cortex_m/ops/op_pad.cpp b/backends/cortex_m/ops/op_pad.cpp
index 739c584c419..132fa9d1856 100644
--- a/backends/cortex_m/ops/op_pad.cpp
+++ b/backends/cortex_m/ops/op_pad.cpp
@@ -8,10 +8,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -74,21 +70,35 @@ Tensor& pad_out(
   const int8_t* input_data = input.const_data_ptr<int8_t>();
   int8_t* output_data = out.mutable_data_ptr<int8_t>();
 
-  const arm_cmsis_nn_status status = arm_pad_s8(
-      input_data,
-      output_data,
-      static_cast<int8_t>(pad_value),
-      &input_dims,
-      &cmsis_pre_pad,
-      &cmsis_post_pad);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "pad_out: arm_pad_s8 failed with status [%d]",
-        static_cast<int>(status));
-    context.fail(Error::Internal);
-    return out;
+  const int32_t out_n = input_dims.n + cmsis_pre_pad.n + cmsis_post_pad.n;
+  const int32_t out_h = input_dims.h + cmsis_pre_pad.h + cmsis_post_pad.h;
+  const int32_t out_w = input_dims.w + cmsis_pre_pad.w + cmsis_post_pad.w;
+  const int32_t out_c = input_dims.c + cmsis_pre_pad.c + cmsis_post_pad.c;
+
+  const int8_t pad_byte = static_cast<int8_t>(pad_value);
+  for (int32_t n = 0; n < out_n; ++n) {
+    for (int32_t h = 0; h < out_h; ++h) {
+      for (int32_t w = 0; w < out_w; ++w) {
+        for (int32_t c = 0; c < out_c; ++c) {
+          const int32_t out_idx = ((n * out_h + h) * out_w + w) * out_c + c;
+          const int32_t in_n = n - cmsis_pre_pad.n;
+          const int32_t in_h = h - cmsis_pre_pad.h;
+          const int32_t in_w = w - cmsis_pre_pad.w;
+          const int32_t in_c = c - cmsis_pre_pad.c;
+          if (in_n >= 0 && in_n < input_dims.n && in_h >= 0 &&
+              in_h < input_dims.h && in_w >= 0 && in_w < input_dims.w &&
+              in_c >= 0 && in_c < input_dims.c) {
+            const int32_t in_idx =
+                ((in_n * input_dims.h + in_h) * input_dims.w + in_w) *
+                    input_dims.c +
+                in_c;
+            output_data[out_idx] = input_data[in_idx];
+          } else {
+            output_data[out_idx] = pad_byte;
+          }
+        }
+      }
+    }
   }
 
   return out;
diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp
index 2cab7dc37fb..4eb6812bc7c 100644
--- a/backends/cortex_m/ops/op_quantized_add.cpp
+++ b/backends/cortex_m/ops/op_quantized_add.cpp
@@ -9,11 +9,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
@@ -39,21 +34,16 @@ Tensor& quantized_add_out(
       input2_int8,
       out,
       ScalarType::Char,
-      /*require_channels_last=*/channel_broadcast,
       /*require_same_sizes=*/!channel_broadcast);
 
   // Validate quantization parameters
   validate_quantization_params(
-      input1_zero_point,
       input1_multiplier,
       input1_shift,
-      input2_zero_point,
       input2_multiplier,
       input2_shift,
-      output_zero_point,
       output_multiplier,
-      output_shift,
-      out);
+      output_shift);
 
   ET_LOG(
       Debug,
diff --git a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp
index ad77bb54aff..293c6ea6957 100644
--- a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp
@@ -7,10 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp
index 3eae9507ba7..13ff3c0c7a0 100644
--- a/backends/cortex_m/ops/op_quantized_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp
@@ -7,10 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -39,10 +35,6 @@ bool validate_conv2d_arguments(
 
   // Check for channels_last dim_order (NHWC: 0, 2, 3, 1)
   // Skip check if channels == 1, as dim_order is ambiguous in that case
-  constexpr executorch::aten::DimOrderType kChannelsLastDimOrder[] = {
-      0, 2, 3, 1};
-  executorch::aten::ArrayRef<executorch::aten::DimOrderType>
-      channels_last_order(kChannelsLastDimOrder, 4);
 
   if (input.size(1) > 1 && !is_channels_last_tensor(input)) {
     ET_LOG(
diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
index b3cf926c2e1..8dec61e0af1 100644
--- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
@@ -7,10 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp
index f04b65fa1fb..5d018cbc0c4 100644
--- a/backends/cortex_m/ops/op_quantized_linear.cpp
+++ b/backends/cortex_m/ops/op_quantized_linear.cpp
@@ -9,10 +9,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
diff --git a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp
index 470a7ae791e..181a29c1b65 100644
--- a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp
@@ -7,10 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_quantized_mul.cpp b/backends/cortex_m/ops/op_quantized_mul.cpp
index 3d9d6ab54a4..5d59a6fe52a 100644
--- a/backends/cortex_m/ops/op_quantized_mul.cpp
+++ b/backends/cortex_m/ops/op_quantized_mul.cpp
@@ -7,11 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 namespace {
@@ -41,22 +36,17 @@ Tensor& quantized_mul_out(
       input2_int8,
       out,
       ScalarType::Char,
-      /*require_channels_last=*/channel_broadcast,
       /*require_same_sizes=*/!channel_broadcast);
 
   const int32_t kIdentityMultiplier(/*value=*/1);
   const int32_t kZeroShift(/*value=*/0);
   validate_quantization_params(
-      input1_zero_point,
       kIdentityMultiplier,
       kZeroShift,
-      input2_zero_point,
       kIdentityMultiplier,
       kZeroShift,
-      output_zero_point,
       output_multiplier,
-      output_shift,
-      out);
+      output_shift);
 
   // Extract quantization parameters
   int8_t* input1_ptr = input1_int8.data_ptr<int8_t>();
diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
index 7126a2b2cf7..71f3f35ec67 100644
--- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
@@ -8,10 +8,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -113,6 +109,10 @@ Tensor& quantized_transpose_conv2d_out(
     return out;
   }
 
+  ET_CHECK_MSG(
+      output_padding[0] == 0 && output_padding[1] == 0,
+      "quantized_transpose_conv2d: non-zero output_padding is not supported");
+
   const int32_t batch = static_cast<int32_t>(input.size(0));
   const int32_t input_channels = static_cast<int32_t>(input.size(1));
   const int32_t input_height = static_cast<int32_t>(input.size(2));
@@ -137,43 +137,27 @@ Tensor& quantized_transpose_conv2d_out(
     return out;
   }
 
+  ET_CHECK_MSG(
+      weight.size(3) == input_channels,
+      "quantized_transpose_conv2d: weight input channels (%d) must match input channels (%d)",
+      static_cast<int>(weight.size(3)),
+      static_cast<int>(input_channels));
+
   const int32_t input_offset_val = static_cast<int32_t>(input_offset);
   const int32_t output_offset_val = static_cast<int32_t>(output_offset);
   const int32_t activation_min_val = static_cast<int32_t>(activation_min);
   const int32_t activation_max_val = static_cast<int32_t>(activation_max);
 
-  const cmsis_nn_dims input_dims{
-      batch, input_height, input_width, input_channels};
-  const cmsis_nn_dims filter_dims{
-      kernel_output_channels,
-      kernel_height,
-      kernel_width,
-      kernel_input_channels};
-  const cmsis_nn_dims output_dims{
-      batch, output_height, output_width, output_channels};
-  const cmsis_nn_dims bias_dims{1, 1, 1, output_channels};
-
-  // Setup transposed convolution parameters
-  cmsis_nn_transpose_conv_params transpose_conv_params;
-  transpose_conv_params.input_offset = input_offset_val;
-  transpose_conv_params.output_offset = output_offset_val;
-  transpose_conv_params.stride.h = static_cast<const int32_t>(stride[0]);
-  transpose_conv_params.stride.w = static_cast<const int32_t>(stride[1]);
-  transpose_conv_params.padding.h = static_cast<const int32_t>(padding[0]);
-  transpose_conv_params.padding.w = static_cast<const int32_t>(padding[1]);
-  // padding_offsets corresponds to output_padding in PyTorch
-  transpose_conv_params.padding_offsets.h =
-      static_cast<const int32_t>(output_padding[0]);
-  transpose_conv_params.padding_offsets.w =
-      static_cast<const int32_t>(output_padding[1]);
-  transpose_conv_params.dilation.h = static_cast<const int32_t>(dilation[0]);
-  transpose_conv_params.dilation.w = static_cast<const int32_t>(dilation[1]);
-  transpose_conv_params.activation.min = activation_min_val;
-  transpose_conv_params.activation.max = activation_max_val;
-
-  cmsis_nn_per_channel_quant_params quant_params;
-  quant_params.multiplier = requantize_multipliers.data_ptr<int32_t>();
-  quant_params.shift = requantize_shifts.data_ptr<int32_t>();
+  const int32_t stride_h = static_cast<int32_t>(stride[0]);
+  const int32_t stride_w = static_cast<int32_t>(stride[1]);
+  const int32_t pad_h = static_cast<int32_t>(padding[0]);
+  const int32_t pad_w = static_cast<int32_t>(padding[1]);
+  const int32_t dil_h = static_cast<int32_t>(dilation[0]);
+  const int32_t dil_w = static_cast<int32_t>(dilation[1]);
+
+  const int32_t* multiplier_data =
+      requantize_multipliers.const_data_ptr<int32_t>();
+  const int32_t* shift_data = requantize_shifts.const_data_ptr<int32_t>();
 
   const int8_t* input_data = input.const_data_ptr<int8_t>();
   const int8_t* weight_data = weight.const_data_ptr<int8_t>();
@@ -181,67 +165,83 @@ Tensor& quantized_transpose_conv2d_out(
   const int32_t* bias_data =
       bias.has_value() ? bias.value().const_data_ptr<int32_t>() : nullptr;
 
-  cmsis_nn_context cmsis_context;
-  cmsis_context.buf = nullptr;
-  cmsis_context.size = 0;
-
-  cmsis_nn_context output_context;
-  output_context.buf = nullptr;
-  output_context.size = 0;
-
-  const int32_t buffer_bytes = arm_transpose_conv_s8_get_buffer_size(
-      &transpose_conv_params, &input_dims, &filter_dims, &output_dims);
-  auto buffer_or_error = context.allocate_temp(
-      static_cast<size_t>(buffer_bytes), kCortexMMveAlignment);
-  if (!buffer_or_error.ok()) {
-    ET_LOG(
-        Error,
-        "quantized_transpose_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
-        buffer_bytes,
-        static_cast<int>(buffer_or_error.error()));
-    context.fail(buffer_or_error.error());
-    return out;
-  }
-  cmsis_context.buf = buffer_or_error.get();
-  cmsis_context.size = buffer_bytes;
-
-  const int32_t output_buffer_bytes =
-      arm_transpose_conv_s8_get_reverse_conv_buffer_size(
-          &transpose_conv_params, &input_dims, &filter_dims);
-  auto output_buffer_or_error = context.allocate_temp(
-      static_cast<size_t>(output_buffer_bytes), kCortexMMveAlignment);
-  if (!output_buffer_or_error.ok()) {
-    ET_LOG(
-        Error,
-        "quantized_transpose_conv2d_out: failed to allocate output scratch buffer (%d bytes, error %d)",
-        output_buffer_bytes,
-        static_cast<int>(output_buffer_or_error.error()));
-    context.fail(output_buffer_or_error.error());
-    return out;
-  }
-  output_context.buf = output_buffer_or_error.get();
-  output_context.size = output_buffer_bytes;
-
-  const arm_cmsis_nn_status status = arm_transpose_conv_wrapper_s8(
-      &cmsis_context,
-      &output_context,
-      &transpose_conv_params,
-      &quant_params,
-      &input_dims,
-      input_data,
-      &filter_dims,
-      weight_data,
-      &bias_dims,
-      bias_data,
-      &output_dims,
-      output_data);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "quantized_transpose_conv2d_out: arm_transpose_conv_wrapper_s8 failed with status %d",
-        status);
-    context.fail(Error::Internal);
+  // Reference transposed conv (output-centric, channels-last NHWC layout).
+  // Weight layout: [output_channels, kernel_h, kernel_w, input_channels] (OHWI).
+  // For each output position (n, oh, ow, oc), the contributing input positions
+  // satisfy: ih = (oh - kh*dil_h + pad_h) / stride_h (must be a non-negative
+  // integer within input bounds), and similarly for iw.
+  for (int32_t n = 0; n < batch; ++n) {
+    for (int32_t oh = 0; oh < output_height; ++oh) {
+      for (int32_t ow = 0; ow < output_width; ++ow) {
+        for (int32_t oc = 0; oc < output_channels; ++oc) {
+          int32_t acc = bias_data != nullptr ? bias_data[oc] : 0;
+
+          for (int32_t kh = 0; kh < kernel_height; ++kh) {
+            const int32_t ih_raw = oh - kh * dil_h + pad_h;
+            if (ih_raw < 0 || ih_raw % stride_h != 0) {
+              continue;
+            }
+            const int32_t ih = ih_raw / stride_h;
+            if (ih >= input_height) {
+              continue;
+            }
+
+            for (int32_t kw = 0; kw < kernel_width; ++kw) {
+              const int32_t iw_raw = ow - kw * dil_w + pad_w;
+              if (iw_raw < 0 || iw_raw % stride_w != 0) {
+                continue;
+              }
+              const int32_t iw = iw_raw / stride_w;
+              if (iw >= input_width) {
+                continue;
+              }
+
+              for (int32_t ic = 0; ic < input_channels; ++ic) {
+                const int64_t in_idx =
+                    ((static_cast<int64_t>(n) * input_height + ih) *
+                         input_width +
+                     iw) *
+                        input_channels +
+                    ic;
+                const int64_t w_idx =
+                    ((static_cast<int64_t>(oc) * kernel_height + kh) *
+                         kernel_width +
+                     kw) *
+                        input_channels +
+                    ic;
+                acc += (static_cast<int32_t>(input_data[in_idx]) +
+                        input_offset_val) *
+                       static_cast<int32_t>(weight_data[w_idx]);
+              }
+            }
+          }
+
+          // Per-channel requantization: result = round(acc * multiplier / 2^(31-shift))
+          const int32_t mul = multiplier_data[oc];
+          const int32_t sft = shift_data[oc];
+          const int32_t right_shift = 31 - sft;
+          const int64_t acc64 = static_cast<int64_t>(acc) * mul;
+          int32_t result;
+          if (right_shift > 0) {
+            result = static_cast<int32_t>(
+                (acc64 + (1LL << (right_shift - 1))) >> right_shift);
+          } else {
+            result = static_cast<int32_t>(acc64 << (-right_shift));
+          }
+
+          result += output_offset_val;
+          result = result < activation_min_val ? activation_min_val : result;
+          result = result > activation_max_val ? activation_max_val : result;
+          const int64_t out_idx =
+              ((static_cast<int64_t>(n) * output_height + oh) *
+                   output_width +
+               ow) *
+                  output_channels +
+              oc;
+          output_data[out_idx] = static_cast<int8_t>(result);
+        }
+      }
+    }
   }
 
   return out;
diff --git a/backends/cortex_m/ops/op_softmax.cpp b/backends/cortex_m/ops/op_softmax.cpp
index a2b8f27fac1..a72750f4049 100644
--- a/backends/cortex_m/ops/op_softmax.cpp
+++ b/backends/cortex_m/ops/op_softmax.cpp
@@ -11,11 +11,6 @@
 #include <cstdint>
 #include <limits>
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -77,10 +72,7 @@ Tensor& softmax_out(
   const int32_t diff_min_val = static_cast<int32_t>(diff_min);
 
   validate_single_quant_params(
-      static_cast<int32_t>(input_zero_point),
-      input_multiplier_val,
-      input_shift_val,
-      "softmax input");
+      input_multiplier_val, input_shift_val, "softmax input");
 
   const auto positive_dim = normalize_dim(input, dim);
   const int64_t row_size64 = input.size(positive_dim);
diff --git a/backends/cortex_m/ops/op_transpose.cpp b/backends/cortex_m/ops/op_transpose.cpp
index 25458435a3c..074deab9783 100644
--- a/backends/cortex_m/ops/op_transpose.cpp
+++ b/backends/cortex_m/ops/op_transpose.cpp
@@ -9,12 +9,6 @@
 
 #include <array>
 #include <limits>
-#include <vector>
-
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
 
 namespace cortex_m {
 namespace native {
@@ -83,38 +77,45 @@ Tensor& transpose_out(
     output_dims_arr[i] = static_cast<int32_t>(out_size);
   }
 
-  cmsis_nn_dims input_dims = {
-      input_dims_arr[0],
-      input_dims_arr[1],
-      input_dims_arr[2],
-      input_dims_arr[3]};
-  cmsis_nn_dims output_dims = {
-      output_dims_arr[0],
-      output_dims_arr[1],
-      output_dims_arr[2],
-      output_dims_arr[3]};
+  // Compute row-major strides for input and output
+  std::array<int64_t, kMaxSupportedDims> input_strides{1, 1, 1, 1};
+  for (int i = static_cast<int>(kMaxSupportedDims) - 2; i >= 0; --i) {
+    input_strides[i] = input_strides[i + 1] * input_dims_arr[i + 1];
+  }
+  std::array<int64_t, kMaxSupportedDims> output_strides{1, 1, 1, 1};
+  for (int i = static_cast<int>(kMaxSupportedDims) - 2; i >= 0; --i) {
+    output_strides[i] = output_strides[i + 1] * output_dims_arr[i + 1];
+  }
 
   std::array<uint32_t, kMaxSupportedDims> perm_buffer{0, 1, 2, 3};
   for (size_t i = 0; i < rank; ++i) {
     perm_buffer[i] = static_cast<uint32_t>(perm[i]);
   }
 
-  const cmsis_nn_transpose_params transpose_params{
-      static_cast<int32_t>(rank), perm_buffer.data()};
-
   const int8_t* input_data = input.const_data_ptr<int8_t>();
   int8_t* output_data = out.mutable_data_ptr<int8_t>();
 
-  const arm_cmsis_nn_status status = arm_transpose_s8(
-      input_data, output_data, &input_dims, &output_dims, &transpose_params);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "transpose_out: arm_transpose_s8 failed with status [%d]",
-        static_cast<int>(status));
-    context.fail(Error::Internal);
-    return out;
+  for (int32_t i0 = 0; i0 < output_dims_arr[0]; ++i0) {
+    for (int32_t i1 = 0; i1 < output_dims_arr[1]; ++i1) {
+      for (int32_t i2 = 0; i2 < output_dims_arr[2]; ++i2) {
+        for (int32_t i3 = 0; i3 < output_dims_arr[3]; ++i3) {
+          const std::array<int32_t, kMaxSupportedDims> out_idx{i0, i1, i2, i3};
+          std::array<int32_t, kMaxSupportedDims> in_idx{0, 0, 0, 0};
+          for (size_t k = 0; k < kMaxSupportedDims; ++k) {
+            in_idx[perm_buffer[k]] = out_idx[k];
+          }
+          const int64_t in_offset = in_idx[0] * input_strides[0] +
+                                     in_idx[1] * input_strides[1] +
+                                     in_idx[2] * input_strides[2] +
+                                     in_idx[3] * input_strides[3];
+          const int64_t out_offset = i0 * output_strides[0] +
+                                      i1 * output_strides[1] +
+                                      i2 * output_strides[2] +
+                                      i3 * output_strides[3];
+          output_data[out_offset] = input_data[in_offset];
+        }
+      }
+    }
   }
 
   return out;