From e529b289a1fa69c91e0783006aebba98ce29cb2b Mon Sep 17 00:00:00 2001
From: Nitin Jain <jainnitin@fb.com>
Date: Tue, 10 Mar 2026 12:11:58 -0700
Subject: [PATCH] Remove extern "C" wrapping and fix format specifiers for ARM
 embedded builds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Remove redundant `extern "C"` blocks wrapping CMSIS-NN header includes and fix `-Wformat` errors in format specifiers. These changes are required for the CC pipeline's FVP benchmark runner to compile on ARM embedded targets (Cortex-M55/M85 with MVE).

## Context
The `extern "C"` wrapping around CMSIS-NN headers causes ARM embedded builds targeting MVE-capable processors to fail. CMSIS-NN's `arm_nn_math_types.h` temporarily closes its inner `extern "C"` before including `arm_mve.h`, but the outer `extern "C"` from the op files remains active, forcing `arm_mve.h` into C linkage where C++ function overloading is illegal.

Additionally, format specifiers (`%hhd` for `ScalarType`, `%d`/`%ld` for `int64_t`) cause `-Wformat` errors treated as build failures on ARM toolchains.

Changes:
1. Removed `extern "C"` wrapping from all 13 op .cpp files and `cmsis_scratch_buffer_context.h`
2. Consolidated CMSIS-NN includes in `cortex_m_ops_common.h` — added `#include "arm_nnfunctions.h"` (without `extern "C"`) so op files get it transitively
3. Added `#include <cinttypes>` for `PRIi64` macro
4. Fixed `%hhd` → `%d` with `static_cast<int>` for `ScalarType` values
5. Fixed `%d`/`%ld` → `PRIi64` for `int64_t` values

Differential Revision: D95739935
---
 .../ops/cmsis_scratch_buffer_context.h        |  4 +--
 backends/cortex_m/ops/cortex_m_ops_common.h   | 33 ++++++++++---------
 backends/cortex_m/ops/op_maximum.cpp          |  5 ---
 backends/cortex_m/ops/op_minimum.cpp          |  5 ---
 backends/cortex_m/ops/op_pad.cpp              |  4 ---
 backends/cortex_m/ops/op_quantized_add.cpp    |  5 ---
 .../cortex_m/ops/op_quantized_avg_pool2d.cpp  |  4 ---
 backends/cortex_m/ops/op_quantized_conv2d.cpp |  4 ---
 .../ops/op_quantized_depthwise_conv2d.cpp     |  4 ---
 backends/cortex_m/ops/op_quantized_linear.cpp |  4 ---
 .../cortex_m/ops/op_quantized_max_pool2d.cpp  |  4 ---
 backends/cortex_m/ops/op_quantized_mul.cpp    |  5 ---
 .../ops/op_quantized_transpose_conv2d.cpp     |  4 ---
 backends/cortex_m/ops/op_softmax.cpp          |  5 ---
 backends/cortex_m/ops/op_transpose.cpp        |  5 ---
 15 files changed, 18 insertions(+), 77 deletions(-)
diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
index 4b9fdaebdf7..4672f05e777 100644
--- a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
+++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
@@ -7,10 +7,8 @@
  */
 #pragma once
 
-#include "cortex_m_ops_common.h"
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
index 1b31367881f..4c0f83d6eb6 100644
--- a/backends/cortex_m/ops/cortex_m_ops_common.h
+++ b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -16,12 +16,12 @@
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <cinttypes>
 #include <limits>
 #include <optional>
 
-extern "C" {
 #include "arm_nn_types.h"
-}
+#include "arm_nnfunctions.h"
 
 using Tensor = torch::executor::Tensor;
 using ScalarType = executorch::aten::ScalarType;
@@ -47,19 +47,19 @@ inline void validate_cmsis_nn_tensor_requirements(
   // Basic dtype validation
   ET_CHECK_MSG(
       input1.scalar_type() == expected_dtype,
-      "Input1 dtype must be %hhd, got %hhd",
-      expected_dtype,
-      input1.scalar_type());
+      "Input1 dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(input1.scalar_type()));
   ET_CHECK_MSG(
       input2.scalar_type() == expected_dtype,
-      "Input2 dtype must be %hhd, got %hhd",
-      expected_dtype,
-      input2.scalar_type());
+      "Input2 dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(input2.scalar_type()));
   ET_CHECK_MSG(
       output.scalar_type() == expected_dtype,
-      "Output dtype must be %hhd, got %hhd",
-      expected_dtype,
-      output.scalar_type());
+      "Output dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(output.scalar_type()));
   if (require_same_sizes) {
     ET_CHECK_MSG(
         input1.sizes() == input2.sizes(),
@@ -78,16 +78,17 @@ inline void validate_single_quant_params(
     const int64_t multiplier,
     const int64_t shift,
     const char* param_name) {
+  (void)zero_point;
   ET_CHECK_MSG(
       multiplier >= std::numeric_limits<int32_t>::min() &&
           multiplier <= std::numeric_limits<int32_t>::max(),
-      "%s multiplier must be in int32 range [Value: %d]",
+      "%s multiplier must be in int32 range [Value: %" PRIi64 "]",
       param_name,
       multiplier);
 
   ET_CHECK_MSG(
       shift >= -31 && shift <= 31,
-      "%s shift must be in range [-31, 31] [Value: %d]",
+      "%s shift must be in range [-31, 31] [Value: %" PRIi64 "]",
       param_name,
       shift);
 }
@@ -172,7 +173,7 @@ inline bool check_int32_within_range(
       value > std::numeric_limits<int32_t>::max()) {
     ET_LOG(
         Error,
-        "%s: %s value (%ld) exceeds int32_t range",
+        "%s: %s value (%" PRIi64 ") exceeds int32_t range",
         op_name,
         value_name,
         value);
@@ -354,14 +355,14 @@ inline bool validate_per_channel_quant_params(
     if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
       ET_LOG(
           Error,
-          "weight_multiplier[%d] out of CMSIS-NN range: %d",
+          "weight_multiplier[%d] out of CMSIS-NN range: %" PRIi64,
           i,
           multipliers[i]);
       return false;
     }
     // Shift: {-31, 30} for arm_nn_requantize
     if (shifts[i] < -31 || shifts[i] > 30) {
-      ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
+      ET_LOG(Error, "weight_shift[%d] out of range: %" PRIi64, i, shifts[i]);
       return false;
     }
   }
diff --git a/backends/cortex_m/ops/op_maximum.cpp b/backends/cortex_m/ops/op_maximum.cpp
index 71a907f12ea..fc76f5c8c48 100644
--- a/backends/cortex_m/ops/op_maximum.cpp
+++ b/backends/cortex_m/ops/op_maximum.cpp
@@ -7,11 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_minimum.cpp b/backends/cortex_m/ops/op_minimum.cpp
index f220aa2664b..5a75cb8a1dc 100644
--- a/backends/cortex_m/ops/op_minimum.cpp
+++ b/backends/cortex_m/ops/op_minimum.cpp
@@ -9,11 +9,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_pad.cpp b/backends/cortex_m/ops/op_pad.cpp
index 739c584c419..b400f4c7e19 100644
--- a/backends/cortex_m/ops/op_pad.cpp
+++ b/backends/cortex_m/ops/op_pad.cpp
@@ -8,10 +8,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp
index 2cab7dc37fb..b4bbfdaffce 100644
--- a/backends/cortex_m/ops/op_quantized_add.cpp
+++ b/backends/cortex_m/ops/op_quantized_add.cpp
@@ -9,11 +9,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
diff --git a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp
index ad77bb54aff..293c6ea6957 100644
--- a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp
@@ -7,10 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp
index 3eae9507ba7..0fa6a3f8536 100644
--- a/backends/cortex_m/ops/op_quantized_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp
@@ -7,10 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
index b3cf926c2e1..8dec61e0af1 100644
--- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
@@ -7,10 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp
index f04b65fa1fb..5d018cbc0c4 100644
--- a/backends/cortex_m/ops/op_quantized_linear.cpp
+++ b/backends/cortex_m/ops/op_quantized_linear.cpp
@@ -9,10 +9,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
diff --git a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp
index 470a7ae791e..181a29c1b65 100644
--- a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp
@@ -7,10 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_quantized_mul.cpp b/backends/cortex_m/ops/op_quantized_mul.cpp
index 3d9d6ab54a4..524e74a6b9f 100644
--- a/backends/cortex_m/ops/op_quantized_mul.cpp
+++ b/backends/cortex_m/ops/op_quantized_mul.cpp
@@ -7,11 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 namespace {
diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
index 7126a2b2cf7..e3f6135c7b9 100644
--- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
+++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
@@ -8,10 +8,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_softmax.cpp b/backends/cortex_m/ops/op_softmax.cpp
index a2b8f27fac1..c07a538db84 100644
--- a/backends/cortex_m/ops/op_softmax.cpp
+++ b/backends/cortex_m/ops/op_softmax.cpp
@@ -11,11 +11,6 @@
 #include <cstdint>
 #include <limits>
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
diff --git a/backends/cortex_m/ops/op_transpose.cpp b/backends/cortex_m/ops/op_transpose.cpp
index 25458435a3c..7fcbc034283 100644
--- a/backends/cortex_m/ops/op_transpose.cpp
+++ b/backends/cortex_m/ops/op_transpose.cpp
@@ -11,11 +11,6 @@
 #include <limits>
 #include <vector>
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {