diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
index c940cdcf839..31a023c350e 100644
--- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
@@ -47,7 +47,7 @@ namespace {
 /**
  * Asserts that the parameters are valid.
  */
-void check_dequantize_per_tensor_args(
+[[maybe_unused]] void check_dequantize_per_tensor_args(
     const Tensor& input,
     int64_t quant_min,
     int64_t quant_max,
@@ -86,6 +86,18 @@ void check_dequantize_per_tensor_args(
       quant_max);
 }
 
+// PDX aligned vector load/store (PDX_LV*) require 16-byte (128-bit) aligned
+// buffers on Fusion G3; an unaligned access raises LoadStoreAlignmentCause. The
+// inline SIMD fast path below is only safe when both buffers are aligned;
+// otherwise the caller falls back to the alignment-safe NNLib kernel.
+[[maybe_unused]] constexpr uintptr_t kSimdAlignmentBytes = 16;
+[[maybe_unused]] inline bool dequant_simd_aligned(
+    const void* in_ptr,
+    const void* out_ptr) {
+  return (reinterpret_cast<uintptr_t>(in_ptr) % kSimdAlignmentBytes == 0) &&
+      (reinterpret_cast<uintptr_t>(out_ptr) % kSimdAlignmentBytes == 0);
+}
+
 } // namespace
 
 /* Local function which calls the kernels based on the input datatype */
@@ -295,10 +307,8 @@ Tensor& dequantize_impl(
               if (zero_point_data != nullptr) {                               \
                 zero_point = zero_point_data[current_ix];                     \
               }                                                               \
-              out_data_ptr[current_ix] =                                      \
-                  static_cast<CTYPE_OUT>(                                     \
-                      input_data_ptr[current_ix] - zero_point) *              \
-                  _scale;                                                     \
+              out_data_ptr[current_ix] = static_cast<CTYPE_OUT>( \
+                  (input_data_ptr[current_ix] - zero_point) * _scale); \
             }                                                                 \
           },                                                                  \
           input,                                                              \
@@ -503,10 +513,8 @@ Tensor& dequantize_impl(
               if (zero_point_data != nullptr) {                               \
                 zero_point = zero_point_data[current_ix];                     \
               }                                                               \
-              out_data_ptr[current_ix] =                                      \
-                  static_cast<CTYPE_OUT>(                                     \
-                      input_data_ptr[current_ix] - zero_point) *              \
-                  _scale;                                                     \
+              out_data_ptr[current_ix] = static_cast<CTYPE_OUT>( \
+                  (input_data_ptr[current_ix] - zero_point) * _scale); \
             }                                                                 \
           },                                                                  \
           input,                                                              \
@@ -579,26 +587,403 @@ Tensor& dequantize_per_tensor_out(
     ScalarType dtype,
     Tensor& out) {
   constexpr ScalarType out_dtype = ScalarType::Float;
-
 #ifdef OP_ARG_CHECK
   torch::executor::Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
       err == torch::executor::Error::Ok,
       "Failed to resize out Tensor in dequantize_per_tensor_out");
-
   check_dequantize_per_tensor_args(
       input, quant_min, quant_max, dtype, out_dtype, out);
 #endif
-
   float scale_data = (float)scale;
   int zero_point_data = (int)zero_point;
-
+  // Fast path: bypass dequantize_impl dispatch overhead for the most common
+  // per-tensor cases (int8/uint8 -> float32). This avoids the inp_shape copy
+  // loop, is_asym_dequant check, optimized flag check, and the multi-branch
+  // type dispatch in dequantize_impl.
+  if (out.scalar_type() == ScalarType::Float) {
+    float* __restrict__ out_data = out.mutable_data_ptr<float>();
+    // For per-tensor dequantization (axis=NULL), flatten to 1D shape.
+    // This avoids the shape copy loop and lets the NNLIB kernel skip
+    // multi-dimensional stride calculations internally.
+    int inp_shape[1] = {static_cast<int>(input.numel())};
+    constexpr int num_inp_dims = 1;
+    if (input.scalar_type() == ScalarType::Char) {
+      const int8_t* __restrict__ input_data = input.const_data_ptr<int8_t>();
+#if defined(__XTENSA__)
+      // Direct inline PDX SIMD dequantization for per-tensor int8->float32.
+      // 4x-unrolled to improve ILP: the Fusion G3 scheduler can interleave
+      // independent loads, converts, and FP MACs across unrolled iterations,
+      // hiding load-to-use latency and reducing loop overhead.
+      if (dequant_simd_aligned(input_data, out_data)) {
+        const int numel = inp_shape[0];
+        auto vIn = reinterpret_cast<const xb_vecMx8*>(input_data);
+        auto vOut = reinterpret_cast<xb_vecMxf32*>(out_data);
+        const xb_vecMxf32 v_scale{
+            scale_data, scale_data, scale_data, scale_data};
+        int i = 0;
+        if (zero_point_data != 0) {
+          const float zp_f = static_cast<float>(zero_point_data);
+          const xb_vecMxf32 v_zp{zp_f, zp_f, zp_f, zp_f};
+          // 4x unrolled main loop: 16 elements per iteration
+          const int e16 = (numel >> 4) << 4;
+          for (; i < e16; i += 16) {
+            xb_vecMx32 vP0 = PDX_LV32_MX8_I(vIn, 0);
+            xb_vecMx32 vP1 = PDX_LV32_MX8_I(vIn + 1, 0);
+            xb_vecMx32 vP2 = PDX_LV32_MX8_I(vIn + 2, 0);
+            xb_vecMx32 vP3 = PDX_LV32_MX8_I(vIn + 3, 0);
+            xb_vecMxf32 vF0 = (xb_vecMxf32)vP0;
+            xb_vecMxf32 vF1 = (xb_vecMxf32)vP1;
+            xb_vecMxf32 vF2 = (xb_vecMxf32)vP2;
+            xb_vecMxf32 vF3 = (xb_vecMxf32)vP3;
+            vOut[0] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF0, v_zp), v_scale);
+            vOut[1] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF1, v_zp), v_scale);
+            vOut[2] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF2, v_zp), v_scale);
+            vOut[3] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF3, v_zp), v_scale);
+            vIn += 4;
+            vOut += 4;
+          }
+          // Scalar residual for remaining 0-15 elements
+          for (; i < numel; ++i) {
+            out_data[i] =
+                (static_cast<float>(input_data[i]) - zp_f) * scale_data;
+          }
+        } else {
+          // 4x unrolled main loop: 16 elements per iteration
+          const int e16 = (numel >> 4) << 4;
+          for (; i < e16; i += 16) {
+            xb_vecMx32 vP0 = PDX_LV32_MX8_I(vIn, 0);
+            xb_vecMx32 vP1 = PDX_LV32_MX8_I(vIn + 1, 0);
+            xb_vecMx32 vP2 = PDX_LV32_MX8_I(vIn + 2, 0);
+            xb_vecMx32 vP3 = PDX_LV32_MX8_I(vIn + 3, 0);
+            xb_vecMxf32 vF0 = (xb_vecMxf32)vP0;
+            xb_vecMxf32 vF1 = (xb_vecMxf32)vP1;
+            xb_vecMxf32 vF2 = (xb_vecMxf32)vP2;
+            xb_vecMxf32 vF3 = (xb_vecMxf32)vP3;
+            vOut[0] = PDX_MUL_MXF32(vF0, v_scale);
+            vOut[1] = PDX_MUL_MXF32(vF1, v_scale);
+            vOut[2] = PDX_MUL_MXF32(vF2, v_scale);
+            vOut[3] = PDX_MUL_MXF32(vF3, v_scale);
+            vIn += 4;
+            vOut += 4;
+          }
+          // Scalar residual for remaining 0-15 elements
+          for (; i < numel; ++i) {
+            out_data[i] = static_cast<float>(input_data[i]) * scale_data;
+          }
+        }
+        return out;
+      }
+#endif
+      if (zero_point_data != 0) {
+        XT_KERNEL_CHECK(
+            context,
+            out,
+            xa_nn_elm_dequantize_asym8_f32,
+            out_data,
+            input_data,
+            inp_shape,
+            num_inp_dims,
+            NULL,
+            &zero_point_data,
+            &scale_data);
+      } else {
+        XT_KERNEL_CHECK(
+            context,
+            out,
+            xa_nn_elm_dequantize_sym8_f32,
+            out_data,
+            input_data,
+            inp_shape,
+            num_inp_dims,
+            NULL,
+            &scale_data);
+      }
+      return out;
+    } else if (input.scalar_type() == ScalarType::Byte) {
+      const uint8_t* __restrict__ input_data = input.const_data_ptr<uint8_t>();
+#if defined(__XTENSA__)
+      // Direct inline PDX SIMD dequantization for per-tensor uint8->float32.
+      // 4x-unrolled to improve ILP: the Fusion G3 scheduler can interleave
+      // independent loads, converts, and FP MACs across unrolled iterations,
+      // hiding load-to-use latency and reducing loop overhead.
+      if (dequant_simd_aligned(input_data, out_data)) {
+        const int numel = inp_shape[0];
+        auto vIn = reinterpret_cast<const xb_vecMxu8*>(input_data);
+        auto vOut = reinterpret_cast<xb_vecMxf32*>(out_data);
+        const xb_vecMxf32 v_scale{
+            scale_data, scale_data, scale_data, scale_data};
+        int i = 0;
+        if (zero_point_data != 0) {
+          const float zp_f = static_cast<float>(zero_point_data);
+          const xb_vecMxf32 v_zp{zp_f, zp_f, zp_f, zp_f};
+          // 4x unrolled main loop: 16 elements per iteration
+          const int e16 = (numel >> 4) << 4;
+          for (; i < e16; i += 16) {
+            xb_vecMxu32 vP0 = PDX_LVU32_MX8_I(vIn, 0);
+            xb_vecMxu32 vP1 = PDX_LVU32_MX8_I(vIn + 1, 0);
+            xb_vecMxu32 vP2 = PDX_LVU32_MX8_I(vIn + 2, 0);
+            xb_vecMxu32 vP3 = PDX_LVU32_MX8_I(vIn + 3, 0);
+            xb_vecMxf32 vF0 = (xb_vecMxf32)vP0;
+            xb_vecMxf32 vF1 = (xb_vecMxf32)vP1;
+            xb_vecMxf32 vF2 = (xb_vecMxf32)vP2;
+            xb_vecMxf32 vF3 = (xb_vecMxf32)vP3;
+            vOut[0] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF0, v_zp), v_scale);
+            vOut[1] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF1, v_zp), v_scale);
+            vOut[2] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF2, v_zp), v_scale);
+            vOut[3] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF3, v_zp), v_scale);
+            vIn += 4;
+            vOut += 4;
+          }
+          // Scalar residual for remaining 0-15 elements
+          for (; i < numel; ++i) {
+            out_data[i] =
+                (static_cast<float>(input_data[i]) - zp_f) * scale_data;
+          }
+        } else {
+          // 4x unrolled main loop: 16 elements per iteration
+          const int e16 = (numel >> 4) << 4;
+          for (; i < e16; i += 16) {
+            xb_vecMxu32 vP0 = PDX_LVU32_MX8_I(vIn, 0);
+            xb_vecMxu32 vP1 = PDX_LVU32_MX8_I(vIn + 1, 0);
+            xb_vecMxu32 vP2 = PDX_LVU32_MX8_I(vIn + 2, 0);
+            xb_vecMxu32 vP3 = PDX_LVU32_MX8_I(vIn + 3, 0);
+            xb_vecMxf32 vF0 = (xb_vecMxf32)vP0;
+            xb_vecMxf32 vF1 = (xb_vecMxf32)vP1;
+            xb_vecMxf32 vF2 = (xb_vecMxf32)vP2;
+            xb_vecMxf32 vF3 = (xb_vecMxf32)vP3;
+            vOut[0] = PDX_MUL_MXF32(vF0, v_scale);
+            vOut[1] = PDX_MUL_MXF32(vF1, v_scale);
+            vOut[2] = PDX_MUL_MXF32(vF2, v_scale);
+            vOut[3] = PDX_MUL_MXF32(vF3, v_scale);
+            vIn += 4;
+            vOut += 4;
+          }
+          // Scalar residual for remaining 0-15 elements
+          for (; i < numel; ++i) {
+            out_data[i] = static_cast<float>(input_data[i]) * scale_data;
+          }
+        }
+        return out;
+      }
+#endif
+      if (zero_point_data != 0) {
+        XT_KERNEL_CHECK(
+            context,
+            out,
+            xa_nn_elm_dequantize_asym8u_f32,
+            out_data,
+            input_data,
+            inp_shape,
+            num_inp_dims,
+            NULL,
+            &zero_point_data,
+            &scale_data);
+      } else {
+        XT_KERNEL_CHECK(
+            context,
+            out,
+            xa_nn_elm_dequantize_sym8u_f32,
+            out_data,
+            input_data,
+            inp_shape,
+            num_inp_dims,
+            NULL,
+            &scale_data);
+      }
+      return out;
+    } else if (input.scalar_type() == ScalarType::Short) {
+      const int16_t* __restrict__ input_data = input.const_data_ptr<int16_t>();
+#if defined(__XTENSA__)
+      // Direct inline PDX SIMD dequantization for per-tensor int16->float32.
+      // 4x-unrolled to improve ILP: the Fusion G3 scheduler can interleave
+      // independent loads, converts, and FP MACs across unrolled iterations,
+      // hiding load-to-use latency and reducing loop overhead.
+      if (dequant_simd_aligned(input_data, out_data)) {
+        const int numel = inp_shape[0];
+        auto vIn = reinterpret_cast<const xb_vecMx16*>(input_data);
+        auto vOut = reinterpret_cast<xb_vecMxf32*>(out_data);
+        const xb_vecMxf32 v_scale{
+            scale_data, scale_data, scale_data, scale_data};
+        int i = 0;
+        if (zero_point_data != 0) {
+          const float zp_f = static_cast<float>(zero_point_data);
+          const xb_vecMxf32 v_zp{zp_f, zp_f, zp_f, zp_f};
+          // 4x unrolled main loop: 16 elements per iteration
+          const int e16 = (numel >> 4) << 4;
+          for (; i < e16; i += 16) {
+            xb_vecMx32 vP0 = PDX_LV32_MX16_I(vIn, 0);
+            xb_vecMx32 vP1 = PDX_LV32_MX16_I(vIn + 1, 0);
+            xb_vecMx32 vP2 = PDX_LV32_MX16_I(vIn + 2, 0);
+            xb_vecMx32 vP3 = PDX_LV32_MX16_I(vIn + 3, 0);
+            xb_vecMxf32 vF0 = (xb_vecMxf32)vP0;
+            xb_vecMxf32 vF1 = (xb_vecMxf32)vP1;
+            xb_vecMxf32 vF2 = (xb_vecMxf32)vP2;
+            xb_vecMxf32 vF3 = (xb_vecMxf32)vP3;
+            vOut[0] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF0, v_zp), v_scale);
+            vOut[1] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF1, v_zp), v_scale);
+            vOut[2] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF2, v_zp), v_scale);
+            vOut[3] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF3, v_zp), v_scale);
+            vIn += 4;
+            vOut += 4;
+          }
+          // Scalar residual for remaining 0-15 elements
+          for (; i < numel; ++i) {
+            out_data[i] =
+                (static_cast<float>(input_data[i]) - zp_f) * scale_data;
+          }
+        } else {
+          // 4x unrolled main loop: 16 elements per iteration
+          const int e16 = (numel >> 4) << 4;
+          for (; i < e16; i += 16) {
+            xb_vecMx32 vP0 = PDX_LV32_MX16_I(vIn, 0);
+            xb_vecMx32 vP1 = PDX_LV32_MX16_I(vIn + 1, 0);
+            xb_vecMx32 vP2 = PDX_LV32_MX16_I(vIn + 2, 0);
+            xb_vecMx32 vP3 = PDX_LV32_MX16_I(vIn + 3, 0);
+            xb_vecMxf32 vF0 = (xb_vecMxf32)vP0;
+            xb_vecMxf32 vF1 = (xb_vecMxf32)vP1;
+            xb_vecMxf32 vF2 = (xb_vecMxf32)vP2;
+            xb_vecMxf32 vF3 = (xb_vecMxf32)vP3;
+            vOut[0] = PDX_MUL_MXF32(vF0, v_scale);
+            vOut[1] = PDX_MUL_MXF32(vF1, v_scale);
+            vOut[2] = PDX_MUL_MXF32(vF2, v_scale);
+            vOut[3] = PDX_MUL_MXF32(vF3, v_scale);
+            vIn += 4;
+            vOut += 4;
+          }
+          // Scalar residual for remaining 0-15 elements
+          for (; i < numel; ++i) {
+            out_data[i] = static_cast<float>(input_data[i]) * scale_data;
+          }
+        }
+        return out;
+      }
+#endif
+      if (zero_point_data != 0) {
+        XT_KERNEL_CHECK(
+            context,
+            out,
+            xa_nn_elm_dequantize_asym16_f32,
+            out_data,
+            input_data,
+            inp_shape,
+            num_inp_dims,
+            NULL,
+            &zero_point_data,
+            &scale_data);
+      } else {
+        XT_KERNEL_CHECK(
+            context,
+            out,
+            xa_nn_elm_dequantize_sym16_f32,
+            out_data,
+            input_data,
+            inp_shape,
+            num_inp_dims,
+            NULL,
+            &scale_data);
+      }
+      return out;
+    } else if (input.scalar_type() == ScalarType::UInt16) {
+      const uint16_t* __restrict__ input_data =
+          input.const_data_ptr<uint16_t>();
+#if defined(__XTENSA__)
+      // Direct inline PDX SIMD dequantization for per-tensor uint16->float32.
+      // 4x-unrolled to improve ILP: the Fusion G3 scheduler can interleave
+      // independent loads, converts, and FP MACs across unrolled iterations,
+      // hiding load-to-use latency and reducing loop overhead.
+      if (dequant_simd_aligned(input_data, out_data)) {
+        const int numel = inp_shape[0];
+        auto vIn = reinterpret_cast<const xb_vecMxu16*>(input_data);
+        auto vOut = reinterpret_cast<xb_vecMxf32*>(out_data);
+        const xb_vecMxf32 v_scale{
+            scale_data, scale_data, scale_data, scale_data};
+        int i = 0;
+        if (zero_point_data != 0) {
+          const float zp_f = static_cast<float>(zero_point_data);
+          const xb_vecMxf32 v_zp{zp_f, zp_f, zp_f, zp_f};
+          // 4x unrolled main loop: 16 elements per iteration
+          const int e16 = (numel >> 4) << 4;
+          for (; i < e16; i += 16) {
+            xb_vecMxu32 vP0 = PDX_LVU32_MX16_I(vIn, 0);
+            xb_vecMxu32 vP1 = PDX_LVU32_MX16_I(vIn + 1, 0);
+            xb_vecMxu32 vP2 = PDX_LVU32_MX16_I(vIn + 2, 0);
+            xb_vecMxu32 vP3 = PDX_LVU32_MX16_I(vIn + 3, 0);
+            xb_vecMxf32 vF0 = (xb_vecMxf32)vP0;
+            xb_vecMxf32 vF1 = (xb_vecMxf32)vP1;
+            xb_vecMxf32 vF2 = (xb_vecMxf32)vP2;
+            xb_vecMxf32 vF3 = (xb_vecMxf32)vP3;
+            vOut[0] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF0, v_zp), v_scale);
+            vOut[1] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF1, v_zp), v_scale);
+            vOut[2] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF2, v_zp), v_scale);
+            vOut[3] = PDX_MUL_MXF32(PDX_SUB_MXF32(vF3, v_zp), v_scale);
+            vIn += 4;
+            vOut += 4;
+          }
+          // Scalar residual for remaining 0-15 elements
+          for (; i < numel; ++i) {
+            out_data[i] =
+                (static_cast<float>(input_data[i]) - zp_f) * scale_data;
+          }
+        } else {
+          // 4x unrolled main loop: 16 elements per iteration
+          const int e16 = (numel >> 4) << 4;
+          for (; i < e16; i += 16) {
+            xb_vecMxu32 vP0 = PDX_LVU32_MX16_I(vIn, 0);
+            xb_vecMxu32 vP1 = PDX_LVU32_MX16_I(vIn + 1, 0);
+            xb_vecMxu32 vP2 = PDX_LVU32_MX16_I(vIn + 2, 0);
+            xb_vecMxu32 vP3 = PDX_LVU32_MX16_I(vIn + 3, 0);
+            xb_vecMxf32 vF0 = (xb_vecMxf32)vP0;
+            xb_vecMxf32 vF1 = (xb_vecMxf32)vP1;
+            xb_vecMxf32 vF2 = (xb_vecMxf32)vP2;
+            xb_vecMxf32 vF3 = (xb_vecMxf32)vP3;
+            vOut[0] = PDX_MUL_MXF32(vF0, v_scale);
+            vOut[1] = PDX_MUL_MXF32(vF1, v_scale);
+            vOut[2] = PDX_MUL_MXF32(vF2, v_scale);
+            vOut[3] = PDX_MUL_MXF32(vF3, v_scale);
+            vIn += 4;
+            vOut += 4;
+          }
+          // Scalar residual for remaining 0-15 elements
+          for (; i < numel; ++i) {
+            out_data[i] = static_cast<float>(input_data[i]) * scale_data;
+          }
+        }
+        return out;
+      }
+#endif
+      if (zero_point_data != 0) {
+        XT_KERNEL_CHECK(
+            context,
+            out,
+            xa_nn_elm_dequantize_asym16u_f32,
+            out_data,
+            input_data,
+            inp_shape,
+            num_inp_dims,
+            NULL,
+            &zero_point_data,
+            &scale_data);
+      } else {
+        XT_KERNEL_CHECK(
+            context,
+            out,
+            xa_nn_elm_dequantize_sym16u_f32,
+            out_data,
+            input_data,
+            inp_shape,
+            num_inp_dims,
+            NULL,
+            &scale_data);
+      }
+      return out;
+    }
+    // Fall through to generic path for other input types
+  }
   dequantize_impl(
       context, out, input, &scale_data, &zero_point_data, NULL, out_dtype);
-
   return out;
 }
-
 Tensor& dequantize_per_tensor_tensor_args_out(
     KernelRuntimeContext& context,
     const Tensor& input,