diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index 5a4ccbb4952..167ceb7da83 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -66,7 +66,11 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Build test runners"
-        ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 ))
+        ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner mlx_mutable_state_test -j$(( $(sysctl -n hw.ncpu) - 1 ))
+        echo "::endgroup::"
+
+        echo "::group::Run mutable-state (multi-session) unit test"
+        ./cmake-out/backends/mlx/test/mlx_mutable_state_test
         echo "::endgroup::"
 
         echo "::group::Run op unit tests"
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index 3b713659e84..13693bd235d 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -631,6 +631,16 @@ def _get_fixed_qparams_qspec(
 if _transpose_dimname is not None:
     _one_to_one_shared_input_qspec.add(_transpose_dimname)
 
+for _op in (
+    getattr(torch.ops.aten.moveaxis, "int", None),
+    getattr(torch.ops.aten.moveaxis, "intlist", None),
+    getattr(torch.ops.aten.movedim, "int", None),
+    getattr(torch.ops.aten.movedim, "intlist", None),
+):
+    if _op is not None:
+        _one_to_one_shared_input_qspec.add(_op)
+
+
 _one_to_one_shared_input_or_input_act_qspec: set[OpOverload] = {
     torch.ops.aten.alias.default,
     torch.ops.aten.clone.default,
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index 8864324dbd5..6819929104e 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -78,6 +78,12 @@ def forward(self, x):
         return torch.permute(x, self.dims)
 
 
+class SimpleMoveAxis(torch.nn.Module):
+
+    def forward(self, x):
+        return torch.moveaxis(x, 1, -1)
+
+
 @common.parametrize(
     "test_data", test_data_suite | test_data_suite_fp16 | test_data_suite_bf16
 )
@@ -118,6 +124,17 @@ def test_permute_u55_INT(test_data):
     pipeline.run()
 
 
+def test_moveaxis_u55_INT():
+    pipeline = EthosU55PipelineINT[input_t1](
+        SimpleMoveAxis(),
+        (torch.rand(1, 4, 5, 6),),
+        "torch.ops.aten.moveaxis.int",
+        exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default",
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", test_data_suite_u55_reject)
 def test_permute_u55_INT_not_delegated(test_data: torch.Tensor):
     test_data, dims = test_data()
diff --git a/backends/arm/test/quantizer/test_generic_annotater.py b/backends/arm/test/quantizer/test_generic_annotater.py
index dd883e72b1f..b5cfd1efdc6 100644
--- a/backends/arm/test/quantizer/test_generic_annotater.py
+++ b/backends/arm/test/quantizer/test_generic_annotater.py
@@ -89,6 +89,41 @@ def test_transpose_tosa_INT():
     )
 
 
+def test_moveaxis_movedim_tosa_INT():
+    check_annotation(
+        SingleOpModel(
+            torch.moveaxis,
+            (torch.randn(2, 3, 4),),
+            source=1,
+            destination=-1,
+        ),
+    )
+    check_annotation(
+        SingleOpModel(
+            torch.moveaxis,
+            (torch.randn(2, 3, 4),),
+            source=(0, 1),
+            destination=(-1, -2),
+        ),
+    )
+    check_annotation(
+        SingleOpModel(
+            torch.movedim,
+            (torch.randn(2, 3, 4),),
+            source=1,
+            destination=-1,
+        ),
+    )
+    check_annotation(
+        SingleOpModel(
+            torch.movedim,
+            (torch.randn(2, 3, 4),),
+            source=(0, 1),
+            destination=(-1, -2),
+        ),
+    )
+
+
 def test_tile_tosa_INT():
     check_annotation(
         SingleOpModel(torch.tile, (torch.randn(4, 4),), dims=(2,)),
diff --git a/backends/mlx/CMakeLists.txt b/backends/mlx/CMakeLists.txt
index 43968d09b5d..acb96fb1ed9 100644
--- a/backends/mlx/CMakeLists.txt
+++ b/backends/mlx/CMakeLists.txt
@@ -255,8 +255,10 @@ option(ET_MLX_ALLOW_CUSTOM_KERNEL_EXECUTION
        ON
 )
 
-set(_mlx_backend__srcs ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXLoader.cpp
-                       ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXBackend.cpp
+set(_mlx_backend__srcs
+    ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXLoader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXBackend.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/runtime/mlx_mutable_state.cpp
 )
 
 add_library(mlxdelegate ${_mlx_backend__srcs})
diff --git a/backends/mlx/custom_kernel_ops/gated_delta_rule.py b/backends/mlx/custom_kernel_ops/gated_delta_rule.py
index 423ffd0b034..41eb8ce7b98 100644
--- a/backends/mlx/custom_kernel_ops/gated_delta_rule.py
+++ b/backends/mlx/custom_kernel_ops/gated_delta_rule.py
@@ -53,6 +53,15 @@ def gated_delta_rule(
     B, T_len, Hk, Dk = q.shape
     Hv, Dv = v.shape[-2:]
 
+    # The Metal kernel maps each v-head to its k-head group
+    # (hk_idx = hv_idx / (Hv / Hk)); mirror that here so the eager reference also
+    # supports Hk != Hv (GQA) instead of relying on broadcasting, which requires
+    # Hk == Hv. repeat_interleave on the head dim reproduces that index mapping.
+    if Hk != Hv:
+        q = q.repeat_interleave(Hv // Hk, dim=2)
+        k = k.repeat_interleave(Hv // Hk, dim=2)
+        Hk = Hv
+
     s = state.clone()
 
     ys = []
@@ -101,6 +110,7 @@ def gated_delta_rule_fake(
     IntOrVid,
     MetalKernelNode,
     MultiplyNode,
+    RepeatNode,
     ScanNode,
     SubtractNode,
     SumNode,
@@ -450,6 +460,33 @@ def _emit_scan(self, P: MLXProgramBuilder, n: Node) -> Slot:
             ]
         )
 
+        # GQA: q/k carry Hk heads but the recurrence state/v have Hv heads. Expand
+        # q/k to Hv (repeat_interleave on the head axis) so the per-step broadcasts
+        # match, mirroring the Metal kernel's hk_idx = hv_idx / (Hv / Hk).
+        Hk = int(self.q_node.meta["val"].shape[-2])
+        Hv = int(self.v_node.meta["val"].shape[-2])
+        if Hk != Hv:
+            rep = IntOrVid.from_literal(Hv // Hk)
+            _, q_exp = P.make_tmp_slot()
+            P.emit(
+                RepeatNode(
+                    x=P.slot_to_tid(q_slot),
+                    out=P.slot_to_tid(q_exp),
+                    repeats=rep,
+                    axis=2,
+                )
+            )
+            _, k_exp = P.make_tmp_slot()
+            P.emit(
+                RepeatNode(
+                    x=P.slot_to_tid(k_slot),
+                    out=P.slot_to_tid(k_exp),
+                    repeats=rep,
+                    axis=2,
+                )
+            )
+            q_slot, k_slot = q_exp, k_exp
+
         # Carry needs a writable slot. This is node n's persistent output (the
         # mutated state), so it must be a node-owned slot — not a temp slot, whose
         # id is reclaimed on tmp_scope exit and would be read as dead by a later
diff --git a/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py b/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py
index 0a7e6a687f9..dfee111e74b 100644
--- a/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py
+++ b/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py
@@ -96,9 +96,8 @@ def forward(
         g: torch.Tensor,  # [B, T, Hv]
         beta: torch.Tensor,  # [B, T, Hv]
     ) -> torch.Tensor:
-        if self.head_repeat > 1:
-            q = q.repeat_interleave(self.head_repeat, dim=2)
-            k = k.repeat_interleave(self.head_repeat, dim=2)
+        # Pass native Hk (no repeat_interleave): the op itself must handle
+        # GQA head expansion (kernel via hk_idx mapping, scan/eager internally).
         return torch.ops.mlx.gated_delta_rule(
             q, k, v, g, beta, self.state, use_custom_kernel=self.use_custom_kernel
         )
diff --git a/backends/mlx/ops.py b/backends/mlx/ops.py
index 44536e675da..e3a636466c1 100644
--- a/backends/mlx/ops.py
+++ b/backends/mlx/ops.py
@@ -163,6 +163,8 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx.node import Node
 
+_LEAKY_RELU_DEFAULT_NEGATIVE_SLOPE = 0.01
+
 
 def require_static_int(value: Any, param_name: str, op_name: str) -> None:
     """
@@ -2786,6 +2788,63 @@ def _relu_handler(P: MLXProgramBuilder, n: Node) -> Slot:
     return out
 
 
+@REGISTRY.register(target=[torch.ops.aten.leaky_relu.default])
+def _leaky_relu_handler(P: MLXProgramBuilder, n: Node) -> Slot:
+    """Handle aten.leaky_relu.default - leaky rectified linear unit.
+
+    leaky_relu(x) = x          if x >= 0
+                  = slope * x  otherwise
+
+    Implemented as where(x >= 0, x, slope * x) so it stays correct for any
+    negative_slope (including values > 1), matching eager PyTorch.
+    """
+    args = P.args(n)
+    require_args(args, 1, 2, "aten.leaky_relu")
+    require_kwargs(P.kwargs(n), set(), "aten.leaky_relu")
+
+    x = args[0]
+    negative_slope = _LEAKY_RELU_DEFAULT_NEGATIVE_SLOPE
+    if len(args) > 1 and args[1] is not None:
+        negative_slope = float(args[1])
+
+    x_meta = n.args[0].meta.get("val")
+    if x_meta is None:
+        raise ValueError("Input tensor metadata not found for leaky_relu")
+    dtype = x_meta.dtype
+
+    zero_slot = emit_lifted_constant(P, 0.0, dtype)
+    slope_slot = emit_lifted_constant(P, negative_slope, dtype)
+
+    _, cond_slot = P.make_tmp_slot()
+    P.emit(
+        GreaterEqualNode(
+            a=P.slot_to_tid(x),
+            b=P.slot_to_tid(zero_slot),
+            out=P.slot_to_tid(cond_slot),
+        )
+    )
+
+    _, scaled_slot = P.make_tmp_slot()
+    P.emit(
+        MultiplyNode(
+            a=P.slot_to_tid(slope_slot),
+            b=P.slot_to_tid(x),
+            out=P.slot_to_tid(scaled_slot),
+        )
+    )
+
+    out = P.make_or_get_slot(n)
+    P.emit(
+        WhereNode(
+            condition=P.slot_to_tid(cond_slot),
+            x=P.slot_to_tid(x),
+            y=P.slot_to_tid(scaled_slot),
+            out=P.slot_to_tid(out),
+        )
+    )
+    return out
+
+
 @REGISTRY.register(target=[torch.ops.aten._log_softmax.default])
 def _log_softmax_handler(P: MLXProgramBuilder, n: Node) -> Slot:
     """Handle aten._log_softmax.default - log of softmax.
diff --git a/backends/mlx/runtime/MLXBackend.cpp b/backends/mlx/runtime/MLXBackend.cpp
index 5bd3bf263d1..0dbdec22436 100644
--- a/backends/mlx/runtime/MLXBackend.cpp
+++ b/backends/mlx/runtime/MLXBackend.cpp
@@ -9,6 +9,7 @@
 #include "MLXExecutor.h"
 #include "MLXInterpreter.h"
 #include "MLXLoader.h"
+#include "mlx_mutable_state.h"
 
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -277,6 +278,12 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface {
         eval(handle->constants.tensors);
       }
 
+      // Register the handle with the per-session mutable-state manager. This is
+      // a no-op unless a multi-session owner is active for this load (see
+      // mlx_mutable_state.h); single-session execution is unaffected.
+      mutable_state_note_handle(
+          handle, &handle->program, &handle->mutable_buffers);
+
     } catch (const std::exception& e) {
       ET_LOG(Error, "Failed to load MLX program: %s", e.what());
       handle->~MLXHandle();
@@ -366,6 +373,14 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface {
           }
         }
 
+        // Select the active session's mutable buffers (KV cache, recurrent/conv
+        // state) before running. No-op for single-session handles; weights stay
+        // shared via ExecutionState::constants.
+        if (Error rebind_err = mutable_state_rebind_for_execute(h, h->state);
+            rebind_err != Error::Ok) {
+          return rebind_err;
+        }
+
         // Run the MLX program (builds lazy computation graph)
         h->interpreter.run(program, h->state, h->stream);
 
@@ -431,6 +446,7 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface {
   void destroy(DelegateHandle* handle) const override {
     std::lock_guard<std::mutex> lock(mlx_global_mutex());
     if (handle != nullptr) {
+      mutable_state_forget_handle(handle);
       auto* mlx_handle = static_cast<MLXHandle*>(handle);
       mlx_handle->~MLXHandle();
     }
diff --git a/backends/mlx/runtime/mlx_mutable_state.cpp b/backends/mlx/runtime/mlx_mutable_state.cpp
new file mode 100644
index 00000000000..2f00d917136
--- /dev/null
+++ b/backends/mlx/runtime/mlx_mutable_state.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "mlx_mutable_state.h"
+
+#include "MLXExecutor.h"
+#include "MLXLoader.h"
+
+#include <executorch/runtime/platform/log.h>
+
+#include <mutex>
+#include <unordered_map>
+
+namespace executorch {
+namespace backends {
+namespace mlx {
+
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace {
+
+struct HandleInfo {
+  const MLXProgram* program{nullptr};
+  MutableBufferData* default_buffers{nullptr};
+};
+
+struct Context {
+  // Delegate handles associated with this loaded program (one per loaded
+  // method). Keyed by opaque MLXHandle pointer.
+  std::unordered_map<const void*, HandleInfo> handles;
+  // Per-session mutable buffers: token -> (handle -> buffers). Allocated lazily
+  // on first execute for a given (session, handle).
+  std::unordered_map<int, std::unordered_map<const void*, MutableBufferData>>
+      sessions;
+  int next_token{0};
+  // Sticky setup failure. Once set (e.g. by nested load scopes), available(),
+  // validate_coverage(), create_session(), and rebind fail consistently.
+  Error build_error{Error::Ok};
+};
+
+// Process-global registry. MLX serializes execution via its own global mutex
+// and the engine serializes per session, but the registry itself is guarded
+// here so context/session lifecycle calls from other threads are safe.
+std::mutex& registry_mutex() {
+  static std::mutex m;
+  return m;
+}
+
+std::unordered_map<MutableStateContext, Context>& contexts() {
+  static std::unordered_map<MutableStateContext, Context> c;
+  return c;
+}
+
+std::unordered_map<const void*, MutableStateContext>& handle_ctx() {
+  static std::unordered_map<const void*, MutableStateContext> m;
+  return m;
+}
+
+MutableStateContext g_next_ctx = 1; // 0 is reserved as invalid.
+
+// Thread-local load scope and active (ctx, session) selection.
+thread_local MutableStateContext tl_loading_ctx = kInvalidMutableContext;
+thread_local MutableStateContext tl_active_ctx = kInvalidMutableContext;
+thread_local int tl_active_token = kNoMutableSession;
+
+} // namespace
+
+namespace detail {
+
+MutableStateContext mutable_state_create_context() {
+  std::lock_guard<std::mutex> g(registry_mutex());
+  MutableStateContext ctx = g_next_ctx++;
+  if (ctx == kInvalidMutableContext) {
+    ctx = g_next_ctx++;
+  }
+  contexts()[ctx];
+  return ctx;
+}
+
+void mutable_state_destroy_context(MutableStateContext ctx) {
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto it = contexts().find(ctx);
+  if (it == contexts().end()) {
+    return;
+  }
+  for (const auto& kv : it->second.handles) {
+    handle_ctx().erase(kv.first);
+  }
+  contexts().erase(it);
+}
+
+void mutable_state_begin_load(MutableStateContext ctx) {
+  if (tl_loading_ctx != kInvalidMutableContext) {
+    // Nested load scopes would silently overwrite the thread-local association.
+    // Mark both the already-active and the new context invalid instead.
+    std::lock_guard<std::mutex> g(registry_mutex());
+    auto active = contexts().find(tl_loading_ctx);
+    if (active != contexts().end()) {
+      active->second.build_error = Error::InvalidState;
+    }
+    auto nested = contexts().find(ctx);
+    if (nested != contexts().end()) {
+      nested->second.build_error = Error::InvalidState;
+    }
+    ET_LOG(Error, "mutable_state: nested load scopes are not supported");
+    tl_loading_ctx = kInvalidMutableContext;
+    return;
+  }
+  tl_loading_ctx = ctx;
+}
+
+void mutable_state_end_load() {
+  tl_loading_ctx = kInvalidMutableContext;
+}
+
+bool mutable_state_available(MutableStateContext ctx) {
+  if (ctx == kInvalidMutableContext) {
+    return false;
+  }
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto it = contexts().find(ctx);
+  return it != contexts().end() && it->second.build_error == Error::Ok &&
+      !it->second.handles.empty();
+}
+
+int64_t mutable_state_bytes_per_session(MutableStateContext ctx) {
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto it = contexts().find(ctx);
+  if (it == contexts().end()) {
+    return 0;
+  }
+  int64_t total = 0;
+  for (const auto& kv : it->second.handles) {
+    const MutableBufferData* bufs = kv.second.default_buffers;
+    if (bufs == nullptr) {
+      continue;
+    }
+    for (const auto& t : bufs->tensors) {
+      if (t.has_value()) {
+        total += static_cast<int64_t>(t->nbytes());
+      }
+    }
+  }
+  return total;
+}
+
+Error mutable_state_validate_coverage(MutableStateContext ctx) {
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto it = contexts().find(ctx);
+  if (it == contexts().end()) {
+    return Error::InvalidArgument;
+  }
+  if (it->second.build_error != Error::Ok) {
+    return it->second.build_error;
+  }
+  // MLX clones all mutable buffers by tid; there is no FQN coverage to verify.
+  return Error::Ok;
+}
+
+Result<int> mutable_state_create_session(MutableStateContext ctx) {
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto it = contexts().find(ctx);
+  if (it == contexts().end()) {
+    ET_LOG(Error, "mutable_state_create_session: unknown context %d", ctx);
+    return Error::InvalidState;
+  }
+  Context& c = it->second;
+  if (c.build_error != Error::Ok) {
+    return c.build_error;
+  }
+  if (c.handles.empty()) {
+    ET_LOG(
+        Error, "mutable_state_create_session: no backend handles registered");
+    return Error::NotSupported;
+  }
+  int token = c.next_token++;
+  // Per-handle buffers are allocated lazily on first execute.
+  c.sessions[token];
+  return token;
+}
+
+void mutable_state_destroy_session(MutableStateContext ctx, int token) {
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto it = contexts().find(ctx);
+  if (it == contexts().end()) {
+    return;
+  }
+  it->second.sessions.erase(token);
+}
+
+void mutable_state_set_active(MutableStateContext ctx, int token) {
+  tl_active_ctx = ctx;
+  tl_active_token = token;
+}
+
+} // namespace detail
+
+void mutable_state_note_handle(
+    const void* handle,
+    const MLXProgram* program,
+    MutableBufferData* default_buffers) {
+  if (tl_loading_ctx == kInvalidMutableContext) {
+    return; // No multi-session owner active during this load: single-session.
+  }
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto it = contexts().find(tl_loading_ctx);
+  if (it == contexts().end()) {
+    return;
+  }
+  it->second.handles[handle] = HandleInfo{program, default_buffers};
+  handle_ctx()[handle] = tl_loading_ctx;
+}
+
+void mutable_state_forget_handle(const void* handle) {
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto hit = handle_ctx().find(handle);
+  if (hit == handle_ctx().end()) {
+    return;
+  }
+  auto cit = contexts().find(hit->second);
+  if (cit != contexts().end()) {
+    cit->second.handles.erase(handle);
+    for (auto& session : cit->second.sessions) {
+      session.second.erase(handle);
+    }
+  }
+  handle_ctx().erase(hit);
+}
+
+Error mutable_state_rebind_for_execute(
+    const void* handle,
+    ExecutionState& state) {
+  std::lock_guard<std::mutex> g(registry_mutex());
+  auto hit = handle_ctx().find(handle);
+  if (hit == handle_ctx().end()) {
+    if (tl_active_token != kNoMutableSession) {
+      ET_LOG(
+          Error,
+          "mutable_state_rebind_for_execute: active session set but handle has "
+          "no mutable-state context");
+      return Error::Internal;
+    }
+    // Handle was not loaded under a multi-session owner: keep default buffers.
+    return Error::Ok;
+  }
+  auto cit = contexts().find(hit->second);
+  if (cit == contexts().end()) {
+    return Error::Ok;
+  }
+  Context& ctx = cit->second;
+  if (ctx.build_error != Error::Ok) {
+    return ctx.build_error;
+  }
+  // Invariant: a handle present in handle_ctx() is present in ctx.handles. Look
+  // it up explicitly (not operator[]) so a broken invariant fails loudly
+  // instead of inserting a {nullptr, nullptr} entry that later null-derefs in
+  // load_mutable_buffers(*info.program, ...).
+  auto info_it = ctx.handles.find(handle);
+  if (info_it == ctx.handles.end()) {
+    ET_LOG(
+        Error,
+        "mutable_state_rebind_for_execute: handle has a context but no "
+        "registered HandleInfo (invariant broken)");
+    return Error::Internal;
+  }
+  HandleInfo& info = info_it->second;
+
+  const bool has_active_session = tl_active_token != kNoMutableSession;
+  const bool active_for_this_ctx =
+      has_active_session && tl_active_ctx == hit->second;
+
+  // A session is active, but for a different context than the one this handle
+  // belongs to. Falling back to default buffers would silently execute with the
+  // wrong model/session state, so refuse instead.
+  if (has_active_session && !active_for_this_ctx) {
+    ET_LOG(
+        Error,
+        "mutable_state_rebind_for_execute: active context mismatch (a session "
+        "is active for a different loaded program than the one executing)");
+    return Error::Internal;
+  }
+
+  if (!active_for_this_ctx) {
+    // No session selected. Refuse if sessions exist (running against the
+    // default buffers here would not isolate state from created sessions).
+    if (!ctx.sessions.empty()) {
+      ET_LOG(
+          Error,
+          "mutable_state_rebind_for_execute: no active session selected but "
+          "sessions exist for this program");
+      return Error::InvalidState;
+    }
+    state.mutable_buffers = info.default_buffers;
+    return Error::Ok;
+  }
+
+  auto sit = ctx.sessions.find(tl_active_token);
+  if (sit == ctx.sessions.end()) {
+    ET_LOG(
+        Error,
+        "mutable_state_rebind_for_execute: unknown session token %d",
+        tl_active_token);
+    return Error::InvalidState;
+  }
+
+  auto& per_handle = sit->second;
+  auto bit = per_handle.find(handle);
+  if (bit == per_handle.end()) {
+    // First execute for this (session, handle): allocate fresh zeroed buffers.
+    // Constants/weights stay shared (ExecutionState::constants is untouched);
+    // only the mutable buffers are per-session.
+    MutableBufferData buffers;
+    try {
+      load_mutable_buffers(*info.program, buffers);
+    } catch (const std::exception& e) {
+      ET_LOG(
+          Error,
+          "mutable_state_rebind_for_execute: failed to allocate session "
+          "buffers: %s",
+          e.what());
+      return Error::MemoryAllocationFailed;
+    }
+    bit = per_handle.emplace(handle, std::move(buffers)).first;
+  }
+  // unordered_map keeps element pointers stable across rehash, so this remains
+  // valid for the duration of the execute.
+  state.mutable_buffers = &bit->second;
+  return Error::Ok;
+}
+
+} // namespace mlx
+} // namespace backends
+} // namespace executorch
diff --git a/backends/mlx/runtime/mlx_mutable_state.h b/backends/mlx/runtime/mlx_mutable_state.h
new file mode 100644
index 00000000000..84420812360
--- /dev/null
+++ b/backends/mlx/runtime/mlx_mutable_state.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/compiler.h>
+
+// MLX-private support for running one loaded MLX program with multiple isolated
+// instances of its mutable buffers (KV cache, conv/recurrent state). Callers
+// create sessions and execute with one active session selected.
+//
+// Unlike the CUDA backend, the MLX runtime owns mutable buffers directly in a
+// swappable container (ExecutionState::mutable_buffers is a
+// MutableBufferData*), so per-session isolation is a pointer swap to a fresh
+// MutableBufferData — no FQN registration / constant-repoint hook is needed.
+
+namespace executorch {
+namespace backends {
+namespace mlx {
+
+// Forward declarations (defined in MLXLoader.h / MLXExecutor.h).
+struct MLXProgram;
+struct MutableBufferData;
+struct ExecutionState;
+
+// Opaque per-loaded-program context id (0 = invalid).
+using MutableStateContext = int;
+constexpr MutableStateContext kInvalidMutableContext = 0;
+
+// Sentinel for execution without per-session rebinding.
+constexpr int kNoMutableSession = -1;
+
+// Implementation entry points. Callers should use MutableStateContextOwner.
+namespace detail {
+
+MutableStateContext mutable_state_create_context();
+void mutable_state_destroy_context(MutableStateContext ctx);
+void mutable_state_begin_load(MutableStateContext ctx);
+void mutable_state_end_load();
+bool mutable_state_available(MutableStateContext ctx);
+int64_t mutable_state_bytes_per_session(MutableStateContext ctx);
+::executorch::runtime::Error mutable_state_validate_coverage(
+    MutableStateContext ctx);
+::executorch::runtime::Result<int> mutable_state_create_session(
+    MutableStateContext ctx);
+void mutable_state_destroy_session(MutableStateContext ctx, int token);
+void mutable_state_set_active(MutableStateContext ctx, int token);
+
+} // namespace detail
+
+// Caller-facing owner for one mutable-state context. Mirrors the CUDA backend's
+// MutableStateContextOwner so the example engine can use a symmetric API.
+class ET_EXPERIMENTAL MutableStateContextOwner final {
+  class LoadScope final {
+   public:
+    explicit LoadScope(MutableStateContext ctx) {
+      detail::mutable_state_begin_load(ctx);
+    }
+
+    ~LoadScope() {
+      detail::mutable_state_end_load();
+    }
+
+    LoadScope(const LoadScope&) = delete;
+    LoadScope& operator=(const LoadScope&) = delete;
+  };
+
+  class ActiveSessionScope final {
+   public:
+    ActiveSessionScope(MutableStateContext ctx, int token) {
+      detail::mutable_state_set_active(ctx, token);
+    }
+
+    ~ActiveSessionScope() {
+      detail::mutable_state_set_active(
+          kInvalidMutableContext, kNoMutableSession);
+    }
+
+    ActiveSessionScope(const ActiveSessionScope&) = delete;
+    ActiveSessionScope& operator=(const ActiveSessionScope&) = delete;
+  };
+
+ public:
+  MutableStateContextOwner() : ctx_(detail::mutable_state_create_context()) {}
+
+  ~MutableStateContextOwner() {
+    destroy();
+  }
+
+  MutableStateContextOwner(const MutableStateContextOwner&) = delete;
+  MutableStateContextOwner& operator=(const MutableStateContextOwner&) = delete;
+
+  MutableStateContextOwner(MutableStateContextOwner&& other) noexcept
+      : ctx_(std::exchange(other.ctx_, kInvalidMutableContext)) {}
+
+  MutableStateContextOwner& operator=(
+      MutableStateContextOwner&& other) noexcept {
+    if (this != &other) {
+      destroy();
+      ctx_ = std::exchange(other.ctx_, kInvalidMutableContext);
+    }
+    return *this;
+  }
+
+  MutableStateContext get() const {
+    return ctx_;
+  }
+
+  explicit operator bool() const {
+    return ctx_ != kInvalidMutableContext;
+  }
+
+  // Associates delegate handles created by `fn` with this context.
+  template <typename Fn>
+  auto with_load_scope(Fn&& fn) const -> decltype(std::forward<Fn>(fn)()) {
+    LoadScope scope(ctx_);
+    return std::forward<Fn>(fn)();
+  }
+
+  // Selects this context/session while `fn` executes. The caller is responsible
+  // for serializing execution that touches the same loaded program.
+  //
+  // Thread-safety contract: destroy_session()/forget_handle() only take the
+  // registry mutex, while rebind (under with_active_session) hands execute a
+  // raw pointer into Context::sessions that is dereferenced after the lock is
+  // released. The caller must therefore guarantee a session is never destroyed
+  // while it is the active session mid-execute (the engine upholds this: a
+  // session's buffers are freed only when its owning LLMSession drops, never
+  // concurrently with its own execute). Destroying *other* sessions
+  // concurrently is safe — unordered_map keeps element pointers stable across
+  // rehash.
+  template <typename Fn>
+  auto with_active_session(int token, Fn&& fn) const
+      -> decltype(std::forward<Fn>(fn)()) {
+    ActiveSessionScope scope(ctx_, token);
+    return std::forward<Fn>(fn)();
+  }
+
+  // True only after this context has been associated with at least one loaded
+  // MLX backend handle can create isolated mutable-buffer sessions.
+  bool available() const {
+    return detail::mutable_state_available(ctx_);
+  }
+
+  int64_t bytes_per_session() const {
+    return detail::mutable_state_bytes_per_session(ctx_);
+  }
+
+  ::executorch::runtime::Error validate_coverage() const {
+    return detail::mutable_state_validate_coverage(ctx_);
+  }
+
+  // Creates an isolated mutable-buffer session for this context.
+  // Fails if no loaded MLX backend handle has been associated with the context.
+  ET_NODISCARD ::executorch::runtime::Result<int> create_session() const {
+    return detail::mutable_state_create_session(ctx_);
+  }
+
+  void destroy_session(int token) const {
+    detail::mutable_state_destroy_session(ctx_, token);
+  }
+
+ private:
+  void destroy() {
+    if (ctx_ != kInvalidMutableContext) {
+      detail::mutable_state_destroy_context(ctx_);
+      ctx_ = kInvalidMutableContext;
+    }
+  }
+
+  MutableStateContext ctx_ = kInvalidMutableContext;
+};
+
+// --- MLXBackend hooks --------------------------------------------------------
+//
+// Called from MLXBackend init/execute/destroy. `handle` is an opaque key (the
+// MLXHandle pointer). `program` and `default_buffers` are the handle's own
+// program and (init-time) mutable buffers; the manager swaps in per-session
+// buffers (or restores the default) by re-pointing `state.mutable_buffers`.
+
+void mutable_state_note_handle(
+    const void* handle,
+    const MLXProgram* program,
+    MutableBufferData* default_buffers);
+
+void mutable_state_forget_handle(const void* handle);
+
+::executorch::runtime::Error mutable_state_rebind_for_execute(
+    const void* handle,
+    ExecutionState& state);
+
+} // namespace mlx
+} // namespace backends
+} // namespace executorch
diff --git a/backends/mlx/test/CMakeLists.txt b/backends/mlx/test/CMakeLists.txt
index 39024639d1d..2d494652138 100644
--- a/backends/mlx/test/CMakeLists.txt
+++ b/backends/mlx/test/CMakeLists.txt
@@ -69,3 +69,22 @@ if(EXECUTORCH_MLX_ENABLE_SANITIZERS)
     multi_thread_test_runner PRIVATE ${_mlx_sanitizer_link_options}
   )
 endif()
+
+# Per-session mutable-state manager unit test (no model/tokenizer needed).
+add_executable(mlx_mutable_state_test mlx_mutable_state_test.cpp)
+target_include_directories(
+  mlx_mutable_state_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../runtime
+)
+target_link_libraries(
+  mlx_mutable_state_test PRIVATE mlxdelegate mlx_schema mlx executorch_core
+)
+if(EXECUTORCH_MLX_ENABLE_SANITIZERS)
+  target_compile_options(
+    mlx_mutable_state_test PRIVATE -fsanitize=address,undefined
+                                   -fno-omit-frame-pointer
+  )
+  target_link_options(
+    mlx_mutable_state_test PRIVATE ${_mlx_sanitizer_link_options}
+  )
+endif()
+add_test(NAME mlx_mutable_state COMMAND mlx_mutable_state_test)
diff --git a/backends/mlx/test/mlx_mutable_state_test.cpp b/backends/mlx/test/mlx_mutable_state_test.cpp
new file mode 100644
index 00000000000..99a646701ef
--- /dev/null
+++ b/backends/mlx/test/mlx_mutable_state_test.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Unit test for the MLX per-session mutable-state manager
+// (backends/mlx/runtime/mlx_mutable_state.{h,cpp}).
+//
+// Verifies that two sessions created on one loaded program get independent
+// mutable buffers: writing into session A's buffer does not leak into session
+// B's, and A's value persists across a rebind to B and back. This is the MLX
+// analogue of the CUDA "no-bleed" guarantee, exercised directly on the manager
+// (no model or tokenizer needed).
+
+#include "MLXExecutor.h"
+#include "MLXLoader.h"
+#include "mlx_mutable_state.h"
+
+#include <mlx/mlx.h>
+
+#include <cstdio>
+
+using namespace ::executorch::backends::mlx;
+
+namespace {
+
+int g_failures = 0;
+
+#define CHECK(cond)                                         \
+  do {                                                      \
+    if (!(cond)) {                                          \
+      std::printf("FAIL: %s (line %d)\n", #cond, __LINE__); \
+      ++g_failures;                                         \
+    }                                                       \
+  } while (0)
+
+// Build a minimal program with a single 1-element float mutable buffer at tid
+// 0.
+MLXProgram make_program() {
+  MLXProgram program;
+  program.num_mutable_buffer_tensors = 1;
+  program.mutable_buffer_map.push_back(SlotVariant{0, SlotType::TensorSlot});
+  TensorMeta meta;
+  meta.shape.push_back(ShapeDim{/*value=*/1});
+  meta.scalar_type = ScalarType::Float;
+  program.tensor_meta.resize(1);
+  program.tensor_meta[0] = meta;
+  return program;
+}
+
+float read0(const MutableBufferData& bufs) {
+  auto arr = bufs.get(Tid{0});
+  ::mlx::core::eval(arr);
+  return arr.item<float>();
+}
+
+} // namespace
+
+int main() {
+  MLXProgram program = make_program();
+
+  // Handle's default (init-time) mutable buffers.
+  MutableBufferData default_bufs;
+  load_mutable_buffers(program, default_bufs);
+
+  int dummy = 0;
+  const void* handle = &dummy;
+
+  MutableStateContextOwner owner;
+  CHECK(static_cast<bool>(owner));
+
+  // Associate the handle with the context (as MLXBackend::init would).
+  owner.with_load_scope(
+      [&]() { mutable_state_note_handle(handle, &program, &default_bufs); });
+
+  CHECK(owner.available());
+  CHECK(owner.bytes_per_session() == static_cast<int64_t>(sizeof(float)));
+
+  auto tokA = owner.create_session();
+  auto tokB = owner.create_session();
+  CHECK(tokA.ok());
+  CHECK(tokB.ok());
+  CHECK(tokA.get() != tokB.get());
+
+  ExecutionState state;
+
+  // Session A: rebind, then write a marker (7.0) into its buffer.
+  owner.with_active_session(tokA.get(), [&]() {
+    auto err = mutable_state_rebind_for_execute(handle, state);
+    CHECK(err == ::executorch::runtime::Error::Ok);
+    state.mutable_buffers->set(
+        Tid{0}, ::mlx::core::full({1}, 7.0f, ::mlx::core::float32));
+    return err;
+  });
+
+  // Session B: a fresh rebind must see zeros, not A's marker.
+  owner.with_active_session(tokB.get(), [&]() {
+    auto err = mutable_state_rebind_for_execute(handle, state);
+    CHECK(err == ::executorch::runtime::Error::Ok);
+    CHECK(read0(*state.mutable_buffers) == 0.0f);
+    return err;
+  });
+
+  // Back to session A: the marker must persist (isolation, no bleed).
+  owner.with_active_session(tokA.get(), [&]() {
+    auto err = mutable_state_rebind_for_execute(handle, state);
+    CHECK(err == ::executorch::runtime::Error::Ok);
+    CHECK(read0(*state.mutable_buffers) == 7.0f);
+    return err;
+  });
+
+  // With sessions present, executing without an active session is refused
+  // (prevents running against unmanaged/shared state).
+  {
+    auto err = mutable_state_rebind_for_execute(handle, state);
+    CHECK(err == ::executorch::runtime::Error::InvalidState);
+  }
+
+  owner.destroy_session(tokA.get());
+  owner.destroy_session(tokB.get());
+  mutable_state_forget_handle(handle);
+
+  if (g_failures == 0) {
+    std::printf("OK: mlx_mutable_state isolation test passed\n");
+    return 0;
+  }
+  std::printf("FAILED: %d checks\n", g_failures);
+  return 1;
+}
diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py
index 8f52116f6b8..e96c8075903 100644
--- a/backends/mlx/test/test_ops.py
+++ b/backends/mlx/test/test_ops.py
@@ -405,6 +405,60 @@ def create_inputs(self) -> Tuple[torch.Tensor, ...]:
         return (x,)
 
 
+class LeakyReLUModel(nn.Module):
+    """Model that applies leaky_relu with an optional negative slope."""
+
+    def __init__(self, negative_slope: Optional[float] = 0.01):
+        super().__init__()
+        self.negative_slope = negative_slope
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.negative_slope is None:
+            return torch.nn.functional.leaky_relu(x)
+        return torch.nn.functional.leaky_relu(x, negative_slope=self.negative_slope)
+
+
+@register_test
+class LeakyReLUTest(OpTestCase):
+    """Test case for leaky_relu activation with various negative slopes."""
+
+    name = "leaky_relu"
+    rtol = 1e-5
+    atol = 1e-5
+
+    def __init__(
+        self,
+        shape: Tuple[int, ...] = (2, 3, 4),
+        negative_slope: Optional[float] = 0.01,
+    ):
+        self.shape = shape
+        self.negative_slope = negative_slope
+        shape_str = "x".join(str(s) for s in shape)
+        slope_str = "default" if negative_slope is None else f"slope{negative_slope}"
+        self.name = f"leaky_relu_{slope_str}_{shape_str}"
+
+    @classmethod
+    def get_test_configs(cls) -> List["LeakyReLUTest"]:
+        return [
+            cls(shape=(2, 3, 4), negative_slope=0.01),
+            cls(shape=(2, 3, 4), negative_slope=None),
+            cls(shape=(4, 8), negative_slope=0.1),
+            cls(shape=(10,), negative_slope=0.2),
+            cls(shape=(10,), negative_slope=1.5),
+            cls(shape=(2, 8, 16), negative_slope=0.01),
+        ]
+
+    def create_model(self) -> nn.Module:
+        return LeakyReLUModel(self.negative_slope)
+
+    def create_inputs(self) -> Tuple[torch.Tensor, ...]:
+        numel = 1
+        for size in self.shape:
+            numel *= size
+        x = torch.linspace(-4.0, 4.0, steps=numel).reshape(self.shape)
+        return (x,)
+
+
 class GELUModel(nn.Module):
     """Simple model using GELU activation."""
 
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index ff8cbb660cb..9df8290e85d 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -74,13 +74,16 @@ class EdgeProgramToIRConverter:
     _default_target_spec = NeutronTargetSpec("imxrt700")
     _default_delegation_options = CustomDelegationOptions()
 
+    def __init__(self):
+        self.edge_to_tflite_map = {}
+
     def convert_program(
         self,
         edge_program: ExportedProgram,
         conversion_config: ConversionConfig = _default_conversion_config,
         neutron_target_spec: NeutronTargetSpec = _default_target_spec,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
-    ) -> tuple[bytes, dict[str, dict[str, DataFormat]]]:
+    ) -> tuple[bytes, dict[str, dict[str, DataFormat]], dict[int, tuple[int, ...]]]:
         """
         Convert ExportedProgram in Edge dialect to IR (TFLite flatbuffers) as bytes.
 
@@ -88,8 +91,11 @@ def convert_program(
         :param conversion_config: ConversionConfig instance.
         :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param custom_delegation_options: Custom user options which affect node delegation.
-        :return: TFLite flatbuffers as bytes.
+        :return: TFLite flatbuffers as bytes, I/O formats, and edge-to-tflite mapping.
         """
+        # Reset the edge to tflite map for each conversion
+        self.edge_to_tflite_map = {}
+
         parameters_mapping = self.map_inputs_to_parameters(edge_program)
         dim_order_map = self.map_nodes_to_dim_order(edge_program)
 
@@ -113,6 +119,9 @@ def convert_program(
         # Apply optimizations and finalize the model.
         internal_tflite_model = cc.tflite_builder.finish()
 
+        # Get the final edge to tflite mapping after optimization
+        self.edge_to_tflite_map = cc.tflite_builder.edge_to_tflite_map
+
         # Extract the formats of the model's inputs and outputs.
         io_formats = cc.tflite_builder.get_io_formats(edge_program.graph_signature)
 
@@ -120,7 +129,7 @@ def convert_program(
         flatbuffers_builder = flatbuffers.Builder()
         internal_tflite_model.gen_tflite(flatbuffers_builder)
 
-        return bytes(flatbuffers_builder.Output()), io_formats
+        return bytes(flatbuffers_builder.Output()), io_formats, self.edge_to_tflite_map
 
     @staticmethod
     def append_placeholders_and_tensors(nodes: list[Node], context: ConversionContext):
@@ -162,7 +171,6 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
             exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         ]
-
         for node in nodes:
             if node.op == "call_function":
                 if node.target in qdq_related_functions and "cluster" in node.meta:
@@ -174,7 +182,37 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
                     # The node was already processed alongside the Q/DQ ops.
                     pass
                 elif node.target in functions_converters:
+                    # Get TFLite op count BEFORE conversion
+                    tflite_op_count_before = len(
+                        conversion_context.tflite_builder.get_operators().vector
+                    )
+                    # Convert the node
                     functions_converters[node.target](conversion_context).convert(node)
+                    # Get TFLite op count AFTER conversion
+                    tflite_op_count_after = len(
+                        conversion_context.tflite_builder.get_operators().vector
+                    )
+
+                    # Track the mapping - store edge debug handle in operators.
+                    # Get the edge debug handle so it can be associated with newly created operators.
+                    edge_debug_handle = node.meta.get("debug_handle", None)
+                    if (
+                        edge_debug_handle is not None
+                        and tflite_op_count_after > tflite_op_count_before
+                    ):
+                        operators = (
+                            conversion_context.tflite_builder.get_operators().vector
+                        )
+                        # Node converters append new operators to the TFLite builder.
+                        # Only operators added during this conversion step (from "before" to "after")
+                        # are tagged with the current edge_debug_handle.
+                        for i in range(tflite_op_count_before, tflite_op_count_after):
+                            # Store edge debug handle in operator's temporary attribute
+                            operators[i].tmp_edge_debug_handle = edge_debug_handle
+                        logger.d(
+                            f"Tagged TFLite ops {list(range(tflite_op_count_before, tflite_op_count_after))} with edge debug_handle={edge_debug_handle} for node '{node.name}'"
+                        )
+
                 else:
                     logger.e(
                         logger.Code.NOT_IMPLEMENTED,
diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py
index f97a194ce87..41820c3ab61 100755
--- a/backends/nxp/backend/ir/converter/builder/model_builder.py
+++ b/backends/nxp/backend/ir/converter/builder/model_builder.py
@@ -85,6 +85,10 @@ class ModelBuilder:
 
     conversion_config: ConversionConfig
 
+    edge_to_tflite_map: dict[
+        int, tuple[int, ...]
+    ]  # Mapping edge debug handles to tuple of TFLite operator indices
+
     _default_conversion_config = ConversionConfig()
 
     def __init__(
@@ -105,6 +109,7 @@ def __init__(
         self._nchw_tensor_version = {}
         self._skipped_output_map = {}
         self._zeros_tensor_map = {}
+        self.edge_to_tflite_map = {}
 
     def create_zeros_tensor(
         self, dims: List[int], name: str, dtype: np.dtype, can_reuse: bool = False
@@ -503,6 +508,9 @@ def finish(self) -> tflite_model.Model:
             self.conversion_config.optimization_blacklist,
         )
 
+        # Create the final edge-to-tflite mapping after model optimization
+        self._create_edge_to_tflite_mapping()
+
         self._keep_one_empty_buffer()
 
         # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference.
@@ -524,6 +532,29 @@ def finish(self) -> tflite_model.Model:
 
         return self._tfl_model
 
+    def _create_edge_to_tflite_mapping(self):
+        """Create edge-to-TFLite mapping and save it to the edge_to_tflite_map class variable.
+
+        This function should be called after all model optimizations have been applied to match the output TFLite model.
+        """
+
+        edge_to_tflite_dict = {}
+        for idx, op in enumerate(self.get_operators().vector):
+            if (
+                hasattr(op, "tmp_edge_debug_handle")
+                and op.tmp_edge_debug_handle is not None
+            ):
+                debug_handle = op.tmp_edge_debug_handle
+                if debug_handle not in edge_to_tflite_dict:
+                    edge_to_tflite_dict[debug_handle] = []
+                edge_to_tflite_dict[debug_handle].append(idx)
+
+        # Convert lists to tuples in the dictionary
+        self.edge_to_tflite_map = {k: tuple(v) for k, v in edge_to_tflite_dict.items()}
+        logger.i(
+            f"\nFinal edge_to_tflite_map after optimization: {self.edge_to_tflite_map}"
+        )
+
     def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool):
         for tensor in outputs.tmp_outputs:
             try:
diff --git a/backends/nxp/backend/ir/tflite_generator/tflite_model.py b/backends/nxp/backend/ir/tflite_generator/tflite_model.py
index 6e8e7b6c33b..d8d0bada57d 100755
--- a/backends/nxp/backend/ir/tflite_generator/tflite_model.py
+++ b/backends/nxp/backend/ir/tflite_generator/tflite_model.py
@@ -514,6 +514,9 @@ class Operator(meta.TFLiteObject):
     # If `True`, this is an extra operator added during conversion. It was not present in the original input model.
     tmp_added_extra: bool
 
+    # Edge program debug handle for mapping edge nodes to TFLite operators
+    tmp_edge_debug_handle: Optional[int]
+
     def __init__(
         self,
         inputs: OperatorInputs = None,
@@ -541,6 +544,8 @@ def __init__(
         self.tmp_version = 1
         self.tmp_added_extra = False
 
+        self.tmp_edge_debug_handle = None
+
     def uses_per_channel_quantization(self) -> bool:
         """Determine if this operator uses per-channel quantization."""
         for tensor in itertools.chain(self.tmp_inputs, self.tmp_outputs):
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
index 0abee0cdc86..92b4e25a5de 100644
--- a/backends/nxp/backend/neutron_converter_manager.py
+++ b/backends/nxp/backend/neutron_converter_manager.py
@@ -25,6 +25,15 @@ def _build_compilation_context(compilation_opts):
     cctx.compilationOpts.dumpKernelSelectionCode = compilation_opts[
         "dumpKernelSelectionCode"
     ]
+    if (
+        hasattr(cctx.compilationOpts, "useProfiling")
+        and compilation_opts["useProfiling"]
+    ):
+        cctx.compilationOpts.useProfiling = compilation_opts["useProfiling"]
+        cctx.compilationOpts.dumpAfterImport = "console"
+        cctx.compilationOpts.dumpAfterGenerate = "console"
+        cctx.compilationOpts.verbose = compilation_opts["useProfiling"]
+
     return cctx
 
 
@@ -81,6 +90,7 @@ def convert(
         target: str,
         delegation_tag: str,
         fetch_constants_to_sram: bool = False,
+        use_profiling: bool = False,
     ) -> bytes:
         """
         Call Neutron Converter.
@@ -89,6 +99,7 @@ def convert(
         :param target: The target platform.
         :param delegation_tag: The delegation tag of model partition.
         :param fetch_constants_to_sram: Add microcode that fetches weights from external memory.
+        :param use_profiling: Use profiling for neutron delegated model.
         This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers).
 
         :return: TFLite model with Neutron microcode as bytes.
@@ -102,6 +113,7 @@ def convert(
             "excludeGraphPasses": "HoistSliceAboveTranspose,MergeTranspose",
             "fetchConstantsToSRAM": fetch_constants_to_sram,
             "dumpKernelSelectionCode": self.dump_kernel_selection_code,
+            "useProfiling": use_profiling,
         }
 
         # Try to use multiprocessing for isolation, but fall back to direct execution
diff --git a/backends/nxp/backend/neutron_map.py b/backends/nxp/backend/neutron_map.py
new file mode 100644
index 00000000000..e2da653daa3
--- /dev/null
+++ b/backends/nxp/backend/neutron_map.py
@@ -0,0 +1,457 @@
+# Copyright 2026 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import re
+from dataclasses import dataclass
+
+# example:  Type: CONV_2D
+#               Inputs:
+#                 [0]: quantized_decomposed_quantize_per_tensor_default_4
+#                 [1]: quantized_decomposed_dequantize_per_channel_default_2
+#               Outputs:
+#                 [0]: quantized_decomposed_quantize_per_tensor_default_5
+#               Location: 4
+PATTERN_NODE = (
+    r"Type:\s+(?P<type>\w+)\s+"
+    r"Inputs:(?P<inputs>[\s\S]*?)"
+    r"Outputs:(?P<outputs>[\s\S]*?)"
+    r"Location:\s+(?P<location>\d+)"
+)
+# The pattern is very similar to operator pattern
+PATTERN_SUBGRAPH = (
+    r"^(?P<num>\d+)\s*"
+    r"Inputs:(?P<inputs>[\s\S]*?)"
+    r"Outputs:(?P<outputs>[\s\S]*?)"
+    r"Tensors:"
+)
+# example:  [0]: quantized_decomposed_quantize_per_tensor_default_4
+PATTERN_IO_TENSOR_NAME = r"\[\d+\]:\s+(?P<name>[\S]+)"
+# example: Statistics for NeutronGraph "subgraph_195":
+PATTERN_GRAPH = r"Statistics for NeutronGraph \"subgraph_(?P<num>\d+)\":"
+# example:      NeutronOperator "subgraph_001":
+#                       Operators:
+#                           PAD
+#                           CONV_2D
+#                       Kernels:
+#                           Pad
+#                           Conv2DStandardV2
+#               NeutronOperator "subgraph_002":
+PATTERN_VERBOSE_KERNELS = (
+    r"\"subgraph_(?P<subgraph>\d+)\"\:\s*"
+    r"Operators:[\s\S]*?"
+    r"Kernels:\s*(?P<kernels>[\s\S]*?)"
+    r"\s*(NeutronOperator|^$|=)"
+)
+# example:  NeutronGraph "subgraph_074":
+PATTERN_VERBOSE_GRAPH = (
+    r"NeutronGraph\s*\"subgraph_(?P<subgraph>\d+)\":(?P<operators>[\s\S]*?)\s*(^$|=)"
+)
+# Two graphs are expected in the input log: original and converted.
+EXPECTED_GRAPHS = 2
+# List of single-input nodes that shouldn't be mapped on the same TFLite node.
+SINGLE_INPUT_NODES = [
+    "ABS",
+    "AVERAGE_POOL_2D",
+    "CAST",
+    "EXP",
+    "HARD_SWISH",
+    "LEAKY_RELU",
+    "LOG",
+    "LOGISTIC",
+    "MAX_POOL_2D",
+    "QUANTIZE",
+    "RSQRT",
+    "TANH",
+]
+
+
+@dataclass
+class Node:
+    name: str  # Name of the node.
+    inputs: list[str]  # List of nodes inputs.
+    outputs: list[str]  # List of nodes outputs.
+    location: int  # Location in graph/subgraph.
+
+
+@dataclass
+class SubgraphInfo:
+    num: int  # Subgraph number.
+    location: int  # Location in neutron graph
+    inputs: list[str]  # List of subgraphs inputs.
+    outputs: list[str]  # List of subgraphs outputs.
+    kernels: int  # Number of neutron kernels in neutron subgraph.
+    nodes: list[Node]  # List of tflite nodes in neutron subgraph.
+
+
+def get_tensors_name(tensors: str) -> list[str]:
+    """Split input string with tensor names into list of names"""
+    return [m.group("name") for m in re.finditer(PATTERN_IO_TENSOR_NAME, tensors)]
+
+
+class NeutronMap:
+    """Mapping between Neutron, TFLite, and Edge operators based on the Neutron converter log.
+
+    Parses the Neutron converter log to extract information about TFLite nodes and Neutron subgraphs.
+    Maps TFLite operators to corresponding Neutron operators.
+    Maps Edge operators to Neutron operators via the Edge-to-TFLite mapping.
+
+    Attributes:
+        tflite_nodes (list[Node]): TFLite node information extracted from the converter log.
+        neutron_subgraphs (list[SubgraphInfo]): Neutron subgraph information extracted from the converter log.
+        neutron_graphs (list[int]): Indices of final Neutron graphs derived from neutron_subgraphs.
+        edge_to_tflite_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to TFLite operators.
+        edge_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to Neutron operators.
+        tflite_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from TFLite operators to Neutron operators.
+
+    Example:
+        >>> map = NeutronMap(log_output, edge_to_tflite_map)
+        >>> neutron_to_edge_map = map.get_neutron_to_edge_map()
+    """
+
+    tflite_nodes: list[Node]
+    neutron_subgraphs: list[SubgraphInfo]
+    neutron_graphs: list[int]
+    edge_to_tflite_map: dict[int, tuple[int, ...]]
+    edge_to_neutron_map: dict[int, tuple[int, ...]]
+    tflite_to_neutron_map: dict[int, tuple[int, ...]]
+
+    def __init__(
+        self, neutron_converter_log: str, edge_to_tflite_map: dict[int, tuple[int, ...]]
+    ) -> None:
+        """Initialize neutron map from neutron converter log.
+
+        :param neutron_converter_log: neutron converter log obtained during model conversion. It should contain
+        original tflite graph and neutron graph dump. To add these dumps to converter log the dumpAfterImport and
+        dumpAfterGenerate flags have to be set to "console".
+        """
+        super().__init__()
+        self.tflite_nodes = []
+        self.neutron_subgraphs = []
+        self.neutron_graphs = []
+        self.edge_to_tflite_map = edge_to_tflite_map
+        self.tflite_to_neutron_map = {}
+        self.edge_to_neutron_map = {}
+        self.neutron_kernels_num = 0
+        self._split_profiling_log(neutron_converter_log)
+
+    def _split_profiling_log(self, log: str) -> None:
+        """Process profiling log to split it into original TFLite and converted Neutron nodes.
+
+        :param log: Neutron converter log obtained during model conversion, containing the original
+            TFLite graph and Neutron graph dump.
+        :return: None. Sets class attributes tflite_nodes and neutron_subgraphs with node information.
+        """
+        graphs = log.split("Graphs:")
+        # Check if there is two graphs in the input dump
+        if len(graphs) != EXPECTED_GRAPHS + 1:
+            return
+        optimization_dump, neutron_graph_dump = graphs[1:]
+
+        # Get tflite model dump
+        tflite_graph_dump = optimization_dump.partition("= Optimize Graph =")[0]
+
+        # Get verbose Neutron graphs located in the Extract Graphs section.
+        extracted_graph_dump = optimization_dump.partition("= Extract Graphs =")[
+            2
+        ].partition("Generate code for NeutronGraph")[0]
+
+        # Get list of original operators from first dumped graph.
+        self.tflite_nodes = [
+            Node(
+                matched_operator.group("type"),
+                get_tensors_name(matched_operator.group("inputs")),
+                get_tensors_name(matched_operator.group("outputs")),
+                int(matched_operator.group("location")),
+            )
+            for matched_operator in re.finditer(PATTERN_NODE, tflite_graph_dump)
+        ]
+        # Get list of neutron subgraphs.
+        self.neutron_subgraphs = self._get_neutron_subgraphs(neutron_graph_dump)
+        if self.neutron_subgraphs:
+            self._update_neutron_subgraphs_info(extracted_graph_dump)
+
+    def _get_neutron_subgraphs(self, graph_dump: str) -> list[SubgraphInfo]:
+        """Parse Neutron graph dump and extract subgraph information.
+
+        :param graph_dump: String containing the Neutron graph dump from the converter log.
+        :return: List of SubgraphInfo objects containing subgraph metadata and operator nodes.
+        """
+
+        def get_subgraph_nodes(subrgraph_dump: str) -> list[Node]:
+            """Parse subgraph dump and extract operator nodes.
+
+            :param subgraph_dump: String containing a single Neutron subgraph definition.
+            :return: List of Node objects representing operators in the subgraph.
+            """
+            return [
+                Node(
+                    matched_operator.group("type"),
+                    get_tensors_name(matched_operator.group("inputs")),
+                    get_tensors_name(matched_operator.group("outputs")),
+                    int(matched_operator.group("location")),
+                )
+                for matched_operator in re.finditer(PATTERN_NODE, subrgraph_dump)
+            ]
+
+        subgraphs = graph_dump.split(r"Name: subgraph_")
+        if len(subgraphs) < 3:
+            return []
+
+        # Get numbers of final neutron graphs in converted model.
+        self.neutron_graphs = [
+            int(matched_graphs.group("num"))
+            for matched_graphs in re.finditer(PATTERN_GRAPH, subgraphs[-1])
+        ]
+        if not self.neutron_graphs:
+            return []
+
+        # Get subgraphs
+        neutron_subgraphs: list[SubgraphInfo] = []
+        for subgraph in subgraphs[1:]:
+            subgraph_match = re.search(PATTERN_SUBGRAPH, subgraph)
+            if not subgraph_match:
+                continue
+            neutron_subgraph = SubgraphInfo(
+                int(subgraph_match.group("num")),
+                -1,
+                get_tensors_name(subgraph_match.group("inputs")),
+                get_tensors_name(subgraph_match.group("outputs")),
+                0,
+                get_subgraph_nodes(subgraph),
+            )
+            neutron_subgraphs.append(neutron_subgraph)
+        return neutron_subgraphs
+
+    def _update_neutron_subgraphs_info(self, extracted_graph: str) -> None:
+        """Update Neutron subgraphs with verbose info.
+
+        - Set numbers of Neutron kernels in each Neutron subgraph. 99% of subgraphs contain only one Neutron kernel,
+        but there are some exceptions and some subgraphs can have more kernels. This number can be taken from
+        final Neutron graph info.
+        - Set Neutron subgraphs location in the final Neutron Graph. The function updates the location parameter
+        for each Neutron subgraph according to its position in the final Neutron graph. Location is calculated
+        continuously across all Neutron graphs in the model. Non-Neutron operators are skipped.
+
+        :param extracted_graph: verbose Neutron graph dump.
+        """
+        # Neutron graphs.
+        neutron_graphs = extracted_graph.split("NeutronGraph")
+        location_shift = 0
+        for neutron_graph in neutron_graphs:
+
+            subgraph_nodes = {
+                int(matched_subgraph.group("subgraph")): {
+                    "location": i + location_shift,
+                    "kernels": [
+                        kernel.replace(" ", "")
+                        for kernel in matched_subgraph.group("kernels").split("\n")
+                        if kernel.strip()
+                    ],
+                }
+                for i, matched_subgraph in enumerate(
+                    re.finditer(PATTERN_VERBOSE_KERNELS, neutron_graph)
+                )
+            }
+            if not subgraph_nodes:
+                continue
+            # Update location offset according to the number of kernels in the subgraph.
+            location_shift += len(subgraph_nodes)
+
+            # Neutron graphs.
+            graph_num = -1
+            matched_graph = re.search(r"subgraph_(?P<subgraph>\d+)", neutron_graph)
+            if matched_graph:
+                graph_num = int(matched_graph.group("subgraph"))
+
+            # Update number of kernels for all subgraphs.
+            for subgraph in self.neutron_subgraphs:
+                if subgraph.num in subgraph_nodes:
+                    subgraph.kernels = len(subgraph_nodes[subgraph.num]["kernels"])
+                    subgraph.location = subgraph_nodes[subgraph.num]["location"]
+                elif subgraph.num == graph_num:
+                    subgraph.kernels = sum(
+                        len(s["kernels"]) for s in subgraph_nodes.values()
+                    )
+                    self.neutron_kernels_num += subgraph.kernels
+
+    def _nodes_match_by_io(self, tf_node: Node, neutron_node: Node) -> bool:
+        """
+        Determine whether a TFLite node can be mapped to a Neutron node
+        based on their input and output compatibility.
+
+        :param tf_node: Source TFLite node.
+        :param neutron_node: Target Neutron node.
+        :return: True if the nodes can be considered mapped, False otherwise.
+        """
+
+        def get_name_matches(tf_names: list[str], neutron_names: list[str]) -> int:
+            # Count how many names from tf_names have a corresponding match in
+            # neutron_names. A match is defined as:
+            #   - exact equality, or
+            #   - one name being a hierarchical variant of the other
+            #     (i.e., sharing a common prefix separated by "/").
+            result = 0
+            for tf_name in tf_names:
+                # Determine if the tensor name corresponds to a special operation input.
+                # Matches names like "perm0", "perm1", etc. used by Transpose ops,
+                # and names like "padding0", "padding1", etc. used by Pad ops.
+                special_op = (
+                    "permutation"
+                    if re.fullmatch(r"perm(\d+)?", tf_name)
+                    else (
+                        "padding"
+                        if re.fullmatch(r"padding(s)?(\d+)?", tf_name)
+                        else None
+                    )
+                )
+                for neutron_name in neutron_names:
+                    if (
+                        neutron_name == tf_name
+                        or neutron_name + "/" in tf_name
+                        or tf_name + "/" in neutron_name
+                    ):
+                        result += 1
+                        break
+
+                    # Check if the neutron input is also the special op (Pad or Transpose)
+                    if special_op and special_op in neutron_name:
+                        result += 1
+                        break
+            return result
+
+        name_matches = get_name_matches(tf_node.inputs, neutron_node.inputs)
+        # Map the node if all TFLite inputs match Neutron inputs.
+        # Note: the Neutron node may still have additional extra inputs.
+        if name_matches == len(tf_node.inputs):
+            return True
+        elif name_matches == len(tf_node.inputs) - 1:
+            # If there is only one unmatched input, check matching of outputs.
+            name_matches = get_name_matches(tf_node.outputs, neutron_node.outputs)
+            if name_matches == len(tf_node.outputs):
+                # Map the node if all TFLite outputs match Neutron outputs.
+                return True
+        return False
+
+    def get_tflite_to_neutron_map(self) -> dict[int, tuple[int, ...]]:
+        """Map TFLite nodes from the original model to Neutron nodes in the converted model.
+
+        The mapping is built based on input and output tensor names. Neutron tensors may have
+        exactly the same names or use the format "tflite_input/additional_name".
+
+        :return: Dictionary mapping TFLite node indices to tuple of Neutron subgraph indices.
+        """
+        tflite_to_neutron_dict = {}
+        for tf_idx, tf_node in enumerate(self.tflite_nodes):
+            subgraph_idxs = []
+            for subgraph in self.neutron_subgraphs:
+                if (
+                    subgraph.num in self.neutron_graphs
+                    or subgraph.location in subgraph_idxs
+                ):
+                    continue
+                for neutron_node in subgraph.nodes:
+                    if self._nodes_match_by_io(tf_node, neutron_node):
+                        subgraph_idxs.append(subgraph.location)
+                        break
+            # Filter subgraph_idxs to avoid mapping multiple parallel single-input nodes that consume the
+            # same input tensor into the same TFLite node.
+            subgraph_idxs = self._filter_single_input_nodes(tf_node.name, subgraph_idxs)
+            if subgraph_idxs:
+                tflite_to_neutron_dict[tf_idx] = tuple(subgraph_idxs)
+
+        self.tflite_to_neutron_map = tflite_to_neutron_dict
+        return self.tflite_to_neutron_map
+
+    def _filter_single_input_nodes(
+        self, node_name: str, subgraph_loc: list[int]
+    ) -> list[int]:
+        """
+        Filter the Neutron-to-TFLite mapping to avoid mapping multiple parallel single-input nodes
+        that consume the same input tensor to a single TFLite node.
+
+        The function checks whether the current TFLite node is a supported single-input node
+        (as defined in SINGLE_INPUT_NODES) and whether it is mapped to multiple Neutron nodes.
+        In such cases, it is possible that parallel single-input Neutron nodes were incorrectly
+        mapped to the same TFLite node.
+
+        If more than one single-input Neutron node is mapped, only one is kept in the mapping:
+        the Neutron node whose operation name matches the operation name of the current TFLite node.
+
+        :param node_name: Operation name of the current TFLite node.
+        :param subgraph_loc: List of Neutron subgraph indices whose inputs correspond to the
+                            input of the current TFLite node.
+        :return: Filtered list of Neutron subgraph indices to be mapped to the current TFLite node.
+        """
+        # Check if there can be potential issue in mapping.
+        if node_name in SINGLE_INPUT_NODES and len(subgraph_loc) > 1:
+            single_in_nodes = []
+            # Find all single-input nodes in subgraph_idxs.
+            subgraphs = (
+                subgraph
+                for subgraph in self.neutron_subgraphs
+                if subgraph.location in subgraph_loc
+            )
+            for subgraph in subgraphs:
+                for neutron_node in subgraph.nodes:
+                    if neutron_node.name in SINGLE_INPUT_NODES:
+                        single_in_nodes.append((subgraph.location, neutron_node.name))
+            if len(single_in_nodes) > 0:
+                # Keep only the node with the matching name when multiple single-input nodes are present in subgraph_idxs.
+                for subgraph_id, single_in_node_name in single_in_nodes:
+                    if single_in_node_name == node_name:
+                        return [subgraph_id]
+                return []
+        return subgraph_loc
+
+    def get_edge_to_neutron_map(self) -> dict[int, tuple[int, ...]]:
+        """Map Edge nodes to Neutron nodes.
+
+        :return: Dictionary mapping Edge node handles to tuple of Neutron subgraph indices.
+        """
+        self.get_tflite_to_neutron_map()
+        edge_to_neutron_dict = {}
+
+        for edge_handle, tflite_indices in self.edge_to_tflite_map.items():
+            neutron_nodes = set()
+            for tf_node in tflite_indices:
+                if tf_node in self.tflite_to_neutron_map:
+                    neutron_nodes.update(self.tflite_to_neutron_map[tf_node])
+            if neutron_nodes:
+                edge_to_neutron_dict[edge_handle] = tuple(neutron_nodes)
+
+        self.edge_to_neutron_map = edge_to_neutron_dict
+        return self.edge_to_neutron_map
+
+    def get_neutron_to_edge_map(self) -> dict[int, tuple[int, ...]]:
+        """
+        Transform edge-to-neutron map to neutron-to-edge map.
+
+        :return: Dictionary mapping neutron_index to tuple of edge_handles
+        """
+        if not self.edge_to_neutron_map:
+            _ = self.get_edge_to_neutron_map()
+
+        neutron_to_edge = {}
+
+        for edge_handle, neutron_indices in self.edge_to_neutron_map.items():
+            for neutron_idx in neutron_indices:
+                if neutron_idx not in neutron_to_edge:
+                    neutron_to_edge[neutron_idx] = []
+                neutron_to_edge[neutron_idx].append(edge_handle)
+
+        # Fill gaps with empty tuples and convert lists to tuples.
+        if neutron_to_edge:
+            max_neutron_idx = self.neutron_kernels_num
+            result = {}
+            # Add one more non-mapped event at the end of list for the Neutron Dump event.
+            for i in range(max_neutron_idx + 1):
+                if i in neutron_to_edge:
+                    result[i] = tuple(neutron_to_edge[i])
+                else:
+                    result[i] = ()
+            logging.info(f"Neutron to Edge map was created: {result}")
+            return result
+        else:
+            return {}
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
index 1a84a418e92..ee711c34369 100644
--- a/backends/nxp/nxp_backend.py
+++ b/backends/nxp/nxp_backend.py
@@ -11,6 +11,8 @@
 import logging
 import os
 import struct
+import tempfile
+from contextlib import contextmanager
 from typing import final
 
 import numpy as np
@@ -26,6 +28,8 @@
 from executorch.backends.nxp.backend.neutron_converter_manager import (
     NeutronConverterManager,
 )
+
+from executorch.backends.nxp.backend.neutron_map import NeutronMap
 from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.neutron_node_extraction import (
     extract_artifacts_from_neutron_node,
@@ -54,6 +58,7 @@ def __init__(self):
         self.use_neutron_for_format_conversion = True
         self.fetch_constants_to_sram = False
         self.dump_kernel_selection_code = False
+        self.use_profiling = False
 
     def _replace_colons(self, operator: str) -> str:
         """
@@ -70,6 +75,7 @@ def neutron_compile_spec(
         use_neutron_for_format_conversion: bool = True,
         fetch_constants_to_sram: bool = False,
         dump_kernel_selection_code: bool = False,
+        use_profiling: bool = False,
     ) -> "NeutronCompileSpecBuilder":
         """Generate compile spec for Neutron NPU
 
@@ -83,6 +89,7 @@ def neutron_compile_spec(
         :param fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights
                                      from FLASH to SRAM. This should be used when the whole model does not fit into SRAM.
         :param dump_kernel_selection_code: Whether Neutron converter dumps kernel selection code.
+        :param use_profiling: If true Neutron Converter will enable profiling for neutron delegated model
         :return: self for method chaining
         """
 
@@ -106,6 +113,7 @@ def neutron_compile_spec(
         self.use_neutron_for_format_conversion = use_neutron_for_format_conversion
         self.fetch_constants_to_sram = fetch_constants_to_sram
         self.dump_kernel_selection_code = dump_kernel_selection_code
+        self.use_profiling = use_profiling
 
         return self
 
@@ -135,6 +143,10 @@ def build(self):
                     "dump_kernel_selection_code",
                     f"{self.dump_kernel_selection_code}".encode(),
                 ),
+                CompileSpec(
+                    "use_profiling",
+                    f"{self.use_profiling}".encode(),
+                ),
             ]
 
         return self.compile_spec
@@ -149,6 +161,7 @@ def generate_neutron_compile_spec(
     use_neutron_for_format_conversion: bool = True,
     fetch_constants_to_sram: bool = False,
     dump_kernel_selection_code: bool = False,
+    use_profiling: bool = False,
 ) -> list[CompileSpec]:
     return (
         NeutronCompileSpecBuilder()
@@ -160,11 +173,36 @@ def generate_neutron_compile_spec(
             use_neutron_for_format_conversion=use_neutron_for_format_conversion,
             fetch_constants_to_sram=fetch_constants_to_sram,
             dump_kernel_selection_code=dump_kernel_selection_code,
+            use_profiling=use_profiling,
         )
         .build()
     )
 
 
+@contextmanager
+def capture_fd_output():
+    tmp = tempfile.TemporaryFile()
+
+    # Save original stdout / stderr
+    original_stdout_fd = os.dup(1)
+    original_stderr_fd = os.dup(2)
+
+    try:
+        # Redirect fd=1 and fd=2 to temp file
+        os.dup2(tmp.fileno(), 1)
+        os.dup2(tmp.fileno(), 2)
+
+        yield tmp  # give access to the temp file
+
+    finally:
+        # Restore original fds
+        os.dup2(original_stdout_fd, 1)
+        os.dup2(original_stderr_fd, 2)
+
+        os.close(original_stdout_fd)
+        os.close(original_stderr_fd)
+
+
 @final
 class NeutronBackend(BackendDetails):
 
@@ -185,6 +223,7 @@ def preprocess(  # noqa C901
         use_neutron_for_format_conversion = None
         fetch_constants_to_sram = False
         dump_kernel_selection_code = None
+        use_profiling = False
         for spec in compile_spec:
             if spec.key == "output_format":
                 output_format = spec.value.decode()
@@ -200,6 +239,8 @@ def preprocess(  # noqa C901
                 fetch_constants_to_sram = spec.value.decode() == "True"
             if spec.key == "dump_kernel_selection_code":
                 dump_kernel_selection_code = spec.value.decode() == "True"
+            if spec.key == "use_profiling":
+                use_profiling = spec.value.decode() == "True"
 
         # Check that the output format is set in the compile spec
         if not output_format:
@@ -229,19 +270,32 @@ def preprocess(  # noqa C901
                 if use_neutron_for_format_conversion is not None
                 else {}
             )
-            tflite_model, io_formats = EdgeProgramToIRConverter().convert_program(
+            (
+                tflite_model,
+                io_formats,
+                edge_to_tflite_map,
+            ) = EdgeProgramToIRConverter().convert_program(
                 edge_program,
                 neutron_target_spec=NeutronTargetSpec(target),
                 conversion_config=conversion_config,
                 custom_delegation_options=CustomDelegationOptions(),
             )
 
-            neutron_model = NeutronConverterManager(dump_kernel_selection_code).convert(
-                tflite_model,
-                target,
-                delegation_tag,
-                fetch_constants_to_sram,
-            )
+            with capture_fd_output() as tmp:
+                neutron_model = NeutronConverterManager(
+                    dump_kernel_selection_code
+                ).convert(
+                    tflite_model,
+                    target,
+                    delegation_tag,
+                    fetch_constants_to_sram,
+                    use_profiling,
+                )
+                tmp.seek(0)
+                log_output = tmp.read().decode()
+            # Get mapping from tflite to neutron
+            map = NeutronMap(log_output, edge_to_tflite_map)
+            neutron_to_edge_map = map.get_neutron_to_edge_map()
 
             # Dump the tflite file if intermediates_dir is set
             if intermediates_dir != "None":
@@ -265,7 +319,9 @@ def preprocess(  # noqa C901
         else:
             raise RuntimeError(f"Unknown format {output_format}")
 
-        return PreprocessResult(processed_bytes=binary)
+        return PreprocessResult(
+            processed_bytes=binary, debug_handle_map=neutron_to_edge_map
+        )
 
 
 class PayloadComposer:
diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp
index 3ea973b7c5b..6fe0482ed89 100644
--- a/backends/nxp/runtime/NeutronBackend.cpp
+++ b/backends/nxp/runtime/NeutronBackend.cpp
@@ -10,6 +10,7 @@
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/event_tracer_hooks_delegate.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 
 #include "NeutronDriver.h"
@@ -25,6 +26,8 @@ namespace neutron {
 #define ALIGN_SIZE(size) \
   ((size + BUFFER_ALIGNMENT - 1) & (~(BUFFER_ALIGNMENT - 1)))
 
+#define KOPC_CALLARGS 6 // The operation for TileIR
+
 // clang-format off
 /* Header schema:
      +----------------------------+-----------------------------+------------------------+
@@ -84,6 +87,19 @@ typedef struct {
   const uint8_t* outputMap;
 } NeutronExecutorchConfig;
 
+typedef struct {
+  uint8_t eventCode;
+  uint8_t opCode;
+  uint8_t functionCode;
+  uint8_t timestampCode;
+  uint32_t time;
+} NeutronSingleProfilingEvent;
+
+typedef struct {
+  NeutronSingleProfilingEvent startEvent;
+  NeutronSingleProfilingEvent stopEvent;
+} NeutronFullProfilingEvent;
+
 #ifdef EXTERNAL_MEM
 // Neutron compute has no access to FLASH.
 // Prefetch weights from FLASH to SRAM using memcpy.
@@ -508,12 +524,11 @@ class NeutronBackend final : public PyTorchBackendInterface {
       }
     }
 
-#ifdef NEUTRON_PROFILE
-    // TODO: Use trace from BackendExecutionContext.
-    NeutronTraceConfig trace_config{.traceConfig = 0};
-    neutronSetTrace(cfg->nmh, &trace_config);
+#ifdef ET_EVENT_TRACER_ENABLED
+    // Save ticks before neutron compute to measure how much time profiling dump
+    // takes
+    et_timestamp_t start_ticks = ::executorch::runtime::pal_current_ticks();
 #endif
-
     // Run neutron compute.
     NeutronError neutronRC = neutronRunBlocking(cfg->nmh, &cfg->dcfg);
     if (neutronRC != ENONE) {
@@ -523,6 +538,11 @@ class NeutronBackend final : public PyTorchBackendInterface {
           neutronRC);
       return Error::InvalidProgram;
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    // Save ticks after neutron compute to measure how much time profiling dump
+    // takes
+    et_timestamp_t stop_ticks = ::executorch::runtime::pal_current_ticks();
+#endif
 
     // Transpose outputs.
     for (int i = 0; i < cfg->numOutputs; i++) {
@@ -558,6 +578,53 @@ class NeutronBackend final : public PyTorchBackendInterface {
         }
       }
     }
+#ifdef ET_EVENT_TRACER_ENABLED
+    // Add traced evens only if model has profiling info.
+    auto profile_size = cfg->profileSize;
+    if (profile_size > 0) {
+      int events_num = static_cast<int>(profile_size / 16);
+      auto profiling_index = cfg->numOutputs + 1;
+      char* profile_info =
+          static_cast<char*>(cfg->dcfg.outputs[profiling_index]);
+      NeutronFullProfilingEvent* neutron_events =
+          (NeutronFullProfilingEvent*)profile_info;
+      executorch::runtime::EventTracer* tracer = context.event_tracer();
+      uint32_t start_time = 0;
+      int index = 0;
+      // Post log neutron events from profiling output.
+      for (int i = 0; i < events_num; i++) {
+        if (start_time == 0) {
+          start_time = neutron_events[i].startEvent.time;
+        }
+        if (neutron_events[i].stopEvent.opCode != KOPC_CALLARGS) {
+          // Only KOPC_CALLARGS events can be mapped to original .pte model.
+          continue;
+        } else {
+          event_tracer_log_profiling_delegate(
+              tracer,
+              nullptr,
+              index,
+              start_time,
+              neutron_events[i].stopEvent.time,
+              static_cast<const void*>(
+                  &neutron_events[i].startEvent.functionCode),
+              sizeof(uint8_t));
+          start_time = 0;
+          index++;
+        }
+      }
+      event_tracer_log_profiling_delegate(
+          tracer,
+          nullptr,
+          index,
+          neutron_events[events_num - 1].startEvent.time,
+          neutron_events[events_num - 1].stopEvent.time + stop_ticks -
+              start_ticks,
+          static_cast<const void*>(
+              &neutron_events[events_num - 1].startEvent.functionCode),
+          sizeof(uint8_t));
+    }
+#endif
 
     return Error::Ok;
   }
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
index 44a96010593..1309e019428 100644
--- a/backends/nxp/tests/executorch_pipeline.py
+++ b/backends/nxp/tests/executorch_pipeline.py
@@ -190,6 +190,7 @@ def to_quantized_edge_program(
     use_quant_state_dict: bool = True,
     fetch_constants_to_sram: bool = False,
     dump_kernel_selection_code: bool = False,
+    use_profiling: bool = False,
     delegate_to_npu=True,
 ) -> EdgeProgramManager:
     _neutron_target_spec = NeutronTargetSpec(target)
@@ -223,6 +224,7 @@ def to_quantized_edge_program(
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
         fetch_constants_to_sram=fetch_constants_to_sram,
         dump_kernel_selection_code=dump_kernel_selection_code,
+        use_profiling=use_profiling,
     )
     post_quant_state_dict = (
         exir_program_aten__module_quant.state_dict() if use_quant_state_dict else None
@@ -244,6 +246,7 @@ def to_quantized_edge_program(
         export(exir_program_aten__module_quant, example_input, strict=True),
         transform_passes=NeutronEdgePassManager(),
         partitioner=partitioners,
+        generate_etrecord=use_profiling,
         compile_config=EdgeCompileConfig(
             _check_ir_validity=False,
             _core_aten_ops_exception_list=core_aten_ops_exception_list,
@@ -274,6 +277,7 @@ def to_quantized_executorch_program(
     use_neutron_for_format_conversion: bool = True,
     dataset_dir: str | None = None,
     delegate_to_npu=True,
+    use_profiling: bool = False,
     operators_not_to_delegate: list[str] = None,
     remove_quant_io_ops: bool = False,
 ) -> ExecutorchProgramManager:
@@ -295,6 +299,7 @@ def to_quantized_executorch_program(
         train_fn=train_fn,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
         delegate_to_npu=delegate_to_npu,
+        use_profiling=use_profiling,
         operators_not_to_delegate=operators_not_to_delegate,
         remove_quant_io_ops=remove_quant_io_ops,
         **get_calibration_inputs_fn,
diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py
index 319f372b5fa..94e91a31b95 100644
--- a/backends/nxp/tests/executors.py
+++ b/backends/nxp/tests/executors.py
@@ -325,7 +325,7 @@ def convert_run_compare(
 
     if tfl_model is None:
         NodeFormatInference(edge_program).identify_node_formats()
-        tfl_model, _ = EdgeProgramToIRConverter().convert_program(
+        tfl_model, *_ = EdgeProgramToIRConverter().convert_program(
             edge_program, conversion_config
         )
 
diff --git a/backends/nxp/tests/generic_tests/test_aot_example.py b/backends/nxp/tests/generic_tests/test_aot_example.py
index 893041fe372..8a1e5e49555 100644
--- a/backends/nxp/tests/generic_tests/test_aot_example.py
+++ b/backends/nxp/tests/generic_tests/test_aot_example.py
@@ -2,11 +2,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
+import os
 import subprocess
 import sys
 from pathlib import Path
 
+from executorch.backends.nxp.tests.config_importer import test_config
+
 # noinspection PyProtectedMember
 from executorch.exir._serialize import _deserialize_pte_binary
 from executorch.exir.schema import DelegateCall, KernelCall
@@ -15,9 +17,8 @@
 def test_aot_example__mobilenet_v2():
     """Test that mobilenet can be lowered to Neutron backend via `aot_neutron_compile.py` and all ops are delegated."""
 
-    # Find the executorch root directory (5 levels up from this test file)
-    executorch_root = Path(__file__).parent.parent.parent.parent.parent
-    assert executorch_root.exists(), f"Executorch root not found at {executorch_root}"
+    # Set the executorch root directory.
+    executorch_root = test_config.PROJECT_DIR
 
     # Run the compilation script as a module (like run_aot_example.sh does)
     cmd = [
@@ -34,14 +35,14 @@ def test_aot_example__mobilenet_v2():
     ]
 
     # Output file will be created in executorch_root
-    pte_file = executorch_root / "mobilenetv2_nxp_delegate.pte"
+    pte_file = Path(os.path.join(executorch_root, "mobilenetv2_nxp_delegate.pte"))
 
     try:
         result = subprocess.run(
             cmd,
             capture_output=True,
             text=True,
-            timeout=300,  # 5 minute timeout just in case. On my machine, the test usually runs ~1 minute.
+            timeout=300,  # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute.
             cwd=str(
                 executorch_root
             ),  # Run from executorch root (like run_aot_example.sh)
@@ -95,3 +96,77 @@ def test_aot_example__mobilenet_v2():
         # Clean up the generated file
         if pte_file.exists():
             pte_file.unlink()
+
+
+def test_aot_example__mobilenet_v2__profiling():
+    """Test that mobilenet_v2 can be lowered to Neutron backend via `aot_neutron_compile.py`, all ops are delegated,
+    the output model is profilable and ETRecord is generated properly."""
+
+    # Set the executorch root directory.
+    executorch_root = test_config.PROJECT_DIR
+
+    # Run the compilation script as a module (like run_aot_example.sh does)
+    cmd = [
+        sys.executable,
+        "-m",
+        "examples.nxp.aot_neutron_compile",
+        "--model_name",
+        "mobilenetv2",
+        "--delegate",
+        "--quantize",
+        "--target",
+        "imxrt700",
+        "--remove-quant-io-ops",
+        "--use_channels_last_dim_order",
+        "--use_profiling",  # Generate profilable model and create ETRecord
+        "--use_random_dataset",  # Avoid downloading the dataset.
+    ]
+
+    # Output files will be created in executorch_root.
+    pte_file = Path(
+        os.path.join(executorch_root, "mobilenetv2_nxp_delegate_profile.pte")
+    )
+    etrecord_file = Path(
+        os.path.join(executorch_root, "etrecord", "mobilenetv2_etrecord.bin")
+    )
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute.
+            cwd=str(
+                executorch_root
+            ),  # Run from executorch root (like run_aot_example.sh)
+        )
+
+        # Check script ran successfully.
+        assert result.returncode == 0, (
+            f"Script failed with return code {result.returncode}\n"
+            f"STDOUT:\n{result.stdout}\n"
+            f"STDERR:\n{result.stderr}"
+        )
+
+        # Check if delegated model was created and saved.
+        assert pte_file.exists(), f"PTE file not created at {pte_file}"
+
+        # Combine stdout and stderr to capture all subprocess output, including logs.
+        process_output = result.stdout + result.stderr
+
+        # Check if nonempty Neutron to Edge map was created.
+        assert "Neutron to Edge map was created:" in process_output
+
+        # Check if ETRecord was created and saved.
+        assert "The ETRecord for the model was saved to" in process_output
+        assert etrecord_file.exists(), f"ETRecord file not created at {etrecord_file}"
+
+    finally:
+        # Clean up the generated files.
+        if pte_file.exists():
+            pte_file.unlink()
+        if etrecord_file.exists():
+            etrecord_file.unlink()
+            parent = etrecord_file.parent
+            if not any(parent.iterdir()):
+                parent.rmdir()
diff --git a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py
index 27bd675a487..6aa07dbba8d 100644
--- a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py
+++ b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py
@@ -629,7 +629,7 @@ def test_move_activation_before_concat_quantization__conv(
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -668,7 +668,7 @@ def test_move_activation_before_concat_quantization__linear(
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -706,7 +706,7 @@ def test_move_activation_before_concat_quantization__addmm(
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -744,7 +744,7 @@ def test_move_activation_before_concat_quantization__mm(
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -788,9 +788,7 @@ def test_concat_cluster_quantization__conv(
                     "lowered_module" in node.name for node in edge_program.graph.nodes
                 )
 
-                tflite_flatbuffers_model, io_formats = converter_spy.calls[
-                    -1
-                ].return_value
+                tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
                 exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
                 exir_program_aten_quant: GraphModule = quantizer_spy.calls[
                     -1
@@ -861,9 +859,7 @@ def test_concat_cluster_quantization__linear(
                     "lowered_module" in node.name for node in edge_program.graph.nodes
                 )
 
-                tflite_flatbuffers_model, io_formats = converter_spy.calls[
-                    -1
-                ].return_value
+                tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
                 exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
                 exir_program_aten_quant: GraphModule = quantizer_spy.calls[
                     -1
diff --git a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py
index 8cf7dfe3dc2..52654a482b9 100644
--- a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py
+++ b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py
@@ -37,7 +37,7 @@ def test_lowered_program_and_tflite_output_match__conv2d__no_bias(mocker):
     )
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     tflite_model = Model.GetRootAs(tflite_flatbuffers_model)
     sub_graph = tflite_model.Subgraphs(0)
@@ -84,7 +84,7 @@ def test_conv_fc__lowered_program_and_tflite_output_match(mocker):
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
 
     # Capture generated model
-    tflite_flatbuffers_model, _ = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # No Transpose ops in produced TFLite model
     tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0)
@@ -148,7 +148,7 @@ def test_delegating_format_related_transpose_operators__supported_case(mocker):
     )
 
     # Capture the converted IR model.
-    tflite_flatbuffers_model, _ = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Make sure the `Transpose` ops ARE in the IR model.
     tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0)
diff --git a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py
index 0705203db06..359dfdb67e9 100644
--- a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py
+++ b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py
@@ -28,7 +28,7 @@ def test_conv2d_neutron_conversion():
 
     NodeFormatInference(edge_program_manager.exported_program()).identify_node_formats()
     edge_program_converter = EdgeProgramToIRConverter()
-    tflite_model, _ = edge_program_converter.convert_program(
+    tflite_model, *_ = edge_program_converter.convert_program(
         edge_program_manager.exported_program()
     )
 
diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
index 706d8ed3e14..af9ef08057b 100644
--- a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
+++ b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
@@ -153,7 +153,7 @@ def test_per_channel_convolution(self, _, use_qat: bool):
                 use_neutron_for_format_conversion=False,
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
 
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
diff --git a/backends/nxp/tests/generic_tests/test_profiling.py b/backends/nxp/tests/generic_tests/test_profiling.py
new file mode 100644
index 00000000000..c922eb070c3
--- /dev/null
+++ b/backends/nxp/tests/generic_tests/test_profiling.py
@@ -0,0 +1,158 @@
+# Copyright 2026 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import ast
+import logging
+import re
+
+import numpy as np
+import pytest
+import torch
+from executorch.backends.nxp.tests.graph_verifier import BaseGraphVerifier
+from executorch.backends.nxp.tests.model_output_comparator import (
+    NumericalStatsOutputComparator,
+)
+
+from executorch.backends.nxp.tests.models import AvgPool2dModule, SoftmaxModule
+from executorch.backends.nxp.tests.nsys_testing import lower_run_compare
+
+from executorch.examples.nxp.experimental.cifar_net.cifar_net import CifarNetModel
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+PATTERN_NEUTRON_MAP = r"Neutron to Edge map was created: (\{.*\})"
+
+
+def extract_map_from_logs(caplog):
+    for record in caplog.records:
+        msg = record.getMessage()
+        neutron_map_match = re.search(PATTERN_NEUTRON_MAP, msg)
+        if neutron_map_match:
+            dict_str = neutron_map_match.group(1)
+            return ast.literal_eval(dict_str)
+    return None
+
+
+class ParallelPoolModel(torch.nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv_in = torch.nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.max_pool2d = torch.nn.MaxPool2d(kernel_size=2, stride=2)
+        self.avg_pool2d = torch.nn.AvgPool2d(kernel_size=2, stride=2)
+        self.conv_out = torch.nn.Conv2d(2 * channels, channels, kernel_size=1)
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        x = torch.cat((self.max_pool2d(x), self.avg_pool2d(x)), dim=1)
+        x = self.conv_out(x)
+        return x
+
+
+class TestProfiling:
+    @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True)
+    def test__softmax(self, caplog, request):
+        caplog.set_level(logging.INFO)
+        model = SoftmaxModule(-1)
+        lower_run_compare(
+            model,
+            (10,),
+            dlg_model_verifier=BaseGraphVerifier(1, []),
+            request=request,
+            use_profiling=True,
+            output_comparator=NumericalStatsOutputComparator(),
+        )
+
+        # Neuron map for 1D Softmax with input size 10 should contain 4 nodes:
+        # 3 Neuron kernels (pad, softmax, and slice) and 1 unmapped node used for profiling dum
+        neutron_map = extract_map_from_logs(caplog)
+        assert neutron_map == {
+            0: (2,),  # Pad
+            1: (2,),  # Softmax
+            2: (2,),  # Slice
+            3: (),  # Neutron Dump
+        }
+
+    def test__parallel_pool(self, caplog, request):
+        caplog.set_level(logging.INFO)
+        input_shape = (1, 3, 32, 32)
+        model = ParallelPoolModel(input_shape[1])
+        lower_run_compare(
+            model,
+            input_shape,
+            dlg_model_verifier=BaseGraphVerifier(1, []),
+            request=request,
+            output_comparator=NumericalStatsOutputComparator(),
+            use_neutron_for_format_conversion=False,
+            use_profiling=True,
+        )
+        neutron_map = extract_map_from_logs(caplog)
+        assert neutron_map == {
+            0: (6,),  # Conv2DStandardV2
+            1: (),  # Conv2DDepthwiseV2 (AvgPool)
+            2: (7,),  # MaxPool
+            3: (),  # TransposeCHW
+            4: (),  # TransposeCHW
+            5: (),  # TransposeCHW
+            6: (),  # Slice
+            7: (),  # Pad
+            8: (),  # Conv2DPointwise
+            9: (),  # Slice
+            10: (),  # Neutron Dump
+        }
+
+    @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True)
+    def test__cifar(self, caplog, request):
+        caplog.set_level(logging.INFO)
+        input_shape = (1, 3, 32, 32)
+        model = CifarNetModel()
+        lower_run_compare(
+            model,
+            input_shape,
+            dlg_model_verifier=BaseGraphVerifier(1, []),
+            request=request,
+            output_comparator=NumericalStatsOutputComparator(),
+            use_neutron_for_format_conversion=False,
+            use_profiling=True,
+        )
+        neutron_map = extract_map_from_logs(caplog)
+        assert neutron_map == {
+            0: (10,),  # Pad
+            1: (10, 11),  # Conv2DStandardV1 (Pad + Conv2d)
+            2: (12,),  # MaxPool
+            3: (13, 14),  # Conv2DStandardV1 (Pad + Conv2d)
+            4: (15,),  # MaxPool
+            5: (16, 17),  # Conv2DStandardV1 (Pad + Conv2d)
+            6: (18,),  # MaxPool
+            7: (20,),  # FullyConnected
+            8: (21,),  # Pad
+            9: (21,),  # Softmax
+            10: (21,),  # Slice
+            11: (),  # Neutron Dump
+        }
+
+    def test__avg_pool(self, caplog, request):
+        caplog.set_level(logging.INFO)
+        input_shape = (2, 9, 6, 15)
+        model = AvgPool2dModule(False, 0)
+        lower_run_compare(
+            model,
+            input_shape,
+            dlg_model_verifier=BaseGraphVerifier(1, []),
+            request=request,
+            output_comparator=NumericalStatsOutputComparator(),
+            use_neutron_for_format_conversion=False,
+            use_profiling=True,
+        )
+        neutron_map = extract_map_from_logs(caplog)
+        assert neutron_map == {
+            0: (2,),  # Pad
+            1: (2,),  # Conv2DDepthwiseDense
+            2: (2,),  # Slice
+            3: (),  # Neutron Dump
+        }
diff --git a/backends/nxp/tests/generic_tests/test_quantizer.py b/backends/nxp/tests/generic_tests/test_quantizer.py
index 3c23241e01e..6180d2fd9ae 100644
--- a/backends/nxp/tests/generic_tests/test_quantizer.py
+++ b/backends/nxp/tests/generic_tests/test_quantizer.py
@@ -432,7 +432,7 @@ def test_quantizer__linear_w_activation(mocker, activation, inplace, use_qat):
     )
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
     exir_program_aten_quant: GraphModule = quantizer_spy.spy_return
 
@@ -477,7 +477,7 @@ def test_quantizer__addmm_w_activation(mocker, activation, inplace, use_qat):
     )
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
     exir_program_aten_quant: GraphModule = quantizer_spy.spy_return
 
@@ -522,7 +522,7 @@ def test_quantizer__mm_w_activation(mocker, activation, inplace, use_qat):
     )
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
     exir_program_aten_quant: GraphModule = quantizer_spy.spy_return
 
@@ -567,7 +567,7 @@ def test_quantizer__conv_w_activation(mocker, activation, inplace, use_qat):
     )
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
     exir_program_aten_quant: GraphModule = quantizer_spy.spy_return
 
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py
index a8cdee41830..668deb28c96 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py
@@ -51,7 +51,7 @@ def test_addmm_conversion(self, _, use_qat: bool):
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -84,7 +84,7 @@ def test_linear_conversion__with_bias(self, _, use_qat: bool):
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
index dc442a4931c..466f596bf91 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py
@@ -59,7 +59,7 @@ def test_convert_bmm__supported(mocker, input_shape_x1, input_shape_x2, use_qat)
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
 
     input_data_1 = (
         np.random.random(input_shape_x1).astype(np.float32) * 256.0 - 128.0
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
index b4b828cd4e6..5ee3db6752f 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
@@ -182,7 +182,7 @@ def test_conv_dropout_quant(
                 use_neutron_for_format_conversion=False,
             ).exported_program()
 
-            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
 
             assert not graph_contains_any(
@@ -241,7 +241,7 @@ def test_clone_pool_view_copy_quant(
                 use_neutron_for_format_conversion=False,
             ).exported_program()
 
-            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
 
             assert not graph_contains_any(
@@ -311,7 +311,7 @@ def test_clone__to_contiguous_format(self):
         ).identify_node_formats()
 
         # Convert to the IR.
-        converted_model, _ = EdgeProgramToIRConverter().convert_program(
+        converted_model, *_ = EdgeProgramToIRConverter().convert_program(
             edge_program_manager.exported_program()
         )
 
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
index 828647d2113..7105514514a 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
@@ -177,7 +177,7 @@ def test_conv2d_quant_conversion(mocker, model: torch.nn.Module, input_shape, us
     )
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
@@ -367,7 +367,7 @@ def test_conv_transpose2d_conversion__quantized(
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py
index 60dbfd1b215..79fffff3b78 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py
@@ -51,7 +51,7 @@ def test_mm_conversion(self, _, use_qat: bool):
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
@@ -85,7 +85,7 @@ def test_linear_conversion__without_bias(self, _, use_qat: bool):
                 "lowered_module" in node.name for node in edge_program.graph.nodes
             )
 
-            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value
             exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
             input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
                 np.int8
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py
index e0fc0d85066..2e7f9035e8a 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py
@@ -74,7 +74,7 @@ def test_convert_neg(mocker, input_shape):
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
 
     input_data = (
         np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0
@@ -105,7 +105,7 @@ def test_convert_neg__channels_last(mocker):
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
 
     input_data = (
         np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py
index fb25f02785a..c5c7aa55b03 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py
@@ -50,7 +50,7 @@ def test_prelu_with_linear_quant_conversion(mocker, input_shape):
     ).exported_program()
 
     # Capture generated entities
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
 
     # Check `prelu` was not decomposed into simpler edge operators
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
index 2621baf18ee..00c10bd257d 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py
@@ -85,7 +85,7 @@ def test_softmax_delegation(input_shape, dim: int, mocker):
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
     input_data = random_input_data(input_shape)
 
     # Make sure the tested program contains the `softmax`, and its input has the expected rank.
@@ -121,7 +121,7 @@ def test_softmax_delegation__channel_first(input_shape, dim: int, mocker):
 
     # Verify correct behavior of the converted NeutronIR model.
     intermediate_ep = converter_spy.call_args.args[1]
-    neutron_ir_model, _ = converter_spy.spy_return
+    neutron_ir_model, *_ = converter_spy.spy_return
     input_data = random_input_data(input_shape)
 
     # Make sure the tested program contains the `softmax`.
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
index cb5f398fa21..276b29da142 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py
@@ -265,7 +265,7 @@ def test_view_copy_w_linear_quant_conversion(mocker, input_shape, new_shape, use
     )
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     edge_program: ExportedProgram = converter_spy.call_args.args[1]
@@ -299,7 +299,7 @@ def test_view_w_conv_linear_quant_conversion(
     )
 
     # Capture generated model
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     edge_program: ExportedProgram = converter_spy.call_args.args[1]
diff --git a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py
index 88ea567381f..aadef8c7731 100644
--- a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py
+++ b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py
@@ -251,7 +251,7 @@ def test_linear_bn_full_qat_pipeline_conversion(
     assert any("lowered_module" in node.name for node in edge_program.graph.nodes)
 
     # Capture generated model
-    tflite_flatbuffers_model, _ = converter_spy.spy_return
+    tflite_flatbuffers_model, *_ = converter_spy.spy_return
 
     # Capture converted program
     exported_program: ExportedProgram = converter_spy.call_args.args[1]
diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py
index d5ff3680f38..ef6fe9c864c 100644
--- a/backends/nxp/tests/nsys_testing.py
+++ b/backends/nxp/tests/nsys_testing.py
@@ -101,6 +101,8 @@ def _run_delegated_executorch_program(
     mocker,
     use_qat: bool = False,
     train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
+    use_profiling: bool = False,
+    use_neutron_for_format_conversion=True,
     operators_not_to_delegate: list[str] = None,
     remove_quant_io_ops: bool = False,
 ) -> tuple[ExportedProgram, str]:
@@ -129,6 +131,8 @@ def wrapper(*args, **kwargs):
             delegate_to_npu=True,
             use_qat=use_qat,
             train_fn=train_fn,
+            use_profiling=use_profiling,
+            use_neutron_for_format_conversion=use_neutron_for_format_conversion,
             operators_not_to_delegate=operators_not_to_delegate,
             remove_quant_io_ops=remove_quant_io_ops,
         )
@@ -405,6 +409,8 @@ def lower_run_compare(
     reference_model: ReferenceModel = ReferenceModel.QUANTIZED_EXECUTORCH_CPP,
     use_qat: bool = False,
     train_fn: Callable[[torch.fx.GraphModule], None] | None = None,
+    use_profiling: bool = False,
+    use_neutron_for_format_conversion=True,
     operators_not_to_delegate: list[str] = None,
     remove_quant_io_ops: bool = False,
 ):
@@ -424,6 +430,10 @@ def lower_run_compare(
     :param reference_model: Version of the model which will be run to obtain reference output data.
     :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training).
     :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`.
+    :param use_profiling: Enable profiling for neutron delegated model.
+    :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to
+                                                ensure that the IO matches the executorch partition, which will be
+                                                delegated to Neutron,
     :param operators_not_to_delegate: list of operators not to delegate.
     :param remove_quant_io_ops: If true, IO q-ops are removed and verification is done on quantized
         version of dataset (quantized INT8 input samples).
@@ -468,6 +478,8 @@ def lower_run_compare(
         mocker,
         use_qat=use_qat,
         train_fn=train_fn,
+        use_profiling=use_profiling,
+        use_neutron_for_format_conversion=use_neutron_for_format_conversion,
         operators_not_to_delegate=operators_not_to_delegate,
         remove_quant_io_ops=remove_quant_io_ops,
     )
diff --git a/backends/qualcomm/aot/wrappers/TARGETS b/backends/qualcomm/aot/wrappers/TARGETS
deleted file mode 100644
index 0a42614a385..00000000000
--- a/backends/qualcomm/aot/wrappers/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/backends/qualcomm/builders/TARGETS b/backends/qualcomm/builders/TARGETS
deleted file mode 100644
index 0a42614a385..00000000000
--- a/backends/qualcomm/builders/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/codegen/test/TARGETS b/codegen/test/TARGETS
deleted file mode 100644
index 1e8cc179228..00000000000
--- a/codegen/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain xplat-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/configurations/TARGETS b/configurations/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/configurations/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png
new file mode 100644
index 00000000000..50ed49f57ec
Binary files /dev/null and b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png differ
diff --git a/docs/source/backends/nxp/nxp-overview.md b/docs/source/backends/nxp/nxp-overview.md
index 22499aea7ad..b8739046351 100644
--- a/docs/source/backends/nxp/nxp-overview.md
+++ b/docs/source/backends/nxp/nxp-overview.md
@@ -64,6 +64,8 @@ here https://www.nxp.com/design/design-center/software/eiq-ai-development-enviro
 
 **→{doc}`nxp-kernel-selection` — Neutron Firmware Kernel Selection support.**
 
+**→{doc}`nxp-profiling` — Neutron models profiling.**
+
 ```{toctree}
 :maxdepth: 2
 :hidden:
@@ -74,4 +76,5 @@ nxp-quantization
 tutorials/nxp-tutorials
 nxp-dim-order
 nxp-kernel-selection
+nxp-profiling
 ```
diff --git a/docs/source/backends/nxp/nxp-profiling.md b/docs/source/backends/nxp/nxp-profiling.md
new file mode 100644
index 00000000000..17e352e479d
--- /dev/null
+++ b/docs/source/backends/nxp/nxp-profiling.md
@@ -0,0 +1,205 @@
+# NXP eIQ Profiling Support
+
+
+The eIQ Neutron Backend is integrated with the
+[Developer Tools](https://docs.pytorch.org/executorch/stable/delegate-debugging.html)
+to provide visibility into delegated operator execution time.
+
+There are three steps required to obtain profiling results for an NXP‑delegated model:
+
+* Convert the model with profiling support enabled.
+* Generate the artifacts consumed by the Developer Tools (`ETRecord`, `ETDump`).
+* Create and run the Inspector class to consume these artifacts and print the results.
+
+---
+
+## Convert a model with the profiling support
+
+Profiling data is generated only for a **profilable** model. 
+To convert a model with profiling enabled, the `--use-profiling` flag must be set.
+
+See the `aot_neutron_compile.py` example and its
+[README](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md)
+for additional details.
+
+The following command creates a profilable `cifar10_nxp_delegate.pte` model and the corresponding `ETRecord` for the 
+**i.MX RT700** board:
+
+```bash
+python -m examples.nxp.aot_neutron_compile --quantize \
+    --delegate -m cifar10 \
+    --use_profiling
+```
+
+For installation details, see {doc}`nxp-overview`.
+
+---
+
+## Generate ETRecord (Optional)
+
+`ETRecord` is an optional artifact that contains model graphs and metadata used to link runtime profiling results 
+back to the eager model.
+
+The recommended approach is to enable `ETRecord` generation by passing `generate_etrecord=True` to export API calls.
+After export completes, retrieve the `ETRecord` using the `get_etrecord()` method, and save it using the `save()` method:
+
+### Example
+
+```python
+from executorch.devtools.etrecord import generate_etrecord
+
+# 1. Open a model and export the model to ATEN
+model = model.eval()
+exported_program = torch.export.export(model, example_inputs, strict=True)
+module = exported_program.module()
+
+# 2. Transform and lower
+compile_spec = generate_neutron_compile_spec("imxrt700")
+partitioners = (
+    [
+        NeutronPartitioner(
+            compile_spec,
+            NeutronTargetSpec(target="imxrt700"),
+            post_quantization_state_dict=module.state_dict(),
+        )
+    ]
+)
+edge_program_manager = to_edge_transform_and_lower(
+    export(module, example_inputs, strict=True),
+    transform_passes=NeutronEdgePassManager(),
+    generate_etrecord=True,
+    partitioner=partitioners,
+    compile_config=EdgeCompileConfig(
+        _core_aten_ops_exception_list=core_aten_ops_exception_list,
+    ),
+)
+
+# 3. Export to ExecuTorch program
+exec_prog = edge_program_manager.to_executorch(
+    config=ExecutorchBackendConfig(extract_delegate_segments=False)
+)
+# Save ETRecord
+exec_prog.get_etrecord().save("etrecord.bin")
+
+```
+
+### Complete Example
+
+A full implementation is available
+in [aot_neutron_compile.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/aot_neutron_compile.py).
+
+The `--use_profiling` flag is used to create a **profilable** model and the corresponding `ETRecord` file  
+(see [Convert a model with profiling support](#convert-a-model-with-profiling-support) for the full command).
+
+
+---
+
+## Generate ETDump
+
+
+The next step is to generate an `ETDump`. An `ETDump` contains runtime data collected during model inference execution.
+
+To generate an `ETDump`, ensure that the ExecuTorch runtime library is integrated with the Developer Tools and built 
+with the `ET_EVENT_TRACER_ENABLED` flag enabled.
+
+Only models converted with profiling support will produce an `ETDump` containing execution times for all Neutron 
+operators. Otherwise, the dump will include only the final delegate execution time.
+
+Neutron software provides a profiling mechanism that logs individual operator execution times to a dedicated runtime 
+output. This data is then used to generate post‑time events after the inference has completed.
+
+
+### Example
+
+```c
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+```
+```c
+// 1. Create ETDumpGen BEFORE inference.
+auto etdump_gen_ptr = std::make_unique<executorch::etdump::ETDumpGen>();
+executorch::etdump::ETDumpGen* etdump_gen = etdump_gen_ptr.get();
+
+// 2. Load a method from the program by name with ETDump generator for profiling.
+Result<Method> method = program->load_method(method_name, &memory_manager, etdump_gen);
+
+// 3. Input tensor setup.
+Tensor::SizesType sizes[] = {1, 1, 32, 32};
+Tensor::DimOrderType dim_order[] = {0, 2, 3, 1};
+TensorImpl impl(ScalarType::Float, 4, sizes, image_data, dim_order);
+Tensor tensor(&impl);
+Error status = method->set_input(tensor, 0);
+
+// 4. Execute.
+status = method->execute();
+
+// Get ETDump.
+if (etdump_gen != nullptr) {
+    executorch::etdump::ETDumpResult result = etdump_gen->get_etdump_data();
+    if (result.buf != nullptr && result.size > 0) {
+        PRINTF("Add a brakepoint here and run this command in Debugger Console: "
+    	       "dump binary memory trace.etdump result.buf (result.buf + result.size)\r\n");
+    }
+}
+```
+
+
+To save an `ETDump` file from the board to a PC, use the **Debug Console** in the MCUXpresso IDE:
+
+- Set a breakpoint at the `PRINTF(...)` line in the example above.
+- Enter the following command in the Debug Console and press **Enter**:
+
+  ```
+  dump binary memory trace.etdump result.buf (result.buf + result.size)
+  ```
+
+
+<figure style="border:1px solid #ccc; padding:8px; display:inline-block;">
+  <img src="../../_static/img/nxp/nxp-mcuxpresso-etdump.png" width="500" alt="Save ETDump in MCUXPresso project" />
+  <figcaption>
+        <b>Figure 1:</b> Save ETDump in MCUXPresso Project.
+  </figcaption>
+</figure>
+
+
+The resulting `ETDump` file is generated in the project folder within the MCUXpresso workspace.
+
+> **Note:**  
+> Profilable models print profiling data to the terminal. Generating this dump may take longer than executing the 
+> Neutron kernels themselves, but this overhead can be ignored as it affects only models with profiling support 
+> enabled. The dump generation time is included in the `ETDump` as the final kernel entry.
+
+---
+
+## Creating an Inspector
+
+The [Inspector](https://docs.pytorch.org/executorch/1.0/model-inspector.html) APIs provide a way to analyze the 
+contents of `ETRecord` and `ETDump`, enabling developers to gain insights into model architecture 
+and performance statistics.
+
+`ETRecord` is an optional argument used to obtain a mapping between the original model and the converted Neutron model.
+
+An `ETDump` generated on the board contains metadata for each Neutron operator, including its unique identifier.  
+To visualize this metadata in the Inspector results table, set the `include_delegate_debug_data = True` argument.
+
+### Example
+
+```python
+from executorch.devtools import Inspector
+
+inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin")
+inspector.print_data_tabular(include_delegate_debug_data = True)
+```
+
+### Complete Example
+
+A full implementation is available
+in [analyzing_with_inspector.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/analyzing_with_inspector.py). @lint-ignore
+
+---
+
+## Summary
+
+* Build the model with the `--use_profiling` flag enabled.
+* Build the ExecuTorch runtime library with the `ET_EVENT_TRACER_ENABLED` flag and the ETDump Developer Tool.
+* Use the Debug Console in MCUXpresso to save the `ETDump` file from the board to a PC.
+* Visualize the profiling results using the Inspector.
diff --git a/examples/devtools/example_runner/TARGETS b/examples/devtools/example_runner/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/examples/devtools/example_runner/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/examples/models/gemma4/BUCK b/examples/models/gemma4/BUCK
index e587370ece0..19f0ff90c93 100644
--- a/examples/models/gemma4/BUCK
+++ b/examples/models/gemma4/BUCK
@@ -1,4 +1,5 @@
 load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
@@ -6,3 +7,122 @@ oncall("executorch")
 non_fbcode_target(_kind = define_common_targets,)
 
 fbcode_target(_kind = define_common_targets,)
+
+# Text decoder module
+fbcode_target(_kind = runtime.python_library,
+    name = "text_decoder",
+    srcs = [
+        "text_decoder/__init__.py",
+        "text_decoder/convert_weights.py",
+        "text_decoder/gemma4_attention.py",
+        "text_decoder/gemma4_config.py",
+        "text_decoder/gemma4_cross_decoder.py",
+        "text_decoder/gemma4_decoder_layer.py",
+        "text_decoder/gemma4_model.py",
+        "text_decoder/gemma4_self_decoder.py",
+        "text_decoder/gemma4_transformer.py",
+    ],
+    _is_external_target = True,
+    base_module = "executorch.examples.models.gemma4",
+    resources = {
+        "config/e2b_config.json": "config/e2b_config.json",
+        "config/e4b_config.json": "config/e4b_config.json",
+    },
+    deps = [
+        "//caffe2:torch",
+        "fbsource//third-party/pypi/safetensors:safetensors",
+        "fbsource//third-party/pypi/transformers:transformers",
+    ],
+    visibility = ["PUBLIC"],
+)
+
+# Speech transform module
+fbcode_target(_kind = runtime.python_library,
+    name = "speech_transform",
+    srcs = [
+        "speech_transform.py",
+    ],
+    _is_external_target = True,
+    base_module = "executorch.examples.models.gemma4",
+    deps = [
+        "//caffe2:torch",
+    ],
+    visibility = ["PUBLIC"],
+)
+
+# Export utilities (shared quantization code)
+fbcode_target(_kind = runtime.python_library,
+    name = "quant_utils",
+    srcs = ["quant_utils.py"],
+    _is_external_target = True,
+    base_module = "executorch.examples.models.gemma4",
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:source_transformation",
+        "//executorch/extension/llm/export:export_lib",
+        "//pytorch/ao:torchao",
+    ],
+    visibility = ["PUBLIC"],
+)
+
+# Single PTE export
+fbcode_target(_kind = runtime.python_binary,
+    name = "export_gemma4",
+    srcs = ["export_gemma4.py"],
+    main_function = "executorch.examples.models.gemma4.export_gemma4.main",
+    preload_deps = [
+        "//pytorch/ao/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight:op_linear_8bit_act_xbit_weight_aten",
+        "//pytorch/ao/torchao/csrc/cpu/shared_kernels/embedding_xbit:op_embedding_xbit_aten",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/kernels/quantized:aot_lib",
+    ],
+    deps = [
+        ":text_decoder",
+        ":speech_transform",
+        ":quant_utils",
+        "//caffe2:torch",
+        "//executorch/exir:lib",
+        "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
+        "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
+        "//executorch/extension/llm/export:export_lib",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_py",
+        "//executorch/kernels/quantized:aot_lib",
+        "//pytorch/ao:torchao",
+        "fbsource//third-party/pypi/safetensors:safetensors",
+        "fbsource//third-party/pypi/transformers:transformers",
+    ],
+)
+
+# Image preprocessing utilities
+fbcode_target(_kind = runtime.python_library,
+    name = "image_utils",
+    srcs = ["image_utils.py"],
+    _is_external_target = True,
+    base_module = "executorch.examples.models.gemma4",
+    deps = [
+        "//caffe2:torch",
+        "fbsource//third-party/pypi/pillow:pillow",
+    ],
+    visibility = ["PUBLIC"],
+)
+
+# Python runner (single PTE, audio + vision + text-only)
+fbcode_target(_kind = runtime.python_binary,
+    name = "run_gemma4",
+    srcs = ["run_gemma4.py"],
+    main_function = "executorch.examples.models.gemma4.run_gemma4.main",
+    preload_deps = [
+        "//executorch/backends/xnnpack:xnnpack_backend",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/kernels/quantized:aot_lib",
+        "//pytorch/ao/torchao/csrc/cpu/shared_kernels/embedding_xbit:op_embedding_xbit_aten",
+        "//pytorch/ao/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight:op_linear_8bit_act_xbit_weight_aten",
+    ],
+    deps = [
+        ":image_utils",
+        "//caffe2:torch",
+        "//executorch/runtime:runtime",
+        "fbsource//third-party/pypi/sentencepiece:sentencepiece",
+    ],
+)
diff --git a/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py b/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py
index e10c1c7e415..fe3e3bb94cb 100644
--- a/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py
+++ b/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py
@@ -34,14 +34,25 @@ class Gemma4MLP(nn.Module):
 
     def __init__(self, hidden_size: int, intermediate_size: int):
         super().__init__()
+        self.intermediate_size = intermediate_size
         self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
         self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
         self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.down_proj(
-            F.gelu(self.gate_proj(x), approximate="tanh") * self.up_proj(x)
-        )
+        # If a loader fused gate_proj|up_proj into one gate_up_proj (single
+        # matmul; e.g. the GGUF loader's coalesced fusion), use it and split the
+        # [.., 2*intermediate_size] output back into gate/up. Otherwise fall back
+        # to the separate projections (unfused checkpoints / non-fusing loaders).
+        gate_up = getattr(self, "gate_up_proj", None)
+        if gate_up is not None:
+            fused = gate_up(x)
+            gate = fused[..., : self.intermediate_size]
+            up = fused[..., self.intermediate_size :]
+        else:
+            gate = self.gate_proj(x)
+            up = self.up_proj(x)
+        return self.down_proj(F.gelu(gate, approximate="tanh") * up)
 
 
 class Gemma4DecoderLayer(nn.Module):
diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py
index 90839ea6f6a..6a4a70ced18 100644
--- a/examples/models/gemma4_31b/gguf_loader.py
+++ b/examples/models/gemma4_31b/gguf_loader.py
@@ -104,6 +104,89 @@ def _convert_weight(model, model_key: str, gtensor, backend: str):
     return gtensor
 
 
+# ---------------------------------------------------------------------------
+# Single-point gate/up fusion (backend-agnostic, at the raw GGUF level)
+#
+# gate_proj and up_proj share the same input, so the MLP can issue ONE matmul
+# over a [2*intermediate, hidden] weight instead of two. We fuse here -- before
+# any backend conversion (_convert_weight) -- by concatenating the two raw GGUF
+# block blobs along the output (row) dim. ExportableGGUFTensor.raw is
+# (N, row_bytes) row-major with each output row self-contained, so the concat is
+# an exact row-stack (no re-quant, no scale recompute). Both CUDA and MLX then
+# pack the already-fused weight, so there is no per-backend-type concat. The
+# model's Gemma4MLP.forward splits the [.., 2*intermediate] output back into
+# gate/up only when a fused gate_up_proj is present (graceful for unfused loads).
+
+
+def _gate_up_layer_kind(model_key: str):
+    """If ``model_key`` is an MLP gate/up proj weight, return ``(layer_idx, kind)``
+    with ``kind`` in ``{"gate", "up"}``; otherwise ``None``."""
+    prefix = "layers."
+    for kind in ("gate", "up"):
+        suffix = f".mlp.{kind}_proj.weight"
+        if model_key.startswith(prefix) and model_key.endswith(suffix):
+            mid = model_key[len(prefix) : len(model_key) - len(suffix)]
+            if mid.isdigit():
+                return int(mid), kind
+    return None
+
+
+def _gate_up_fuseable(gate, up) -> bool:
+    """True iff gate/up are the same GGUF quant type and same packed row width
+    (hence same K + block layout), so a row-concat along output N is valid."""
+    return (
+        gate.ggml_type == up.ggml_type
+        and gate.raw.shape[1] == up.raw.shape[1]
+        and int(gate.shape[1]) == int(up.shape[1])
+    )
+
+
+def _fuse_gate_up_raw(gate, up):
+    """Row-concat gate|up raw GGUF blocks (gate rows first) into one fused
+    ExportableGGUFTensor of shape (2*N, K)."""
+    from executorch.extension.llm.export.gguf import ExportableGGUFTensor
+
+    fused_raw = torch.cat([gate.raw, up.raw], dim=0)
+    return ExportableGGUFTensor.from_raw(fused_raw, gate.ggml_type, gate.orig_dtype)
+
+
+def _assign_gate_up_unfused(model, layer_idx, kind, gtensor, backend, packers):
+    """Assign a single gate/up GGUF tensor to its own projection (no fusion)."""
+    from executorch.examples.models.gemma4_31b.quant import pack_one
+
+    key = f"layers.{layer_idx}.mlp.{kind}_proj.weight"
+    pack_one(model, key, _convert_weight(model, key, gtensor, backend), packers)
+
+
+def _install_and_pack_fused_gate_up(model, layer_idx, gate, up, backend, packers):
+    """Fuse gate|up at the raw level, swap the layer's MLP to a single
+    ``gate_up_proj`` (dropping gate_proj/up_proj), then pack the fused weight."""
+    import torch.nn as nn
+
+    from executorch.examples.models.gemma4_31b.quant import pack_one
+
+    fused = _fuse_gate_up_raw(gate, up)
+    inter, hidden = int(gate.shape[0]), int(gate.shape[1])
+
+    mlp = model.get_submodule(f"layers.{layer_idx}.mlp")
+    mlp.gate_up_proj = nn.Linear(hidden, 2 * inter, bias=False, device="meta")
+    del mlp.gate_proj
+    del mlp.up_proj
+
+    key = f"layers.{layer_idx}.mlp.gate_up_proj.weight"
+    pack_one(model, key, _convert_weight(model, key, fused, backend), packers)
+
+
+def _process_gate_up_pair(model, layer_idx, gate, up, backend, packers) -> bool:
+    """Fuse gate|up if compatible (returns True), else assign them unfused."""
+    if _gate_up_fuseable(gate, up):
+        _install_and_pack_fused_gate_up(model, layer_idx, gate, up, backend, packers)
+        return True
+    _assign_gate_up_unfused(model, layer_idx, "gate", gate, backend, packers)
+    _assign_gate_up_unfused(model, layer_idx, "up", up, backend, packers)
+    return False
+
+
 def _resolve_tied_lm_head(model, lm_head_weight, packers):
     """Assign a tied lm_head (GGUF ties it to the token embedding)."""
     from executorch.examples.models.gemma4_31b.quant import pack_one
@@ -217,11 +300,32 @@ def load_gguf_model(
     n_processed = 0
 
     print(f"Streaming GGUF from {gguf_path}...")
+    pending_gate_up: dict = {}  # layer_idx -> {"gate": raw, "up": raw}
+    n_fused = 0
+    n_unfused = 0
     for gguf_name, value in iter_gguf(gguf_path):
         model_key = gguf_to_model_key(gguf_name)
         if model_key is None:
             continue
 
+        # Buffer the RAW gate/up ExportableGGUFTensor (pre-conversion) and fuse
+        # once both arrive -- the single common point upstream of _convert_weight.
+        gu = _gate_up_layer_kind(model_key)
+        if gu is not None and isinstance(value, ExportableGGUFTensor):
+            layer_idx, kind = gu
+            slot = pending_gate_up.setdefault(layer_idx, {})
+            slot[kind] = value
+            if "gate" in slot and "up" in slot:
+                if _process_gate_up_pair(
+                    model, layer_idx, slot["gate"], slot["up"], backend, packers
+                ):
+                    n_fused += 1
+                else:
+                    n_unfused += 1
+                pending_gate_up.pop(layer_idx, None)
+                n_processed += 2
+            continue
+
         if isinstance(value, ExportableGGUFTensor):
             weight = _convert_weight(model, model_key, value, backend)
             if model_key == "embed_tokens.weight":
@@ -238,6 +342,21 @@ def load_gguf_model(
         if n_processed % 100 == 0:
             print(f"  Processed {n_processed} tensors...")
 
+    # Flush any unpaired gate/up (partial/malformed) as separate unfused
+    # projections so no weight is left on meta.
+    for layer_idx, slot in pending_gate_up.items():
+        for kind in ("gate", "up"):
+            if kind in slot:
+                _assign_gate_up_unfused(
+                    model, layer_idx, kind, slot[kind], backend, packers
+                )
+                n_unfused += 1
+
+    print(
+        f"[gemma4_31b gguf] Fused gate+up on {n_fused} MLP layers"
+        + (f" ({n_unfused} left unfused)" if n_unfused else "")
+    )
+
     _resolve_tied_lm_head(model, lm_head_weight, packers)
 
     # Fill RoPE tables / KV caches / scalar constants (left on meta by the
diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt
index 726657a3779..aeb97f76ab7 100644
--- a/examples/models/qwen3_5_moe/CMakeLists.txt
+++ b/examples/models/qwen3_5_moe/CMakeLists.txt
@@ -89,6 +89,7 @@ endif()
 
 if(TARGET mlxdelegate)
   executorch_target_copy_mlx_metallib(qwen3_5_moe_runner)
+  executorch_target_copy_mlx_metallib(qwen3_5_moe_worker)
 endif()
 
 if(EXECUTORCH_BUILD_CUDA)
diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json
index 276c2116148..6adcb8aa9cb 100644
--- a/examples/models/qwen3_5_moe/CMakePresets.json
+++ b/examples/models/qwen3_5_moe/CMakePresets.json
@@ -70,9 +70,9 @@
         },
         {
             "name": "qwen3-5-moe-mlx",
-            "displayName": "Build Qwen3.5 MoE runner (MLX)",
+            "displayName": "Build Qwen3.5 MoE runner and worker (MLX)",
             "configurePreset": "qwen3-5-moe-mlx",
-            "targets": ["qwen3_5_moe_runner"]
+            "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
         }
     ],
     "workflowPresets": [
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
index c275641bfd7..77f53aefcc6 100644
--- a/examples/models/qwen3_5_moe/README.md
+++ b/examples/models/qwen3_5_moe/README.md
@@ -302,6 +302,66 @@ python -m executorch.examples.models.qwen3_5_moe.run \
     --max-new-tokens 50
 ```
 
+### Serving (MLX, multi-session)
+
+The MLX worker hosts multiple isolated sessions on **one** weight load, so an
+OpenAI-compatible server can serve concurrent conversations without duplicating
+the ~weights. `make qwen3_5_moe-mlx` builds both `qwen3_5_moe_runner` and
+`qwen3_5_moe_worker` (each with `mlx.metallib` copied alongside).
+
+Start the server (it auto-locates the worker binary):
+
+```bash
+# tokenizer.json the C++ worker opens (resolve from the HF cache)
+TOKENIZER_JSON=$(ls "${HF_HOME:-$HOME/.cache/huggingface}"/hub/models--Qwen--Qwen3.5-35B-A3B/snapshots/*/tokenizer.json | head -n1)
+
+python -m executorch.examples.models.qwen3_5_moe.serve \
+    --model-path ./qwen35_moe_mlx/model.pte \
+    --tokenizer-path "$TOKENIZER_JSON" \
+    --hf-tokenizer Qwen/Qwen3.5-35B-A3B \
+    --max-sessions 4 \
+    --host 127.0.0.1 \
+    --port 8000
+```
+
+- `--tokenizer-path` is the raw `tokenizer.json` **file** the worker loads;
+  `--hf-tokenizer` (HF id or local dir) supplies the chat template on the Python
+  side. No `--data-path` (the MLX `.pte` is self-contained).
+- `--max-sessions N` caps physical sessions on the single weight load. One slot
+  is reserved for anonymous requests (requests sent without a session id), so
+  `N` allows `N-1` concurrently named sessions.
+
+Query it (OpenAI-compatible) from another terminal. Route each conversation to a
+session with the `session_id` header:
+
+```bash
+curl http://127.0.0.1:8000/v1/chat/completions \
+  -H "Content-Type: application/json" -H "session_id: alice" \
+  -d '{"model":"qwen3.5-moe",
+       "messages":[{"role":"user","content":"What is the capital of France?"}],
+       "max_tokens":50,"chat_template_kwargs":{"enable_thinking":false}}'
+```
+
+Endpoints: `GET /health`, `GET /v1/models`, `POST /v1/chat/completions`,
+`DELETE /v1/sessions/{id}` (free a session + its slot), `POST /v1/sessions/{id}/reset`.
+
+Session/memory semantics on MLX:
+- This server uses the standard **stateless** OpenAI contract — send the full
+  `messages` history each request. `session_id` + warm-resume is a KV-cache reuse
+  optimization for the shared prefix, not server-side memory.
+- Each session adds **one** set of mutable buffers (KV + recurrent/conv state) on
+  top of the shared weights; per-session cost scales with `max_seq_len`. Weights
+  are never duplicated.
+- KV persists across requests for a live session and is **released on close**
+  (`DELETE`/reset). Named sessions are not auto-closed — close them to free slots.
+  MLX's Metal allocator pools freed buffers (so RSS may not shrink immediately),
+  but they are reused by later sessions, keeping memory bounded.
+- Requests are processed **one at a time** (a single in-flight request per
+  worker). A request runs to completion and head-of-line-blocks every other
+  session until it finishes; there is no token-level interleaving or parallel
+  execution. This holds on both MLX and CUDA; multi-session provides memory
+  isolation and warm resume, not added throughput.
+
 ### Tiny Model Test
 
 For CI or quick pipeline validation (no model download needed):
diff --git a/examples/models/qwen3_5_moe/mlx_source_transformations.py b/examples/models/qwen3_5_moe/mlx_source_transformations.py
index 9a49f8a84f6..3c460fc9c54 100644
--- a/examples/models/qwen3_5_moe/mlx_source_transformations.py
+++ b/examples/models/qwen3_5_moe/mlx_source_transformations.py
@@ -113,12 +113,14 @@ def _full_attention_forward(self, x, input_pos):
 
     k, v = self.kv_cache.update(input_pos, k, v)
 
-    if self.n_kv_groups > 1:
-        k = k.repeat_interleave(self.n_kv_groups, dim=1)
-        v = v.repeat_interleave(self.n_kv_groups, dim=1)
-
-    attn_mask = self.mask[input_pos].unsqueeze(0).unsqueeze(0)
-    y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+    y = torch.ops.mlx.custom_sdpa(
+        q,
+        k,
+        v,
+        start_pos=pos,
+        dropout_p=0.0,
+        is_causal=True,
+    )
 
     y = y.transpose(1, 2).contiguous().view(B, T, -1)
 
@@ -184,10 +186,8 @@ def _exportable_gated_delta_net_forward(self, x, input_pos):
         k, (self.head_k_dim,), self._qk_rms_weight, eps=1e-6
     )
 
-    # head_repeat for k_heads != v_heads
-    if self.head_repeat > 1:
-        q = q.repeat_interleave(self.head_repeat, dim=2)
-        k = k.repeat_interleave(self.head_repeat, dim=2)
+    # GQA head expansion (k_heads != v_heads) is handled inside
+    # mlx::gated_delta_rule
 
     # Mamba-style gating
     beta = b.sigmoid()
@@ -278,17 +278,13 @@ def _swap_gated_delta_net(model, model_dtype):
 
 
 def _swap_full_attention(model, config):
-    """FullAttention → mlx::rope custom op + causal mask."""
+    """FullAttention → mlx::rope custom op"""
     rope_theta = config.rope_theta if config else 10000.0
-    max_seq_len = config.max_seq_len if config else 4096
     count = 0
     for _name, module in model.named_modules():
         if isinstance(module, FullAttention):
             module._rope_dims = module.rotary_emb.rotary_dim
             module._rope_base = rope_theta
-            mask = torch.full((max_seq_len, max_seq_len), float("-inf"))
-            mask = torch.triu(mask, diagonal=1)
-            module.register_buffer("mask", mask)
             module.forward = types.MethodType(_full_attention_forward, module)
             count += 1
     return count
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
index 713f6211330..fd81f60c23a 100644
--- a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
+++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
@@ -183,9 +183,9 @@ class Qwen35MoESession : public LLMSession {
       ::tokenizers::Tokenizer* tokenizer,
       std::unordered_map<std::string, int64_t> metadata,
       std::unordered_set<uint64_t> eos_ids
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
       ,
-      ::executorch::backends::cuda::MutableStateContextOwner* mutable_state,
+      MutableStateContextOwner* mutable_state,
       int session_token
 #endif
       )
@@ -195,7 +195,7 @@ class Qwen35MoESession : public LLMSession {
         tokenizer_(tokenizer),
         metadata_(std::move(metadata)),
         eos_ids_(std::move(eos_ids))
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
         ,
         mutable_state_(mutable_state),
         session_token_(session_token)
@@ -212,9 +212,8 @@ class Qwen35MoESession : public LLMSession {
   }
 
   ~Qwen35MoESession() override {
-#ifdef EXECUTORCH_BUILD_CUDA
-    if (mutable_state_ != nullptr &&
-        session_token_ != ::executorch::backends::cuda::kNoMutableSession) {
+#ifdef QWEN_HAS_MUTABLE_STATE
+    if (mutable_state_ != nullptr && session_token_ != kNoMutableSession) {
       mutable_state_->destroy_session(session_token_);
     }
 #endif
@@ -425,8 +424,8 @@ class Qwen35MoESession : public LLMSession {
       float temperature,
       bool sync_after) {
     std::lock_guard<std::mutex> guard(*exec_mutex_);
-#ifdef EXECUTORCH_BUILD_CUDA
-    Result<std::vector<EValue>> res = mutable_state_ != nullptr
+#ifdef QWEN_HAS_MUTABLE_STATE
+    auto res = mutable_state_ != nullptr
         ? mutable_state_->with_active_session(
               session_token_,
               [&]() { return module_->execute(method, inputs); })
@@ -465,10 +464,11 @@ class Qwen35MoESession : public LLMSession {
   int64_t decode_pos_data_[1] = {0};
   TensorPtr decode_tokens_;
   TensorPtr decode_pos_;
+#ifdef QWEN_HAS_MUTABLE_STATE
+  MutableStateContextOwner* mutable_state_ = nullptr;
+  int session_token_ = kNoMutableSession;
+#endif
 #ifdef EXECUTORCH_BUILD_CUDA
-  ::executorch::backends::cuda::MutableStateContextOwner* mutable_state_ =
-      nullptr;
-  int session_token_ = ::executorch::backends::cuda::kNoMutableSession;
   float temp_val_ = 1e-6f;
   TensorPtr temp_tensor_;
 #endif
@@ -529,17 +529,17 @@ Result<std::unique_ptr<Qwen35MoEEngine>> Qwen35MoEEngine::create(
         "not stop at end of turn");
   }
 
+#ifdef QWEN_HAS_MUTABLE_STATE
+  std::unique_ptr<MutableStateContextOwner> mutable_state;
+#endif
 #ifdef EXECUTORCH_BUILD_CUDA
-  std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner>
-      mutable_state;
   if (config.enable_cuda_graph) {
     ET_LOG(
         Info,
         "Qwen35MoEEngine: CUDA graph requested; per-session rebinding disabled "
         "and serving capacity clamped to 1 session.");
   } else {
-    auto candidate = std::make_unique<
-        ::executorch::backends::cuda::MutableStateContextOwner>();
+    auto candidate = std::make_unique<MutableStateContextOwner>();
     if (Error e = register_mutable_fqns(meta_module.get(), *candidate);
         e == Error::Ok) {
       mutable_state = std::move(candidate);
@@ -550,9 +550,13 @@ Result<std::unique_ptr<Qwen35MoEEngine>> Qwen35MoEEngine::create(
           "serving capacity clamped to 1 session.");
     }
   }
+#elif defined(EXECUTORCH_BUILD_MLX)
+  // MLX owns mutable buffers directly and selects per-session storage at
+  // execute time; no FQN registration or coverage check is required.
+  mutable_state = std::make_unique<MutableStateContextOwner>();
 #endif
 
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
   auto module_res = mutable_state != nullptr
       ? mutable_state->with_load_scope(
             [&]() { return build_qwen_module(config); })
@@ -566,16 +570,14 @@ Result<std::unique_ptr<Qwen35MoEEngine>> Qwen35MoEEngine::create(
   std::unique_ptr<Module> shared_module = std::move(module_res.get());
 
   bool rebind_available = false;
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
   rebind_available = mutable_state != nullptr && mutable_state->available();
-  if (rebind_available) {
-    if (mutable_state->validate_coverage() != Error::Ok) {
-      ET_LOG(
-          Error,
-          "Qwen35MoEEngine: mutable-buffer coverage check failed; disabling "
-          "multi-session (capacity clamped to 1).");
-      rebind_available = false;
-    }
+  if (rebind_available && mutable_state->validate_coverage() != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Qwen35MoEEngine: mutable-buffer coverage check failed; disabling "
+        "multi-session (capacity clamped to 1).");
+    rebind_available = false;
   }
   if (!rebind_available) {
     ET_LOG(
@@ -592,7 +594,7 @@ Result<std::unique_ptr<Qwen35MoEEngine>> Qwen35MoEEngine::create(
       std::move(eos_ids),
       std::move(shared_module),
       rebind_available
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
       ,
       std::move(mutable_state)
 #endif
@@ -621,7 +623,7 @@ Result<std::unique_ptr<LLMSession>> Qwen35MoEEngine::create_session() {
   }
 
   int token = -1; // kNoMutableSession: single-session / no rebind
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
   if (rebind_available_) {
     auto t = mutable_state_->create_session();
     if (t.error() != Error::Ok) {
@@ -638,7 +640,7 @@ Result<std::unique_ptr<LLMSession>> Qwen35MoEEngine::create_session() {
       tokenizer_.get(),
       metadata_,
       eos_ids_
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
       ,
       mutable_state_.get(),
       token
@@ -648,7 +650,7 @@ Result<std::unique_ptr<LLMSession>> Qwen35MoEEngine::create_session() {
 
 LLMServingCapacity Qwen35MoEEngine::serving_capacity() const {
   LLMServingCapacity cap; // default: 1 session, 0 bytes (unknown)
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
   if (rebind_available_) {
     cap.max_physical_sessions_without_weight_duplication =
         config_.max_sessions > 1 ? config_.max_sessions : 1;
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.h b/examples/models/qwen3_5_moe/qwen35_moe_engine.h
index c7ea53115b8..5a5e286c9c5 100644
--- a/examples/models/qwen3_5_moe/qwen35_moe_engine.h
+++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.h
@@ -7,8 +7,8 @@
  */
 
 // Engine/Session adapter for the Qwen3.5 MoE exported prefill/decode methods.
-// CUDA builds can host multiple sessions on one loaded model by rebinding the
-// model's mutable buffers before each execute.
+// CUDA and MLX builds can host multiple sessions on one loaded model by
+// rebinding the model's mutable buffers before each execute.
 
 #pragma once
 
@@ -28,10 +28,28 @@
 
 #ifdef EXECUTORCH_BUILD_CUDA
 #include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
+#elif defined(EXECUTORCH_BUILD_MLX)
+#include <executorch/backends/mlx/runtime/mlx_mutable_state.h>
+#endif
+
+#if defined(EXECUTORCH_BUILD_CUDA) || defined(EXECUTORCH_BUILD_MLX)
+#define QWEN_HAS_MUTABLE_STATE 1
 #endif
 
 namespace executorch::extension::llm {
 
+#if defined(EXECUTORCH_BUILD_CUDA)
+using MutableStateContextOwner =
+    ::executorch::backends::cuda::MutableStateContextOwner;
+constexpr int kNoMutableSession =
+    ::executorch::backends::cuda::kNoMutableSession;
+#elif defined(EXECUTORCH_BUILD_MLX)
+using MutableStateContextOwner =
+    ::executorch::backends::mlx::MutableStateContextOwner;
+constexpr int kNoMutableSession =
+    ::executorch::backends::mlx::kNoMutableSession;
+#endif
+
 /// Immutable configuration for a Qwen3.5 MoE engine.
 struct Qwen35MoEConfig {
   std::string model_path; // .pte
@@ -77,10 +95,9 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine {
       std::unordered_set<uint64_t> eos_ids,
       std::unique_ptr<Module> shared_module,
       bool rebind_available
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
       ,
-      std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner>
-          mutable_state
+      std::unique_ptr<MutableStateContextOwner> mutable_state
 #endif
       )
       : config_(std::move(config)),
@@ -89,7 +106,7 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine {
         eos_ids_(std::move(eos_ids)),
         shared_module_(std::move(shared_module)),
         rebind_available_(rebind_available)
-#ifdef EXECUTORCH_BUILD_CUDA
+#ifdef QWEN_HAS_MUTABLE_STATE
         ,
         mutable_state_(std::move(mutable_state))
 #endif
@@ -104,9 +121,8 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine {
   std::unique_ptr<Module> shared_module_;
   std::mutex exec_mutex_;
   bool rebind_available_ = false;
-#ifdef EXECUTORCH_BUILD_CUDA
-  std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner>
-      mutable_state_;
+#ifdef QWEN_HAS_MUTABLE_STATE
+  std::unique_ptr<MutableStateContextOwner> mutable_state_;
 #endif
   std::atomic<int> live_sessions_{0};
 };
diff --git a/examples/nxp/analyzing_with_inspector.py b/examples/nxp/analyzing_with_inspector.py
new file mode 100644
index 00000000000..b339af79d6e
--- /dev/null
+++ b/examples/nxp/analyzing_with_inspector.py
@@ -0,0 +1,58 @@
+# Copyright 2026 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Print profiling table for the NXP Neutron NPU model
+
+from typing import Any, Union
+
+from executorch.devtools import Inspector
+
+
+def parse_delegate_metadata(
+    delegate_metadatas: list[bytes],
+) -> Union[list[str], dict[str, Any]]:
+    """Metadata parser for Neutron Backend metadata.
+
+    The parser is a callable that deserializes the data and returns neutron kernel number.
+    The deserialized data is then added back to the corresponding event in the event block for user consumption.
+    """
+
+    metadata_list = []
+    for metadata_bytes in delegate_metadatas:
+        if len(metadata_bytes) == 1:
+            function_code = metadata_bytes[0]
+            if function_code == 0:
+                metadata_list.append("Profiling dump")
+            else:
+                metadata_list.append("Neutron kernel " + str(function_code))
+        else:
+            metadata_list.append("Invalid metadata size")
+    return metadata_list
+
+
+if __name__ == "__main__":
+
+    try:
+        etrecord_path = "etrecord/etrecord.bin"
+        etdump_path = "etdump/trace.etdump"
+        inspector = Inspector(
+            etdump_path=etdump_path,
+            etrecord=etrecord_path,
+            delegate_metadata_parser=parse_delegate_metadata,
+        )
+
+        # Access raw event data and filter quantized_decomposed nodes
+        for event_block in inspector.event_blocks:
+            for event in event_block.events:
+                if hasattr(event, "op_types") and isinstance(event.op_types, list):
+                    # Filter out quantized_decomposed ops from the actual list
+                    filtered = [
+                        op for op in event.op_types if "quantized_decomposed" not in op
+                    ]
+                    event.op_types = filtered if filtered else event.op_types
+
+        inspector.print_data_tabular(include_delegate_debug_data=True)
+    except Exception as e:
+        print(f"Error during inspection: {type(e).__name__}: {e}")
diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py
index f5f92d36541..258b4c87772 100644
--- a/examples/nxp/aot_neutron_compile.py
+++ b/examples/nxp/aot_neutron_compile.py
@@ -8,6 +8,7 @@
 import argparse
 import io
 import logging
+import os
 from collections import defaultdict
 
 import executorch.extension.pybindings.portable_lib
@@ -167,6 +168,13 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
         default=False,
         help="Use QAT mode for quantization (performs two QAT training epochs)",
     )
+    parser.add_argument(
+        "--use_profiling",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable profiling for eIQ Neutron NPU delegated model",
+    )
     parser.add_argument(
         "-s",
         "--so_library",
@@ -322,6 +330,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
         operators_not_to_delegate=args.operators_not_to_delegate,
         fetch_constants_to_sram=args.fetch_constants_to_sram,
         dump_kernel_selection_code=args.dump_kernel_selection_code,
+        use_profiling=args.use_profiling,
     )
     partitioners = (
         [
@@ -338,6 +347,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
     edge_program_manager = to_edge_transform_and_lower(
         export(module, example_inputs, strict=True),
         transform_passes=NeutronEdgePassManager(),
+        generate_etrecord=args.use_profiling,
         partitioner=partitioners,
         compile_config=EdgeCompileConfig(
             _core_aten_ops_exception_list=core_aten_ops_exception_list,
@@ -360,6 +370,21 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
         exec_prog = edge_program_manager.to_executorch(
             config=ExecutorchBackendConfig(extract_delegate_segments=False)
         )
+
+        # Generate ETRecord if profiling flag is set
+        if args.use_profiling:
+            etrecord_path = os.path.join("etrecord", f"{args.model_name}_etrecord.bin")
+            # Create directory if it doesn't exist
+            os.makedirs(os.path.dirname(etrecord_path), exist_ok=True)
+            # Save ETRecord
+            exec_prog.get_etrecord().save(etrecord_path)
+            # Notify the user about profiling enablement and ETRecord generation.
+            logging.info(
+                "The model was converted with profiling enabled. The time spent generating the profiling dump is traced as the "
+                "final delegate operation and can be ignored, as no dump is produced for non‑profilable models."
+            )
+            logging.info(f"The ETRecord for the model was saved to {etrecord_path}.")
+
     except RuntimeError as e:
         if "Missing out variants" in str(e.args[0]):
             raise RuntimeError(
@@ -378,8 +403,10 @@ def executorch_program_to_str(ep, verbose=False):
     logging.debug(f"Executorch program:\n{executorch_program_to_str(exec_prog)}")
 
     # 6. Serialize to *.pte
-    model_name = f"{args.model_name}" + (
-        "_nxp_delegate" if args.delegate is True else ""
+    model_name = (
+        f"{args.model_name}"
+        + ("_nxp_delegate" if args.delegate is True else "")
+        + ("_profile" if args.use_profiling is True else "")
     )
     save_pte_program(exec_prog, model_name)
 
diff --git a/examples/qualcomm/executor_runner/TARGETS b/examples/qualcomm/executor_runner/TARGETS
deleted file mode 100644
index 1e8cc179228..00000000000
--- a/examples/qualcomm/executor_runner/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain xplat-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/aten_util/TARGETS b/extension/aten_util/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/aten_util/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/aten_util/test/TARGETS b/extension/aten_util/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/aten_util/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/cuda/TARGETS b/extension/cuda/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/cuda/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/image/TARGETS b/extension/image/TARGETS
deleted file mode 100644
index 0a42614a385..00000000000
--- a/extension/image/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/image/benchmark/TARGETS b/extension/image/benchmark/TARGETS
deleted file mode 100644
index 0a42614a385..00000000000
--- a/extension/image/benchmark/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/image/test/TARGETS b/extension/image/test/TARGETS
deleted file mode 100644
index 0a42614a385..00000000000
--- a/extension/image/test/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/pytree/aten_util/TARGETS b/extension/pytree/aten_util/TARGETS
deleted file mode 100644
index 77b38349334..00000000000
--- a/extension/pytree/aten_util/TARGETS
+++ /dev/null
@@ -1,7 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/pytree/aten_util/test/TARGETS b/extension/pytree/aten_util/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/pytree/aten_util/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/runner_util/TARGETS b/extension/runner_util/TARGETS
deleted file mode 100644
index 1e8cc179228..00000000000
--- a/extension/runner_util/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain xplat-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/tensor/TARGETS b/extension/tensor/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/tensor/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/tensor/test/TARGETS b/extension/tensor/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/tensor/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/testing_util/TARGETS b/extension/testing_util/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/testing_util/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/testing_util/test/TARGETS b/extension/testing_util/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/testing_util/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/threadpool/TARGETS b/extension/threadpool/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/threadpool/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/threadpool/test/TARGETS b/extension/threadpool/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/threadpool/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/kernels/optimized/cpu/TARGETS b/kernels/optimized/cpu/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/kernels/optimized/cpu/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/kernels/optimized/test/TARGETS b/kernels/optimized/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/kernels/optimized/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/kernels/portable/cpu/util/TARGETS b/kernels/portable/cpu/util/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/kernels/portable/cpu/util/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/kernels/portable/cpu/util/test/TARGETS b/kernels/portable/cpu/util/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/kernels/portable/cpu/util/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/kernels/prim_ops/TARGETS b/kernels/prim_ops/TARGETS
deleted file mode 100644
index 77b38349334..00000000000
--- a/kernels/prim_ops/TARGETS
+++ /dev/null
@@ -1,7 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/backend/TARGETS b/runtime/backend/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/backend/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/backend/test/TARGETS b/runtime/backend/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/backend/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/core/TARGETS b/runtime/core/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/core/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/core/exec_aten/TARGETS b/runtime/core/exec_aten/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/core/exec_aten/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/core/exec_aten/testing_util/TARGETS b/runtime/core/exec_aten/testing_util/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/core/exec_aten/testing_util/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/core/exec_aten/testing_util/test/TARGETS b/runtime/core/exec_aten/testing_util/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/core/exec_aten/testing_util/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/core/exec_aten/util/TARGETS b/runtime/core/exec_aten/util/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/core/exec_aten/util/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/platform/TARGETS b/runtime/platform/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/platform/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/runtime/platform/test/TARGETS b/runtime/platform/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/runtime/platform/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/schema/TARGETS b/schema/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/schema/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/schema/test/TARGETS b/schema/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/schema/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/test/TARGETS b/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()