diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml index 5a4ccbb4952..167ceb7da83 100644 --- a/.github/workflows/mlx.yml +++ b/.github/workflows/mlx.yml @@ -66,7 +66,11 @@ jobs: echo "::endgroup::" echo "::group::Build test runners" - ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 )) + ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner mlx_mutable_state_test -j$(( $(sysctl -n hw.ncpu) - 1 )) + echo "::endgroup::" + + echo "::group::Run mutable-state (multi-session) unit test" + ./cmake-out/backends/mlx/test/mlx_mutable_state_test echo "::endgroup::" echo "::group::Run op unit tests" diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py index 3b713659e84..13693bd235d 100644 --- a/backends/arm/quantizer/quantization_annotator.py +++ b/backends/arm/quantizer/quantization_annotator.py @@ -631,6 +631,16 @@ def _get_fixed_qparams_qspec( if _transpose_dimname is not None: _one_to_one_shared_input_qspec.add(_transpose_dimname) +for _op in ( + getattr(torch.ops.aten.moveaxis, "int", None), + getattr(torch.ops.aten.moveaxis, "intlist", None), + getattr(torch.ops.aten.movedim, "int", None), + getattr(torch.ops.aten.movedim, "intlist", None), +): + if _op is not None: + _one_to_one_shared_input_qspec.add(_op) + + _one_to_one_shared_input_or_input_act_qspec: set[OpOverload] = { torch.ops.aten.alias.default, torch.ops.aten.clone.default, diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py index 8864324dbd5..6819929104e 100644 --- a/backends/arm/test/ops/test_permute.py +++ b/backends/arm/test/ops/test_permute.py @@ -78,6 +78,12 @@ def forward(self, x): return torch.permute(x, self.dims) +class SimpleMoveAxis(torch.nn.Module): + + def forward(self, x): + return torch.moveaxis(x, 1, -1) + + @common.parametrize( "test_data", test_data_suite | test_data_suite_fp16 | test_data_suite_bf16 ) @@ -118,6 +124,17 @@ def test_permute_u55_INT(test_data): pipeline.run() +def test_moveaxis_u55_INT(): + pipeline = EthosU55PipelineINT[input_t1]( + SimpleMoveAxis(), + (torch.rand(1, 4, 5, 6),), + "torch.ops.aten.moveaxis.int", + exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default", + run_on_fvp=False, + ) + pipeline.run() + + @common.parametrize("test_data", test_data_suite_u55_reject) def test_permute_u55_INT_not_delegated(test_data: torch.Tensor): test_data, dims = test_data() diff --git a/backends/arm/test/quantizer/test_generic_annotater.py b/backends/arm/test/quantizer/test_generic_annotater.py index dd883e72b1f..b5cfd1efdc6 100644 --- a/backends/arm/test/quantizer/test_generic_annotater.py +++ b/backends/arm/test/quantizer/test_generic_annotater.py @@ -89,6 +89,41 @@ def test_transpose_tosa_INT(): ) +def test_moveaxis_movedim_tosa_INT(): + check_annotation( + SingleOpModel( + torch.moveaxis, + (torch.randn(2, 3, 4),), + source=1, + destination=-1, + ), + ) + check_annotation( + SingleOpModel( + torch.moveaxis, + (torch.randn(2, 3, 4),), + source=(0, 1), + destination=(-1, -2), + ), + ) + check_annotation( + SingleOpModel( + torch.movedim, + (torch.randn(2, 3, 4),), + source=1, + destination=-1, + ), + ) + check_annotation( + SingleOpModel( + torch.movedim, + (torch.randn(2, 3, 4),), + source=(0, 1), + destination=(-1, -2), + ), + ) + + def test_tile_tosa_INT(): check_annotation( SingleOpModel(torch.tile, (torch.randn(4, 4),), dims=(2,)), diff --git a/backends/mlx/CMakeLists.txt b/backends/mlx/CMakeLists.txt index 43968d09b5d..acb96fb1ed9 100644 --- a/backends/mlx/CMakeLists.txt +++ b/backends/mlx/CMakeLists.txt @@ -255,8 +255,10 @@ option(ET_MLX_ALLOW_CUSTOM_KERNEL_EXECUTION ON ) -set(_mlx_backend__srcs ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXLoader.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXBackend.cpp +set(_mlx_backend__srcs + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXLoader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXBackend.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/mlx_mutable_state.cpp ) add_library(mlxdelegate ${_mlx_backend__srcs}) diff --git a/backends/mlx/custom_kernel_ops/gated_delta_rule.py b/backends/mlx/custom_kernel_ops/gated_delta_rule.py index 423ffd0b034..41eb8ce7b98 100644 --- a/backends/mlx/custom_kernel_ops/gated_delta_rule.py +++ b/backends/mlx/custom_kernel_ops/gated_delta_rule.py @@ -53,6 +53,15 @@ def gated_delta_rule( B, T_len, Hk, Dk = q.shape Hv, Dv = v.shape[-2:] + # The Metal kernel maps each v-head to its k-head group + # (hk_idx = hv_idx / (Hv / Hk)); mirror that here so the eager reference also + # supports Hk != Hv (GQA) instead of relying on broadcasting, which requires + # Hk == Hv. repeat_interleave on the head dim reproduces that index mapping. + if Hk != Hv: + q = q.repeat_interleave(Hv // Hk, dim=2) + k = k.repeat_interleave(Hv // Hk, dim=2) + Hk = Hv + s = state.clone() ys = [] @@ -101,6 +110,7 @@ def gated_delta_rule_fake( IntOrVid, MetalKernelNode, MultiplyNode, + RepeatNode, ScanNode, SubtractNode, SumNode, @@ -450,6 +460,33 @@ def _emit_scan(self, P: MLXProgramBuilder, n: Node) -> Slot: ] ) + # GQA: q/k carry Hk heads but the recurrence state/v have Hv heads. Expand + # q/k to Hv (repeat_interleave on the head axis) so the per-step broadcasts + # match, mirroring the Metal kernel's hk_idx = hv_idx / (Hv / Hk). + Hk = int(self.q_node.meta["val"].shape[-2]) + Hv = int(self.v_node.meta["val"].shape[-2]) + if Hk != Hv: + rep = IntOrVid.from_literal(Hv // Hk) + _, q_exp = P.make_tmp_slot() + P.emit( + RepeatNode( + x=P.slot_to_tid(q_slot), + out=P.slot_to_tid(q_exp), + repeats=rep, + axis=2, + ) + ) + _, k_exp = P.make_tmp_slot() + P.emit( + RepeatNode( + x=P.slot_to_tid(k_slot), + out=P.slot_to_tid(k_exp), + repeats=rep, + axis=2, + ) + ) + q_slot, k_slot = q_exp, k_exp + # Carry needs a writable slot. This is node n's persistent output (the # mutated state), so it must be a node-owned slot — not a temp slot, whose # id is reclaimed on tmp_scope exit and would be read as dead by a later diff --git a/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py b/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py index 0a7e6a687f9..dfee111e74b 100644 --- a/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py +++ b/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py @@ -96,9 +96,8 @@ def forward( g: torch.Tensor, # [B, T, Hv] beta: torch.Tensor, # [B, T, Hv] ) -> torch.Tensor: - if self.head_repeat > 1: - q = q.repeat_interleave(self.head_repeat, dim=2) - k = k.repeat_interleave(self.head_repeat, dim=2) + # Pass native Hk (no repeat_interleave): the op itself must handle + # GQA head expansion (kernel via hk_idx mapping, scan/eager internally). return torch.ops.mlx.gated_delta_rule( q, k, v, g, beta, self.state, use_custom_kernel=self.use_custom_kernel ) diff --git a/backends/mlx/ops.py b/backends/mlx/ops.py index 44536e675da..e3a636466c1 100644 --- a/backends/mlx/ops.py +++ b/backends/mlx/ops.py @@ -163,6 +163,8 @@ from executorch.exir.dialects._ops import ops as exir_ops from torch.fx.node import Node +_LEAKY_RELU_DEFAULT_NEGATIVE_SLOPE = 0.01 + def require_static_int(value: Any, param_name: str, op_name: str) -> None: """ @@ -2786,6 +2788,63 @@ def _relu_handler(P: MLXProgramBuilder, n: Node) -> Slot: return out +@REGISTRY.register(target=[torch.ops.aten.leaky_relu.default]) +def _leaky_relu_handler(P: MLXProgramBuilder, n: Node) -> Slot: + """Handle aten.leaky_relu.default - leaky rectified linear unit. + + leaky_relu(x) = x if x >= 0 + = slope * x otherwise + + Implemented as where(x >= 0, x, slope * x) so it stays correct for any + negative_slope (including values > 1), matching eager PyTorch. + """ + args = P.args(n) + require_args(args, 1, 2, "aten.leaky_relu") + require_kwargs(P.kwargs(n), set(), "aten.leaky_relu") + + x = args[0] + negative_slope = _LEAKY_RELU_DEFAULT_NEGATIVE_SLOPE + if len(args) > 1 and args[1] is not None: + negative_slope = float(args[1]) + + x_meta = n.args[0].meta.get("val") + if x_meta is None: + raise ValueError("Input tensor metadata not found for leaky_relu") + dtype = x_meta.dtype + + zero_slot = emit_lifted_constant(P, 0.0, dtype) + slope_slot = emit_lifted_constant(P, negative_slope, dtype) + + _, cond_slot = P.make_tmp_slot() + P.emit( + GreaterEqualNode( + a=P.slot_to_tid(x), + b=P.slot_to_tid(zero_slot), + out=P.slot_to_tid(cond_slot), + ) + ) + + _, scaled_slot = P.make_tmp_slot() + P.emit( + MultiplyNode( + a=P.slot_to_tid(slope_slot), + b=P.slot_to_tid(x), + out=P.slot_to_tid(scaled_slot), + ) + ) + + out = P.make_or_get_slot(n) + P.emit( + WhereNode( + condition=P.slot_to_tid(cond_slot), + x=P.slot_to_tid(x), + y=P.slot_to_tid(scaled_slot), + out=P.slot_to_tid(out), + ) + ) + return out + + @REGISTRY.register(target=[torch.ops.aten._log_softmax.default]) def _log_softmax_handler(P: MLXProgramBuilder, n: Node) -> Slot: """Handle aten._log_softmax.default - log of softmax. diff --git a/backends/mlx/runtime/MLXBackend.cpp b/backends/mlx/runtime/MLXBackend.cpp index 5bd3bf263d1..0dbdec22436 100644 --- a/backends/mlx/runtime/MLXBackend.cpp +++ b/backends/mlx/runtime/MLXBackend.cpp @@ -9,6 +9,7 @@ #include "MLXExecutor.h" #include "MLXInterpreter.h" #include "MLXLoader.h" +#include "mlx_mutable_state.h" #include #include @@ -277,6 +278,12 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface { eval(handle->constants.tensors); } + // Register the handle with the per-session mutable-state manager. This is + // a no-op unless a multi-session owner is active for this load (see + // mlx_mutable_state.h); single-session execution is unaffected. + mutable_state_note_handle( + handle, &handle->program, &handle->mutable_buffers); + } catch (const std::exception& e) { ET_LOG(Error, "Failed to load MLX program: %s", e.what()); handle->~MLXHandle(); @@ -366,6 +373,14 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface { } } + // Select the active session's mutable buffers (KV cache, recurrent/conv + // state) before running. No-op for single-session handles; weights stay + // shared via ExecutionState::constants. + if (Error rebind_err = mutable_state_rebind_for_execute(h, h->state); + rebind_err != Error::Ok) { + return rebind_err; + } + // Run the MLX program (builds lazy computation graph) h->interpreter.run(program, h->state, h->stream); @@ -431,6 +446,7 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface { void destroy(DelegateHandle* handle) const override { std::lock_guard lock(mlx_global_mutex()); if (handle != nullptr) { + mutable_state_forget_handle(handle); auto* mlx_handle = static_cast(handle); mlx_handle->~MLXHandle(); } diff --git a/backends/mlx/runtime/mlx_mutable_state.cpp b/backends/mlx/runtime/mlx_mutable_state.cpp new file mode 100644 index 00000000000..2f00d917136 --- /dev/null +++ b/backends/mlx/runtime/mlx_mutable_state.cpp @@ -0,0 +1,339 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "mlx_mutable_state.h" + +#include "MLXExecutor.h" +#include "MLXLoader.h" + +#include + +#include +#include + +namespace executorch { +namespace backends { +namespace mlx { + +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; + +namespace { + +struct HandleInfo { + const MLXProgram* program{nullptr}; + MutableBufferData* default_buffers{nullptr}; +}; + +struct Context { + // Delegate handles associated with this loaded program (one per loaded + // method). Keyed by opaque MLXHandle pointer. + std::unordered_map handles; + // Per-session mutable buffers: token -> (handle -> buffers). Allocated lazily + // on first execute for a given (session, handle). + std::unordered_map> + sessions; + int next_token{0}; + // Sticky setup failure. Once set (e.g. by nested load scopes), available(), + // validate_coverage(), create_session(), and rebind fail consistently. + Error build_error{Error::Ok}; +}; + +// Process-global registry. MLX serializes execution via its own global mutex +// and the engine serializes per session, but the registry itself is guarded +// here so context/session lifecycle calls from other threads are safe. +std::mutex& registry_mutex() { + static std::mutex m; + return m; +} + +std::unordered_map& contexts() { + static std::unordered_map c; + return c; +} + +std::unordered_map& handle_ctx() { + static std::unordered_map m; + return m; +} + +MutableStateContext g_next_ctx = 1; // 0 is reserved as invalid. + +// Thread-local load scope and active (ctx, session) selection. +thread_local MutableStateContext tl_loading_ctx = kInvalidMutableContext; +thread_local MutableStateContext tl_active_ctx = kInvalidMutableContext; +thread_local int tl_active_token = kNoMutableSession; + +} // namespace + +namespace detail { + +MutableStateContext mutable_state_create_context() { + std::lock_guard g(registry_mutex()); + MutableStateContext ctx = g_next_ctx++; + if (ctx == kInvalidMutableContext) { + ctx = g_next_ctx++; + } + contexts()[ctx]; + return ctx; +} + +void mutable_state_destroy_context(MutableStateContext ctx) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + return; + } + for (const auto& kv : it->second.handles) { + handle_ctx().erase(kv.first); + } + contexts().erase(it); +} + +void mutable_state_begin_load(MutableStateContext ctx) { + if (tl_loading_ctx != kInvalidMutableContext) { + // Nested load scopes would silently overwrite the thread-local association. + // Mark both the already-active and the new context invalid instead. + std::lock_guard g(registry_mutex()); + auto active = contexts().find(tl_loading_ctx); + if (active != contexts().end()) { + active->second.build_error = Error::InvalidState; + } + auto nested = contexts().find(ctx); + if (nested != contexts().end()) { + nested->second.build_error = Error::InvalidState; + } + ET_LOG(Error, "mutable_state: nested load scopes are not supported"); + tl_loading_ctx = kInvalidMutableContext; + return; + } + tl_loading_ctx = ctx; +} + +void mutable_state_end_load() { + tl_loading_ctx = kInvalidMutableContext; +} + +bool mutable_state_available(MutableStateContext ctx) { + if (ctx == kInvalidMutableContext) { + return false; + } + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + return it != contexts().end() && it->second.build_error == Error::Ok && + !it->second.handles.empty(); +} + +int64_t mutable_state_bytes_per_session(MutableStateContext ctx) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + return 0; + } + int64_t total = 0; + for (const auto& kv : it->second.handles) { + const MutableBufferData* bufs = kv.second.default_buffers; + if (bufs == nullptr) { + continue; + } + for (const auto& t : bufs->tensors) { + if (t.has_value()) { + total += static_cast(t->nbytes()); + } + } + } + return total; +} + +Error mutable_state_validate_coverage(MutableStateContext ctx) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + return Error::InvalidArgument; + } + if (it->second.build_error != Error::Ok) { + return it->second.build_error; + } + // MLX clones all mutable buffers by tid; there is no FQN coverage to verify. + return Error::Ok; +} + +Result mutable_state_create_session(MutableStateContext ctx) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + ET_LOG(Error, "mutable_state_create_session: unknown context %d", ctx); + return Error::InvalidState; + } + Context& c = it->second; + if (c.build_error != Error::Ok) { + return c.build_error; + } + if (c.handles.empty()) { + ET_LOG( + Error, "mutable_state_create_session: no backend handles registered"); + return Error::NotSupported; + } + int token = c.next_token++; + // Per-handle buffers are allocated lazily on first execute. + c.sessions[token]; + return token; +} + +void mutable_state_destroy_session(MutableStateContext ctx, int token) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + return; + } + it->second.sessions.erase(token); +} + +void mutable_state_set_active(MutableStateContext ctx, int token) { + tl_active_ctx = ctx; + tl_active_token = token; +} + +} // namespace detail + +void mutable_state_note_handle( + const void* handle, + const MLXProgram* program, + MutableBufferData* default_buffers) { + if (tl_loading_ctx == kInvalidMutableContext) { + return; // No multi-session owner active during this load: single-session. + } + std::lock_guard g(registry_mutex()); + auto it = contexts().find(tl_loading_ctx); + if (it == contexts().end()) { + return; + } + it->second.handles[handle] = HandleInfo{program, default_buffers}; + handle_ctx()[handle] = tl_loading_ctx; +} + +void mutable_state_forget_handle(const void* handle) { + std::lock_guard g(registry_mutex()); + auto hit = handle_ctx().find(handle); + if (hit == handle_ctx().end()) { + return; + } + auto cit = contexts().find(hit->second); + if (cit != contexts().end()) { + cit->second.handles.erase(handle); + for (auto& session : cit->second.sessions) { + session.second.erase(handle); + } + } + handle_ctx().erase(hit); +} + +Error mutable_state_rebind_for_execute( + const void* handle, + ExecutionState& state) { + std::lock_guard g(registry_mutex()); + auto hit = handle_ctx().find(handle); + if (hit == handle_ctx().end()) { + if (tl_active_token != kNoMutableSession) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: active session set but handle has " + "no mutable-state context"); + return Error::Internal; + } + // Handle was not loaded under a multi-session owner: keep default buffers. + return Error::Ok; + } + auto cit = contexts().find(hit->second); + if (cit == contexts().end()) { + return Error::Ok; + } + Context& ctx = cit->second; + if (ctx.build_error != Error::Ok) { + return ctx.build_error; + } + // Invariant: a handle present in handle_ctx() is present in ctx.handles. Look + // it up explicitly (not operator[]) so a broken invariant fails loudly + // instead of inserting a {nullptr, nullptr} entry that later null-derefs in + // load_mutable_buffers(*info.program, ...). + auto info_it = ctx.handles.find(handle); + if (info_it == ctx.handles.end()) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: handle has a context but no " + "registered HandleInfo (invariant broken)"); + return Error::Internal; + } + HandleInfo& info = info_it->second; + + const bool has_active_session = tl_active_token != kNoMutableSession; + const bool active_for_this_ctx = + has_active_session && tl_active_ctx == hit->second; + + // A session is active, but for a different context than the one this handle + // belongs to. Falling back to default buffers would silently execute with the + // wrong model/session state, so refuse instead. + if (has_active_session && !active_for_this_ctx) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: active context mismatch (a session " + "is active for a different loaded program than the one executing)"); + return Error::Internal; + } + + if (!active_for_this_ctx) { + // No session selected. Refuse if sessions exist (running against the + // default buffers here would not isolate state from created sessions). + if (!ctx.sessions.empty()) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: no active session selected but " + "sessions exist for this program"); + return Error::InvalidState; + } + state.mutable_buffers = info.default_buffers; + return Error::Ok; + } + + auto sit = ctx.sessions.find(tl_active_token); + if (sit == ctx.sessions.end()) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: unknown session token %d", + tl_active_token); + return Error::InvalidState; + } + + auto& per_handle = sit->second; + auto bit = per_handle.find(handle); + if (bit == per_handle.end()) { + // First execute for this (session, handle): allocate fresh zeroed buffers. + // Constants/weights stay shared (ExecutionState::constants is untouched); + // only the mutable buffers are per-session. + MutableBufferData buffers; + try { + load_mutable_buffers(*info.program, buffers); + } catch (const std::exception& e) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: failed to allocate session " + "buffers: %s", + e.what()); + return Error::MemoryAllocationFailed; + } + bit = per_handle.emplace(handle, std::move(buffers)).first; + } + // unordered_map keeps element pointers stable across rehash, so this remains + // valid for the duration of the execute. + state.mutable_buffers = &bit->second; + return Error::Ok; +} + +} // namespace mlx +} // namespace backends +} // namespace executorch diff --git a/backends/mlx/runtime/mlx_mutable_state.h b/backends/mlx/runtime/mlx_mutable_state.h new file mode 100644 index 00000000000..84420812360 --- /dev/null +++ b/backends/mlx/runtime/mlx_mutable_state.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +// MLX-private support for running one loaded MLX program with multiple isolated +// instances of its mutable buffers (KV cache, conv/recurrent state). Callers +// create sessions and execute with one active session selected. +// +// Unlike the CUDA backend, the MLX runtime owns mutable buffers directly in a +// swappable container (ExecutionState::mutable_buffers is a +// MutableBufferData*), so per-session isolation is a pointer swap to a fresh +// MutableBufferData — no FQN registration / constant-repoint hook is needed. + +namespace executorch { +namespace backends { +namespace mlx { + +// Forward declarations (defined in MLXLoader.h / MLXExecutor.h). +struct MLXProgram; +struct MutableBufferData; +struct ExecutionState; + +// Opaque per-loaded-program context id (0 = invalid). +using MutableStateContext = int; +constexpr MutableStateContext kInvalidMutableContext = 0; + +// Sentinel for execution without per-session rebinding. +constexpr int kNoMutableSession = -1; + +// Implementation entry points. Callers should use MutableStateContextOwner. +namespace detail { + +MutableStateContext mutable_state_create_context(); +void mutable_state_destroy_context(MutableStateContext ctx); +void mutable_state_begin_load(MutableStateContext ctx); +void mutable_state_end_load(); +bool mutable_state_available(MutableStateContext ctx); +int64_t mutable_state_bytes_per_session(MutableStateContext ctx); +::executorch::runtime::Error mutable_state_validate_coverage( + MutableStateContext ctx); +::executorch::runtime::Result mutable_state_create_session( + MutableStateContext ctx); +void mutable_state_destroy_session(MutableStateContext ctx, int token); +void mutable_state_set_active(MutableStateContext ctx, int token); + +} // namespace detail + +// Caller-facing owner for one mutable-state context. Mirrors the CUDA backend's +// MutableStateContextOwner so the example engine can use a symmetric API. +class ET_EXPERIMENTAL MutableStateContextOwner final { + class LoadScope final { + public: + explicit LoadScope(MutableStateContext ctx) { + detail::mutable_state_begin_load(ctx); + } + + ~LoadScope() { + detail::mutable_state_end_load(); + } + + LoadScope(const LoadScope&) = delete; + LoadScope& operator=(const LoadScope&) = delete; + }; + + class ActiveSessionScope final { + public: + ActiveSessionScope(MutableStateContext ctx, int token) { + detail::mutable_state_set_active(ctx, token); + } + + ~ActiveSessionScope() { + detail::mutable_state_set_active( + kInvalidMutableContext, kNoMutableSession); + } + + ActiveSessionScope(const ActiveSessionScope&) = delete; + ActiveSessionScope& operator=(const ActiveSessionScope&) = delete; + }; + + public: + MutableStateContextOwner() : ctx_(detail::mutable_state_create_context()) {} + + ~MutableStateContextOwner() { + destroy(); + } + + MutableStateContextOwner(const MutableStateContextOwner&) = delete; + MutableStateContextOwner& operator=(const MutableStateContextOwner&) = delete; + + MutableStateContextOwner(MutableStateContextOwner&& other) noexcept + : ctx_(std::exchange(other.ctx_, kInvalidMutableContext)) {} + + MutableStateContextOwner& operator=( + MutableStateContextOwner&& other) noexcept { + if (this != &other) { + destroy(); + ctx_ = std::exchange(other.ctx_, kInvalidMutableContext); + } + return *this; + } + + MutableStateContext get() const { + return ctx_; + } + + explicit operator bool() const { + return ctx_ != kInvalidMutableContext; + } + + // Associates delegate handles created by `fn` with this context. + template + auto with_load_scope(Fn&& fn) const -> decltype(std::forward(fn)()) { + LoadScope scope(ctx_); + return std::forward(fn)(); + } + + // Selects this context/session while `fn` executes. The caller is responsible + // for serializing execution that touches the same loaded program. + // + // Thread-safety contract: destroy_session()/forget_handle() only take the + // registry mutex, while rebind (under with_active_session) hands execute a + // raw pointer into Context::sessions that is dereferenced after the lock is + // released. The caller must therefore guarantee a session is never destroyed + // while it is the active session mid-execute (the engine upholds this: a + // session's buffers are freed only when its owning LLMSession drops, never + // concurrently with its own execute). Destroying *other* sessions + // concurrently is safe — unordered_map keeps element pointers stable across + // rehash. + template + auto with_active_session(int token, Fn&& fn) const + -> decltype(std::forward(fn)()) { + ActiveSessionScope scope(ctx_, token); + return std::forward(fn)(); + } + + // True only after this context has been associated with at least one loaded + // MLX backend handle can create isolated mutable-buffer sessions. + bool available() const { + return detail::mutable_state_available(ctx_); + } + + int64_t bytes_per_session() const { + return detail::mutable_state_bytes_per_session(ctx_); + } + + ::executorch::runtime::Error validate_coverage() const { + return detail::mutable_state_validate_coverage(ctx_); + } + + // Creates an isolated mutable-buffer session for this context. + // Fails if no loaded MLX backend handle has been associated with the context. + ET_NODISCARD ::executorch::runtime::Result create_session() const { + return detail::mutable_state_create_session(ctx_); + } + + void destroy_session(int token) const { + detail::mutable_state_destroy_session(ctx_, token); + } + + private: + void destroy() { + if (ctx_ != kInvalidMutableContext) { + detail::mutable_state_destroy_context(ctx_); + ctx_ = kInvalidMutableContext; + } + } + + MutableStateContext ctx_ = kInvalidMutableContext; +}; + +// --- MLXBackend hooks -------------------------------------------------------- +// +// Called from MLXBackend init/execute/destroy. `handle` is an opaque key (the +// MLXHandle pointer). `program` and `default_buffers` are the handle's own +// program and (init-time) mutable buffers; the manager swaps in per-session +// buffers (or restores the default) by re-pointing `state.mutable_buffers`. + +void mutable_state_note_handle( + const void* handle, + const MLXProgram* program, + MutableBufferData* default_buffers); + +void mutable_state_forget_handle(const void* handle); + +::executorch::runtime::Error mutable_state_rebind_for_execute( + const void* handle, + ExecutionState& state); + +} // namespace mlx +} // namespace backends +} // namespace executorch diff --git a/backends/mlx/test/CMakeLists.txt b/backends/mlx/test/CMakeLists.txt index 39024639d1d..2d494652138 100644 --- a/backends/mlx/test/CMakeLists.txt +++ b/backends/mlx/test/CMakeLists.txt @@ -69,3 +69,22 @@ if(EXECUTORCH_MLX_ENABLE_SANITIZERS) multi_thread_test_runner PRIVATE ${_mlx_sanitizer_link_options} ) endif() + +# Per-session mutable-state manager unit test (no model/tokenizer needed). +add_executable(mlx_mutable_state_test mlx_mutable_state_test.cpp) +target_include_directories( + mlx_mutable_state_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../runtime +) +target_link_libraries( + mlx_mutable_state_test PRIVATE mlxdelegate mlx_schema mlx executorch_core +) +if(EXECUTORCH_MLX_ENABLE_SANITIZERS) + target_compile_options( + mlx_mutable_state_test PRIVATE -fsanitize=address,undefined + -fno-omit-frame-pointer + ) + target_link_options( + mlx_mutable_state_test PRIVATE ${_mlx_sanitizer_link_options} + ) +endif() +add_test(NAME mlx_mutable_state COMMAND mlx_mutable_state_test) diff --git a/backends/mlx/test/mlx_mutable_state_test.cpp b/backends/mlx/test/mlx_mutable_state_test.cpp new file mode 100644 index 00000000000..99a646701ef --- /dev/null +++ b/backends/mlx/test/mlx_mutable_state_test.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Unit test for the MLX per-session mutable-state manager +// (backends/mlx/runtime/mlx_mutable_state.{h,cpp}). +// +// Verifies that two sessions created on one loaded program get independent +// mutable buffers: writing into session A's buffer does not leak into session +// B's, and A's value persists across a rebind to B and back. This is the MLX +// analogue of the CUDA "no-bleed" guarantee, exercised directly on the manager +// (no model or tokenizer needed). + +#include "MLXExecutor.h" +#include "MLXLoader.h" +#include "mlx_mutable_state.h" + +#include + +#include + +using namespace ::executorch::backends::mlx; + +namespace { + +int g_failures = 0; + +#define CHECK(cond) \ + do { \ + if (!(cond)) { \ + std::printf("FAIL: %s (line %d)\n", #cond, __LINE__); \ + ++g_failures; \ + } \ + } while (0) + +// Build a minimal program with a single 1-element float mutable buffer at tid +// 0. +MLXProgram make_program() { + MLXProgram program; + program.num_mutable_buffer_tensors = 1; + program.mutable_buffer_map.push_back(SlotVariant{0, SlotType::TensorSlot}); + TensorMeta meta; + meta.shape.push_back(ShapeDim{/*value=*/1}); + meta.scalar_type = ScalarType::Float; + program.tensor_meta.resize(1); + program.tensor_meta[0] = meta; + return program; +} + +float read0(const MutableBufferData& bufs) { + auto arr = bufs.get(Tid{0}); + ::mlx::core::eval(arr); + return arr.item(); +} + +} // namespace + +int main() { + MLXProgram program = make_program(); + + // Handle's default (init-time) mutable buffers. + MutableBufferData default_bufs; + load_mutable_buffers(program, default_bufs); + + int dummy = 0; + const void* handle = &dummy; + + MutableStateContextOwner owner; + CHECK(static_cast(owner)); + + // Associate the handle with the context (as MLXBackend::init would). + owner.with_load_scope( + [&]() { mutable_state_note_handle(handle, &program, &default_bufs); }); + + CHECK(owner.available()); + CHECK(owner.bytes_per_session() == static_cast(sizeof(float))); + + auto tokA = owner.create_session(); + auto tokB = owner.create_session(); + CHECK(tokA.ok()); + CHECK(tokB.ok()); + CHECK(tokA.get() != tokB.get()); + + ExecutionState state; + + // Session A: rebind, then write a marker (7.0) into its buffer. + owner.with_active_session(tokA.get(), [&]() { + auto err = mutable_state_rebind_for_execute(handle, state); + CHECK(err == ::executorch::runtime::Error::Ok); + state.mutable_buffers->set( + Tid{0}, ::mlx::core::full({1}, 7.0f, ::mlx::core::float32)); + return err; + }); + + // Session B: a fresh rebind must see zeros, not A's marker. + owner.with_active_session(tokB.get(), [&]() { + auto err = mutable_state_rebind_for_execute(handle, state); + CHECK(err == ::executorch::runtime::Error::Ok); + CHECK(read0(*state.mutable_buffers) == 0.0f); + return err; + }); + + // Back to session A: the marker must persist (isolation, no bleed). + owner.with_active_session(tokA.get(), [&]() { + auto err = mutable_state_rebind_for_execute(handle, state); + CHECK(err == ::executorch::runtime::Error::Ok); + CHECK(read0(*state.mutable_buffers) == 7.0f); + return err; + }); + + // With sessions present, executing without an active session is refused + // (prevents running against unmanaged/shared state). + { + auto err = mutable_state_rebind_for_execute(handle, state); + CHECK(err == ::executorch::runtime::Error::InvalidState); + } + + owner.destroy_session(tokA.get()); + owner.destroy_session(tokB.get()); + mutable_state_forget_handle(handle); + + if (g_failures == 0) { + std::printf("OK: mlx_mutable_state isolation test passed\n"); + return 0; + } + std::printf("FAILED: %d checks\n", g_failures); + return 1; +} diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py index 8f52116f6b8..e96c8075903 100644 --- a/backends/mlx/test/test_ops.py +++ b/backends/mlx/test/test_ops.py @@ -405,6 +405,60 @@ def create_inputs(self) -> Tuple[torch.Tensor, ...]: return (x,) +class LeakyReLUModel(nn.Module): + """Model that applies leaky_relu with an optional negative slope.""" + + def __init__(self, negative_slope: Optional[float] = 0.01): + super().__init__() + self.negative_slope = negative_slope + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.negative_slope is None: + return torch.nn.functional.leaky_relu(x) + return torch.nn.functional.leaky_relu(x, negative_slope=self.negative_slope) + + +@register_test +class LeakyReLUTest(OpTestCase): + """Test case for leaky_relu activation with various negative slopes.""" + + name = "leaky_relu" + rtol = 1e-5 + atol = 1e-5 + + def __init__( + self, + shape: Tuple[int, ...] = (2, 3, 4), + negative_slope: Optional[float] = 0.01, + ): + self.shape = shape + self.negative_slope = negative_slope + shape_str = "x".join(str(s) for s in shape) + slope_str = "default" if negative_slope is None else f"slope{negative_slope}" + self.name = f"leaky_relu_{slope_str}_{shape_str}" + + @classmethod + def get_test_configs(cls) -> List["LeakyReLUTest"]: + return [ + cls(shape=(2, 3, 4), negative_slope=0.01), + cls(shape=(2, 3, 4), negative_slope=None), + cls(shape=(4, 8), negative_slope=0.1), + cls(shape=(10,), negative_slope=0.2), + cls(shape=(10,), negative_slope=1.5), + cls(shape=(2, 8, 16), negative_slope=0.01), + ] + + def create_model(self) -> nn.Module: + return LeakyReLUModel(self.negative_slope) + + def create_inputs(self) -> Tuple[torch.Tensor, ...]: + numel = 1 + for size in self.shape: + numel *= size + x = torch.linspace(-4.0, 4.0, steps=numel).reshape(self.shape) + return (x,) + + class GELUModel(nn.Module): """Simple model using GELU activation.""" diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py index ff8cbb660cb..9df8290e85d 100644 --- a/backends/nxp/backend/edge_program_converter.py +++ b/backends/nxp/backend/edge_program_converter.py @@ -74,13 +74,16 @@ class EdgeProgramToIRConverter: _default_target_spec = NeutronTargetSpec("imxrt700") _default_delegation_options = CustomDelegationOptions() + def __init__(self): + self.edge_to_tflite_map = {} + def convert_program( self, edge_program: ExportedProgram, conversion_config: ConversionConfig = _default_conversion_config, neutron_target_spec: NeutronTargetSpec = _default_target_spec, custom_delegation_options: CustomDelegationOptions = _default_delegation_options, - ) -> tuple[bytes, dict[str, dict[str, DataFormat]]]: + ) -> tuple[bytes, dict[str, dict[str, DataFormat]], dict[int, tuple[int, ...]]]: """ Convert ExportedProgram in Edge dialect to IR (TFLite flatbuffers) as bytes. @@ -88,8 +91,11 @@ def convert_program( :param conversion_config: ConversionConfig instance. :param neutron_target_spec: Object for querying the target platform to retrieve its properties. :param custom_delegation_options: Custom user options which affect node delegation. - :return: TFLite flatbuffers as bytes. + :return: TFLite flatbuffers as bytes, I/O formats, and edge-to-tflite mapping. """ + # Reset the edge to tflite map for each conversion + self.edge_to_tflite_map = {} + parameters_mapping = self.map_inputs_to_parameters(edge_program) dim_order_map = self.map_nodes_to_dim_order(edge_program) @@ -113,6 +119,9 @@ def convert_program( # Apply optimizations and finalize the model. internal_tflite_model = cc.tflite_builder.finish() + # Get the final edge to tflite mapping after optimization + self.edge_to_tflite_map = cc.tflite_builder.edge_to_tflite_map + # Extract the formats of the model's inputs and outputs. io_formats = cc.tflite_builder.get_io_formats(edge_program.graph_signature) @@ -120,7 +129,7 @@ def convert_program( flatbuffers_builder = flatbuffers.Builder() internal_tflite_model.gen_tflite(flatbuffers_builder) - return bytes(flatbuffers_builder.Output()), io_formats + return bytes(flatbuffers_builder.Output()), io_formats, self.edge_to_tflite_map @staticmethod def append_placeholders_and_tensors(nodes: list[Node], context: ConversionContext): @@ -162,7 +171,6 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, ] - for node in nodes: if node.op == "call_function": if node.target in qdq_related_functions and "cluster" in node.meta: @@ -174,7 +182,37 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex # The node was already processed alongside the Q/DQ ops. pass elif node.target in functions_converters: + # Get TFLite op count BEFORE conversion + tflite_op_count_before = len( + conversion_context.tflite_builder.get_operators().vector + ) + # Convert the node functions_converters[node.target](conversion_context).convert(node) + # Get TFLite op count AFTER conversion + tflite_op_count_after = len( + conversion_context.tflite_builder.get_operators().vector + ) + + # Track the mapping - store edge debug handle in operators. + # Get the edge debug handle so it can be associated with newly created operators. + edge_debug_handle = node.meta.get("debug_handle", None) + if ( + edge_debug_handle is not None + and tflite_op_count_after > tflite_op_count_before + ): + operators = ( + conversion_context.tflite_builder.get_operators().vector + ) + # Node converters append new operators to the TFLite builder. + # Only operators added during this conversion step (from "before" to "after") + # are tagged with the current edge_debug_handle. + for i in range(tflite_op_count_before, tflite_op_count_after): + # Store edge debug handle in operator's temporary attribute + operators[i].tmp_edge_debug_handle = edge_debug_handle + logger.d( + f"Tagged TFLite ops {list(range(tflite_op_count_before, tflite_op_count_after))} with edge debug_handle={edge_debug_handle} for node '{node.name}'" + ) + else: logger.e( logger.Code.NOT_IMPLEMENTED, diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py index f97a194ce87..41820c3ab61 100755 --- a/backends/nxp/backend/ir/converter/builder/model_builder.py +++ b/backends/nxp/backend/ir/converter/builder/model_builder.py @@ -85,6 +85,10 @@ class ModelBuilder: conversion_config: ConversionConfig + edge_to_tflite_map: dict[ + int, tuple[int, ...] + ] # Mapping edge debug handles to tuple of TFLite operator indices + _default_conversion_config = ConversionConfig() def __init__( @@ -105,6 +109,7 @@ def __init__( self._nchw_tensor_version = {} self._skipped_output_map = {} self._zeros_tensor_map = {} + self.edge_to_tflite_map = {} def create_zeros_tensor( self, dims: List[int], name: str, dtype: np.dtype, can_reuse: bool = False @@ -503,6 +508,9 @@ def finish(self) -> tflite_model.Model: self.conversion_config.optimization_blacklist, ) + # Create the final edge-to-tflite mapping after model optimization + self._create_edge_to_tflite_mapping() + self._keep_one_empty_buffer() # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference. @@ -524,6 +532,29 @@ def finish(self) -> tflite_model.Model: return self._tfl_model + def _create_edge_to_tflite_mapping(self): + """Create edge-to-TFLite mapping and save it to the edge_to_tflite_map class variable. + + This function should be called after all model optimizations have been applied to match the output TFLite model. + """ + + edge_to_tflite_dict = {} + for idx, op in enumerate(self.get_operators().vector): + if ( + hasattr(op, "tmp_edge_debug_handle") + and op.tmp_edge_debug_handle is not None + ): + debug_handle = op.tmp_edge_debug_handle + if debug_handle not in edge_to_tflite_dict: + edge_to_tflite_dict[debug_handle] = [] + edge_to_tflite_dict[debug_handle].append(idx) + + # Convert lists to tuples in the dictionary + self.edge_to_tflite_map = {k: tuple(v) for k, v in edge_to_tflite_dict.items()} + logger.i( + f"\nFinal edge_to_tflite_map after optimization: {self.edge_to_tflite_map}" + ) + def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool): for tensor in outputs.tmp_outputs: try: diff --git a/backends/nxp/backend/ir/tflite_generator/tflite_model.py b/backends/nxp/backend/ir/tflite_generator/tflite_model.py index 6e8e7b6c33b..d8d0bada57d 100755 --- a/backends/nxp/backend/ir/tflite_generator/tflite_model.py +++ b/backends/nxp/backend/ir/tflite_generator/tflite_model.py @@ -514,6 +514,9 @@ class Operator(meta.TFLiteObject): # If `True`, this is an extra operator added during conversion. It was not present in the original input model. tmp_added_extra: bool + # Edge program debug handle for mapping edge nodes to TFLite operators + tmp_edge_debug_handle: Optional[int] + def __init__( self, inputs: OperatorInputs = None, @@ -541,6 +544,8 @@ def __init__( self.tmp_version = 1 self.tmp_added_extra = False + self.tmp_edge_debug_handle = None + def uses_per_channel_quantization(self) -> bool: """Determine if this operator uses per-channel quantization.""" for tensor in itertools.chain(self.tmp_inputs, self.tmp_outputs): diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py index 0abee0cdc86..92b4e25a5de 100644 --- a/backends/nxp/backend/neutron_converter_manager.py +++ b/backends/nxp/backend/neutron_converter_manager.py @@ -25,6 +25,15 @@ def _build_compilation_context(compilation_opts): cctx.compilationOpts.dumpKernelSelectionCode = compilation_opts[ "dumpKernelSelectionCode" ] + if ( + hasattr(cctx.compilationOpts, "useProfiling") + and compilation_opts["useProfiling"] + ): + cctx.compilationOpts.useProfiling = compilation_opts["useProfiling"] + cctx.compilationOpts.dumpAfterImport = "console" + cctx.compilationOpts.dumpAfterGenerate = "console" + cctx.compilationOpts.verbose = compilation_opts["useProfiling"] + return cctx @@ -81,6 +90,7 @@ def convert( target: str, delegation_tag: str, fetch_constants_to_sram: bool = False, + use_profiling: bool = False, ) -> bytes: """ Call Neutron Converter. @@ -89,6 +99,7 @@ def convert( :param target: The target platform. :param delegation_tag: The delegation tag of model partition. :param fetch_constants_to_sram: Add microcode that fetches weights from external memory. + :param use_profiling: Use profiling for neutron delegated model. This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers). :return: TFLite model with Neutron microcode as bytes. @@ -102,6 +113,7 @@ def convert( "excludeGraphPasses": "HoistSliceAboveTranspose,MergeTranspose", "fetchConstantsToSRAM": fetch_constants_to_sram, "dumpKernelSelectionCode": self.dump_kernel_selection_code, + "useProfiling": use_profiling, } # Try to use multiprocessing for isolation, but fall back to direct execution diff --git a/backends/nxp/backend/neutron_map.py b/backends/nxp/backend/neutron_map.py new file mode 100644 index 00000000000..e2da653daa3 --- /dev/null +++ b/backends/nxp/backend/neutron_map.py @@ -0,0 +1,457 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import logging +import re +from dataclasses import dataclass + +# example: Type: CONV_2D +# Inputs: +# [0]: quantized_decomposed_quantize_per_tensor_default_4 +# [1]: quantized_decomposed_dequantize_per_channel_default_2 +# Outputs: +# [0]: quantized_decomposed_quantize_per_tensor_default_5 +# Location: 4 +PATTERN_NODE = ( + r"Type:\s+(?P\w+)\s+" + r"Inputs:(?P[\s\S]*?)" + r"Outputs:(?P[\s\S]*?)" + r"Location:\s+(?P\d+)" +) +# The pattern is very similar to operator pattern +PATTERN_SUBGRAPH = ( + r"^(?P\d+)\s*" + r"Inputs:(?P[\s\S]*?)" + r"Outputs:(?P[\s\S]*?)" + r"Tensors:" +) +# example: [0]: quantized_decomposed_quantize_per_tensor_default_4 +PATTERN_IO_TENSOR_NAME = r"\[\d+\]:\s+(?P[\S]+)" +# example: Statistics for NeutronGraph "subgraph_195": +PATTERN_GRAPH = r"Statistics for NeutronGraph \"subgraph_(?P\d+)\":" +# example: NeutronOperator "subgraph_001": +# Operators: +# PAD +# CONV_2D +# Kernels: +# Pad +# Conv2DStandardV2 +# NeutronOperator "subgraph_002": +PATTERN_VERBOSE_KERNELS = ( + r"\"subgraph_(?P\d+)\"\:\s*" + r"Operators:[\s\S]*?" + r"Kernels:\s*(?P[\s\S]*?)" + r"\s*(NeutronOperator|^$|=)" +) +# example: NeutronGraph "subgraph_074": +PATTERN_VERBOSE_GRAPH = ( + r"NeutronGraph\s*\"subgraph_(?P\d+)\":(?P[\s\S]*?)\s*(^$|=)" +) +# Two graphs are expected in the input log: original and converted. +EXPECTED_GRAPHS = 2 +# List of single-input nodes that shouldn't be mapped on the same TFLite node. +SINGLE_INPUT_NODES = [ + "ABS", + "AVERAGE_POOL_2D", + "CAST", + "EXP", + "HARD_SWISH", + "LEAKY_RELU", + "LOG", + "LOGISTIC", + "MAX_POOL_2D", + "QUANTIZE", + "RSQRT", + "TANH", +] + + +@dataclass +class Node: + name: str # Name of the node. + inputs: list[str] # List of nodes inputs. + outputs: list[str] # List of nodes outputs. + location: int # Location in graph/subgraph. + + +@dataclass +class SubgraphInfo: + num: int # Subgraph number. + location: int # Location in neutron graph + inputs: list[str] # List of subgraphs inputs. + outputs: list[str] # List of subgraphs outputs. + kernels: int # Number of neutron kernels in neutron subgraph. + nodes: list[Node] # List of tflite nodes in neutron subgraph. + + +def get_tensors_name(tensors: str) -> list[str]: + """Split input string with tensor names into list of names""" + return [m.group("name") for m in re.finditer(PATTERN_IO_TENSOR_NAME, tensors)] + + +class NeutronMap: + """Mapping between Neutron, TFLite, and Edge operators based on the Neutron converter log. + + Parses the Neutron converter log to extract information about TFLite nodes and Neutron subgraphs. + Maps TFLite operators to corresponding Neutron operators. + Maps Edge operators to Neutron operators via the Edge-to-TFLite mapping. + + Attributes: + tflite_nodes (list[Node]): TFLite node information extracted from the converter log. + neutron_subgraphs (list[SubgraphInfo]): Neutron subgraph information extracted from the converter log. + neutron_graphs (list[int]): Indices of final Neutron graphs derived from neutron_subgraphs. + edge_to_tflite_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to TFLite operators. + edge_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to Neutron operators. + tflite_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from TFLite operators to Neutron operators. + + Example: + >>> map = NeutronMap(log_output, edge_to_tflite_map) + >>> neutron_to_edge_map = map.get_neutron_to_edge_map() + """ + + tflite_nodes: list[Node] + neutron_subgraphs: list[SubgraphInfo] + neutron_graphs: list[int] + edge_to_tflite_map: dict[int, tuple[int, ...]] + edge_to_neutron_map: dict[int, tuple[int, ...]] + tflite_to_neutron_map: dict[int, tuple[int, ...]] + + def __init__( + self, neutron_converter_log: str, edge_to_tflite_map: dict[int, tuple[int, ...]] + ) -> None: + """Initialize neutron map from neutron converter log. + + :param neutron_converter_log: neutron converter log obtained during model conversion. It should contain + original tflite graph and neutron graph dump. To add these dumps to converter log the dumpAfterImport and + dumpAfterGenerate flags have to be set to "console". + """ + super().__init__() + self.tflite_nodes = [] + self.neutron_subgraphs = [] + self.neutron_graphs = [] + self.edge_to_tflite_map = edge_to_tflite_map + self.tflite_to_neutron_map = {} + self.edge_to_neutron_map = {} + self.neutron_kernels_num = 0 + self._split_profiling_log(neutron_converter_log) + + def _split_profiling_log(self, log: str) -> None: + """Process profiling log to split it into original TFLite and converted Neutron nodes. + + :param log: Neutron converter log obtained during model conversion, containing the original + TFLite graph and Neutron graph dump. + :return: None. Sets class attributes tflite_nodes and neutron_subgraphs with node information. + """ + graphs = log.split("Graphs:") + # Check if there is two graphs in the input dump + if len(graphs) != EXPECTED_GRAPHS + 1: + return + optimization_dump, neutron_graph_dump = graphs[1:] + + # Get tflite model dump + tflite_graph_dump = optimization_dump.partition("= Optimize Graph =")[0] + + # Get verbose Neutron graphs located in the Extract Graphs section. + extracted_graph_dump = optimization_dump.partition("= Extract Graphs =")[ + 2 + ].partition("Generate code for NeutronGraph")[0] + + # Get list of original operators from first dumped graph. + self.tflite_nodes = [ + Node( + matched_operator.group("type"), + get_tensors_name(matched_operator.group("inputs")), + get_tensors_name(matched_operator.group("outputs")), + int(matched_operator.group("location")), + ) + for matched_operator in re.finditer(PATTERN_NODE, tflite_graph_dump) + ] + # Get list of neutron subgraphs. + self.neutron_subgraphs = self._get_neutron_subgraphs(neutron_graph_dump) + if self.neutron_subgraphs: + self._update_neutron_subgraphs_info(extracted_graph_dump) + + def _get_neutron_subgraphs(self, graph_dump: str) -> list[SubgraphInfo]: + """Parse Neutron graph dump and extract subgraph information. + + :param graph_dump: String containing the Neutron graph dump from the converter log. + :return: List of SubgraphInfo objects containing subgraph metadata and operator nodes. + """ + + def get_subgraph_nodes(subrgraph_dump: str) -> list[Node]: + """Parse subgraph dump and extract operator nodes. + + :param subgraph_dump: String containing a single Neutron subgraph definition. + :return: List of Node objects representing operators in the subgraph. + """ + return [ + Node( + matched_operator.group("type"), + get_tensors_name(matched_operator.group("inputs")), + get_tensors_name(matched_operator.group("outputs")), + int(matched_operator.group("location")), + ) + for matched_operator in re.finditer(PATTERN_NODE, subrgraph_dump) + ] + + subgraphs = graph_dump.split(r"Name: subgraph_") + if len(subgraphs) < 3: + return [] + + # Get numbers of final neutron graphs in converted model. + self.neutron_graphs = [ + int(matched_graphs.group("num")) + for matched_graphs in re.finditer(PATTERN_GRAPH, subgraphs[-1]) + ] + if not self.neutron_graphs: + return [] + + # Get subgraphs + neutron_subgraphs: list[SubgraphInfo] = [] + for subgraph in subgraphs[1:]: + subgraph_match = re.search(PATTERN_SUBGRAPH, subgraph) + if not subgraph_match: + continue + neutron_subgraph = SubgraphInfo( + int(subgraph_match.group("num")), + -1, + get_tensors_name(subgraph_match.group("inputs")), + get_tensors_name(subgraph_match.group("outputs")), + 0, + get_subgraph_nodes(subgraph), + ) + neutron_subgraphs.append(neutron_subgraph) + return neutron_subgraphs + + def _update_neutron_subgraphs_info(self, extracted_graph: str) -> None: + """Update Neutron subgraphs with verbose info. + + - Set numbers of Neutron kernels in each Neutron subgraph. 99% of subgraphs contain only one Neutron kernel, + but there are some exceptions and some subgraphs can have more kernels. This number can be taken from + final Neutron graph info. + - Set Neutron subgraphs location in the final Neutron Graph. The function updates the location parameter + for each Neutron subgraph according to its position in the final Neutron graph. Location is calculated + continuously across all Neutron graphs in the model. Non-Neutron operators are skipped. + + :param extracted_graph: verbose Neutron graph dump. + """ + # Neutron graphs. + neutron_graphs = extracted_graph.split("NeutronGraph") + location_shift = 0 + for neutron_graph in neutron_graphs: + + subgraph_nodes = { + int(matched_subgraph.group("subgraph")): { + "location": i + location_shift, + "kernels": [ + kernel.replace(" ", "") + for kernel in matched_subgraph.group("kernels").split("\n") + if kernel.strip() + ], + } + for i, matched_subgraph in enumerate( + re.finditer(PATTERN_VERBOSE_KERNELS, neutron_graph) + ) + } + if not subgraph_nodes: + continue + # Update location offset according to the number of kernels in the subgraph. + location_shift += len(subgraph_nodes) + + # Neutron graphs. + graph_num = -1 + matched_graph = re.search(r"subgraph_(?P\d+)", neutron_graph) + if matched_graph: + graph_num = int(matched_graph.group("subgraph")) + + # Update number of kernels for all subgraphs. + for subgraph in self.neutron_subgraphs: + if subgraph.num in subgraph_nodes: + subgraph.kernels = len(subgraph_nodes[subgraph.num]["kernels"]) + subgraph.location = subgraph_nodes[subgraph.num]["location"] + elif subgraph.num == graph_num: + subgraph.kernels = sum( + len(s["kernels"]) for s in subgraph_nodes.values() + ) + self.neutron_kernels_num += subgraph.kernels + + def _nodes_match_by_io(self, tf_node: Node, neutron_node: Node) -> bool: + """ + Determine whether a TFLite node can be mapped to a Neutron node + based on their input and output compatibility. + + :param tf_node: Source TFLite node. + :param neutron_node: Target Neutron node. + :return: True if the nodes can be considered mapped, False otherwise. + """ + + def get_name_matches(tf_names: list[str], neutron_names: list[str]) -> int: + # Count how many names from tf_names have a corresponding match in + # neutron_names. A match is defined as: + # - exact equality, or + # - one name being a hierarchical variant of the other + # (i.e., sharing a common prefix separated by "/"). + result = 0 + for tf_name in tf_names: + # Determine if the tensor name corresponds to a special operation input. + # Matches names like "perm0", "perm1", etc. used by Transpose ops, + # and names like "padding0", "padding1", etc. used by Pad ops. + special_op = ( + "permutation" + if re.fullmatch(r"perm(\d+)?", tf_name) + else ( + "padding" + if re.fullmatch(r"padding(s)?(\d+)?", tf_name) + else None + ) + ) + for neutron_name in neutron_names: + if ( + neutron_name == tf_name + or neutron_name + "/" in tf_name + or tf_name + "/" in neutron_name + ): + result += 1 + break + + # Check if the neutron input is also the special op (Pad or Transpose) + if special_op and special_op in neutron_name: + result += 1 + break + return result + + name_matches = get_name_matches(tf_node.inputs, neutron_node.inputs) + # Map the node if all TFLite inputs match Neutron inputs. + # Note: the Neutron node may still have additional extra inputs. + if name_matches == len(tf_node.inputs): + return True + elif name_matches == len(tf_node.inputs) - 1: + # If there is only one unmatched input, check matching of outputs. + name_matches = get_name_matches(tf_node.outputs, neutron_node.outputs) + if name_matches == len(tf_node.outputs): + # Map the node if all TFLite outputs match Neutron outputs. + return True + return False + + def get_tflite_to_neutron_map(self) -> dict[int, tuple[int, ...]]: + """Map TFLite nodes from the original model to Neutron nodes in the converted model. + + The mapping is built based on input and output tensor names. Neutron tensors may have + exactly the same names or use the format "tflite_input/additional_name". + + :return: Dictionary mapping TFLite node indices to tuple of Neutron subgraph indices. + """ + tflite_to_neutron_dict = {} + for tf_idx, tf_node in enumerate(self.tflite_nodes): + subgraph_idxs = [] + for subgraph in self.neutron_subgraphs: + if ( + subgraph.num in self.neutron_graphs + or subgraph.location in subgraph_idxs + ): + continue + for neutron_node in subgraph.nodes: + if self._nodes_match_by_io(tf_node, neutron_node): + subgraph_idxs.append(subgraph.location) + break + # Filter subgraph_idxs to avoid mapping multiple parallel single-input nodes that consume the + # same input tensor into the same TFLite node. + subgraph_idxs = self._filter_single_input_nodes(tf_node.name, subgraph_idxs) + if subgraph_idxs: + tflite_to_neutron_dict[tf_idx] = tuple(subgraph_idxs) + + self.tflite_to_neutron_map = tflite_to_neutron_dict + return self.tflite_to_neutron_map + + def _filter_single_input_nodes( + self, node_name: str, subgraph_loc: list[int] + ) -> list[int]: + """ + Filter the Neutron-to-TFLite mapping to avoid mapping multiple parallel single-input nodes + that consume the same input tensor to a single TFLite node. + + The function checks whether the current TFLite node is a supported single-input node + (as defined in SINGLE_INPUT_NODES) and whether it is mapped to multiple Neutron nodes. + In such cases, it is possible that parallel single-input Neutron nodes were incorrectly + mapped to the same TFLite node. + + If more than one single-input Neutron node is mapped, only one is kept in the mapping: + the Neutron node whose operation name matches the operation name of the current TFLite node. + + :param node_name: Operation name of the current TFLite node. + :param subgraph_loc: List of Neutron subgraph indices whose inputs correspond to the + input of the current TFLite node. + :return: Filtered list of Neutron subgraph indices to be mapped to the current TFLite node. + """ + # Check if there can be potential issue in mapping. + if node_name in SINGLE_INPUT_NODES and len(subgraph_loc) > 1: + single_in_nodes = [] + # Find all single-input nodes in subgraph_idxs. + subgraphs = ( + subgraph + for subgraph in self.neutron_subgraphs + if subgraph.location in subgraph_loc + ) + for subgraph in subgraphs: + for neutron_node in subgraph.nodes: + if neutron_node.name in SINGLE_INPUT_NODES: + single_in_nodes.append((subgraph.location, neutron_node.name)) + if len(single_in_nodes) > 0: + # Keep only the node with the matching name when multiple single-input nodes are present in subgraph_idxs. + for subgraph_id, single_in_node_name in single_in_nodes: + if single_in_node_name == node_name: + return [subgraph_id] + return [] + return subgraph_loc + + def get_edge_to_neutron_map(self) -> dict[int, tuple[int, ...]]: + """Map Edge nodes to Neutron nodes. + + :return: Dictionary mapping Edge node handles to tuple of Neutron subgraph indices. + """ + self.get_tflite_to_neutron_map() + edge_to_neutron_dict = {} + + for edge_handle, tflite_indices in self.edge_to_tflite_map.items(): + neutron_nodes = set() + for tf_node in tflite_indices: + if tf_node in self.tflite_to_neutron_map: + neutron_nodes.update(self.tflite_to_neutron_map[tf_node]) + if neutron_nodes: + edge_to_neutron_dict[edge_handle] = tuple(neutron_nodes) + + self.edge_to_neutron_map = edge_to_neutron_dict + return self.edge_to_neutron_map + + def get_neutron_to_edge_map(self) -> dict[int, tuple[int, ...]]: + """ + Transform edge-to-neutron map to neutron-to-edge map. + + :return: Dictionary mapping neutron_index to tuple of edge_handles + """ + if not self.edge_to_neutron_map: + _ = self.get_edge_to_neutron_map() + + neutron_to_edge = {} + + for edge_handle, neutron_indices in self.edge_to_neutron_map.items(): + for neutron_idx in neutron_indices: + if neutron_idx not in neutron_to_edge: + neutron_to_edge[neutron_idx] = [] + neutron_to_edge[neutron_idx].append(edge_handle) + + # Fill gaps with empty tuples and convert lists to tuples. + if neutron_to_edge: + max_neutron_idx = self.neutron_kernels_num + result = {} + # Add one more non-mapped event at the end of list for the Neutron Dump event. + for i in range(max_neutron_idx + 1): + if i in neutron_to_edge: + result[i] = tuple(neutron_to_edge[i]) + else: + result[i] = () + logging.info(f"Neutron to Edge map was created: {result}") + return result + else: + return {} diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py index 1a84a418e92..ee711c34369 100644 --- a/backends/nxp/nxp_backend.py +++ b/backends/nxp/nxp_backend.py @@ -11,6 +11,8 @@ import logging import os import struct +import tempfile +from contextlib import contextmanager from typing import final import numpy as np @@ -26,6 +28,8 @@ from executorch.backends.nxp.backend.neutron_converter_manager import ( NeutronConverterManager, ) + +from executorch.backends.nxp.backend.neutron_map import NeutronMap from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.backends.nxp.neutron_node_extraction import ( extract_artifacts_from_neutron_node, @@ -54,6 +58,7 @@ def __init__(self): self.use_neutron_for_format_conversion = True self.fetch_constants_to_sram = False self.dump_kernel_selection_code = False + self.use_profiling = False def _replace_colons(self, operator: str) -> str: """ @@ -70,6 +75,7 @@ def neutron_compile_spec( use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, ) -> "NeutronCompileSpecBuilder": """Generate compile spec for Neutron NPU @@ -83,6 +89,7 @@ def neutron_compile_spec( :param fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights from FLASH to SRAM. This should be used when the whole model does not fit into SRAM. :param dump_kernel_selection_code: Whether Neutron converter dumps kernel selection code. + :param use_profiling: If true Neutron Converter will enable profiling for neutron delegated model :return: self for method chaining """ @@ -106,6 +113,7 @@ def neutron_compile_spec( self.use_neutron_for_format_conversion = use_neutron_for_format_conversion self.fetch_constants_to_sram = fetch_constants_to_sram self.dump_kernel_selection_code = dump_kernel_selection_code + self.use_profiling = use_profiling return self @@ -135,6 +143,10 @@ def build(self): "dump_kernel_selection_code", f"{self.dump_kernel_selection_code}".encode(), ), + CompileSpec( + "use_profiling", + f"{self.use_profiling}".encode(), + ), ] return self.compile_spec @@ -149,6 +161,7 @@ def generate_neutron_compile_spec( use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, ) -> list[CompileSpec]: return ( NeutronCompileSpecBuilder() @@ -160,11 +173,36 @@ def generate_neutron_compile_spec( use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, dump_kernel_selection_code=dump_kernel_selection_code, + use_profiling=use_profiling, ) .build() ) +@contextmanager +def capture_fd_output(): + tmp = tempfile.TemporaryFile() + + # Save original stdout / stderr + original_stdout_fd = os.dup(1) + original_stderr_fd = os.dup(2) + + try: + # Redirect fd=1 and fd=2 to temp file + os.dup2(tmp.fileno(), 1) + os.dup2(tmp.fileno(), 2) + + yield tmp # give access to the temp file + + finally: + # Restore original fds + os.dup2(original_stdout_fd, 1) + os.dup2(original_stderr_fd, 2) + + os.close(original_stdout_fd) + os.close(original_stderr_fd) + + @final class NeutronBackend(BackendDetails): @@ -185,6 +223,7 @@ def preprocess( # noqa C901 use_neutron_for_format_conversion = None fetch_constants_to_sram = False dump_kernel_selection_code = None + use_profiling = False for spec in compile_spec: if spec.key == "output_format": output_format = spec.value.decode() @@ -200,6 +239,8 @@ def preprocess( # noqa C901 fetch_constants_to_sram = spec.value.decode() == "True" if spec.key == "dump_kernel_selection_code": dump_kernel_selection_code = spec.value.decode() == "True" + if spec.key == "use_profiling": + use_profiling = spec.value.decode() == "True" # Check that the output format is set in the compile spec if not output_format: @@ -229,19 +270,32 @@ def preprocess( # noqa C901 if use_neutron_for_format_conversion is not None else {} ) - tflite_model, io_formats = EdgeProgramToIRConverter().convert_program( + ( + tflite_model, + io_formats, + edge_to_tflite_map, + ) = EdgeProgramToIRConverter().convert_program( edge_program, neutron_target_spec=NeutronTargetSpec(target), conversion_config=conversion_config, custom_delegation_options=CustomDelegationOptions(), ) - neutron_model = NeutronConverterManager(dump_kernel_selection_code).convert( - tflite_model, - target, - delegation_tag, - fetch_constants_to_sram, - ) + with capture_fd_output() as tmp: + neutron_model = NeutronConverterManager( + dump_kernel_selection_code + ).convert( + tflite_model, + target, + delegation_tag, + fetch_constants_to_sram, + use_profiling, + ) + tmp.seek(0) + log_output = tmp.read().decode() + # Get mapping from tflite to neutron + map = NeutronMap(log_output, edge_to_tflite_map) + neutron_to_edge_map = map.get_neutron_to_edge_map() # Dump the tflite file if intermediates_dir is set if intermediates_dir != "None": @@ -265,7 +319,9 @@ def preprocess( # noqa C901 else: raise RuntimeError(f"Unknown format {output_format}") - return PreprocessResult(processed_bytes=binary) + return PreprocessResult( + processed_bytes=binary, debug_handle_map=neutron_to_edge_map + ) class PayloadComposer: diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp index 3ea973b7c5b..6fe0482ed89 100644 --- a/backends/nxp/runtime/NeutronBackend.cpp +++ b/backends/nxp/runtime/NeutronBackend.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "NeutronDriver.h" @@ -25,6 +26,8 @@ namespace neutron { #define ALIGN_SIZE(size) \ ((size + BUFFER_ALIGNMENT - 1) & (~(BUFFER_ALIGNMENT - 1))) +#define KOPC_CALLARGS 6 // The operation for TileIR + // clang-format off /* Header schema: +----------------------------+-----------------------------+------------------------+ @@ -84,6 +87,19 @@ typedef struct { const uint8_t* outputMap; } NeutronExecutorchConfig; +typedef struct { + uint8_t eventCode; + uint8_t opCode; + uint8_t functionCode; + uint8_t timestampCode; + uint32_t time; +} NeutronSingleProfilingEvent; + +typedef struct { + NeutronSingleProfilingEvent startEvent; + NeutronSingleProfilingEvent stopEvent; +} NeutronFullProfilingEvent; + #ifdef EXTERNAL_MEM // Neutron compute has no access to FLASH. // Prefetch weights from FLASH to SRAM using memcpy. @@ -508,12 +524,11 @@ class NeutronBackend final : public PyTorchBackendInterface { } } -#ifdef NEUTRON_PROFILE - // TODO: Use trace from BackendExecutionContext. - NeutronTraceConfig trace_config{.traceConfig = 0}; - neutronSetTrace(cfg->nmh, &trace_config); +#ifdef ET_EVENT_TRACER_ENABLED + // Save ticks before neutron compute to measure how much time profiling dump + // takes + et_timestamp_t start_ticks = ::executorch::runtime::pal_current_ticks(); #endif - // Run neutron compute. NeutronError neutronRC = neutronRunBlocking(cfg->nmh, &cfg->dcfg); if (neutronRC != ENONE) { @@ -523,6 +538,11 @@ class NeutronBackend final : public PyTorchBackendInterface { neutronRC); return Error::InvalidProgram; } +#ifdef ET_EVENT_TRACER_ENABLED + // Save ticks after neutron compute to measure how much time profiling dump + // takes + et_timestamp_t stop_ticks = ::executorch::runtime::pal_current_ticks(); +#endif // Transpose outputs. for (int i = 0; i < cfg->numOutputs; i++) { @@ -558,6 +578,53 @@ class NeutronBackend final : public PyTorchBackendInterface { } } } +#ifdef ET_EVENT_TRACER_ENABLED + // Add traced evens only if model has profiling info. + auto profile_size = cfg->profileSize; + if (profile_size > 0) { + int events_num = static_cast(profile_size / 16); + auto profiling_index = cfg->numOutputs + 1; + char* profile_info = + static_cast(cfg->dcfg.outputs[profiling_index]); + NeutronFullProfilingEvent* neutron_events = + (NeutronFullProfilingEvent*)profile_info; + executorch::runtime::EventTracer* tracer = context.event_tracer(); + uint32_t start_time = 0; + int index = 0; + // Post log neutron events from profiling output. + for (int i = 0; i < events_num; i++) { + if (start_time == 0) { + start_time = neutron_events[i].startEvent.time; + } + if (neutron_events[i].stopEvent.opCode != KOPC_CALLARGS) { + // Only KOPC_CALLARGS events can be mapped to original .pte model. + continue; + } else { + event_tracer_log_profiling_delegate( + tracer, + nullptr, + index, + start_time, + neutron_events[i].stopEvent.time, + static_cast( + &neutron_events[i].startEvent.functionCode), + sizeof(uint8_t)); + start_time = 0; + index++; + } + } + event_tracer_log_profiling_delegate( + tracer, + nullptr, + index, + neutron_events[events_num - 1].startEvent.time, + neutron_events[events_num - 1].stopEvent.time + stop_ticks - + start_ticks, + static_cast( + &neutron_events[events_num - 1].startEvent.functionCode), + sizeof(uint8_t)); + } +#endif return Error::Ok; } diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py index 44a96010593..1309e019428 100644 --- a/backends/nxp/tests/executorch_pipeline.py +++ b/backends/nxp/tests/executorch_pipeline.py @@ -190,6 +190,7 @@ def to_quantized_edge_program( use_quant_state_dict: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, delegate_to_npu=True, ) -> EdgeProgramManager: _neutron_target_spec = NeutronTargetSpec(target) @@ -223,6 +224,7 @@ def to_quantized_edge_program( use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, dump_kernel_selection_code=dump_kernel_selection_code, + use_profiling=use_profiling, ) post_quant_state_dict = ( exir_program_aten__module_quant.state_dict() if use_quant_state_dict else None @@ -244,6 +246,7 @@ def to_quantized_edge_program( export(exir_program_aten__module_quant, example_input, strict=True), transform_passes=NeutronEdgePassManager(), partitioner=partitioners, + generate_etrecord=use_profiling, compile_config=EdgeCompileConfig( _check_ir_validity=False, _core_aten_ops_exception_list=core_aten_ops_exception_list, @@ -274,6 +277,7 @@ def to_quantized_executorch_program( use_neutron_for_format_conversion: bool = True, dataset_dir: str | None = None, delegate_to_npu=True, + use_profiling: bool = False, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ) -> ExecutorchProgramManager: @@ -295,6 +299,7 @@ def to_quantized_executorch_program( train_fn=train_fn, use_neutron_for_format_conversion=use_neutron_for_format_conversion, delegate_to_npu=delegate_to_npu, + use_profiling=use_profiling, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, **get_calibration_inputs_fn, diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py index 319f372b5fa..94e91a31b95 100644 --- a/backends/nxp/tests/executors.py +++ b/backends/nxp/tests/executors.py @@ -325,7 +325,7 @@ def convert_run_compare( if tfl_model is None: NodeFormatInference(edge_program).identify_node_formats() - tfl_model, _ = EdgeProgramToIRConverter().convert_program( + tfl_model, *_ = EdgeProgramToIRConverter().convert_program( edge_program, conversion_config ) diff --git a/backends/nxp/tests/generic_tests/test_aot_example.py b/backends/nxp/tests/generic_tests/test_aot_example.py index 893041fe372..8a1e5e49555 100644 --- a/backends/nxp/tests/generic_tests/test_aot_example.py +++ b/backends/nxp/tests/generic_tests/test_aot_example.py @@ -2,11 +2,13 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - +import os import subprocess import sys from pathlib import Path +from executorch.backends.nxp.tests.config_importer import test_config + # noinspection PyProtectedMember from executorch.exir._serialize import _deserialize_pte_binary from executorch.exir.schema import DelegateCall, KernelCall @@ -15,9 +17,8 @@ def test_aot_example__mobilenet_v2(): """Test that mobilenet can be lowered to Neutron backend via `aot_neutron_compile.py` and all ops are delegated.""" - # Find the executorch root directory (5 levels up from this test file) - executorch_root = Path(__file__).parent.parent.parent.parent.parent - assert executorch_root.exists(), f"Executorch root not found at {executorch_root}" + # Set the executorch root directory. + executorch_root = test_config.PROJECT_DIR # Run the compilation script as a module (like run_aot_example.sh does) cmd = [ @@ -34,14 +35,14 @@ def test_aot_example__mobilenet_v2(): ] # Output file will be created in executorch_root - pte_file = executorch_root / "mobilenetv2_nxp_delegate.pte" + pte_file = Path(os.path.join(executorch_root, "mobilenetv2_nxp_delegate.pte")) try: result = subprocess.run( cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout just in case. On my machine, the test usually runs ~1 minute. + timeout=300, # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute. cwd=str( executorch_root ), # Run from executorch root (like run_aot_example.sh) @@ -95,3 +96,77 @@ def test_aot_example__mobilenet_v2(): # Clean up the generated file if pte_file.exists(): pte_file.unlink() + + +def test_aot_example__mobilenet_v2__profiling(): + """Test that mobilenet_v2 can be lowered to Neutron backend via `aot_neutron_compile.py`, all ops are delegated, + the output model is profilable and ETRecord is generated properly.""" + + # Set the executorch root directory. + executorch_root = test_config.PROJECT_DIR + + # Run the compilation script as a module (like run_aot_example.sh does) + cmd = [ + sys.executable, + "-m", + "examples.nxp.aot_neutron_compile", + "--model_name", + "mobilenetv2", + "--delegate", + "--quantize", + "--target", + "imxrt700", + "--remove-quant-io-ops", + "--use_channels_last_dim_order", + "--use_profiling", # Generate profilable model and create ETRecord + "--use_random_dataset", # Avoid downloading the dataset. + ] + + # Output files will be created in executorch_root. + pte_file = Path( + os.path.join(executorch_root, "mobilenetv2_nxp_delegate_profile.pte") + ) + etrecord_file = Path( + os.path.join(executorch_root, "etrecord", "mobilenetv2_etrecord.bin") + ) + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute. + cwd=str( + executorch_root + ), # Run from executorch root (like run_aot_example.sh) + ) + + # Check script ran successfully. + assert result.returncode == 0, ( + f"Script failed with return code {result.returncode}\n" + f"STDOUT:\n{result.stdout}\n" + f"STDERR:\n{result.stderr}" + ) + + # Check if delegated model was created and saved. + assert pte_file.exists(), f"PTE file not created at {pte_file}" + + # Combine stdout and stderr to capture all subprocess output, including logs. + process_output = result.stdout + result.stderr + + # Check if nonempty Neutron to Edge map was created. + assert "Neutron to Edge map was created:" in process_output + + # Check if ETRecord was created and saved. + assert "The ETRecord for the model was saved to" in process_output + assert etrecord_file.exists(), f"ETRecord file not created at {etrecord_file}" + + finally: + # Clean up the generated files. + if pte_file.exists(): + pte_file.unlink() + if etrecord_file.exists(): + etrecord_file.unlink() + parent = etrecord_file.parent + if not any(parent.iterdir()): + parent.rmdir() diff --git a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py index 27bd675a487..6aa07dbba8d 100644 --- a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py +++ b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py @@ -629,7 +629,7 @@ def test_move_activation_before_concat_quantization__conv( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -668,7 +668,7 @@ def test_move_activation_before_concat_quantization__linear( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -706,7 +706,7 @@ def test_move_activation_before_concat_quantization__addmm( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -744,7 +744,7 @@ def test_move_activation_before_concat_quantization__mm( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -788,9 +788,7 @@ def test_concat_cluster_quantization__conv( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[ - -1 - ].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] exir_program_aten_quant: GraphModule = quantizer_spy.calls[ -1 @@ -861,9 +859,7 @@ def test_concat_cluster_quantization__linear( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[ - -1 - ].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] exir_program_aten_quant: GraphModule = quantizer_spy.calls[ -1 diff --git a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py index 8cf7dfe3dc2..52654a482b9 100644 --- a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py +++ b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py @@ -37,7 +37,7 @@ def test_lowered_program_and_tflite_output_match__conv2d__no_bias(mocker): ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return tflite_model = Model.GetRootAs(tflite_flatbuffers_model) sub_graph = tflite_model.Subgraphs(0) @@ -84,7 +84,7 @@ def test_conv_fc__lowered_program_and_tflite_output_match(mocker): exported_program: ExportedProgram = converter_spy.call_args.args[1] # Capture generated model - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # No Transpose ops in produced TFLite model tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0) @@ -148,7 +148,7 @@ def test_delegating_format_related_transpose_operators__supported_case(mocker): ) # Capture the converted IR model. - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Make sure the `Transpose` ops ARE in the IR model. tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0) diff --git a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py index 0705203db06..359dfdb67e9 100644 --- a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py +++ b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py @@ -28,7 +28,7 @@ def test_conv2d_neutron_conversion(): NodeFormatInference(edge_program_manager.exported_program()).identify_node_formats() edge_program_converter = EdgeProgramToIRConverter() - tflite_model, _ = edge_program_converter.convert_program( + tflite_model, *_ = edge_program_converter.convert_program( edge_program_manager.exported_program() ) diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py index 706d8ed3e14..af9ef08057b 100644 --- a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py +++ b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py @@ -153,7 +153,7 @@ def test_per_channel_convolution(self, _, use_qat: bool): use_neutron_for_format_conversion=False, ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( diff --git a/backends/nxp/tests/generic_tests/test_profiling.py b/backends/nxp/tests/generic_tests/test_profiling.py new file mode 100644 index 00000000000..c922eb070c3 --- /dev/null +++ b/backends/nxp/tests/generic_tests/test_profiling.py @@ -0,0 +1,158 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import ast +import logging +import re + +import numpy as np +import pytest +import torch +from executorch.backends.nxp.tests.graph_verifier import BaseGraphVerifier +from executorch.backends.nxp.tests.model_output_comparator import ( + NumericalStatsOutputComparator, +) + +from executorch.backends.nxp.tests.models import AvgPool2dModule, SoftmaxModule +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare + +from executorch.examples.nxp.experimental.cifar_net.cifar_net import CifarNetModel + + +@pytest.fixture(autouse=True) +def reseed_model_per_test_run(): + torch.manual_seed(23) + np.random.seed(23) + + +PATTERN_NEUTRON_MAP = r"Neutron to Edge map was created: (\{.*\})" + + +def extract_map_from_logs(caplog): + for record in caplog.records: + msg = record.getMessage() + neutron_map_match = re.search(PATTERN_NEUTRON_MAP, msg) + if neutron_map_match: + dict_str = neutron_map_match.group(1) + return ast.literal_eval(dict_str) + return None + + +class ParallelPoolModel(torch.nn.Module): + def __init__(self, channels: int): + super().__init__() + self.conv_in = torch.nn.Conv2d(channels, channels, kernel_size=3, padding=1) + self.max_pool2d = torch.nn.MaxPool2d(kernel_size=2, stride=2) + self.avg_pool2d = torch.nn.AvgPool2d(kernel_size=2, stride=2) + self.conv_out = torch.nn.Conv2d(2 * channels, channels, kernel_size=1) + + def forward(self, x): + x = self.conv_in(x) + x = torch.cat((self.max_pool2d(x), self.avg_pool2d(x)), dim=1) + x = self.conv_out(x) + return x + + +class TestProfiling: + @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True) + def test__softmax(self, caplog, request): + caplog.set_level(logging.INFO) + model = SoftmaxModule(-1) + lower_run_compare( + model, + (10,), + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + use_profiling=True, + output_comparator=NumericalStatsOutputComparator(), + ) + + # Neuron map for 1D Softmax with input size 10 should contain 4 nodes: + # 3 Neuron kernels (pad, softmax, and slice) and 1 unmapped node used for profiling dum + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (2,), # Pad + 1: (2,), # Softmax + 2: (2,), # Slice + 3: (), # Neutron Dump + } + + def test__parallel_pool(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (1, 3, 32, 32) + model = ParallelPoolModel(input_shape[1]) + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (6,), # Conv2DStandardV2 + 1: (), # Conv2DDepthwiseV2 (AvgPool) + 2: (7,), # MaxPool + 3: (), # TransposeCHW + 4: (), # TransposeCHW + 5: (), # TransposeCHW + 6: (), # Slice + 7: (), # Pad + 8: (), # Conv2DPointwise + 9: (), # Slice + 10: (), # Neutron Dump + } + + @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True) + def test__cifar(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (1, 3, 32, 32) + model = CifarNetModel() + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (10,), # Pad + 1: (10, 11), # Conv2DStandardV1 (Pad + Conv2d) + 2: (12,), # MaxPool + 3: (13, 14), # Conv2DStandardV1 (Pad + Conv2d) + 4: (15,), # MaxPool + 5: (16, 17), # Conv2DStandardV1 (Pad + Conv2d) + 6: (18,), # MaxPool + 7: (20,), # FullyConnected + 8: (21,), # Pad + 9: (21,), # Softmax + 10: (21,), # Slice + 11: (), # Neutron Dump + } + + def test__avg_pool(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (2, 9, 6, 15) + model = AvgPool2dModule(False, 0) + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (2,), # Pad + 1: (2,), # Conv2DDepthwiseDense + 2: (2,), # Slice + 3: (), # Neutron Dump + } diff --git a/backends/nxp/tests/generic_tests/test_quantizer.py b/backends/nxp/tests/generic_tests/test_quantizer.py index 3c23241e01e..6180d2fd9ae 100644 --- a/backends/nxp/tests/generic_tests/test_quantizer.py +++ b/backends/nxp/tests/generic_tests/test_quantizer.py @@ -432,7 +432,7 @@ def test_quantizer__linear_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -477,7 +477,7 @@ def test_quantizer__addmm_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -522,7 +522,7 @@ def test_quantizer__mm_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -567,7 +567,7 @@ def test_quantizer__conv_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return diff --git a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py index a8cdee41830..668deb28c96 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py @@ -51,7 +51,7 @@ def test_addmm_conversion(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -84,7 +84,7 @@ def test_linear_conversion__with_bias(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py index dc442a4931c..466f596bf91 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py @@ -59,7 +59,7 @@ def test_convert_bmm__supported(mocker, input_shape_x1, input_shape_x2, use_qat) # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data_1 = ( np.random.random(input_shape_x1).astype(np.float32) * 256.0 - 128.0 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py index b4b828cd4e6..5ee3db6752f 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py @@ -182,7 +182,7 @@ def test_conv_dropout_quant( use_neutron_for_format_conversion=False, ).exported_program() - tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] assert not graph_contains_any( @@ -241,7 +241,7 @@ def test_clone_pool_view_copy_quant( use_neutron_for_format_conversion=False, ).exported_program() - tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] assert not graph_contains_any( @@ -311,7 +311,7 @@ def test_clone__to_contiguous_format(self): ).identify_node_formats() # Convert to the IR. - converted_model, _ = EdgeProgramToIRConverter().convert_program( + converted_model, *_ = EdgeProgramToIRConverter().convert_program( edge_program_manager.exported_program() ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py index 828647d2113..7105514514a 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py @@ -177,7 +177,7 @@ def test_conv2d_quant_conversion(mocker, model: torch.nn.Module, input_shape, us ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] @@ -367,7 +367,7 @@ def test_conv_transpose2d_conversion__quantized( assert any("lowered_module" in node.name for node in edge_program.graph.nodes) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py index 60dbfd1b215..79fffff3b78 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py @@ -51,7 +51,7 @@ def test_mm_conversion(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -85,7 +85,7 @@ def test_linear_conversion__without_bias(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py index e0fc0d85066..2e7f9035e8a 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py @@ -74,7 +74,7 @@ def test_convert_neg(mocker, input_shape): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = ( np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0 @@ -105,7 +105,7 @@ def test_convert_neg__channels_last(mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = ( np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py index fb25f02785a..c5c7aa55b03 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py @@ -50,7 +50,7 @@ def test_prelu_with_linear_quant_conversion(mocker, input_shape): ).exported_program() # Capture generated entities - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] # Check `prelu` was not decomposed into simpler edge operators diff --git a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py index 2621baf18ee..00c10bd257d 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py @@ -85,7 +85,7 @@ def test_softmax_delegation(input_shape, dim: int, mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = random_input_data(input_shape) # Make sure the tested program contains the `softmax`, and its input has the expected rank. @@ -121,7 +121,7 @@ def test_softmax_delegation__channel_first(input_shape, dim: int, mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = random_input_data(input_shape) # Make sure the tested program contains the `softmax`. diff --git a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py index cb5f398fa21..276b29da142 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py @@ -265,7 +265,7 @@ def test_view_copy_w_linear_quant_conversion(mocker, input_shape, new_shape, use ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program edge_program: ExportedProgram = converter_spy.call_args.args[1] @@ -299,7 +299,7 @@ def test_view_w_conv_linear_quant_conversion( ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program edge_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py index 88ea567381f..aadef8c7731 100644 --- a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py +++ b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py @@ -251,7 +251,7 @@ def test_linear_bn_full_qat_pipeline_conversion( assert any("lowered_module" in node.name for node in edge_program.graph.nodes) # Capture generated model - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py index d5ff3680f38..ef6fe9c864c 100644 --- a/backends/nxp/tests/nsys_testing.py +++ b/backends/nxp/tests/nsys_testing.py @@ -101,6 +101,8 @@ def _run_delegated_executorch_program( mocker, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_profiling: bool = False, + use_neutron_for_format_conversion=True, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ) -> tuple[ExportedProgram, str]: @@ -129,6 +131,8 @@ def wrapper(*args, **kwargs): delegate_to_npu=True, use_qat=use_qat, train_fn=train_fn, + use_profiling=use_profiling, + use_neutron_for_format_conversion=use_neutron_for_format_conversion, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, ) @@ -405,6 +409,8 @@ def lower_run_compare( reference_model: ReferenceModel = ReferenceModel.QUANTIZED_EXECUTORCH_CPP, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_profiling: bool = False, + use_neutron_for_format_conversion=True, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ): @@ -424,6 +430,10 @@ def lower_run_compare( :param reference_model: Version of the model which will be run to obtain reference output data. :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training). :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`. + :param use_profiling: Enable profiling for neutron delegated model. + :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to + ensure that the IO matches the executorch partition, which will be + delegated to Neutron, :param operators_not_to_delegate: list of operators not to delegate. :param remove_quant_io_ops: If true, IO q-ops are removed and verification is done on quantized version of dataset (quantized INT8 input samples). @@ -468,6 +478,8 @@ def lower_run_compare( mocker, use_qat=use_qat, train_fn=train_fn, + use_profiling=use_profiling, + use_neutron_for_format_conversion=use_neutron_for_format_conversion, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, ) diff --git a/backends/qualcomm/aot/wrappers/TARGETS b/backends/qualcomm/aot/wrappers/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/backends/qualcomm/aot/wrappers/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/backends/qualcomm/builders/TARGETS b/backends/qualcomm/builders/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/backends/qualcomm/builders/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/codegen/test/TARGETS b/codegen/test/TARGETS deleted file mode 100644 index 1e8cc179228..00000000000 --- a/codegen/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain xplat-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/configurations/TARGETS b/configurations/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/configurations/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png new file mode 100644 index 00000000000..50ed49f57ec Binary files /dev/null and b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png differ diff --git a/docs/source/backends/nxp/nxp-overview.md b/docs/source/backends/nxp/nxp-overview.md index 22499aea7ad..b8739046351 100644 --- a/docs/source/backends/nxp/nxp-overview.md +++ b/docs/source/backends/nxp/nxp-overview.md @@ -64,6 +64,8 @@ here https://www.nxp.com/design/design-center/software/eiq-ai-development-enviro **→{doc}`nxp-kernel-selection` — Neutron Firmware Kernel Selection support.** +**→{doc}`nxp-profiling` — Neutron models profiling.** + ```{toctree} :maxdepth: 2 :hidden: @@ -74,4 +76,5 @@ nxp-quantization tutorials/nxp-tutorials nxp-dim-order nxp-kernel-selection +nxp-profiling ``` diff --git a/docs/source/backends/nxp/nxp-profiling.md b/docs/source/backends/nxp/nxp-profiling.md new file mode 100644 index 00000000000..17e352e479d --- /dev/null +++ b/docs/source/backends/nxp/nxp-profiling.md @@ -0,0 +1,205 @@ +# NXP eIQ Profiling Support + + +The eIQ Neutron Backend is integrated with the +[Developer Tools](https://docs.pytorch.org/executorch/stable/delegate-debugging.html) +to provide visibility into delegated operator execution time. + +There are three steps required to obtain profiling results for an NXP‑delegated model: + +* Convert the model with profiling support enabled. +* Generate the artifacts consumed by the Developer Tools (`ETRecord`, `ETDump`). +* Create and run the Inspector class to consume these artifacts and print the results. + +--- + +## Convert a model with the profiling support + +Profiling data is generated only for a **profilable** model. +To convert a model with profiling enabled, the `--use-profiling` flag must be set. + +See the `aot_neutron_compile.py` example and its +[README](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md) +for additional details. + +The following command creates a profilable `cifar10_nxp_delegate.pte` model and the corresponding `ETRecord` for the +**i.MX RT700** board: + +```bash +python -m examples.nxp.aot_neutron_compile --quantize \ + --delegate -m cifar10 \ + --use_profiling +``` + +For installation details, see {doc}`nxp-overview`. + +--- + +## Generate ETRecord (Optional) + +`ETRecord` is an optional artifact that contains model graphs and metadata used to link runtime profiling results +back to the eager model. + +The recommended approach is to enable `ETRecord` generation by passing `generate_etrecord=True` to export API calls. +After export completes, retrieve the `ETRecord` using the `get_etrecord()` method, and save it using the `save()` method: + +### Example + +```python +from executorch.devtools.etrecord import generate_etrecord + +# 1. Open a model and export the model to ATEN +model = model.eval() +exported_program = torch.export.export(model, example_inputs, strict=True) +module = exported_program.module() + +# 2. Transform and lower +compile_spec = generate_neutron_compile_spec("imxrt700") +partitioners = ( + [ + NeutronPartitioner( + compile_spec, + NeutronTargetSpec(target="imxrt700"), + post_quantization_state_dict=module.state_dict(), + ) + ] +) +edge_program_manager = to_edge_transform_and_lower( + export(module, example_inputs, strict=True), + transform_passes=NeutronEdgePassManager(), + generate_etrecord=True, + partitioner=partitioners, + compile_config=EdgeCompileConfig( + _core_aten_ops_exception_list=core_aten_ops_exception_list, + ), +) + +# 3. Export to ExecuTorch program +exec_prog = edge_program_manager.to_executorch( + config=ExecutorchBackendConfig(extract_delegate_segments=False) +) +# Save ETRecord +exec_prog.get_etrecord().save("etrecord.bin") + +``` + +### Complete Example + +A full implementation is available +in [aot_neutron_compile.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/aot_neutron_compile.py). + +The `--use_profiling` flag is used to create a **profilable** model and the corresponding `ETRecord` file +(see [Convert a model with profiling support](#convert-a-model-with-profiling-support) for the full command). + + +--- + +## Generate ETDump + + +The next step is to generate an `ETDump`. An `ETDump` contains runtime data collected during model inference execution. + +To generate an `ETDump`, ensure that the ExecuTorch runtime library is integrated with the Developer Tools and built +with the `ET_EVENT_TRACER_ENABLED` flag enabled. + +Only models converted with profiling support will produce an `ETDump` containing execution times for all Neutron +operators. Otherwise, the dump will include only the final delegate execution time. + +Neutron software provides a profiling mechanism that logs individual operator execution times to a dedicated runtime +output. This data is then used to generate post‑time events after the inference has completed. + + +### Example + +```c +#include +``` +```c +// 1. Create ETDumpGen BEFORE inference. +auto etdump_gen_ptr = std::make_unique(); +executorch::etdump::ETDumpGen* etdump_gen = etdump_gen_ptr.get(); + +// 2. Load a method from the program by name with ETDump generator for profiling. +Result method = program->load_method(method_name, &memory_manager, etdump_gen); + +// 3. Input tensor setup. +Tensor::SizesType sizes[] = {1, 1, 32, 32}; +Tensor::DimOrderType dim_order[] = {0, 2, 3, 1}; +TensorImpl impl(ScalarType::Float, 4, sizes, image_data, dim_order); +Tensor tensor(&impl); +Error status = method->set_input(tensor, 0); + +// 4. Execute. +status = method->execute(); + +// Get ETDump. +if (etdump_gen != nullptr) { + executorch::etdump::ETDumpResult result = etdump_gen->get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + PRINTF("Add a brakepoint here and run this command in Debugger Console: " + "dump binary memory trace.etdump result.buf (result.buf + result.size)\r\n"); + } +} +``` + + +To save an `ETDump` file from the board to a PC, use the **Debug Console** in the MCUXpresso IDE: + +- Set a breakpoint at the `PRINTF(...)` line in the example above. +- Enter the following command in the Debug Console and press **Enter**: + + ``` + dump binary memory trace.etdump result.buf (result.buf + result.size) + ``` + + +
+ Save ETDump in MCUXPresso project +
+ Figure 1: Save ETDump in MCUXPresso Project. +
+
+ + +The resulting `ETDump` file is generated in the project folder within the MCUXpresso workspace. + +> **Note:** +> Profilable models print profiling data to the terminal. Generating this dump may take longer than executing the +> Neutron kernels themselves, but this overhead can be ignored as it affects only models with profiling support +> enabled. The dump generation time is included in the `ETDump` as the final kernel entry. + +--- + +## Creating an Inspector + +The [Inspector](https://docs.pytorch.org/executorch/1.0/model-inspector.html) APIs provide a way to analyze the +contents of `ETRecord` and `ETDump`, enabling developers to gain insights into model architecture +and performance statistics. + +`ETRecord` is an optional argument used to obtain a mapping between the original model and the converted Neutron model. + +An `ETDump` generated on the board contains metadata for each Neutron operator, including its unique identifier. +To visualize this metadata in the Inspector results table, set the `include_delegate_debug_data = True` argument. + +### Example + +```python +from executorch.devtools import Inspector + +inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin") +inspector.print_data_tabular(include_delegate_debug_data = True) +``` + +### Complete Example + +A full implementation is available +in [analyzing_with_inspector.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/analyzing_with_inspector.py). @lint-ignore + +--- + +## Summary + +* Build the model with the `--use_profiling` flag enabled. +* Build the ExecuTorch runtime library with the `ET_EVENT_TRACER_ENABLED` flag and the ETDump Developer Tool. +* Use the Debug Console in MCUXpresso to save the `ETDump` file from the board to a PC. +* Visualize the profiling results using the Inspector. diff --git a/examples/devtools/example_runner/TARGETS b/examples/devtools/example_runner/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/examples/devtools/example_runner/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/examples/models/gemma4/BUCK b/examples/models/gemma4/BUCK index e587370ece0..19f0ff90c93 100644 --- a/examples/models/gemma4/BUCK +++ b/examples/models/gemma4/BUCK @@ -1,4 +1,5 @@ load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load(":targets.bzl", "define_common_targets") oncall("executorch") @@ -6,3 +7,122 @@ oncall("executorch") non_fbcode_target(_kind = define_common_targets,) fbcode_target(_kind = define_common_targets,) + +# Text decoder module +fbcode_target(_kind = runtime.python_library, + name = "text_decoder", + srcs = [ + "text_decoder/__init__.py", + "text_decoder/convert_weights.py", + "text_decoder/gemma4_attention.py", + "text_decoder/gemma4_config.py", + "text_decoder/gemma4_cross_decoder.py", + "text_decoder/gemma4_decoder_layer.py", + "text_decoder/gemma4_model.py", + "text_decoder/gemma4_self_decoder.py", + "text_decoder/gemma4_transformer.py", + ], + _is_external_target = True, + base_module = "executorch.examples.models.gemma4", + resources = { + "config/e2b_config.json": "config/e2b_config.json", + "config/e4b_config.json": "config/e4b_config.json", + }, + deps = [ + "//caffe2:torch", + "fbsource//third-party/pypi/safetensors:safetensors", + "fbsource//third-party/pypi/transformers:transformers", + ], + visibility = ["PUBLIC"], +) + +# Speech transform module +fbcode_target(_kind = runtime.python_library, + name = "speech_transform", + srcs = [ + "speech_transform.py", + ], + _is_external_target = True, + base_module = "executorch.examples.models.gemma4", + deps = [ + "//caffe2:torch", + ], + visibility = ["PUBLIC"], +) + +# Export utilities (shared quantization code) +fbcode_target(_kind = runtime.python_library, + name = "quant_utils", + srcs = ["quant_utils.py"], + _is_external_target = True, + base_module = "executorch.examples.models.gemma4", + deps = [ + "//caffe2:torch", + "//executorch/examples/models/llama:source_transformation", + "//executorch/extension/llm/export:export_lib", + "//pytorch/ao:torchao", + ], + visibility = ["PUBLIC"], +) + +# Single PTE export +fbcode_target(_kind = runtime.python_binary, + name = "export_gemma4", + srcs = ["export_gemma4.py"], + main_function = "executorch.examples.models.gemma4.export_gemma4.main", + preload_deps = [ + "//pytorch/ao/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight:op_linear_8bit_act_xbit_weight_aten", + "//pytorch/ao/torchao/csrc/cpu/shared_kernels/embedding_xbit:op_embedding_xbit_aten", + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/kernels/quantized:aot_lib", + ], + deps = [ + ":text_decoder", + ":speech_transform", + ":quant_utils", + "//caffe2:torch", + "//executorch/exir:lib", + "//executorch/backends/xnnpack/partition:xnnpack_partitioner", + "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer", + "//executorch/extension/llm/export:export_lib", + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/extension/llm/custom_ops:custom_ops_aot_py", + "//executorch/kernels/quantized:aot_lib", + "//pytorch/ao:torchao", + "fbsource//third-party/pypi/safetensors:safetensors", + "fbsource//third-party/pypi/transformers:transformers", + ], +) + +# Image preprocessing utilities +fbcode_target(_kind = runtime.python_library, + name = "image_utils", + srcs = ["image_utils.py"], + _is_external_target = True, + base_module = "executorch.examples.models.gemma4", + deps = [ + "//caffe2:torch", + "fbsource//third-party/pypi/pillow:pillow", + ], + visibility = ["PUBLIC"], +) + +# Python runner (single PTE, audio + vision + text-only) +fbcode_target(_kind = runtime.python_binary, + name = "run_gemma4", + srcs = ["run_gemma4.py"], + main_function = "executorch.examples.models.gemma4.run_gemma4.main", + preload_deps = [ + "//executorch/backends/xnnpack:xnnpack_backend", + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/kernels/quantized:aot_lib", + "//pytorch/ao/torchao/csrc/cpu/shared_kernels/embedding_xbit:op_embedding_xbit_aten", + "//pytorch/ao/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight:op_linear_8bit_act_xbit_weight_aten", + ], + deps = [ + ":image_utils", + "//caffe2:torch", + "//executorch/runtime:runtime", + "fbsource//third-party/pypi/sentencepiece:sentencepiece", + ], +) diff --git a/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py b/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py index e10c1c7e415..fe3e3bb94cb 100644 --- a/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py +++ b/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py @@ -34,14 +34,25 @@ class Gemma4MLP(nn.Module): def __init__(self, hidden_size: int, intermediate_size: int): super().__init__() + self.intermediate_size = intermediate_size self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.down_proj( - F.gelu(self.gate_proj(x), approximate="tanh") * self.up_proj(x) - ) + # If a loader fused gate_proj|up_proj into one gate_up_proj (single + # matmul; e.g. the GGUF loader's coalesced fusion), use it and split the + # [.., 2*intermediate_size] output back into gate/up. Otherwise fall back + # to the separate projections (unfused checkpoints / non-fusing loaders). + gate_up = getattr(self, "gate_up_proj", None) + if gate_up is not None: + fused = gate_up(x) + gate = fused[..., : self.intermediate_size] + up = fused[..., self.intermediate_size :] + else: + gate = self.gate_proj(x) + up = self.up_proj(x) + return self.down_proj(F.gelu(gate, approximate="tanh") * up) class Gemma4DecoderLayer(nn.Module): diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py index 90839ea6f6a..6a4a70ced18 100644 --- a/examples/models/gemma4_31b/gguf_loader.py +++ b/examples/models/gemma4_31b/gguf_loader.py @@ -104,6 +104,89 @@ def _convert_weight(model, model_key: str, gtensor, backend: str): return gtensor +# --------------------------------------------------------------------------- +# Single-point gate/up fusion (backend-agnostic, at the raw GGUF level) +# +# gate_proj and up_proj share the same input, so the MLP can issue ONE matmul +# over a [2*intermediate, hidden] weight instead of two. We fuse here -- before +# any backend conversion (_convert_weight) -- by concatenating the two raw GGUF +# block blobs along the output (row) dim. ExportableGGUFTensor.raw is +# (N, row_bytes) row-major with each output row self-contained, so the concat is +# an exact row-stack (no re-quant, no scale recompute). Both CUDA and MLX then +# pack the already-fused weight, so there is no per-backend-type concat. The +# model's Gemma4MLP.forward splits the [.., 2*intermediate] output back into +# gate/up only when a fused gate_up_proj is present (graceful for unfused loads). + + +def _gate_up_layer_kind(model_key: str): + """If ``model_key`` is an MLP gate/up proj weight, return ``(layer_idx, kind)`` + with ``kind`` in ``{"gate", "up"}``; otherwise ``None``.""" + prefix = "layers." + for kind in ("gate", "up"): + suffix = f".mlp.{kind}_proj.weight" + if model_key.startswith(prefix) and model_key.endswith(suffix): + mid = model_key[len(prefix) : len(model_key) - len(suffix)] + if mid.isdigit(): + return int(mid), kind + return None + + +def _gate_up_fuseable(gate, up) -> bool: + """True iff gate/up are the same GGUF quant type and same packed row width + (hence same K + block layout), so a row-concat along output N is valid.""" + return ( + gate.ggml_type == up.ggml_type + and gate.raw.shape[1] == up.raw.shape[1] + and int(gate.shape[1]) == int(up.shape[1]) + ) + + +def _fuse_gate_up_raw(gate, up): + """Row-concat gate|up raw GGUF blocks (gate rows first) into one fused + ExportableGGUFTensor of shape (2*N, K).""" + from executorch.extension.llm.export.gguf import ExportableGGUFTensor + + fused_raw = torch.cat([gate.raw, up.raw], dim=0) + return ExportableGGUFTensor.from_raw(fused_raw, gate.ggml_type, gate.orig_dtype) + + +def _assign_gate_up_unfused(model, layer_idx, kind, gtensor, backend, packers): + """Assign a single gate/up GGUF tensor to its own projection (no fusion).""" + from executorch.examples.models.gemma4_31b.quant import pack_one + + key = f"layers.{layer_idx}.mlp.{kind}_proj.weight" + pack_one(model, key, _convert_weight(model, key, gtensor, backend), packers) + + +def _install_and_pack_fused_gate_up(model, layer_idx, gate, up, backend, packers): + """Fuse gate|up at the raw level, swap the layer's MLP to a single + ``gate_up_proj`` (dropping gate_proj/up_proj), then pack the fused weight.""" + import torch.nn as nn + + from executorch.examples.models.gemma4_31b.quant import pack_one + + fused = _fuse_gate_up_raw(gate, up) + inter, hidden = int(gate.shape[0]), int(gate.shape[1]) + + mlp = model.get_submodule(f"layers.{layer_idx}.mlp") + mlp.gate_up_proj = nn.Linear(hidden, 2 * inter, bias=False, device="meta") + del mlp.gate_proj + del mlp.up_proj + + key = f"layers.{layer_idx}.mlp.gate_up_proj.weight" + pack_one(model, key, _convert_weight(model, key, fused, backend), packers) + + +def _process_gate_up_pair(model, layer_idx, gate, up, backend, packers) -> bool: + """Fuse gate|up if compatible (returns True), else assign them unfused.""" + if _gate_up_fuseable(gate, up): + _install_and_pack_fused_gate_up(model, layer_idx, gate, up, backend, packers) + return True + _assign_gate_up_unfused(model, layer_idx, "gate", gate, backend, packers) + _assign_gate_up_unfused(model, layer_idx, "up", up, backend, packers) + return False + + def _resolve_tied_lm_head(model, lm_head_weight, packers): """Assign a tied lm_head (GGUF ties it to the token embedding).""" from executorch.examples.models.gemma4_31b.quant import pack_one @@ -217,11 +300,32 @@ def load_gguf_model( n_processed = 0 print(f"Streaming GGUF from {gguf_path}...") + pending_gate_up: dict = {} # layer_idx -> {"gate": raw, "up": raw} + n_fused = 0 + n_unfused = 0 for gguf_name, value in iter_gguf(gguf_path): model_key = gguf_to_model_key(gguf_name) if model_key is None: continue + # Buffer the RAW gate/up ExportableGGUFTensor (pre-conversion) and fuse + # once both arrive -- the single common point upstream of _convert_weight. + gu = _gate_up_layer_kind(model_key) + if gu is not None and isinstance(value, ExportableGGUFTensor): + layer_idx, kind = gu + slot = pending_gate_up.setdefault(layer_idx, {}) + slot[kind] = value + if "gate" in slot and "up" in slot: + if _process_gate_up_pair( + model, layer_idx, slot["gate"], slot["up"], backend, packers + ): + n_fused += 1 + else: + n_unfused += 1 + pending_gate_up.pop(layer_idx, None) + n_processed += 2 + continue + if isinstance(value, ExportableGGUFTensor): weight = _convert_weight(model, model_key, value, backend) if model_key == "embed_tokens.weight": @@ -238,6 +342,21 @@ def load_gguf_model( if n_processed % 100 == 0: print(f" Processed {n_processed} tensors...") + # Flush any unpaired gate/up (partial/malformed) as separate unfused + # projections so no weight is left on meta. + for layer_idx, slot in pending_gate_up.items(): + for kind in ("gate", "up"): + if kind in slot: + _assign_gate_up_unfused( + model, layer_idx, kind, slot[kind], backend, packers + ) + n_unfused += 1 + + print( + f"[gemma4_31b gguf] Fused gate+up on {n_fused} MLP layers" + + (f" ({n_unfused} left unfused)" if n_unfused else "") + ) + _resolve_tied_lm_head(model, lm_head_weight, packers) # Fill RoPE tables / KV caches / scalar constants (left on meta by the diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt index 726657a3779..aeb97f76ab7 100644 --- a/examples/models/qwen3_5_moe/CMakeLists.txt +++ b/examples/models/qwen3_5_moe/CMakeLists.txt @@ -89,6 +89,7 @@ endif() if(TARGET mlxdelegate) executorch_target_copy_mlx_metallib(qwen3_5_moe_runner) + executorch_target_copy_mlx_metallib(qwen3_5_moe_worker) endif() if(EXECUTORCH_BUILD_CUDA) diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json index 276c2116148..6adcb8aa9cb 100644 --- a/examples/models/qwen3_5_moe/CMakePresets.json +++ b/examples/models/qwen3_5_moe/CMakePresets.json @@ -70,9 +70,9 @@ }, { "name": "qwen3-5-moe-mlx", - "displayName": "Build Qwen3.5 MoE runner (MLX)", + "displayName": "Build Qwen3.5 MoE runner and worker (MLX)", "configurePreset": "qwen3-5-moe-mlx", - "targets": ["qwen3_5_moe_runner"] + "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"] } ], "workflowPresets": [ diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md index c275641bfd7..77f53aefcc6 100644 --- a/examples/models/qwen3_5_moe/README.md +++ b/examples/models/qwen3_5_moe/README.md @@ -302,6 +302,66 @@ python -m executorch.examples.models.qwen3_5_moe.run \ --max-new-tokens 50 ``` +### Serving (MLX, multi-session) + +The MLX worker hosts multiple isolated sessions on **one** weight load, so an +OpenAI-compatible server can serve concurrent conversations without duplicating +the ~weights. `make qwen3_5_moe-mlx` builds both `qwen3_5_moe_runner` and +`qwen3_5_moe_worker` (each with `mlx.metallib` copied alongside). + +Start the server (it auto-locates the worker binary): + +```bash +# tokenizer.json the C++ worker opens (resolve from the HF cache) +TOKENIZER_JSON=$(ls "${HF_HOME:-$HOME/.cache/huggingface}"/hub/models--Qwen--Qwen3.5-35B-A3B/snapshots/*/tokenizer.json | head -n1) + +python -m executorch.examples.models.qwen3_5_moe.serve \ + --model-path ./qwen35_moe_mlx/model.pte \ + --tokenizer-path "$TOKENIZER_JSON" \ + --hf-tokenizer Qwen/Qwen3.5-35B-A3B \ + --max-sessions 4 \ + --host 127.0.0.1 \ + --port 8000 +``` + +- `--tokenizer-path` is the raw `tokenizer.json` **file** the worker loads; + `--hf-tokenizer` (HF id or local dir) supplies the chat template on the Python + side. No `--data-path` (the MLX `.pte` is self-contained). +- `--max-sessions N` caps physical sessions on the single weight load. One slot + is reserved for anonymous requests (requests sent without a session id), so + `N` allows `N-1` concurrently named sessions. + +Query it (OpenAI-compatible) from another terminal. Route each conversation to a +session with the `session_id` header: + +```bash +curl http://127.0.0.1:8000/v1/chat/completions \ + -H "Content-Type: application/json" -H "session_id: alice" \ + -d '{"model":"qwen3.5-moe", + "messages":[{"role":"user","content":"What is the capital of France?"}], + "max_tokens":50,"chat_template_kwargs":{"enable_thinking":false}}' +``` + +Endpoints: `GET /health`, `GET /v1/models`, `POST /v1/chat/completions`, +`DELETE /v1/sessions/{id}` (free a session + its slot), `POST /v1/sessions/{id}/reset`. + +Session/memory semantics on MLX: +- This server uses the standard **stateless** OpenAI contract — send the full + `messages` history each request. `session_id` + warm-resume is a KV-cache reuse + optimization for the shared prefix, not server-side memory. +- Each session adds **one** set of mutable buffers (KV + recurrent/conv state) on + top of the shared weights; per-session cost scales with `max_seq_len`. Weights + are never duplicated. +- KV persists across requests for a live session and is **released on close** + (`DELETE`/reset). Named sessions are not auto-closed — close them to free slots. + MLX's Metal allocator pools freed buffers (so RSS may not shrink immediately), + but they are reused by later sessions, keeping memory bounded. +- Requests are processed **one at a time** (a single in-flight request per + worker). A request runs to completion and head-of-line-blocks every other + session until it finishes; there is no token-level interleaving or parallel + execution. This holds on both MLX and CUDA; multi-session provides memory + isolation and warm resume, not added throughput. + ### Tiny Model Test For CI or quick pipeline validation (no model download needed): diff --git a/examples/models/qwen3_5_moe/mlx_source_transformations.py b/examples/models/qwen3_5_moe/mlx_source_transformations.py index 9a49f8a84f6..3c460fc9c54 100644 --- a/examples/models/qwen3_5_moe/mlx_source_transformations.py +++ b/examples/models/qwen3_5_moe/mlx_source_transformations.py @@ -113,12 +113,14 @@ def _full_attention_forward(self, x, input_pos): k, v = self.kv_cache.update(input_pos, k, v) - if self.n_kv_groups > 1: - k = k.repeat_interleave(self.n_kv_groups, dim=1) - v = v.repeat_interleave(self.n_kv_groups, dim=1) - - attn_mask = self.mask[input_pos].unsqueeze(0).unsqueeze(0) - y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask) + y = torch.ops.mlx.custom_sdpa( + q, + k, + v, + start_pos=pos, + dropout_p=0.0, + is_causal=True, + ) y = y.transpose(1, 2).contiguous().view(B, T, -1) @@ -184,10 +186,8 @@ def _exportable_gated_delta_net_forward(self, x, input_pos): k, (self.head_k_dim,), self._qk_rms_weight, eps=1e-6 ) - # head_repeat for k_heads != v_heads - if self.head_repeat > 1: - q = q.repeat_interleave(self.head_repeat, dim=2) - k = k.repeat_interleave(self.head_repeat, dim=2) + # GQA head expansion (k_heads != v_heads) is handled inside + # mlx::gated_delta_rule # Mamba-style gating beta = b.sigmoid() @@ -278,17 +278,13 @@ def _swap_gated_delta_net(model, model_dtype): def _swap_full_attention(model, config): - """FullAttention → mlx::rope custom op + causal mask.""" + """FullAttention → mlx::rope custom op""" rope_theta = config.rope_theta if config else 10000.0 - max_seq_len = config.max_seq_len if config else 4096 count = 0 for _name, module in model.named_modules(): if isinstance(module, FullAttention): module._rope_dims = module.rotary_emb.rotary_dim module._rope_base = rope_theta - mask = torch.full((max_seq_len, max_seq_len), float("-inf")) - mask = torch.triu(mask, diagonal=1) - module.register_buffer("mask", mask) module.forward = types.MethodType(_full_attention_forward, module) count += 1 return count diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp index 713f6211330..fd81f60c23a 100644 --- a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp +++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp @@ -183,9 +183,9 @@ class Qwen35MoESession : public LLMSession { ::tokenizers::Tokenizer* tokenizer, std::unordered_map metadata, std::unordered_set eos_ids -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , - ::executorch::backends::cuda::MutableStateContextOwner* mutable_state, + MutableStateContextOwner* mutable_state, int session_token #endif ) @@ -195,7 +195,7 @@ class Qwen35MoESession : public LLMSession { tokenizer_(tokenizer), metadata_(std::move(metadata)), eos_ids_(std::move(eos_ids)) -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , mutable_state_(mutable_state), session_token_(session_token) @@ -212,9 +212,8 @@ class Qwen35MoESession : public LLMSession { } ~Qwen35MoESession() override { -#ifdef EXECUTORCH_BUILD_CUDA - if (mutable_state_ != nullptr && - session_token_ != ::executorch::backends::cuda::kNoMutableSession) { +#ifdef QWEN_HAS_MUTABLE_STATE + if (mutable_state_ != nullptr && session_token_ != kNoMutableSession) { mutable_state_->destroy_session(session_token_); } #endif @@ -425,8 +424,8 @@ class Qwen35MoESession : public LLMSession { float temperature, bool sync_after) { std::lock_guard guard(*exec_mutex_); -#ifdef EXECUTORCH_BUILD_CUDA - Result> res = mutable_state_ != nullptr +#ifdef QWEN_HAS_MUTABLE_STATE + auto res = mutable_state_ != nullptr ? mutable_state_->with_active_session( session_token_, [&]() { return module_->execute(method, inputs); }) @@ -465,10 +464,11 @@ class Qwen35MoESession : public LLMSession { int64_t decode_pos_data_[1] = {0}; TensorPtr decode_tokens_; TensorPtr decode_pos_; +#ifdef QWEN_HAS_MUTABLE_STATE + MutableStateContextOwner* mutable_state_ = nullptr; + int session_token_ = kNoMutableSession; +#endif #ifdef EXECUTORCH_BUILD_CUDA - ::executorch::backends::cuda::MutableStateContextOwner* mutable_state_ = - nullptr; - int session_token_ = ::executorch::backends::cuda::kNoMutableSession; float temp_val_ = 1e-6f; TensorPtr temp_tensor_; #endif @@ -529,17 +529,17 @@ Result> Qwen35MoEEngine::create( "not stop at end of turn"); } +#ifdef QWEN_HAS_MUTABLE_STATE + std::unique_ptr mutable_state; +#endif #ifdef EXECUTORCH_BUILD_CUDA - std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner> - mutable_state; if (config.enable_cuda_graph) { ET_LOG( Info, "Qwen35MoEEngine: CUDA graph requested; per-session rebinding disabled " "and serving capacity clamped to 1 session."); } else { - auto candidate = std::make_unique< - ::executorch::backends::cuda::MutableStateContextOwner>(); + auto candidate = std::make_unique(); if (Error e = register_mutable_fqns(meta_module.get(), *candidate); e == Error::Ok) { mutable_state = std::move(candidate); @@ -550,9 +550,13 @@ Result> Qwen35MoEEngine::create( "serving capacity clamped to 1 session."); } } +#elif defined(EXECUTORCH_BUILD_MLX) + // MLX owns mutable buffers directly and selects per-session storage at + // execute time; no FQN registration or coverage check is required. + mutable_state = std::make_unique(); #endif -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE auto module_res = mutable_state != nullptr ? mutable_state->with_load_scope( [&]() { return build_qwen_module(config); }) @@ -566,16 +570,14 @@ Result> Qwen35MoEEngine::create( std::unique_ptr shared_module = std::move(module_res.get()); bool rebind_available = false; -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE rebind_available = mutable_state != nullptr && mutable_state->available(); - if (rebind_available) { - if (mutable_state->validate_coverage() != Error::Ok) { - ET_LOG( - Error, - "Qwen35MoEEngine: mutable-buffer coverage check failed; disabling " - "multi-session (capacity clamped to 1)."); - rebind_available = false; - } + if (rebind_available && mutable_state->validate_coverage() != Error::Ok) { + ET_LOG( + Error, + "Qwen35MoEEngine: mutable-buffer coverage check failed; disabling " + "multi-session (capacity clamped to 1)."); + rebind_available = false; } if (!rebind_available) { ET_LOG( @@ -592,7 +594,7 @@ Result> Qwen35MoEEngine::create( std::move(eos_ids), std::move(shared_module), rebind_available -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , std::move(mutable_state) #endif @@ -621,7 +623,7 @@ Result> Qwen35MoEEngine::create_session() { } int token = -1; // kNoMutableSession: single-session / no rebind -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE if (rebind_available_) { auto t = mutable_state_->create_session(); if (t.error() != Error::Ok) { @@ -638,7 +640,7 @@ Result> Qwen35MoEEngine::create_session() { tokenizer_.get(), metadata_, eos_ids_ -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , mutable_state_.get(), token @@ -648,7 +650,7 @@ Result> Qwen35MoEEngine::create_session() { LLMServingCapacity Qwen35MoEEngine::serving_capacity() const { LLMServingCapacity cap; // default: 1 session, 0 bytes (unknown) -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE if (rebind_available_) { cap.max_physical_sessions_without_weight_duplication = config_.max_sessions > 1 ? config_.max_sessions : 1; diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.h b/examples/models/qwen3_5_moe/qwen35_moe_engine.h index c7ea53115b8..5a5e286c9c5 100644 --- a/examples/models/qwen3_5_moe/qwen35_moe_engine.h +++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.h @@ -7,8 +7,8 @@ */ // Engine/Session adapter for the Qwen3.5 MoE exported prefill/decode methods. -// CUDA builds can host multiple sessions on one loaded model by rebinding the -// model's mutable buffers before each execute. +// CUDA and MLX builds can host multiple sessions on one loaded model by +// rebinding the model's mutable buffers before each execute. #pragma once @@ -28,10 +28,28 @@ #ifdef EXECUTORCH_BUILD_CUDA #include +#elif defined(EXECUTORCH_BUILD_MLX) +#include +#endif + +#if defined(EXECUTORCH_BUILD_CUDA) || defined(EXECUTORCH_BUILD_MLX) +#define QWEN_HAS_MUTABLE_STATE 1 #endif namespace executorch::extension::llm { +#if defined(EXECUTORCH_BUILD_CUDA) +using MutableStateContextOwner = + ::executorch::backends::cuda::MutableStateContextOwner; +constexpr int kNoMutableSession = + ::executorch::backends::cuda::kNoMutableSession; +#elif defined(EXECUTORCH_BUILD_MLX) +using MutableStateContextOwner = + ::executorch::backends::mlx::MutableStateContextOwner; +constexpr int kNoMutableSession = + ::executorch::backends::mlx::kNoMutableSession; +#endif + /// Immutable configuration for a Qwen3.5 MoE engine. struct Qwen35MoEConfig { std::string model_path; // .pte @@ -77,10 +95,9 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine { std::unordered_set eos_ids, std::unique_ptr shared_module, bool rebind_available -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , - std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner> - mutable_state + std::unique_ptr mutable_state #endif ) : config_(std::move(config)), @@ -89,7 +106,7 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine { eos_ids_(std::move(eos_ids)), shared_module_(std::move(shared_module)), rebind_available_(rebind_available) -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , mutable_state_(std::move(mutable_state)) #endif @@ -104,9 +121,8 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine { std::unique_ptr shared_module_; std::mutex exec_mutex_; bool rebind_available_ = false; -#ifdef EXECUTORCH_BUILD_CUDA - std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner> - mutable_state_; +#ifdef QWEN_HAS_MUTABLE_STATE + std::unique_ptr mutable_state_; #endif std::atomic live_sessions_{0}; }; diff --git a/examples/nxp/analyzing_with_inspector.py b/examples/nxp/analyzing_with_inspector.py new file mode 100644 index 00000000000..b339af79d6e --- /dev/null +++ b/examples/nxp/analyzing_with_inspector.py @@ -0,0 +1,58 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Print profiling table for the NXP Neutron NPU model + +from typing import Any, Union + +from executorch.devtools import Inspector + + +def parse_delegate_metadata( + delegate_metadatas: list[bytes], +) -> Union[list[str], dict[str, Any]]: + """Metadata parser for Neutron Backend metadata. + + The parser is a callable that deserializes the data and returns neutron kernel number. + The deserialized data is then added back to the corresponding event in the event block for user consumption. + """ + + metadata_list = [] + for metadata_bytes in delegate_metadatas: + if len(metadata_bytes) == 1: + function_code = metadata_bytes[0] + if function_code == 0: + metadata_list.append("Profiling dump") + else: + metadata_list.append("Neutron kernel " + str(function_code)) + else: + metadata_list.append("Invalid metadata size") + return metadata_list + + +if __name__ == "__main__": + + try: + etrecord_path = "etrecord/etrecord.bin" + etdump_path = "etdump/trace.etdump" + inspector = Inspector( + etdump_path=etdump_path, + etrecord=etrecord_path, + delegate_metadata_parser=parse_delegate_metadata, + ) + + # Access raw event data and filter quantized_decomposed nodes + for event_block in inspector.event_blocks: + for event in event_block.events: + if hasattr(event, "op_types") and isinstance(event.op_types, list): + # Filter out quantized_decomposed ops from the actual list + filtered = [ + op for op in event.op_types if "quantized_decomposed" not in op + ] + event.op_types = filtered if filtered else event.op_types + + inspector.print_data_tabular(include_delegate_debug_data=True) + except Exception as e: + print(f"Error during inspection: {type(e).__name__}: {e}") diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py index f5f92d36541..258b4c87772 100644 --- a/examples/nxp/aot_neutron_compile.py +++ b/examples/nxp/aot_neutron_compile.py @@ -8,6 +8,7 @@ import argparse import io import logging +import os from collections import defaultdict import executorch.extension.pybindings.portable_lib @@ -167,6 +168,13 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): default=False, help="Use QAT mode for quantization (performs two QAT training epochs)", ) + parser.add_argument( + "--use_profiling", + action="store_true", + required=False, + default=False, + help="Enable profiling for eIQ Neutron NPU delegated model", + ) parser.add_argument( "-s", "--so_library", @@ -322,6 +330,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): operators_not_to_delegate=args.operators_not_to_delegate, fetch_constants_to_sram=args.fetch_constants_to_sram, dump_kernel_selection_code=args.dump_kernel_selection_code, + use_profiling=args.use_profiling, ) partitioners = ( [ @@ -338,6 +347,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): edge_program_manager = to_edge_transform_and_lower( export(module, example_inputs, strict=True), transform_passes=NeutronEdgePassManager(), + generate_etrecord=args.use_profiling, partitioner=partitioners, compile_config=EdgeCompileConfig( _core_aten_ops_exception_list=core_aten_ops_exception_list, @@ -360,6 +370,21 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): exec_prog = edge_program_manager.to_executorch( config=ExecutorchBackendConfig(extract_delegate_segments=False) ) + + # Generate ETRecord if profiling flag is set + if args.use_profiling: + etrecord_path = os.path.join("etrecord", f"{args.model_name}_etrecord.bin") + # Create directory if it doesn't exist + os.makedirs(os.path.dirname(etrecord_path), exist_ok=True) + # Save ETRecord + exec_prog.get_etrecord().save(etrecord_path) + # Notify the user about profiling enablement and ETRecord generation. + logging.info( + "The model was converted with profiling enabled. The time spent generating the profiling dump is traced as the " + "final delegate operation and can be ignored, as no dump is produced for non‑profilable models." + ) + logging.info(f"The ETRecord for the model was saved to {etrecord_path}.") + except RuntimeError as e: if "Missing out variants" in str(e.args[0]): raise RuntimeError( @@ -378,8 +403,10 @@ def executorch_program_to_str(ep, verbose=False): logging.debug(f"Executorch program:\n{executorch_program_to_str(exec_prog)}") # 6. Serialize to *.pte - model_name = f"{args.model_name}" + ( - "_nxp_delegate" if args.delegate is True else "" + model_name = ( + f"{args.model_name}" + + ("_nxp_delegate" if args.delegate is True else "") + + ("_profile" if args.use_profiling is True else "") ) save_pte_program(exec_prog, model_name) diff --git a/examples/qualcomm/executor_runner/TARGETS b/examples/qualcomm/executor_runner/TARGETS deleted file mode 100644 index 1e8cc179228..00000000000 --- a/examples/qualcomm/executor_runner/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain xplat-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/aten_util/TARGETS b/extension/aten_util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/aten_util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/aten_util/test/TARGETS b/extension/aten_util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/aten_util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/cuda/TARGETS b/extension/cuda/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/cuda/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/image/TARGETS b/extension/image/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/extension/image/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/image/benchmark/TARGETS b/extension/image/benchmark/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/extension/image/benchmark/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/image/test/TARGETS b/extension/image/test/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/extension/image/test/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/pytree/aten_util/TARGETS b/extension/pytree/aten_util/TARGETS deleted file mode 100644 index 77b38349334..00000000000 --- a/extension/pytree/aten_util/TARGETS +++ /dev/null @@ -1,7 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/pytree/aten_util/test/TARGETS b/extension/pytree/aten_util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/pytree/aten_util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/runner_util/TARGETS b/extension/runner_util/TARGETS deleted file mode 100644 index 1e8cc179228..00000000000 --- a/extension/runner_util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain xplat-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/tensor/TARGETS b/extension/tensor/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/tensor/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/tensor/test/TARGETS b/extension/tensor/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/tensor/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/testing_util/TARGETS b/extension/testing_util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/testing_util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/testing_util/test/TARGETS b/extension/testing_util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/testing_util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/threadpool/TARGETS b/extension/threadpool/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/threadpool/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/threadpool/test/TARGETS b/extension/threadpool/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/threadpool/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/optimized/cpu/TARGETS b/kernels/optimized/cpu/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/kernels/optimized/cpu/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/optimized/test/TARGETS b/kernels/optimized/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/kernels/optimized/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/portable/cpu/util/TARGETS b/kernels/portable/cpu/util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/kernels/portable/cpu/util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/portable/cpu/util/test/TARGETS b/kernels/portable/cpu/util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/kernels/portable/cpu/util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/prim_ops/TARGETS b/kernels/prim_ops/TARGETS deleted file mode 100644 index 77b38349334..00000000000 --- a/kernels/prim_ops/TARGETS +++ /dev/null @@ -1,7 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/backend/TARGETS b/runtime/backend/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/backend/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/backend/test/TARGETS b/runtime/backend/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/backend/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/TARGETS b/runtime/core/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/exec_aten/TARGETS b/runtime/core/exec_aten/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/exec_aten/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/exec_aten/testing_util/TARGETS b/runtime/core/exec_aten/testing_util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/exec_aten/testing_util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/exec_aten/testing_util/test/TARGETS b/runtime/core/exec_aten/testing_util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/exec_aten/testing_util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/exec_aten/util/TARGETS b/runtime/core/exec_aten/util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/exec_aten/util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/platform/TARGETS b/runtime/platform/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/platform/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/platform/test/TARGETS b/runtime/platform/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/platform/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/schema/TARGETS b/schema/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/schema/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/schema/test/TARGETS b/schema/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/schema/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/test/TARGETS b/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets()