From 4433d1fc1bc1a5bd5c25275c44ca3564cd2b940f Mon Sep 17 00:00:00 2001 From: Devin Lai Date: Wed, 24 Jun 2026 02:39:17 +0800 Subject: [PATCH 1/7] [MLX] Add leaky_relu op handler (#20305) Summary: - Add MLX lowering for aten.leaky_relu.default using existing GreaterEqual, Multiply, and Where nodes. - Add focused MLX op tests for custom negative_slope values, including a slope above 1. Test Plan: - python -m py_compile backends/mlx/ops.py backends/mlx/test/test_ops.py - git diff --check HEAD^..HEAD - PATH="$PWD/.venv-mlx/bin:$PATH" .venv-mlx/bin/lintrunner backends/mlx/ops.py backends/mlx/test/test_ops.py - .venv-mlx/bin/python -m executorch.backends.mlx.test.run_all_tests leaky_relu --timeout 180 cc @metascroy --- backends/mlx/ops.py | 59 +++++++++++++++++++++++++++++++++++ backends/mlx/test/test_ops.py | 54 ++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/backends/mlx/ops.py b/backends/mlx/ops.py index 44536e675da..e3a636466c1 100644 --- a/backends/mlx/ops.py +++ b/backends/mlx/ops.py @@ -163,6 +163,8 @@ from executorch.exir.dialects._ops import ops as exir_ops from torch.fx.node import Node +_LEAKY_RELU_DEFAULT_NEGATIVE_SLOPE = 0.01 + def require_static_int(value: Any, param_name: str, op_name: str) -> None: """ @@ -2786,6 +2788,63 @@ def _relu_handler(P: MLXProgramBuilder, n: Node) -> Slot: return out +@REGISTRY.register(target=[torch.ops.aten.leaky_relu.default]) +def _leaky_relu_handler(P: MLXProgramBuilder, n: Node) -> Slot: + """Handle aten.leaky_relu.default - leaky rectified linear unit. + + leaky_relu(x) = x if x >= 0 + = slope * x otherwise + + Implemented as where(x >= 0, x, slope * x) so it stays correct for any + negative_slope (including values > 1), matching eager PyTorch. + """ + args = P.args(n) + require_args(args, 1, 2, "aten.leaky_relu") + require_kwargs(P.kwargs(n), set(), "aten.leaky_relu") + + x = args[0] + negative_slope = _LEAKY_RELU_DEFAULT_NEGATIVE_SLOPE + if len(args) > 1 and args[1] is not None: + negative_slope = float(args[1]) + + x_meta = n.args[0].meta.get("val") + if x_meta is None: + raise ValueError("Input tensor metadata not found for leaky_relu") + dtype = x_meta.dtype + + zero_slot = emit_lifted_constant(P, 0.0, dtype) + slope_slot = emit_lifted_constant(P, negative_slope, dtype) + + _, cond_slot = P.make_tmp_slot() + P.emit( + GreaterEqualNode( + a=P.slot_to_tid(x), + b=P.slot_to_tid(zero_slot), + out=P.slot_to_tid(cond_slot), + ) + ) + + _, scaled_slot = P.make_tmp_slot() + P.emit( + MultiplyNode( + a=P.slot_to_tid(slope_slot), + b=P.slot_to_tid(x), + out=P.slot_to_tid(scaled_slot), + ) + ) + + out = P.make_or_get_slot(n) + P.emit( + WhereNode( + condition=P.slot_to_tid(cond_slot), + x=P.slot_to_tid(x), + y=P.slot_to_tid(scaled_slot), + out=P.slot_to_tid(out), + ) + ) + return out + + @REGISTRY.register(target=[torch.ops.aten._log_softmax.default]) def _log_softmax_handler(P: MLXProgramBuilder, n: Node) -> Slot: """Handle aten._log_softmax.default - log of softmax. diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py index 8f52116f6b8..e96c8075903 100644 --- a/backends/mlx/test/test_ops.py +++ b/backends/mlx/test/test_ops.py @@ -405,6 +405,60 @@ def create_inputs(self) -> Tuple[torch.Tensor, ...]: return (x,) +class LeakyReLUModel(nn.Module): + """Model that applies leaky_relu with an optional negative slope.""" + + def __init__(self, negative_slope: Optional[float] = 0.01): + super().__init__() + self.negative_slope = negative_slope + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.negative_slope is None: + return torch.nn.functional.leaky_relu(x) + return torch.nn.functional.leaky_relu(x, negative_slope=self.negative_slope) + + +@register_test +class LeakyReLUTest(OpTestCase): + """Test case for leaky_relu activation with various negative slopes.""" + + name = "leaky_relu" + rtol = 1e-5 + atol = 1e-5 + + def __init__( + self, + shape: Tuple[int, ...] = (2, 3, 4), + negative_slope: Optional[float] = 0.01, + ): + self.shape = shape + self.negative_slope = negative_slope + shape_str = "x".join(str(s) for s in shape) + slope_str = "default" if negative_slope is None else f"slope{negative_slope}" + self.name = f"leaky_relu_{slope_str}_{shape_str}" + + @classmethod + def get_test_configs(cls) -> List["LeakyReLUTest"]: + return [ + cls(shape=(2, 3, 4), negative_slope=0.01), + cls(shape=(2, 3, 4), negative_slope=None), + cls(shape=(4, 8), negative_slope=0.1), + cls(shape=(10,), negative_slope=0.2), + cls(shape=(10,), negative_slope=1.5), + cls(shape=(2, 8, 16), negative_slope=0.01), + ] + + def create_model(self) -> nn.Module: + return LeakyReLUModel(self.negative_slope) + + def create_inputs(self) -> Tuple[torch.Tensor, ...]: + numel = 1 + for size in self.shape: + numel *= size + x = torch.linspace(-4.0, 4.0, steps=numel).reshape(self.shape) + return (x,) + + class GELUModel(nn.Module): """Simple model using GELU activation.""" From 58447b268273be4e4b333527656805308fa15e3e Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:37:26 -0700 Subject: [PATCH 2/7] [MLX] Support multiple KV cache sessions, with shared constant data (#20408) MLX backend already has mutable state in a separate execution context from its constant data. This PR exposes a way to configure that for external callers, and uses this to support serve.py on MLX like CUDA backend. --- .github/workflows/mlx.yml | 6 +- backends/mlx/CMakeLists.txt | 6 +- .../mlx/custom_kernel_ops/gated_delta_rule.py | 37 ++ .../test/test_gated_delta_rule.py | 5 +- backends/mlx/runtime/MLXBackend.cpp | 16 + backends/mlx/runtime/mlx_mutable_state.cpp | 339 ++++++++++++++++++ backends/mlx/runtime/mlx_mutable_state.h | 204 +++++++++++ backends/mlx/test/CMakeLists.txt | 19 + backends/mlx/test/mlx_mutable_state_test.cpp | 132 +++++++ examples/models/qwen3_5_moe/CMakeLists.txt | 1 + examples/models/qwen3_5_moe/CMakePresets.json | 4 +- examples/models/qwen3_5_moe/README.md | 60 ++++ .../qwen3_5_moe/mlx_source_transformations.py | 26 +- .../models/qwen3_5_moe/qwen35_moe_engine.cpp | 60 ++-- .../models/qwen3_5_moe/qwen35_moe_engine.h | 34 +- 15 files changed, 888 insertions(+), 61 deletions(-) create mode 100644 backends/mlx/runtime/mlx_mutable_state.cpp create mode 100644 backends/mlx/runtime/mlx_mutable_state.h create mode 100644 backends/mlx/test/mlx_mutable_state_test.cpp diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml index 5a4ccbb4952..167ceb7da83 100644 --- a/.github/workflows/mlx.yml +++ b/.github/workflows/mlx.yml @@ -66,7 +66,11 @@ jobs: echo "::endgroup::" echo "::group::Build test runners" - ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 )) + ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner mlx_mutable_state_test -j$(( $(sysctl -n hw.ncpu) - 1 )) + echo "::endgroup::" + + echo "::group::Run mutable-state (multi-session) unit test" + ./cmake-out/backends/mlx/test/mlx_mutable_state_test echo "::endgroup::" echo "::group::Run op unit tests" diff --git a/backends/mlx/CMakeLists.txt b/backends/mlx/CMakeLists.txt index 43968d09b5d..acb96fb1ed9 100644 --- a/backends/mlx/CMakeLists.txt +++ b/backends/mlx/CMakeLists.txt @@ -255,8 +255,10 @@ option(ET_MLX_ALLOW_CUSTOM_KERNEL_EXECUTION ON ) -set(_mlx_backend__srcs ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXLoader.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXBackend.cpp +set(_mlx_backend__srcs + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXLoader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/MLXBackend.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/mlx_mutable_state.cpp ) add_library(mlxdelegate ${_mlx_backend__srcs}) diff --git a/backends/mlx/custom_kernel_ops/gated_delta_rule.py b/backends/mlx/custom_kernel_ops/gated_delta_rule.py index 423ffd0b034..41eb8ce7b98 100644 --- a/backends/mlx/custom_kernel_ops/gated_delta_rule.py +++ b/backends/mlx/custom_kernel_ops/gated_delta_rule.py @@ -53,6 +53,15 @@ def gated_delta_rule( B, T_len, Hk, Dk = q.shape Hv, Dv = v.shape[-2:] + # The Metal kernel maps each v-head to its k-head group + # (hk_idx = hv_idx / (Hv / Hk)); mirror that here so the eager reference also + # supports Hk != Hv (GQA) instead of relying on broadcasting, which requires + # Hk == Hv. repeat_interleave on the head dim reproduces that index mapping. + if Hk != Hv: + q = q.repeat_interleave(Hv // Hk, dim=2) + k = k.repeat_interleave(Hv // Hk, dim=2) + Hk = Hv + s = state.clone() ys = [] @@ -101,6 +110,7 @@ def gated_delta_rule_fake( IntOrVid, MetalKernelNode, MultiplyNode, + RepeatNode, ScanNode, SubtractNode, SumNode, @@ -450,6 +460,33 @@ def _emit_scan(self, P: MLXProgramBuilder, n: Node) -> Slot: ] ) + # GQA: q/k carry Hk heads but the recurrence state/v have Hv heads. Expand + # q/k to Hv (repeat_interleave on the head axis) so the per-step broadcasts + # match, mirroring the Metal kernel's hk_idx = hv_idx / (Hv / Hk). + Hk = int(self.q_node.meta["val"].shape[-2]) + Hv = int(self.v_node.meta["val"].shape[-2]) + if Hk != Hv: + rep = IntOrVid.from_literal(Hv // Hk) + _, q_exp = P.make_tmp_slot() + P.emit( + RepeatNode( + x=P.slot_to_tid(q_slot), + out=P.slot_to_tid(q_exp), + repeats=rep, + axis=2, + ) + ) + _, k_exp = P.make_tmp_slot() + P.emit( + RepeatNode( + x=P.slot_to_tid(k_slot), + out=P.slot_to_tid(k_exp), + repeats=rep, + axis=2, + ) + ) + q_slot, k_slot = q_exp, k_exp + # Carry needs a writable slot. This is node n's persistent output (the # mutated state), so it must be a node-owned slot — not a temp slot, whose # id is reclaimed on tmp_scope exit and would be read as dead by a later diff --git a/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py b/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py index 0a7e6a687f9..dfee111e74b 100644 --- a/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py +++ b/backends/mlx/custom_kernel_ops/test/test_gated_delta_rule.py @@ -96,9 +96,8 @@ def forward( g: torch.Tensor, # [B, T, Hv] beta: torch.Tensor, # [B, T, Hv] ) -> torch.Tensor: - if self.head_repeat > 1: - q = q.repeat_interleave(self.head_repeat, dim=2) - k = k.repeat_interleave(self.head_repeat, dim=2) + # Pass native Hk (no repeat_interleave): the op itself must handle + # GQA head expansion (kernel via hk_idx mapping, scan/eager internally). return torch.ops.mlx.gated_delta_rule( q, k, v, g, beta, self.state, use_custom_kernel=self.use_custom_kernel ) diff --git a/backends/mlx/runtime/MLXBackend.cpp b/backends/mlx/runtime/MLXBackend.cpp index 5bd3bf263d1..0dbdec22436 100644 --- a/backends/mlx/runtime/MLXBackend.cpp +++ b/backends/mlx/runtime/MLXBackend.cpp @@ -9,6 +9,7 @@ #include "MLXExecutor.h" #include "MLXInterpreter.h" #include "MLXLoader.h" +#include "mlx_mutable_state.h" #include #include @@ -277,6 +278,12 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface { eval(handle->constants.tensors); } + // Register the handle with the per-session mutable-state manager. This is + // a no-op unless a multi-session owner is active for this load (see + // mlx_mutable_state.h); single-session execution is unaffected. + mutable_state_note_handle( + handle, &handle->program, &handle->mutable_buffers); + } catch (const std::exception& e) { ET_LOG(Error, "Failed to load MLX program: %s", e.what()); handle->~MLXHandle(); @@ -366,6 +373,14 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface { } } + // Select the active session's mutable buffers (KV cache, recurrent/conv + // state) before running. No-op for single-session handles; weights stay + // shared via ExecutionState::constants. + if (Error rebind_err = mutable_state_rebind_for_execute(h, h->state); + rebind_err != Error::Ok) { + return rebind_err; + } + // Run the MLX program (builds lazy computation graph) h->interpreter.run(program, h->state, h->stream); @@ -431,6 +446,7 @@ class MLXBackend final : public ::executorch::runtime::BackendInterface { void destroy(DelegateHandle* handle) const override { std::lock_guard lock(mlx_global_mutex()); if (handle != nullptr) { + mutable_state_forget_handle(handle); auto* mlx_handle = static_cast(handle); mlx_handle->~MLXHandle(); } diff --git a/backends/mlx/runtime/mlx_mutable_state.cpp b/backends/mlx/runtime/mlx_mutable_state.cpp new file mode 100644 index 00000000000..2f00d917136 --- /dev/null +++ b/backends/mlx/runtime/mlx_mutable_state.cpp @@ -0,0 +1,339 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "mlx_mutable_state.h" + +#include "MLXExecutor.h" +#include "MLXLoader.h" + +#include + +#include +#include + +namespace executorch { +namespace backends { +namespace mlx { + +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; + +namespace { + +struct HandleInfo { + const MLXProgram* program{nullptr}; + MutableBufferData* default_buffers{nullptr}; +}; + +struct Context { + // Delegate handles associated with this loaded program (one per loaded + // method). Keyed by opaque MLXHandle pointer. + std::unordered_map handles; + // Per-session mutable buffers: token -> (handle -> buffers). Allocated lazily + // on first execute for a given (session, handle). + std::unordered_map> + sessions; + int next_token{0}; + // Sticky setup failure. Once set (e.g. by nested load scopes), available(), + // validate_coverage(), create_session(), and rebind fail consistently. + Error build_error{Error::Ok}; +}; + +// Process-global registry. MLX serializes execution via its own global mutex +// and the engine serializes per session, but the registry itself is guarded +// here so context/session lifecycle calls from other threads are safe. +std::mutex& registry_mutex() { + static std::mutex m; + return m; +} + +std::unordered_map& contexts() { + static std::unordered_map c; + return c; +} + +std::unordered_map& handle_ctx() { + static std::unordered_map m; + return m; +} + +MutableStateContext g_next_ctx = 1; // 0 is reserved as invalid. + +// Thread-local load scope and active (ctx, session) selection. +thread_local MutableStateContext tl_loading_ctx = kInvalidMutableContext; +thread_local MutableStateContext tl_active_ctx = kInvalidMutableContext; +thread_local int tl_active_token = kNoMutableSession; + +} // namespace + +namespace detail { + +MutableStateContext mutable_state_create_context() { + std::lock_guard g(registry_mutex()); + MutableStateContext ctx = g_next_ctx++; + if (ctx == kInvalidMutableContext) { + ctx = g_next_ctx++; + } + contexts()[ctx]; + return ctx; +} + +void mutable_state_destroy_context(MutableStateContext ctx) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + return; + } + for (const auto& kv : it->second.handles) { + handle_ctx().erase(kv.first); + } + contexts().erase(it); +} + +void mutable_state_begin_load(MutableStateContext ctx) { + if (tl_loading_ctx != kInvalidMutableContext) { + // Nested load scopes would silently overwrite the thread-local association. + // Mark both the already-active and the new context invalid instead. + std::lock_guard g(registry_mutex()); + auto active = contexts().find(tl_loading_ctx); + if (active != contexts().end()) { + active->second.build_error = Error::InvalidState; + } + auto nested = contexts().find(ctx); + if (nested != contexts().end()) { + nested->second.build_error = Error::InvalidState; + } + ET_LOG(Error, "mutable_state: nested load scopes are not supported"); + tl_loading_ctx = kInvalidMutableContext; + return; + } + tl_loading_ctx = ctx; +} + +void mutable_state_end_load() { + tl_loading_ctx = kInvalidMutableContext; +} + +bool mutable_state_available(MutableStateContext ctx) { + if (ctx == kInvalidMutableContext) { + return false; + } + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + return it != contexts().end() && it->second.build_error == Error::Ok && + !it->second.handles.empty(); +} + +int64_t mutable_state_bytes_per_session(MutableStateContext ctx) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + return 0; + } + int64_t total = 0; + for (const auto& kv : it->second.handles) { + const MutableBufferData* bufs = kv.second.default_buffers; + if (bufs == nullptr) { + continue; + } + for (const auto& t : bufs->tensors) { + if (t.has_value()) { + total += static_cast(t->nbytes()); + } + } + } + return total; +} + +Error mutable_state_validate_coverage(MutableStateContext ctx) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + return Error::InvalidArgument; + } + if (it->second.build_error != Error::Ok) { + return it->second.build_error; + } + // MLX clones all mutable buffers by tid; there is no FQN coverage to verify. + return Error::Ok; +} + +Result mutable_state_create_session(MutableStateContext ctx) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + ET_LOG(Error, "mutable_state_create_session: unknown context %d", ctx); + return Error::InvalidState; + } + Context& c = it->second; + if (c.build_error != Error::Ok) { + return c.build_error; + } + if (c.handles.empty()) { + ET_LOG( + Error, "mutable_state_create_session: no backend handles registered"); + return Error::NotSupported; + } + int token = c.next_token++; + // Per-handle buffers are allocated lazily on first execute. + c.sessions[token]; + return token; +} + +void mutable_state_destroy_session(MutableStateContext ctx, int token) { + std::lock_guard g(registry_mutex()); + auto it = contexts().find(ctx); + if (it == contexts().end()) { + return; + } + it->second.sessions.erase(token); +} + +void mutable_state_set_active(MutableStateContext ctx, int token) { + tl_active_ctx = ctx; + tl_active_token = token; +} + +} // namespace detail + +void mutable_state_note_handle( + const void* handle, + const MLXProgram* program, + MutableBufferData* default_buffers) { + if (tl_loading_ctx == kInvalidMutableContext) { + return; // No multi-session owner active during this load: single-session. + } + std::lock_guard g(registry_mutex()); + auto it = contexts().find(tl_loading_ctx); + if (it == contexts().end()) { + return; + } + it->second.handles[handle] = HandleInfo{program, default_buffers}; + handle_ctx()[handle] = tl_loading_ctx; +} + +void mutable_state_forget_handle(const void* handle) { + std::lock_guard g(registry_mutex()); + auto hit = handle_ctx().find(handle); + if (hit == handle_ctx().end()) { + return; + } + auto cit = contexts().find(hit->second); + if (cit != contexts().end()) { + cit->second.handles.erase(handle); + for (auto& session : cit->second.sessions) { + session.second.erase(handle); + } + } + handle_ctx().erase(hit); +} + +Error mutable_state_rebind_for_execute( + const void* handle, + ExecutionState& state) { + std::lock_guard g(registry_mutex()); + auto hit = handle_ctx().find(handle); + if (hit == handle_ctx().end()) { + if (tl_active_token != kNoMutableSession) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: active session set but handle has " + "no mutable-state context"); + return Error::Internal; + } + // Handle was not loaded under a multi-session owner: keep default buffers. + return Error::Ok; + } + auto cit = contexts().find(hit->second); + if (cit == contexts().end()) { + return Error::Ok; + } + Context& ctx = cit->second; + if (ctx.build_error != Error::Ok) { + return ctx.build_error; + } + // Invariant: a handle present in handle_ctx() is present in ctx.handles. Look + // it up explicitly (not operator[]) so a broken invariant fails loudly + // instead of inserting a {nullptr, nullptr} entry that later null-derefs in + // load_mutable_buffers(*info.program, ...). + auto info_it = ctx.handles.find(handle); + if (info_it == ctx.handles.end()) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: handle has a context but no " + "registered HandleInfo (invariant broken)"); + return Error::Internal; + } + HandleInfo& info = info_it->second; + + const bool has_active_session = tl_active_token != kNoMutableSession; + const bool active_for_this_ctx = + has_active_session && tl_active_ctx == hit->second; + + // A session is active, but for a different context than the one this handle + // belongs to. Falling back to default buffers would silently execute with the + // wrong model/session state, so refuse instead. + if (has_active_session && !active_for_this_ctx) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: active context mismatch (a session " + "is active for a different loaded program than the one executing)"); + return Error::Internal; + } + + if (!active_for_this_ctx) { + // No session selected. Refuse if sessions exist (running against the + // default buffers here would not isolate state from created sessions). + if (!ctx.sessions.empty()) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: no active session selected but " + "sessions exist for this program"); + return Error::InvalidState; + } + state.mutable_buffers = info.default_buffers; + return Error::Ok; + } + + auto sit = ctx.sessions.find(tl_active_token); + if (sit == ctx.sessions.end()) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: unknown session token %d", + tl_active_token); + return Error::InvalidState; + } + + auto& per_handle = sit->second; + auto bit = per_handle.find(handle); + if (bit == per_handle.end()) { + // First execute for this (session, handle): allocate fresh zeroed buffers. + // Constants/weights stay shared (ExecutionState::constants is untouched); + // only the mutable buffers are per-session. + MutableBufferData buffers; + try { + load_mutable_buffers(*info.program, buffers); + } catch (const std::exception& e) { + ET_LOG( + Error, + "mutable_state_rebind_for_execute: failed to allocate session " + "buffers: %s", + e.what()); + return Error::MemoryAllocationFailed; + } + bit = per_handle.emplace(handle, std::move(buffers)).first; + } + // unordered_map keeps element pointers stable across rehash, so this remains + // valid for the duration of the execute. + state.mutable_buffers = &bit->second; + return Error::Ok; +} + +} // namespace mlx +} // namespace backends +} // namespace executorch diff --git a/backends/mlx/runtime/mlx_mutable_state.h b/backends/mlx/runtime/mlx_mutable_state.h new file mode 100644 index 00000000000..84420812360 --- /dev/null +++ b/backends/mlx/runtime/mlx_mutable_state.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +// MLX-private support for running one loaded MLX program with multiple isolated +// instances of its mutable buffers (KV cache, conv/recurrent state). Callers +// create sessions and execute with one active session selected. +// +// Unlike the CUDA backend, the MLX runtime owns mutable buffers directly in a +// swappable container (ExecutionState::mutable_buffers is a +// MutableBufferData*), so per-session isolation is a pointer swap to a fresh +// MutableBufferData — no FQN registration / constant-repoint hook is needed. + +namespace executorch { +namespace backends { +namespace mlx { + +// Forward declarations (defined in MLXLoader.h / MLXExecutor.h). +struct MLXProgram; +struct MutableBufferData; +struct ExecutionState; + +// Opaque per-loaded-program context id (0 = invalid). +using MutableStateContext = int; +constexpr MutableStateContext kInvalidMutableContext = 0; + +// Sentinel for execution without per-session rebinding. +constexpr int kNoMutableSession = -1; + +// Implementation entry points. Callers should use MutableStateContextOwner. +namespace detail { + +MutableStateContext mutable_state_create_context(); +void mutable_state_destroy_context(MutableStateContext ctx); +void mutable_state_begin_load(MutableStateContext ctx); +void mutable_state_end_load(); +bool mutable_state_available(MutableStateContext ctx); +int64_t mutable_state_bytes_per_session(MutableStateContext ctx); +::executorch::runtime::Error mutable_state_validate_coverage( + MutableStateContext ctx); +::executorch::runtime::Result mutable_state_create_session( + MutableStateContext ctx); +void mutable_state_destroy_session(MutableStateContext ctx, int token); +void mutable_state_set_active(MutableStateContext ctx, int token); + +} // namespace detail + +// Caller-facing owner for one mutable-state context. Mirrors the CUDA backend's +// MutableStateContextOwner so the example engine can use a symmetric API. +class ET_EXPERIMENTAL MutableStateContextOwner final { + class LoadScope final { + public: + explicit LoadScope(MutableStateContext ctx) { + detail::mutable_state_begin_load(ctx); + } + + ~LoadScope() { + detail::mutable_state_end_load(); + } + + LoadScope(const LoadScope&) = delete; + LoadScope& operator=(const LoadScope&) = delete; + }; + + class ActiveSessionScope final { + public: + ActiveSessionScope(MutableStateContext ctx, int token) { + detail::mutable_state_set_active(ctx, token); + } + + ~ActiveSessionScope() { + detail::mutable_state_set_active( + kInvalidMutableContext, kNoMutableSession); + } + + ActiveSessionScope(const ActiveSessionScope&) = delete; + ActiveSessionScope& operator=(const ActiveSessionScope&) = delete; + }; + + public: + MutableStateContextOwner() : ctx_(detail::mutable_state_create_context()) {} + + ~MutableStateContextOwner() { + destroy(); + } + + MutableStateContextOwner(const MutableStateContextOwner&) = delete; + MutableStateContextOwner& operator=(const MutableStateContextOwner&) = delete; + + MutableStateContextOwner(MutableStateContextOwner&& other) noexcept + : ctx_(std::exchange(other.ctx_, kInvalidMutableContext)) {} + + MutableStateContextOwner& operator=( + MutableStateContextOwner&& other) noexcept { + if (this != &other) { + destroy(); + ctx_ = std::exchange(other.ctx_, kInvalidMutableContext); + } + return *this; + } + + MutableStateContext get() const { + return ctx_; + } + + explicit operator bool() const { + return ctx_ != kInvalidMutableContext; + } + + // Associates delegate handles created by `fn` with this context. + template + auto with_load_scope(Fn&& fn) const -> decltype(std::forward(fn)()) { + LoadScope scope(ctx_); + return std::forward(fn)(); + } + + // Selects this context/session while `fn` executes. The caller is responsible + // for serializing execution that touches the same loaded program. + // + // Thread-safety contract: destroy_session()/forget_handle() only take the + // registry mutex, while rebind (under with_active_session) hands execute a + // raw pointer into Context::sessions that is dereferenced after the lock is + // released. The caller must therefore guarantee a session is never destroyed + // while it is the active session mid-execute (the engine upholds this: a + // session's buffers are freed only when its owning LLMSession drops, never + // concurrently with its own execute). Destroying *other* sessions + // concurrently is safe — unordered_map keeps element pointers stable across + // rehash. + template + auto with_active_session(int token, Fn&& fn) const + -> decltype(std::forward(fn)()) { + ActiveSessionScope scope(ctx_, token); + return std::forward(fn)(); + } + + // True only after this context has been associated with at least one loaded + // MLX backend handle can create isolated mutable-buffer sessions. + bool available() const { + return detail::mutable_state_available(ctx_); + } + + int64_t bytes_per_session() const { + return detail::mutable_state_bytes_per_session(ctx_); + } + + ::executorch::runtime::Error validate_coverage() const { + return detail::mutable_state_validate_coverage(ctx_); + } + + // Creates an isolated mutable-buffer session for this context. + // Fails if no loaded MLX backend handle has been associated with the context. + ET_NODISCARD ::executorch::runtime::Result create_session() const { + return detail::mutable_state_create_session(ctx_); + } + + void destroy_session(int token) const { + detail::mutable_state_destroy_session(ctx_, token); + } + + private: + void destroy() { + if (ctx_ != kInvalidMutableContext) { + detail::mutable_state_destroy_context(ctx_); + ctx_ = kInvalidMutableContext; + } + } + + MutableStateContext ctx_ = kInvalidMutableContext; +}; + +// --- MLXBackend hooks -------------------------------------------------------- +// +// Called from MLXBackend init/execute/destroy. `handle` is an opaque key (the +// MLXHandle pointer). `program` and `default_buffers` are the handle's own +// program and (init-time) mutable buffers; the manager swaps in per-session +// buffers (or restores the default) by re-pointing `state.mutable_buffers`. + +void mutable_state_note_handle( + const void* handle, + const MLXProgram* program, + MutableBufferData* default_buffers); + +void mutable_state_forget_handle(const void* handle); + +::executorch::runtime::Error mutable_state_rebind_for_execute( + const void* handle, + ExecutionState& state); + +} // namespace mlx +} // namespace backends +} // namespace executorch diff --git a/backends/mlx/test/CMakeLists.txt b/backends/mlx/test/CMakeLists.txt index 39024639d1d..2d494652138 100644 --- a/backends/mlx/test/CMakeLists.txt +++ b/backends/mlx/test/CMakeLists.txt @@ -69,3 +69,22 @@ if(EXECUTORCH_MLX_ENABLE_SANITIZERS) multi_thread_test_runner PRIVATE ${_mlx_sanitizer_link_options} ) endif() + +# Per-session mutable-state manager unit test (no model/tokenizer needed). +add_executable(mlx_mutable_state_test mlx_mutable_state_test.cpp) +target_include_directories( + mlx_mutable_state_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../runtime +) +target_link_libraries( + mlx_mutable_state_test PRIVATE mlxdelegate mlx_schema mlx executorch_core +) +if(EXECUTORCH_MLX_ENABLE_SANITIZERS) + target_compile_options( + mlx_mutable_state_test PRIVATE -fsanitize=address,undefined + -fno-omit-frame-pointer + ) + target_link_options( + mlx_mutable_state_test PRIVATE ${_mlx_sanitizer_link_options} + ) +endif() +add_test(NAME mlx_mutable_state COMMAND mlx_mutable_state_test) diff --git a/backends/mlx/test/mlx_mutable_state_test.cpp b/backends/mlx/test/mlx_mutable_state_test.cpp new file mode 100644 index 00000000000..99a646701ef --- /dev/null +++ b/backends/mlx/test/mlx_mutable_state_test.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Unit test for the MLX per-session mutable-state manager +// (backends/mlx/runtime/mlx_mutable_state.{h,cpp}). +// +// Verifies that two sessions created on one loaded program get independent +// mutable buffers: writing into session A's buffer does not leak into session +// B's, and A's value persists across a rebind to B and back. This is the MLX +// analogue of the CUDA "no-bleed" guarantee, exercised directly on the manager +// (no model or tokenizer needed). + +#include "MLXExecutor.h" +#include "MLXLoader.h" +#include "mlx_mutable_state.h" + +#include + +#include + +using namespace ::executorch::backends::mlx; + +namespace { + +int g_failures = 0; + +#define CHECK(cond) \ + do { \ + if (!(cond)) { \ + std::printf("FAIL: %s (line %d)\n", #cond, __LINE__); \ + ++g_failures; \ + } \ + } while (0) + +// Build a minimal program with a single 1-element float mutable buffer at tid +// 0. +MLXProgram make_program() { + MLXProgram program; + program.num_mutable_buffer_tensors = 1; + program.mutable_buffer_map.push_back(SlotVariant{0, SlotType::TensorSlot}); + TensorMeta meta; + meta.shape.push_back(ShapeDim{/*value=*/1}); + meta.scalar_type = ScalarType::Float; + program.tensor_meta.resize(1); + program.tensor_meta[0] = meta; + return program; +} + +float read0(const MutableBufferData& bufs) { + auto arr = bufs.get(Tid{0}); + ::mlx::core::eval(arr); + return arr.item(); +} + +} // namespace + +int main() { + MLXProgram program = make_program(); + + // Handle's default (init-time) mutable buffers. + MutableBufferData default_bufs; + load_mutable_buffers(program, default_bufs); + + int dummy = 0; + const void* handle = &dummy; + + MutableStateContextOwner owner; + CHECK(static_cast(owner)); + + // Associate the handle with the context (as MLXBackend::init would). + owner.with_load_scope( + [&]() { mutable_state_note_handle(handle, &program, &default_bufs); }); + + CHECK(owner.available()); + CHECK(owner.bytes_per_session() == static_cast(sizeof(float))); + + auto tokA = owner.create_session(); + auto tokB = owner.create_session(); + CHECK(tokA.ok()); + CHECK(tokB.ok()); + CHECK(tokA.get() != tokB.get()); + + ExecutionState state; + + // Session A: rebind, then write a marker (7.0) into its buffer. + owner.with_active_session(tokA.get(), [&]() { + auto err = mutable_state_rebind_for_execute(handle, state); + CHECK(err == ::executorch::runtime::Error::Ok); + state.mutable_buffers->set( + Tid{0}, ::mlx::core::full({1}, 7.0f, ::mlx::core::float32)); + return err; + }); + + // Session B: a fresh rebind must see zeros, not A's marker. + owner.with_active_session(tokB.get(), [&]() { + auto err = mutable_state_rebind_for_execute(handle, state); + CHECK(err == ::executorch::runtime::Error::Ok); + CHECK(read0(*state.mutable_buffers) == 0.0f); + return err; + }); + + // Back to session A: the marker must persist (isolation, no bleed). + owner.with_active_session(tokA.get(), [&]() { + auto err = mutable_state_rebind_for_execute(handle, state); + CHECK(err == ::executorch::runtime::Error::Ok); + CHECK(read0(*state.mutable_buffers) == 7.0f); + return err; + }); + + // With sessions present, executing without an active session is refused + // (prevents running against unmanaged/shared state). + { + auto err = mutable_state_rebind_for_execute(handle, state); + CHECK(err == ::executorch::runtime::Error::InvalidState); + } + + owner.destroy_session(tokA.get()); + owner.destroy_session(tokB.get()); + mutable_state_forget_handle(handle); + + if (g_failures == 0) { + std::printf("OK: mlx_mutable_state isolation test passed\n"); + return 0; + } + std::printf("FAILED: %d checks\n", g_failures); + return 1; +} diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt index 726657a3779..aeb97f76ab7 100644 --- a/examples/models/qwen3_5_moe/CMakeLists.txt +++ b/examples/models/qwen3_5_moe/CMakeLists.txt @@ -89,6 +89,7 @@ endif() if(TARGET mlxdelegate) executorch_target_copy_mlx_metallib(qwen3_5_moe_runner) + executorch_target_copy_mlx_metallib(qwen3_5_moe_worker) endif() if(EXECUTORCH_BUILD_CUDA) diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json index 276c2116148..6adcb8aa9cb 100644 --- a/examples/models/qwen3_5_moe/CMakePresets.json +++ b/examples/models/qwen3_5_moe/CMakePresets.json @@ -70,9 +70,9 @@ }, { "name": "qwen3-5-moe-mlx", - "displayName": "Build Qwen3.5 MoE runner (MLX)", + "displayName": "Build Qwen3.5 MoE runner and worker (MLX)", "configurePreset": "qwen3-5-moe-mlx", - "targets": ["qwen3_5_moe_runner"] + "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"] } ], "workflowPresets": [ diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md index c275641bfd7..77f53aefcc6 100644 --- a/examples/models/qwen3_5_moe/README.md +++ b/examples/models/qwen3_5_moe/README.md @@ -302,6 +302,66 @@ python -m executorch.examples.models.qwen3_5_moe.run \ --max-new-tokens 50 ``` +### Serving (MLX, multi-session) + +The MLX worker hosts multiple isolated sessions on **one** weight load, so an +OpenAI-compatible server can serve concurrent conversations without duplicating +the ~weights. `make qwen3_5_moe-mlx` builds both `qwen3_5_moe_runner` and +`qwen3_5_moe_worker` (each with `mlx.metallib` copied alongside). + +Start the server (it auto-locates the worker binary): + +```bash +# tokenizer.json the C++ worker opens (resolve from the HF cache) +TOKENIZER_JSON=$(ls "${HF_HOME:-$HOME/.cache/huggingface}"/hub/models--Qwen--Qwen3.5-35B-A3B/snapshots/*/tokenizer.json | head -n1) + +python -m executorch.examples.models.qwen3_5_moe.serve \ + --model-path ./qwen35_moe_mlx/model.pte \ + --tokenizer-path "$TOKENIZER_JSON" \ + --hf-tokenizer Qwen/Qwen3.5-35B-A3B \ + --max-sessions 4 \ + --host 127.0.0.1 \ + --port 8000 +``` + +- `--tokenizer-path` is the raw `tokenizer.json` **file** the worker loads; + `--hf-tokenizer` (HF id or local dir) supplies the chat template on the Python + side. No `--data-path` (the MLX `.pte` is self-contained). +- `--max-sessions N` caps physical sessions on the single weight load. One slot + is reserved for anonymous requests (requests sent without a session id), so + `N` allows `N-1` concurrently named sessions. + +Query it (OpenAI-compatible) from another terminal. Route each conversation to a +session with the `session_id` header: + +```bash +curl http://127.0.0.1:8000/v1/chat/completions \ + -H "Content-Type: application/json" -H "session_id: alice" \ + -d '{"model":"qwen3.5-moe", + "messages":[{"role":"user","content":"What is the capital of France?"}], + "max_tokens":50,"chat_template_kwargs":{"enable_thinking":false}}' +``` + +Endpoints: `GET /health`, `GET /v1/models`, `POST /v1/chat/completions`, +`DELETE /v1/sessions/{id}` (free a session + its slot), `POST /v1/sessions/{id}/reset`. + +Session/memory semantics on MLX: +- This server uses the standard **stateless** OpenAI contract — send the full + `messages` history each request. `session_id` + warm-resume is a KV-cache reuse + optimization for the shared prefix, not server-side memory. +- Each session adds **one** set of mutable buffers (KV + recurrent/conv state) on + top of the shared weights; per-session cost scales with `max_seq_len`. Weights + are never duplicated. +- KV persists across requests for a live session and is **released on close** + (`DELETE`/reset). Named sessions are not auto-closed — close them to free slots. + MLX's Metal allocator pools freed buffers (so RSS may not shrink immediately), + but they are reused by later sessions, keeping memory bounded. +- Requests are processed **one at a time** (a single in-flight request per + worker). A request runs to completion and head-of-line-blocks every other + session until it finishes; there is no token-level interleaving or parallel + execution. This holds on both MLX and CUDA; multi-session provides memory + isolation and warm resume, not added throughput. + ### Tiny Model Test For CI or quick pipeline validation (no model download needed): diff --git a/examples/models/qwen3_5_moe/mlx_source_transformations.py b/examples/models/qwen3_5_moe/mlx_source_transformations.py index 9a49f8a84f6..3c460fc9c54 100644 --- a/examples/models/qwen3_5_moe/mlx_source_transformations.py +++ b/examples/models/qwen3_5_moe/mlx_source_transformations.py @@ -113,12 +113,14 @@ def _full_attention_forward(self, x, input_pos): k, v = self.kv_cache.update(input_pos, k, v) - if self.n_kv_groups > 1: - k = k.repeat_interleave(self.n_kv_groups, dim=1) - v = v.repeat_interleave(self.n_kv_groups, dim=1) - - attn_mask = self.mask[input_pos].unsqueeze(0).unsqueeze(0) - y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask) + y = torch.ops.mlx.custom_sdpa( + q, + k, + v, + start_pos=pos, + dropout_p=0.0, + is_causal=True, + ) y = y.transpose(1, 2).contiguous().view(B, T, -1) @@ -184,10 +186,8 @@ def _exportable_gated_delta_net_forward(self, x, input_pos): k, (self.head_k_dim,), self._qk_rms_weight, eps=1e-6 ) - # head_repeat for k_heads != v_heads - if self.head_repeat > 1: - q = q.repeat_interleave(self.head_repeat, dim=2) - k = k.repeat_interleave(self.head_repeat, dim=2) + # GQA head expansion (k_heads != v_heads) is handled inside + # mlx::gated_delta_rule # Mamba-style gating beta = b.sigmoid() @@ -278,17 +278,13 @@ def _swap_gated_delta_net(model, model_dtype): def _swap_full_attention(model, config): - """FullAttention → mlx::rope custom op + causal mask.""" + """FullAttention → mlx::rope custom op""" rope_theta = config.rope_theta if config else 10000.0 - max_seq_len = config.max_seq_len if config else 4096 count = 0 for _name, module in model.named_modules(): if isinstance(module, FullAttention): module._rope_dims = module.rotary_emb.rotary_dim module._rope_base = rope_theta - mask = torch.full((max_seq_len, max_seq_len), float("-inf")) - mask = torch.triu(mask, diagonal=1) - module.register_buffer("mask", mask) module.forward = types.MethodType(_full_attention_forward, module) count += 1 return count diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp index 713f6211330..fd81f60c23a 100644 --- a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp +++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp @@ -183,9 +183,9 @@ class Qwen35MoESession : public LLMSession { ::tokenizers::Tokenizer* tokenizer, std::unordered_map metadata, std::unordered_set eos_ids -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , - ::executorch::backends::cuda::MutableStateContextOwner* mutable_state, + MutableStateContextOwner* mutable_state, int session_token #endif ) @@ -195,7 +195,7 @@ class Qwen35MoESession : public LLMSession { tokenizer_(tokenizer), metadata_(std::move(metadata)), eos_ids_(std::move(eos_ids)) -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , mutable_state_(mutable_state), session_token_(session_token) @@ -212,9 +212,8 @@ class Qwen35MoESession : public LLMSession { } ~Qwen35MoESession() override { -#ifdef EXECUTORCH_BUILD_CUDA - if (mutable_state_ != nullptr && - session_token_ != ::executorch::backends::cuda::kNoMutableSession) { +#ifdef QWEN_HAS_MUTABLE_STATE + if (mutable_state_ != nullptr && session_token_ != kNoMutableSession) { mutable_state_->destroy_session(session_token_); } #endif @@ -425,8 +424,8 @@ class Qwen35MoESession : public LLMSession { float temperature, bool sync_after) { std::lock_guard guard(*exec_mutex_); -#ifdef EXECUTORCH_BUILD_CUDA - Result> res = mutable_state_ != nullptr +#ifdef QWEN_HAS_MUTABLE_STATE + auto res = mutable_state_ != nullptr ? mutable_state_->with_active_session( session_token_, [&]() { return module_->execute(method, inputs); }) @@ -465,10 +464,11 @@ class Qwen35MoESession : public LLMSession { int64_t decode_pos_data_[1] = {0}; TensorPtr decode_tokens_; TensorPtr decode_pos_; +#ifdef QWEN_HAS_MUTABLE_STATE + MutableStateContextOwner* mutable_state_ = nullptr; + int session_token_ = kNoMutableSession; +#endif #ifdef EXECUTORCH_BUILD_CUDA - ::executorch::backends::cuda::MutableStateContextOwner* mutable_state_ = - nullptr; - int session_token_ = ::executorch::backends::cuda::kNoMutableSession; float temp_val_ = 1e-6f; TensorPtr temp_tensor_; #endif @@ -529,17 +529,17 @@ Result> Qwen35MoEEngine::create( "not stop at end of turn"); } +#ifdef QWEN_HAS_MUTABLE_STATE + std::unique_ptr mutable_state; +#endif #ifdef EXECUTORCH_BUILD_CUDA - std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner> - mutable_state; if (config.enable_cuda_graph) { ET_LOG( Info, "Qwen35MoEEngine: CUDA graph requested; per-session rebinding disabled " "and serving capacity clamped to 1 session."); } else { - auto candidate = std::make_unique< - ::executorch::backends::cuda::MutableStateContextOwner>(); + auto candidate = std::make_unique(); if (Error e = register_mutable_fqns(meta_module.get(), *candidate); e == Error::Ok) { mutable_state = std::move(candidate); @@ -550,9 +550,13 @@ Result> Qwen35MoEEngine::create( "serving capacity clamped to 1 session."); } } +#elif defined(EXECUTORCH_BUILD_MLX) + // MLX owns mutable buffers directly and selects per-session storage at + // execute time; no FQN registration or coverage check is required. + mutable_state = std::make_unique(); #endif -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE auto module_res = mutable_state != nullptr ? mutable_state->with_load_scope( [&]() { return build_qwen_module(config); }) @@ -566,16 +570,14 @@ Result> Qwen35MoEEngine::create( std::unique_ptr shared_module = std::move(module_res.get()); bool rebind_available = false; -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE rebind_available = mutable_state != nullptr && mutable_state->available(); - if (rebind_available) { - if (mutable_state->validate_coverage() != Error::Ok) { - ET_LOG( - Error, - "Qwen35MoEEngine: mutable-buffer coverage check failed; disabling " - "multi-session (capacity clamped to 1)."); - rebind_available = false; - } + if (rebind_available && mutable_state->validate_coverage() != Error::Ok) { + ET_LOG( + Error, + "Qwen35MoEEngine: mutable-buffer coverage check failed; disabling " + "multi-session (capacity clamped to 1)."); + rebind_available = false; } if (!rebind_available) { ET_LOG( @@ -592,7 +594,7 @@ Result> Qwen35MoEEngine::create( std::move(eos_ids), std::move(shared_module), rebind_available -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , std::move(mutable_state) #endif @@ -621,7 +623,7 @@ Result> Qwen35MoEEngine::create_session() { } int token = -1; // kNoMutableSession: single-session / no rebind -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE if (rebind_available_) { auto t = mutable_state_->create_session(); if (t.error() != Error::Ok) { @@ -638,7 +640,7 @@ Result> Qwen35MoEEngine::create_session() { tokenizer_.get(), metadata_, eos_ids_ -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , mutable_state_.get(), token @@ -648,7 +650,7 @@ Result> Qwen35MoEEngine::create_session() { LLMServingCapacity Qwen35MoEEngine::serving_capacity() const { LLMServingCapacity cap; // default: 1 session, 0 bytes (unknown) -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE if (rebind_available_) { cap.max_physical_sessions_without_weight_duplication = config_.max_sessions > 1 ? config_.max_sessions : 1; diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.h b/examples/models/qwen3_5_moe/qwen35_moe_engine.h index c7ea53115b8..5a5e286c9c5 100644 --- a/examples/models/qwen3_5_moe/qwen35_moe_engine.h +++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.h @@ -7,8 +7,8 @@ */ // Engine/Session adapter for the Qwen3.5 MoE exported prefill/decode methods. -// CUDA builds can host multiple sessions on one loaded model by rebinding the -// model's mutable buffers before each execute. +// CUDA and MLX builds can host multiple sessions on one loaded model by +// rebinding the model's mutable buffers before each execute. #pragma once @@ -28,10 +28,28 @@ #ifdef EXECUTORCH_BUILD_CUDA #include +#elif defined(EXECUTORCH_BUILD_MLX) +#include +#endif + +#if defined(EXECUTORCH_BUILD_CUDA) || defined(EXECUTORCH_BUILD_MLX) +#define QWEN_HAS_MUTABLE_STATE 1 #endif namespace executorch::extension::llm { +#if defined(EXECUTORCH_BUILD_CUDA) +using MutableStateContextOwner = + ::executorch::backends::cuda::MutableStateContextOwner; +constexpr int kNoMutableSession = + ::executorch::backends::cuda::kNoMutableSession; +#elif defined(EXECUTORCH_BUILD_MLX) +using MutableStateContextOwner = + ::executorch::backends::mlx::MutableStateContextOwner; +constexpr int kNoMutableSession = + ::executorch::backends::mlx::kNoMutableSession; +#endif + /// Immutable configuration for a Qwen3.5 MoE engine. struct Qwen35MoEConfig { std::string model_path; // .pte @@ -77,10 +95,9 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine { std::unordered_set eos_ids, std::unique_ptr shared_module, bool rebind_available -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , - std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner> - mutable_state + std::unique_ptr mutable_state #endif ) : config_(std::move(config)), @@ -89,7 +106,7 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine { eos_ids_(std::move(eos_ids)), shared_module_(std::move(shared_module)), rebind_available_(rebind_available) -#ifdef EXECUTORCH_BUILD_CUDA +#ifdef QWEN_HAS_MUTABLE_STATE , mutable_state_(std::move(mutable_state)) #endif @@ -104,9 +121,8 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine { std::unique_ptr shared_module_; std::mutex exec_mutex_; bool rebind_available_ = false; -#ifdef EXECUTORCH_BUILD_CUDA - std::unique_ptr<::executorch::backends::cuda::MutableStateContextOwner> - mutable_state_; +#ifdef QWEN_HAS_MUTABLE_STATE + std::unique_ptr mutable_state_; #endif std::atomic live_sessions_{0}; }; From 3169302acdf08e690296c2b61392de5f388c5f46 Mon Sep 17 00:00:00 2001 From: Irina Korchakova Date: Tue, 23 Jun 2026 21:59:47 +0200 Subject: [PATCH 3/7] NXP backend: Add nxp backend profiling support (#19225) ### Summary Add profiling support for the NXP backend. ### Test plan All CI tests passed including new test for the profiling feature. --------- Signed-off-by: Irina Korchakova --- .../nxp/backend/edge_program_converter.py | 46 +- .../ir/converter/builder/model_builder.py | 31 ++ .../ir/tflite_generator/tflite_model.py | 5 + .../nxp/backend/neutron_converter_manager.py | 12 + backends/nxp/backend/neutron_map.py | 457 ++++++++++++++++++ backends/nxp/nxp_backend.py | 72 ++- backends/nxp/runtime/NeutronBackend.cpp | 77 ++- backends/nxp/tests/executorch_pipeline.py | 5 + backends/nxp/tests/executors.py | 2 +- .../tests/generic_tests/test_aot_example.py | 87 +++- ...st_move_activation_before_concatenation.py | 16 +- .../test_neutron_backend_executor.py | 6 +- .../test_neutron_converter_manager.py | 2 +- .../test_per_channel_conversion.py | 2 +- .../nxp/tests/generic_tests/test_profiling.py | 158 ++++++ .../nxp/tests/generic_tests/test_quantizer.py | 8 +- .../node_converter/test_addmm_converter.py | 4 +- .../node_converter/test_bmm_converter.py | 2 +- .../node_converter/test_clone_converter.py | 6 +- .../node_converter/test_conv_converter.py | 4 +- .../node_converter/test_mm_converter.py | 4 +- .../node_converter/test_neg_converter.py | 4 +- .../node_converter/test_prelu_converter.py | 2 +- .../node_converter/test_softmax_converter.py | 4 +- .../test_view_copy_converter.py | 4 +- .../ir/edge_passes/test_linear_bn_fusing.py | 2 +- backends/nxp/tests/nsys_testing.py | 12 + .../_static/img/nxp/nxp-mcuxpresso-etdump.png | Bin 0 -> 44262 bytes docs/source/backends/nxp/nxp-overview.md | 3 + docs/source/backends/nxp/nxp-profiling.md | 205 ++++++++ examples/nxp/analyzing_with_inspector.py | 58 +++ examples/nxp/aot_neutron_compile.py | 31 +- 32 files changed, 1268 insertions(+), 63 deletions(-) create mode 100644 backends/nxp/backend/neutron_map.py create mode 100644 backends/nxp/tests/generic_tests/test_profiling.py create mode 100644 docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png create mode 100644 docs/source/backends/nxp/nxp-profiling.md create mode 100644 examples/nxp/analyzing_with_inspector.py diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py index ff8cbb660cb..9df8290e85d 100644 --- a/backends/nxp/backend/edge_program_converter.py +++ b/backends/nxp/backend/edge_program_converter.py @@ -74,13 +74,16 @@ class EdgeProgramToIRConverter: _default_target_spec = NeutronTargetSpec("imxrt700") _default_delegation_options = CustomDelegationOptions() + def __init__(self): + self.edge_to_tflite_map = {} + def convert_program( self, edge_program: ExportedProgram, conversion_config: ConversionConfig = _default_conversion_config, neutron_target_spec: NeutronTargetSpec = _default_target_spec, custom_delegation_options: CustomDelegationOptions = _default_delegation_options, - ) -> tuple[bytes, dict[str, dict[str, DataFormat]]]: + ) -> tuple[bytes, dict[str, dict[str, DataFormat]], dict[int, tuple[int, ...]]]: """ Convert ExportedProgram in Edge dialect to IR (TFLite flatbuffers) as bytes. @@ -88,8 +91,11 @@ def convert_program( :param conversion_config: ConversionConfig instance. :param neutron_target_spec: Object for querying the target platform to retrieve its properties. :param custom_delegation_options: Custom user options which affect node delegation. - :return: TFLite flatbuffers as bytes. + :return: TFLite flatbuffers as bytes, I/O formats, and edge-to-tflite mapping. """ + # Reset the edge to tflite map for each conversion + self.edge_to_tflite_map = {} + parameters_mapping = self.map_inputs_to_parameters(edge_program) dim_order_map = self.map_nodes_to_dim_order(edge_program) @@ -113,6 +119,9 @@ def convert_program( # Apply optimizations and finalize the model. internal_tflite_model = cc.tflite_builder.finish() + # Get the final edge to tflite mapping after optimization + self.edge_to_tflite_map = cc.tflite_builder.edge_to_tflite_map + # Extract the formats of the model's inputs and outputs. io_formats = cc.tflite_builder.get_io_formats(edge_program.graph_signature) @@ -120,7 +129,7 @@ def convert_program( flatbuffers_builder = flatbuffers.Builder() internal_tflite_model.gen_tflite(flatbuffers_builder) - return bytes(flatbuffers_builder.Output()), io_formats + return bytes(flatbuffers_builder.Output()), io_formats, self.edge_to_tflite_map @staticmethod def append_placeholders_and_tensors(nodes: list[Node], context: ConversionContext): @@ -162,7 +171,6 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, ] - for node in nodes: if node.op == "call_function": if node.target in qdq_related_functions and "cluster" in node.meta: @@ -174,7 +182,37 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex # The node was already processed alongside the Q/DQ ops. pass elif node.target in functions_converters: + # Get TFLite op count BEFORE conversion + tflite_op_count_before = len( + conversion_context.tflite_builder.get_operators().vector + ) + # Convert the node functions_converters[node.target](conversion_context).convert(node) + # Get TFLite op count AFTER conversion + tflite_op_count_after = len( + conversion_context.tflite_builder.get_operators().vector + ) + + # Track the mapping - store edge debug handle in operators. + # Get the edge debug handle so it can be associated with newly created operators. + edge_debug_handle = node.meta.get("debug_handle", None) + if ( + edge_debug_handle is not None + and tflite_op_count_after > tflite_op_count_before + ): + operators = ( + conversion_context.tflite_builder.get_operators().vector + ) + # Node converters append new operators to the TFLite builder. + # Only operators added during this conversion step (from "before" to "after") + # are tagged with the current edge_debug_handle. + for i in range(tflite_op_count_before, tflite_op_count_after): + # Store edge debug handle in operator's temporary attribute + operators[i].tmp_edge_debug_handle = edge_debug_handle + logger.d( + f"Tagged TFLite ops {list(range(tflite_op_count_before, tflite_op_count_after))} with edge debug_handle={edge_debug_handle} for node '{node.name}'" + ) + else: logger.e( logger.Code.NOT_IMPLEMENTED, diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py index f97a194ce87..41820c3ab61 100755 --- a/backends/nxp/backend/ir/converter/builder/model_builder.py +++ b/backends/nxp/backend/ir/converter/builder/model_builder.py @@ -85,6 +85,10 @@ class ModelBuilder: conversion_config: ConversionConfig + edge_to_tflite_map: dict[ + int, tuple[int, ...] + ] # Mapping edge debug handles to tuple of TFLite operator indices + _default_conversion_config = ConversionConfig() def __init__( @@ -105,6 +109,7 @@ def __init__( self._nchw_tensor_version = {} self._skipped_output_map = {} self._zeros_tensor_map = {} + self.edge_to_tflite_map = {} def create_zeros_tensor( self, dims: List[int], name: str, dtype: np.dtype, can_reuse: bool = False @@ -503,6 +508,9 @@ def finish(self) -> tflite_model.Model: self.conversion_config.optimization_blacklist, ) + # Create the final edge-to-tflite mapping after model optimization + self._create_edge_to_tflite_mapping() + self._keep_one_empty_buffer() # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference. @@ -524,6 +532,29 @@ def finish(self) -> tflite_model.Model: return self._tfl_model + def _create_edge_to_tflite_mapping(self): + """Create edge-to-TFLite mapping and save it to the edge_to_tflite_map class variable. + + This function should be called after all model optimizations have been applied to match the output TFLite model. + """ + + edge_to_tflite_dict = {} + for idx, op in enumerate(self.get_operators().vector): + if ( + hasattr(op, "tmp_edge_debug_handle") + and op.tmp_edge_debug_handle is not None + ): + debug_handle = op.tmp_edge_debug_handle + if debug_handle not in edge_to_tflite_dict: + edge_to_tflite_dict[debug_handle] = [] + edge_to_tflite_dict[debug_handle].append(idx) + + # Convert lists to tuples in the dictionary + self.edge_to_tflite_map = {k: tuple(v) for k, v in edge_to_tflite_dict.items()} + logger.i( + f"\nFinal edge_to_tflite_map after optimization: {self.edge_to_tflite_map}" + ) + def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool): for tensor in outputs.tmp_outputs: try: diff --git a/backends/nxp/backend/ir/tflite_generator/tflite_model.py b/backends/nxp/backend/ir/tflite_generator/tflite_model.py index 6e8e7b6c33b..d8d0bada57d 100755 --- a/backends/nxp/backend/ir/tflite_generator/tflite_model.py +++ b/backends/nxp/backend/ir/tflite_generator/tflite_model.py @@ -514,6 +514,9 @@ class Operator(meta.TFLiteObject): # If `True`, this is an extra operator added during conversion. It was not present in the original input model. tmp_added_extra: bool + # Edge program debug handle for mapping edge nodes to TFLite operators + tmp_edge_debug_handle: Optional[int] + def __init__( self, inputs: OperatorInputs = None, @@ -541,6 +544,8 @@ def __init__( self.tmp_version = 1 self.tmp_added_extra = False + self.tmp_edge_debug_handle = None + def uses_per_channel_quantization(self) -> bool: """Determine if this operator uses per-channel quantization.""" for tensor in itertools.chain(self.tmp_inputs, self.tmp_outputs): diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py index 0abee0cdc86..92b4e25a5de 100644 --- a/backends/nxp/backend/neutron_converter_manager.py +++ b/backends/nxp/backend/neutron_converter_manager.py @@ -25,6 +25,15 @@ def _build_compilation_context(compilation_opts): cctx.compilationOpts.dumpKernelSelectionCode = compilation_opts[ "dumpKernelSelectionCode" ] + if ( + hasattr(cctx.compilationOpts, "useProfiling") + and compilation_opts["useProfiling"] + ): + cctx.compilationOpts.useProfiling = compilation_opts["useProfiling"] + cctx.compilationOpts.dumpAfterImport = "console" + cctx.compilationOpts.dumpAfterGenerate = "console" + cctx.compilationOpts.verbose = compilation_opts["useProfiling"] + return cctx @@ -81,6 +90,7 @@ def convert( target: str, delegation_tag: str, fetch_constants_to_sram: bool = False, + use_profiling: bool = False, ) -> bytes: """ Call Neutron Converter. @@ -89,6 +99,7 @@ def convert( :param target: The target platform. :param delegation_tag: The delegation tag of model partition. :param fetch_constants_to_sram: Add microcode that fetches weights from external memory. + :param use_profiling: Use profiling for neutron delegated model. This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers). :return: TFLite model with Neutron microcode as bytes. @@ -102,6 +113,7 @@ def convert( "excludeGraphPasses": "HoistSliceAboveTranspose,MergeTranspose", "fetchConstantsToSRAM": fetch_constants_to_sram, "dumpKernelSelectionCode": self.dump_kernel_selection_code, + "useProfiling": use_profiling, } # Try to use multiprocessing for isolation, but fall back to direct execution diff --git a/backends/nxp/backend/neutron_map.py b/backends/nxp/backend/neutron_map.py new file mode 100644 index 00000000000..e2da653daa3 --- /dev/null +++ b/backends/nxp/backend/neutron_map.py @@ -0,0 +1,457 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import logging +import re +from dataclasses import dataclass + +# example: Type: CONV_2D +# Inputs: +# [0]: quantized_decomposed_quantize_per_tensor_default_4 +# [1]: quantized_decomposed_dequantize_per_channel_default_2 +# Outputs: +# [0]: quantized_decomposed_quantize_per_tensor_default_5 +# Location: 4 +PATTERN_NODE = ( + r"Type:\s+(?P\w+)\s+" + r"Inputs:(?P[\s\S]*?)" + r"Outputs:(?P[\s\S]*?)" + r"Location:\s+(?P\d+)" +) +# The pattern is very similar to operator pattern +PATTERN_SUBGRAPH = ( + r"^(?P\d+)\s*" + r"Inputs:(?P[\s\S]*?)" + r"Outputs:(?P[\s\S]*?)" + r"Tensors:" +) +# example: [0]: quantized_decomposed_quantize_per_tensor_default_4 +PATTERN_IO_TENSOR_NAME = r"\[\d+\]:\s+(?P[\S]+)" +# example: Statistics for NeutronGraph "subgraph_195": +PATTERN_GRAPH = r"Statistics for NeutronGraph \"subgraph_(?P\d+)\":" +# example: NeutronOperator "subgraph_001": +# Operators: +# PAD +# CONV_2D +# Kernels: +# Pad +# Conv2DStandardV2 +# NeutronOperator "subgraph_002": +PATTERN_VERBOSE_KERNELS = ( + r"\"subgraph_(?P\d+)\"\:\s*" + r"Operators:[\s\S]*?" + r"Kernels:\s*(?P[\s\S]*?)" + r"\s*(NeutronOperator|^$|=)" +) +# example: NeutronGraph "subgraph_074": +PATTERN_VERBOSE_GRAPH = ( + r"NeutronGraph\s*\"subgraph_(?P\d+)\":(?P[\s\S]*?)\s*(^$|=)" +) +# Two graphs are expected in the input log: original and converted. +EXPECTED_GRAPHS = 2 +# List of single-input nodes that shouldn't be mapped on the same TFLite node. +SINGLE_INPUT_NODES = [ + "ABS", + "AVERAGE_POOL_2D", + "CAST", + "EXP", + "HARD_SWISH", + "LEAKY_RELU", + "LOG", + "LOGISTIC", + "MAX_POOL_2D", + "QUANTIZE", + "RSQRT", + "TANH", +] + + +@dataclass +class Node: + name: str # Name of the node. + inputs: list[str] # List of nodes inputs. + outputs: list[str] # List of nodes outputs. + location: int # Location in graph/subgraph. + + +@dataclass +class SubgraphInfo: + num: int # Subgraph number. + location: int # Location in neutron graph + inputs: list[str] # List of subgraphs inputs. + outputs: list[str] # List of subgraphs outputs. + kernels: int # Number of neutron kernels in neutron subgraph. + nodes: list[Node] # List of tflite nodes in neutron subgraph. + + +def get_tensors_name(tensors: str) -> list[str]: + """Split input string with tensor names into list of names""" + return [m.group("name") for m in re.finditer(PATTERN_IO_TENSOR_NAME, tensors)] + + +class NeutronMap: + """Mapping between Neutron, TFLite, and Edge operators based on the Neutron converter log. + + Parses the Neutron converter log to extract information about TFLite nodes and Neutron subgraphs. + Maps TFLite operators to corresponding Neutron operators. + Maps Edge operators to Neutron operators via the Edge-to-TFLite mapping. + + Attributes: + tflite_nodes (list[Node]): TFLite node information extracted from the converter log. + neutron_subgraphs (list[SubgraphInfo]): Neutron subgraph information extracted from the converter log. + neutron_graphs (list[int]): Indices of final Neutron graphs derived from neutron_subgraphs. + edge_to_tflite_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to TFLite operators. + edge_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from Edge operators to Neutron operators. + tflite_to_neutron_map (dict[int, tuple[int, ...]]): Mapping from TFLite operators to Neutron operators. + + Example: + >>> map = NeutronMap(log_output, edge_to_tflite_map) + >>> neutron_to_edge_map = map.get_neutron_to_edge_map() + """ + + tflite_nodes: list[Node] + neutron_subgraphs: list[SubgraphInfo] + neutron_graphs: list[int] + edge_to_tflite_map: dict[int, tuple[int, ...]] + edge_to_neutron_map: dict[int, tuple[int, ...]] + tflite_to_neutron_map: dict[int, tuple[int, ...]] + + def __init__( + self, neutron_converter_log: str, edge_to_tflite_map: dict[int, tuple[int, ...]] + ) -> None: + """Initialize neutron map from neutron converter log. + + :param neutron_converter_log: neutron converter log obtained during model conversion. It should contain + original tflite graph and neutron graph dump. To add these dumps to converter log the dumpAfterImport and + dumpAfterGenerate flags have to be set to "console". + """ + super().__init__() + self.tflite_nodes = [] + self.neutron_subgraphs = [] + self.neutron_graphs = [] + self.edge_to_tflite_map = edge_to_tflite_map + self.tflite_to_neutron_map = {} + self.edge_to_neutron_map = {} + self.neutron_kernels_num = 0 + self._split_profiling_log(neutron_converter_log) + + def _split_profiling_log(self, log: str) -> None: + """Process profiling log to split it into original TFLite and converted Neutron nodes. + + :param log: Neutron converter log obtained during model conversion, containing the original + TFLite graph and Neutron graph dump. + :return: None. Sets class attributes tflite_nodes and neutron_subgraphs with node information. + """ + graphs = log.split("Graphs:") + # Check if there is two graphs in the input dump + if len(graphs) != EXPECTED_GRAPHS + 1: + return + optimization_dump, neutron_graph_dump = graphs[1:] + + # Get tflite model dump + tflite_graph_dump = optimization_dump.partition("= Optimize Graph =")[0] + + # Get verbose Neutron graphs located in the Extract Graphs section. + extracted_graph_dump = optimization_dump.partition("= Extract Graphs =")[ + 2 + ].partition("Generate code for NeutronGraph")[0] + + # Get list of original operators from first dumped graph. + self.tflite_nodes = [ + Node( + matched_operator.group("type"), + get_tensors_name(matched_operator.group("inputs")), + get_tensors_name(matched_operator.group("outputs")), + int(matched_operator.group("location")), + ) + for matched_operator in re.finditer(PATTERN_NODE, tflite_graph_dump) + ] + # Get list of neutron subgraphs. + self.neutron_subgraphs = self._get_neutron_subgraphs(neutron_graph_dump) + if self.neutron_subgraphs: + self._update_neutron_subgraphs_info(extracted_graph_dump) + + def _get_neutron_subgraphs(self, graph_dump: str) -> list[SubgraphInfo]: + """Parse Neutron graph dump and extract subgraph information. + + :param graph_dump: String containing the Neutron graph dump from the converter log. + :return: List of SubgraphInfo objects containing subgraph metadata and operator nodes. + """ + + def get_subgraph_nodes(subrgraph_dump: str) -> list[Node]: + """Parse subgraph dump and extract operator nodes. + + :param subgraph_dump: String containing a single Neutron subgraph definition. + :return: List of Node objects representing operators in the subgraph. + """ + return [ + Node( + matched_operator.group("type"), + get_tensors_name(matched_operator.group("inputs")), + get_tensors_name(matched_operator.group("outputs")), + int(matched_operator.group("location")), + ) + for matched_operator in re.finditer(PATTERN_NODE, subrgraph_dump) + ] + + subgraphs = graph_dump.split(r"Name: subgraph_") + if len(subgraphs) < 3: + return [] + + # Get numbers of final neutron graphs in converted model. + self.neutron_graphs = [ + int(matched_graphs.group("num")) + for matched_graphs in re.finditer(PATTERN_GRAPH, subgraphs[-1]) + ] + if not self.neutron_graphs: + return [] + + # Get subgraphs + neutron_subgraphs: list[SubgraphInfo] = [] + for subgraph in subgraphs[1:]: + subgraph_match = re.search(PATTERN_SUBGRAPH, subgraph) + if not subgraph_match: + continue + neutron_subgraph = SubgraphInfo( + int(subgraph_match.group("num")), + -1, + get_tensors_name(subgraph_match.group("inputs")), + get_tensors_name(subgraph_match.group("outputs")), + 0, + get_subgraph_nodes(subgraph), + ) + neutron_subgraphs.append(neutron_subgraph) + return neutron_subgraphs + + def _update_neutron_subgraphs_info(self, extracted_graph: str) -> None: + """Update Neutron subgraphs with verbose info. + + - Set numbers of Neutron kernels in each Neutron subgraph. 99% of subgraphs contain only one Neutron kernel, + but there are some exceptions and some subgraphs can have more kernels. This number can be taken from + final Neutron graph info. + - Set Neutron subgraphs location in the final Neutron Graph. The function updates the location parameter + for each Neutron subgraph according to its position in the final Neutron graph. Location is calculated + continuously across all Neutron graphs in the model. Non-Neutron operators are skipped. + + :param extracted_graph: verbose Neutron graph dump. + """ + # Neutron graphs. + neutron_graphs = extracted_graph.split("NeutronGraph") + location_shift = 0 + for neutron_graph in neutron_graphs: + + subgraph_nodes = { + int(matched_subgraph.group("subgraph")): { + "location": i + location_shift, + "kernels": [ + kernel.replace(" ", "") + for kernel in matched_subgraph.group("kernels").split("\n") + if kernel.strip() + ], + } + for i, matched_subgraph in enumerate( + re.finditer(PATTERN_VERBOSE_KERNELS, neutron_graph) + ) + } + if not subgraph_nodes: + continue + # Update location offset according to the number of kernels in the subgraph. + location_shift += len(subgraph_nodes) + + # Neutron graphs. + graph_num = -1 + matched_graph = re.search(r"subgraph_(?P\d+)", neutron_graph) + if matched_graph: + graph_num = int(matched_graph.group("subgraph")) + + # Update number of kernels for all subgraphs. + for subgraph in self.neutron_subgraphs: + if subgraph.num in subgraph_nodes: + subgraph.kernels = len(subgraph_nodes[subgraph.num]["kernels"]) + subgraph.location = subgraph_nodes[subgraph.num]["location"] + elif subgraph.num == graph_num: + subgraph.kernels = sum( + len(s["kernels"]) for s in subgraph_nodes.values() + ) + self.neutron_kernels_num += subgraph.kernels + + def _nodes_match_by_io(self, tf_node: Node, neutron_node: Node) -> bool: + """ + Determine whether a TFLite node can be mapped to a Neutron node + based on their input and output compatibility. + + :param tf_node: Source TFLite node. + :param neutron_node: Target Neutron node. + :return: True if the nodes can be considered mapped, False otherwise. + """ + + def get_name_matches(tf_names: list[str], neutron_names: list[str]) -> int: + # Count how many names from tf_names have a corresponding match in + # neutron_names. A match is defined as: + # - exact equality, or + # - one name being a hierarchical variant of the other + # (i.e., sharing a common prefix separated by "/"). + result = 0 + for tf_name in tf_names: + # Determine if the tensor name corresponds to a special operation input. + # Matches names like "perm0", "perm1", etc. used by Transpose ops, + # and names like "padding0", "padding1", etc. used by Pad ops. + special_op = ( + "permutation" + if re.fullmatch(r"perm(\d+)?", tf_name) + else ( + "padding" + if re.fullmatch(r"padding(s)?(\d+)?", tf_name) + else None + ) + ) + for neutron_name in neutron_names: + if ( + neutron_name == tf_name + or neutron_name + "/" in tf_name + or tf_name + "/" in neutron_name + ): + result += 1 + break + + # Check if the neutron input is also the special op (Pad or Transpose) + if special_op and special_op in neutron_name: + result += 1 + break + return result + + name_matches = get_name_matches(tf_node.inputs, neutron_node.inputs) + # Map the node if all TFLite inputs match Neutron inputs. + # Note: the Neutron node may still have additional extra inputs. + if name_matches == len(tf_node.inputs): + return True + elif name_matches == len(tf_node.inputs) - 1: + # If there is only one unmatched input, check matching of outputs. + name_matches = get_name_matches(tf_node.outputs, neutron_node.outputs) + if name_matches == len(tf_node.outputs): + # Map the node if all TFLite outputs match Neutron outputs. + return True + return False + + def get_tflite_to_neutron_map(self) -> dict[int, tuple[int, ...]]: + """Map TFLite nodes from the original model to Neutron nodes in the converted model. + + The mapping is built based on input and output tensor names. Neutron tensors may have + exactly the same names or use the format "tflite_input/additional_name". + + :return: Dictionary mapping TFLite node indices to tuple of Neutron subgraph indices. + """ + tflite_to_neutron_dict = {} + for tf_idx, tf_node in enumerate(self.tflite_nodes): + subgraph_idxs = [] + for subgraph in self.neutron_subgraphs: + if ( + subgraph.num in self.neutron_graphs + or subgraph.location in subgraph_idxs + ): + continue + for neutron_node in subgraph.nodes: + if self._nodes_match_by_io(tf_node, neutron_node): + subgraph_idxs.append(subgraph.location) + break + # Filter subgraph_idxs to avoid mapping multiple parallel single-input nodes that consume the + # same input tensor into the same TFLite node. + subgraph_idxs = self._filter_single_input_nodes(tf_node.name, subgraph_idxs) + if subgraph_idxs: + tflite_to_neutron_dict[tf_idx] = tuple(subgraph_idxs) + + self.tflite_to_neutron_map = tflite_to_neutron_dict + return self.tflite_to_neutron_map + + def _filter_single_input_nodes( + self, node_name: str, subgraph_loc: list[int] + ) -> list[int]: + """ + Filter the Neutron-to-TFLite mapping to avoid mapping multiple parallel single-input nodes + that consume the same input tensor to a single TFLite node. + + The function checks whether the current TFLite node is a supported single-input node + (as defined in SINGLE_INPUT_NODES) and whether it is mapped to multiple Neutron nodes. + In such cases, it is possible that parallel single-input Neutron nodes were incorrectly + mapped to the same TFLite node. + + If more than one single-input Neutron node is mapped, only one is kept in the mapping: + the Neutron node whose operation name matches the operation name of the current TFLite node. + + :param node_name: Operation name of the current TFLite node. + :param subgraph_loc: List of Neutron subgraph indices whose inputs correspond to the + input of the current TFLite node. + :return: Filtered list of Neutron subgraph indices to be mapped to the current TFLite node. + """ + # Check if there can be potential issue in mapping. + if node_name in SINGLE_INPUT_NODES and len(subgraph_loc) > 1: + single_in_nodes = [] + # Find all single-input nodes in subgraph_idxs. + subgraphs = ( + subgraph + for subgraph in self.neutron_subgraphs + if subgraph.location in subgraph_loc + ) + for subgraph in subgraphs: + for neutron_node in subgraph.nodes: + if neutron_node.name in SINGLE_INPUT_NODES: + single_in_nodes.append((subgraph.location, neutron_node.name)) + if len(single_in_nodes) > 0: + # Keep only the node with the matching name when multiple single-input nodes are present in subgraph_idxs. + for subgraph_id, single_in_node_name in single_in_nodes: + if single_in_node_name == node_name: + return [subgraph_id] + return [] + return subgraph_loc + + def get_edge_to_neutron_map(self) -> dict[int, tuple[int, ...]]: + """Map Edge nodes to Neutron nodes. + + :return: Dictionary mapping Edge node handles to tuple of Neutron subgraph indices. + """ + self.get_tflite_to_neutron_map() + edge_to_neutron_dict = {} + + for edge_handle, tflite_indices in self.edge_to_tflite_map.items(): + neutron_nodes = set() + for tf_node in tflite_indices: + if tf_node in self.tflite_to_neutron_map: + neutron_nodes.update(self.tflite_to_neutron_map[tf_node]) + if neutron_nodes: + edge_to_neutron_dict[edge_handle] = tuple(neutron_nodes) + + self.edge_to_neutron_map = edge_to_neutron_dict + return self.edge_to_neutron_map + + def get_neutron_to_edge_map(self) -> dict[int, tuple[int, ...]]: + """ + Transform edge-to-neutron map to neutron-to-edge map. + + :return: Dictionary mapping neutron_index to tuple of edge_handles + """ + if not self.edge_to_neutron_map: + _ = self.get_edge_to_neutron_map() + + neutron_to_edge = {} + + for edge_handle, neutron_indices in self.edge_to_neutron_map.items(): + for neutron_idx in neutron_indices: + if neutron_idx not in neutron_to_edge: + neutron_to_edge[neutron_idx] = [] + neutron_to_edge[neutron_idx].append(edge_handle) + + # Fill gaps with empty tuples and convert lists to tuples. + if neutron_to_edge: + max_neutron_idx = self.neutron_kernels_num + result = {} + # Add one more non-mapped event at the end of list for the Neutron Dump event. + for i in range(max_neutron_idx + 1): + if i in neutron_to_edge: + result[i] = tuple(neutron_to_edge[i]) + else: + result[i] = () + logging.info(f"Neutron to Edge map was created: {result}") + return result + else: + return {} diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py index 1a84a418e92..ee711c34369 100644 --- a/backends/nxp/nxp_backend.py +++ b/backends/nxp/nxp_backend.py @@ -11,6 +11,8 @@ import logging import os import struct +import tempfile +from contextlib import contextmanager from typing import final import numpy as np @@ -26,6 +28,8 @@ from executorch.backends.nxp.backend.neutron_converter_manager import ( NeutronConverterManager, ) + +from executorch.backends.nxp.backend.neutron_map import NeutronMap from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from executorch.backends.nxp.neutron_node_extraction import ( extract_artifacts_from_neutron_node, @@ -54,6 +58,7 @@ def __init__(self): self.use_neutron_for_format_conversion = True self.fetch_constants_to_sram = False self.dump_kernel_selection_code = False + self.use_profiling = False def _replace_colons(self, operator: str) -> str: """ @@ -70,6 +75,7 @@ def neutron_compile_spec( use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, ) -> "NeutronCompileSpecBuilder": """Generate compile spec for Neutron NPU @@ -83,6 +89,7 @@ def neutron_compile_spec( :param fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights from FLASH to SRAM. This should be used when the whole model does not fit into SRAM. :param dump_kernel_selection_code: Whether Neutron converter dumps kernel selection code. + :param use_profiling: If true Neutron Converter will enable profiling for neutron delegated model :return: self for method chaining """ @@ -106,6 +113,7 @@ def neutron_compile_spec( self.use_neutron_for_format_conversion = use_neutron_for_format_conversion self.fetch_constants_to_sram = fetch_constants_to_sram self.dump_kernel_selection_code = dump_kernel_selection_code + self.use_profiling = use_profiling return self @@ -135,6 +143,10 @@ def build(self): "dump_kernel_selection_code", f"{self.dump_kernel_selection_code}".encode(), ), + CompileSpec( + "use_profiling", + f"{self.use_profiling}".encode(), + ), ] return self.compile_spec @@ -149,6 +161,7 @@ def generate_neutron_compile_spec( use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, ) -> list[CompileSpec]: return ( NeutronCompileSpecBuilder() @@ -160,11 +173,36 @@ def generate_neutron_compile_spec( use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, dump_kernel_selection_code=dump_kernel_selection_code, + use_profiling=use_profiling, ) .build() ) +@contextmanager +def capture_fd_output(): + tmp = tempfile.TemporaryFile() + + # Save original stdout / stderr + original_stdout_fd = os.dup(1) + original_stderr_fd = os.dup(2) + + try: + # Redirect fd=1 and fd=2 to temp file + os.dup2(tmp.fileno(), 1) + os.dup2(tmp.fileno(), 2) + + yield tmp # give access to the temp file + + finally: + # Restore original fds + os.dup2(original_stdout_fd, 1) + os.dup2(original_stderr_fd, 2) + + os.close(original_stdout_fd) + os.close(original_stderr_fd) + + @final class NeutronBackend(BackendDetails): @@ -185,6 +223,7 @@ def preprocess( # noqa C901 use_neutron_for_format_conversion = None fetch_constants_to_sram = False dump_kernel_selection_code = None + use_profiling = False for spec in compile_spec: if spec.key == "output_format": output_format = spec.value.decode() @@ -200,6 +239,8 @@ def preprocess( # noqa C901 fetch_constants_to_sram = spec.value.decode() == "True" if spec.key == "dump_kernel_selection_code": dump_kernel_selection_code = spec.value.decode() == "True" + if spec.key == "use_profiling": + use_profiling = spec.value.decode() == "True" # Check that the output format is set in the compile spec if not output_format: @@ -229,19 +270,32 @@ def preprocess( # noqa C901 if use_neutron_for_format_conversion is not None else {} ) - tflite_model, io_formats = EdgeProgramToIRConverter().convert_program( + ( + tflite_model, + io_formats, + edge_to_tflite_map, + ) = EdgeProgramToIRConverter().convert_program( edge_program, neutron_target_spec=NeutronTargetSpec(target), conversion_config=conversion_config, custom_delegation_options=CustomDelegationOptions(), ) - neutron_model = NeutronConverterManager(dump_kernel_selection_code).convert( - tflite_model, - target, - delegation_tag, - fetch_constants_to_sram, - ) + with capture_fd_output() as tmp: + neutron_model = NeutronConverterManager( + dump_kernel_selection_code + ).convert( + tflite_model, + target, + delegation_tag, + fetch_constants_to_sram, + use_profiling, + ) + tmp.seek(0) + log_output = tmp.read().decode() + # Get mapping from tflite to neutron + map = NeutronMap(log_output, edge_to_tflite_map) + neutron_to_edge_map = map.get_neutron_to_edge_map() # Dump the tflite file if intermediates_dir is set if intermediates_dir != "None": @@ -265,7 +319,9 @@ def preprocess( # noqa C901 else: raise RuntimeError(f"Unknown format {output_format}") - return PreprocessResult(processed_bytes=binary) + return PreprocessResult( + processed_bytes=binary, debug_handle_map=neutron_to_edge_map + ) class PayloadComposer: diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp index 3ea973b7c5b..6fe0482ed89 100644 --- a/backends/nxp/runtime/NeutronBackend.cpp +++ b/backends/nxp/runtime/NeutronBackend.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "NeutronDriver.h" @@ -25,6 +26,8 @@ namespace neutron { #define ALIGN_SIZE(size) \ ((size + BUFFER_ALIGNMENT - 1) & (~(BUFFER_ALIGNMENT - 1))) +#define KOPC_CALLARGS 6 // The operation for TileIR + // clang-format off /* Header schema: +----------------------------+-----------------------------+------------------------+ @@ -84,6 +87,19 @@ typedef struct { const uint8_t* outputMap; } NeutronExecutorchConfig; +typedef struct { + uint8_t eventCode; + uint8_t opCode; + uint8_t functionCode; + uint8_t timestampCode; + uint32_t time; +} NeutronSingleProfilingEvent; + +typedef struct { + NeutronSingleProfilingEvent startEvent; + NeutronSingleProfilingEvent stopEvent; +} NeutronFullProfilingEvent; + #ifdef EXTERNAL_MEM // Neutron compute has no access to FLASH. // Prefetch weights from FLASH to SRAM using memcpy. @@ -508,12 +524,11 @@ class NeutronBackend final : public PyTorchBackendInterface { } } -#ifdef NEUTRON_PROFILE - // TODO: Use trace from BackendExecutionContext. - NeutronTraceConfig trace_config{.traceConfig = 0}; - neutronSetTrace(cfg->nmh, &trace_config); +#ifdef ET_EVENT_TRACER_ENABLED + // Save ticks before neutron compute to measure how much time profiling dump + // takes + et_timestamp_t start_ticks = ::executorch::runtime::pal_current_ticks(); #endif - // Run neutron compute. NeutronError neutronRC = neutronRunBlocking(cfg->nmh, &cfg->dcfg); if (neutronRC != ENONE) { @@ -523,6 +538,11 @@ class NeutronBackend final : public PyTorchBackendInterface { neutronRC); return Error::InvalidProgram; } +#ifdef ET_EVENT_TRACER_ENABLED + // Save ticks after neutron compute to measure how much time profiling dump + // takes + et_timestamp_t stop_ticks = ::executorch::runtime::pal_current_ticks(); +#endif // Transpose outputs. for (int i = 0; i < cfg->numOutputs; i++) { @@ -558,6 +578,53 @@ class NeutronBackend final : public PyTorchBackendInterface { } } } +#ifdef ET_EVENT_TRACER_ENABLED + // Add traced evens only if model has profiling info. + auto profile_size = cfg->profileSize; + if (profile_size > 0) { + int events_num = static_cast(profile_size / 16); + auto profiling_index = cfg->numOutputs + 1; + char* profile_info = + static_cast(cfg->dcfg.outputs[profiling_index]); + NeutronFullProfilingEvent* neutron_events = + (NeutronFullProfilingEvent*)profile_info; + executorch::runtime::EventTracer* tracer = context.event_tracer(); + uint32_t start_time = 0; + int index = 0; + // Post log neutron events from profiling output. + for (int i = 0; i < events_num; i++) { + if (start_time == 0) { + start_time = neutron_events[i].startEvent.time; + } + if (neutron_events[i].stopEvent.opCode != KOPC_CALLARGS) { + // Only KOPC_CALLARGS events can be mapped to original .pte model. + continue; + } else { + event_tracer_log_profiling_delegate( + tracer, + nullptr, + index, + start_time, + neutron_events[i].stopEvent.time, + static_cast( + &neutron_events[i].startEvent.functionCode), + sizeof(uint8_t)); + start_time = 0; + index++; + } + } + event_tracer_log_profiling_delegate( + tracer, + nullptr, + index, + neutron_events[events_num - 1].startEvent.time, + neutron_events[events_num - 1].stopEvent.time + stop_ticks - + start_ticks, + static_cast( + &neutron_events[events_num - 1].startEvent.functionCode), + sizeof(uint8_t)); + } +#endif return Error::Ok; } diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py index 44a96010593..1309e019428 100644 --- a/backends/nxp/tests/executorch_pipeline.py +++ b/backends/nxp/tests/executorch_pipeline.py @@ -190,6 +190,7 @@ def to_quantized_edge_program( use_quant_state_dict: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_profiling: bool = False, delegate_to_npu=True, ) -> EdgeProgramManager: _neutron_target_spec = NeutronTargetSpec(target) @@ -223,6 +224,7 @@ def to_quantized_edge_program( use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, dump_kernel_selection_code=dump_kernel_selection_code, + use_profiling=use_profiling, ) post_quant_state_dict = ( exir_program_aten__module_quant.state_dict() if use_quant_state_dict else None @@ -244,6 +246,7 @@ def to_quantized_edge_program( export(exir_program_aten__module_quant, example_input, strict=True), transform_passes=NeutronEdgePassManager(), partitioner=partitioners, + generate_etrecord=use_profiling, compile_config=EdgeCompileConfig( _check_ir_validity=False, _core_aten_ops_exception_list=core_aten_ops_exception_list, @@ -274,6 +277,7 @@ def to_quantized_executorch_program( use_neutron_for_format_conversion: bool = True, dataset_dir: str | None = None, delegate_to_npu=True, + use_profiling: bool = False, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ) -> ExecutorchProgramManager: @@ -295,6 +299,7 @@ def to_quantized_executorch_program( train_fn=train_fn, use_neutron_for_format_conversion=use_neutron_for_format_conversion, delegate_to_npu=delegate_to_npu, + use_profiling=use_profiling, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, **get_calibration_inputs_fn, diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py index 319f372b5fa..94e91a31b95 100644 --- a/backends/nxp/tests/executors.py +++ b/backends/nxp/tests/executors.py @@ -325,7 +325,7 @@ def convert_run_compare( if tfl_model is None: NodeFormatInference(edge_program).identify_node_formats() - tfl_model, _ = EdgeProgramToIRConverter().convert_program( + tfl_model, *_ = EdgeProgramToIRConverter().convert_program( edge_program, conversion_config ) diff --git a/backends/nxp/tests/generic_tests/test_aot_example.py b/backends/nxp/tests/generic_tests/test_aot_example.py index 893041fe372..8a1e5e49555 100644 --- a/backends/nxp/tests/generic_tests/test_aot_example.py +++ b/backends/nxp/tests/generic_tests/test_aot_example.py @@ -2,11 +2,13 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - +import os import subprocess import sys from pathlib import Path +from executorch.backends.nxp.tests.config_importer import test_config + # noinspection PyProtectedMember from executorch.exir._serialize import _deserialize_pte_binary from executorch.exir.schema import DelegateCall, KernelCall @@ -15,9 +17,8 @@ def test_aot_example__mobilenet_v2(): """Test that mobilenet can be lowered to Neutron backend via `aot_neutron_compile.py` and all ops are delegated.""" - # Find the executorch root directory (5 levels up from this test file) - executorch_root = Path(__file__).parent.parent.parent.parent.parent - assert executorch_root.exists(), f"Executorch root not found at {executorch_root}" + # Set the executorch root directory. + executorch_root = test_config.PROJECT_DIR # Run the compilation script as a module (like run_aot_example.sh does) cmd = [ @@ -34,14 +35,14 @@ def test_aot_example__mobilenet_v2(): ] # Output file will be created in executorch_root - pte_file = executorch_root / "mobilenetv2_nxp_delegate.pte" + pte_file = Path(os.path.join(executorch_root, "mobilenetv2_nxp_delegate.pte")) try: result = subprocess.run( cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout just in case. On my machine, the test usually runs ~1 minute. + timeout=300, # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute. cwd=str( executorch_root ), # Run from executorch root (like run_aot_example.sh) @@ -95,3 +96,77 @@ def test_aot_example__mobilenet_v2(): # Clean up the generated file if pte_file.exists(): pte_file.unlink() + + +def test_aot_example__mobilenet_v2__profiling(): + """Test that mobilenet_v2 can be lowered to Neutron backend via `aot_neutron_compile.py`, all ops are delegated, + the output model is profilable and ETRecord is generated properly.""" + + # Set the executorch root directory. + executorch_root = test_config.PROJECT_DIR + + # Run the compilation script as a module (like run_aot_example.sh does) + cmd = [ + sys.executable, + "-m", + "examples.nxp.aot_neutron_compile", + "--model_name", + "mobilenetv2", + "--delegate", + "--quantize", + "--target", + "imxrt700", + "--remove-quant-io-ops", + "--use_channels_last_dim_order", + "--use_profiling", # Generate profilable model and create ETRecord + "--use_random_dataset", # Avoid downloading the dataset. + ] + + # Output files will be created in executorch_root. + pte_file = Path( + os.path.join(executorch_root, "mobilenetv2_nxp_delegate_profile.pte") + ) + etrecord_file = Path( + os.path.join(executorch_root, "etrecord", "mobilenetv2_etrecord.bin") + ) + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout just in case. On 8-core x86 the test usually runs ~1 minute. + cwd=str( + executorch_root + ), # Run from executorch root (like run_aot_example.sh) + ) + + # Check script ran successfully. + assert result.returncode == 0, ( + f"Script failed with return code {result.returncode}\n" + f"STDOUT:\n{result.stdout}\n" + f"STDERR:\n{result.stderr}" + ) + + # Check if delegated model was created and saved. + assert pte_file.exists(), f"PTE file not created at {pte_file}" + + # Combine stdout and stderr to capture all subprocess output, including logs. + process_output = result.stdout + result.stderr + + # Check if nonempty Neutron to Edge map was created. + assert "Neutron to Edge map was created:" in process_output + + # Check if ETRecord was created and saved. + assert "The ETRecord for the model was saved to" in process_output + assert etrecord_file.exists(), f"ETRecord file not created at {etrecord_file}" + + finally: + # Clean up the generated files. + if pte_file.exists(): + pte_file.unlink() + if etrecord_file.exists(): + etrecord_file.unlink() + parent = etrecord_file.parent + if not any(parent.iterdir()): + parent.rmdir() diff --git a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py index 27bd675a487..6aa07dbba8d 100644 --- a/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py +++ b/backends/nxp/tests/generic_tests/test_move_activation_before_concatenation.py @@ -629,7 +629,7 @@ def test_move_activation_before_concat_quantization__conv( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -668,7 +668,7 @@ def test_move_activation_before_concat_quantization__linear( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -706,7 +706,7 @@ def test_move_activation_before_concat_quantization__addmm( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -744,7 +744,7 @@ def test_move_activation_before_concat_quantization__mm( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -788,9 +788,7 @@ def test_concat_cluster_quantization__conv( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[ - -1 - ].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] exir_program_aten_quant: GraphModule = quantizer_spy.calls[ -1 @@ -861,9 +859,7 @@ def test_concat_cluster_quantization__linear( "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[ - -1 - ].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] exir_program_aten_quant: GraphModule = quantizer_spy.calls[ -1 diff --git a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py index 8cf7dfe3dc2..52654a482b9 100644 --- a/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py +++ b/backends/nxp/tests/generic_tests/test_neutron_backend_executor.py @@ -37,7 +37,7 @@ def test_lowered_program_and_tflite_output_match__conv2d__no_bias(mocker): ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return tflite_model = Model.GetRootAs(tflite_flatbuffers_model) sub_graph = tflite_model.Subgraphs(0) @@ -84,7 +84,7 @@ def test_conv_fc__lowered_program_and_tflite_output_match(mocker): exported_program: ExportedProgram = converter_spy.call_args.args[1] # Capture generated model - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # No Transpose ops in produced TFLite model tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0) @@ -148,7 +148,7 @@ def test_delegating_format_related_transpose_operators__supported_case(mocker): ) # Capture the converted IR model. - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Make sure the `Transpose` ops ARE in the IR model. tflite_subgraph = Model.GetRootAs(tflite_flatbuffers_model).Subgraphs(0) diff --git a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py index 0705203db06..359dfdb67e9 100644 --- a/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py +++ b/backends/nxp/tests/generic_tests/test_neutron_converter_manager.py @@ -28,7 +28,7 @@ def test_conv2d_neutron_conversion(): NodeFormatInference(edge_program_manager.exported_program()).identify_node_formats() edge_program_converter = EdgeProgramToIRConverter() - tflite_model, _ = edge_program_converter.convert_program( + tflite_model, *_ = edge_program_converter.convert_program( edge_program_manager.exported_program() ) diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py index 706d8ed3e14..af9ef08057b 100644 --- a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py +++ b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py @@ -153,7 +153,7 @@ def test_per_channel_convolution(self, _, use_qat: bool): use_neutron_for_format_conversion=False, ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( diff --git a/backends/nxp/tests/generic_tests/test_profiling.py b/backends/nxp/tests/generic_tests/test_profiling.py new file mode 100644 index 00000000000..c922eb070c3 --- /dev/null +++ b/backends/nxp/tests/generic_tests/test_profiling.py @@ -0,0 +1,158 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import ast +import logging +import re + +import numpy as np +import pytest +import torch +from executorch.backends.nxp.tests.graph_verifier import BaseGraphVerifier +from executorch.backends.nxp.tests.model_output_comparator import ( + NumericalStatsOutputComparator, +) + +from executorch.backends.nxp.tests.models import AvgPool2dModule, SoftmaxModule +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare + +from executorch.examples.nxp.experimental.cifar_net.cifar_net import CifarNetModel + + +@pytest.fixture(autouse=True) +def reseed_model_per_test_run(): + torch.manual_seed(23) + np.random.seed(23) + + +PATTERN_NEUTRON_MAP = r"Neutron to Edge map was created: (\{.*\})" + + +def extract_map_from_logs(caplog): + for record in caplog.records: + msg = record.getMessage() + neutron_map_match = re.search(PATTERN_NEUTRON_MAP, msg) + if neutron_map_match: + dict_str = neutron_map_match.group(1) + return ast.literal_eval(dict_str) + return None + + +class ParallelPoolModel(torch.nn.Module): + def __init__(self, channels: int): + super().__init__() + self.conv_in = torch.nn.Conv2d(channels, channels, kernel_size=3, padding=1) + self.max_pool2d = torch.nn.MaxPool2d(kernel_size=2, stride=2) + self.avg_pool2d = torch.nn.AvgPool2d(kernel_size=2, stride=2) + self.conv_out = torch.nn.Conv2d(2 * channels, channels, kernel_size=1) + + def forward(self, x): + x = self.conv_in(x) + x = torch.cat((self.max_pool2d(x), self.avg_pool2d(x)), dim=1) + x = self.conv_out(x) + return x + + +class TestProfiling: + @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True) + def test__softmax(self, caplog, request): + caplog.set_level(logging.INFO) + model = SoftmaxModule(-1) + lower_run_compare( + model, + (10,), + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + use_profiling=True, + output_comparator=NumericalStatsOutputComparator(), + ) + + # Neuron map for 1D Softmax with input size 10 should contain 4 nodes: + # 3 Neuron kernels (pad, softmax, and slice) and 1 unmapped node used for profiling dum + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (2,), # Pad + 1: (2,), # Softmax + 2: (2,), # Slice + 3: (), # Neutron Dump + } + + def test__parallel_pool(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (1, 3, 32, 32) + model = ParallelPoolModel(input_shape[1]) + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (6,), # Conv2DStandardV2 + 1: (), # Conv2DDepthwiseV2 (AvgPool) + 2: (7,), # MaxPool + 3: (), # TransposeCHW + 4: (), # TransposeCHW + 5: (), # TransposeCHW + 6: (), # Slice + 7: (), # Pad + 8: (), # Conv2DPointwise + 9: (), # Slice + 10: (), # Neutron Dump + } + + @pytest.mark.xfail(reason="SoftMax support PR is not merged so far.", strict=True) + def test__cifar(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (1, 3, 32, 32) + model = CifarNetModel() + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (10,), # Pad + 1: (10, 11), # Conv2DStandardV1 (Pad + Conv2d) + 2: (12,), # MaxPool + 3: (13, 14), # Conv2DStandardV1 (Pad + Conv2d) + 4: (15,), # MaxPool + 5: (16, 17), # Conv2DStandardV1 (Pad + Conv2d) + 6: (18,), # MaxPool + 7: (20,), # FullyConnected + 8: (21,), # Pad + 9: (21,), # Softmax + 10: (21,), # Slice + 11: (), # Neutron Dump + } + + def test__avg_pool(self, caplog, request): + caplog.set_level(logging.INFO) + input_shape = (2, 9, 6, 15) + model = AvgPool2dModule(False, 0) + lower_run_compare( + model, + input_shape, + dlg_model_verifier=BaseGraphVerifier(1, []), + request=request, + output_comparator=NumericalStatsOutputComparator(), + use_neutron_for_format_conversion=False, + use_profiling=True, + ) + neutron_map = extract_map_from_logs(caplog) + assert neutron_map == { + 0: (2,), # Pad + 1: (2,), # Conv2DDepthwiseDense + 2: (2,), # Slice + 3: (), # Neutron Dump + } diff --git a/backends/nxp/tests/generic_tests/test_quantizer.py b/backends/nxp/tests/generic_tests/test_quantizer.py index 3c23241e01e..6180d2fd9ae 100644 --- a/backends/nxp/tests/generic_tests/test_quantizer.py +++ b/backends/nxp/tests/generic_tests/test_quantizer.py @@ -432,7 +432,7 @@ def test_quantizer__linear_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -477,7 +477,7 @@ def test_quantizer__addmm_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -522,7 +522,7 @@ def test_quantizer__mm_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return @@ -567,7 +567,7 @@ def test_quantizer__conv_w_activation(mocker, activation, inplace, use_qat): ) assert any("lowered_module" in node.name for node in edge_program.graph.nodes) - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] exir_program_aten_quant: GraphModule = quantizer_spy.spy_return diff --git a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py index a8cdee41830..668deb28c96 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py @@ -51,7 +51,7 @@ def test_addmm_conversion(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -84,7 +84,7 @@ def test_linear_conversion__with_bias(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py index dc442a4931c..466f596bf91 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_bmm_converter.py @@ -59,7 +59,7 @@ def test_convert_bmm__supported(mocker, input_shape_x1, input_shape_x2, use_qat) # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data_1 = ( np.random.random(input_shape_x1).astype(np.float32) * 256.0 - 128.0 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py index b4b828cd4e6..5ee3db6752f 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py @@ -182,7 +182,7 @@ def test_conv_dropout_quant( use_neutron_for_format_conversion=False, ).exported_program() - tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] assert not graph_contains_any( @@ -241,7 +241,7 @@ def test_clone_pool_view_copy_quant( use_neutron_for_format_conversion=False, ).exported_program() - tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] assert not graph_contains_any( @@ -311,7 +311,7 @@ def test_clone__to_contiguous_format(self): ).identify_node_formats() # Convert to the IR. - converted_model, _ = EdgeProgramToIRConverter().convert_program( + converted_model, *_ = EdgeProgramToIRConverter().convert_program( edge_program_manager.exported_program() ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py index 828647d2113..7105514514a 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py @@ -177,7 +177,7 @@ def test_conv2d_quant_conversion(mocker, model: torch.nn.Module, input_shape, us ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] @@ -367,7 +367,7 @@ def test_conv_transpose2d_conversion__quantized( assert any("lowered_module" in node.name for node in edge_program.graph.nodes) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py index 60dbfd1b215..79fffff3b78 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py @@ -51,7 +51,7 @@ def test_mm_conversion(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 @@ -85,7 +85,7 @@ def test_linear_conversion__without_bias(self, _, use_qat: bool): "lowered_module" in node.name for node in edge_program.graph.nodes ) - tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value + tflite_flatbuffers_model, *_ = converter_spy.calls[-1].return_value exported_program: ExportedProgram = converter_spy.calls[-1].args[0] input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype( np.int8 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py index e0fc0d85066..2e7f9035e8a 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_neg_converter.py @@ -74,7 +74,7 @@ def test_convert_neg(mocker, input_shape): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = ( np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0 @@ -105,7 +105,7 @@ def test_convert_neg__channels_last(mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = ( np.random.random(input_shape).astype(np.float32) * 256.0 - 128.0 diff --git a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py index fb25f02785a..c5c7aa55b03 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_prelu_converter.py @@ -50,7 +50,7 @@ def test_prelu_with_linear_quant_conversion(mocker, input_shape): ).exported_program() # Capture generated entities - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return exported_program: ExportedProgram = converter_spy.call_args.args[1] # Check `prelu` was not decomposed into simpler edge operators diff --git a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py index 2621baf18ee..00c10bd257d 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_softmax_converter.py @@ -85,7 +85,7 @@ def test_softmax_delegation(input_shape, dim: int, mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = random_input_data(input_shape) # Make sure the tested program contains the `softmax`, and its input has the expected rank. @@ -121,7 +121,7 @@ def test_softmax_delegation__channel_first(input_shape, dim: int, mocker): # Verify correct behavior of the converted NeutronIR model. intermediate_ep = converter_spy.call_args.args[1] - neutron_ir_model, _ = converter_spy.spy_return + neutron_ir_model, *_ = converter_spy.spy_return input_data = random_input_data(input_shape) # Make sure the tested program contains the `softmax`. diff --git a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py index cb5f398fa21..276b29da142 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_view_copy_converter.py @@ -265,7 +265,7 @@ def test_view_copy_w_linear_quant_conversion(mocker, input_shape, new_shape, use ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program edge_program: ExportedProgram = converter_spy.call_args.args[1] @@ -299,7 +299,7 @@ def test_view_w_conv_linear_quant_conversion( ) # Capture generated model - tflite_flatbuffers_model, io_formats = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program edge_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py index 88ea567381f..aadef8c7731 100644 --- a/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py +++ b/backends/nxp/tests/ir/edge_passes/test_linear_bn_fusing.py @@ -251,7 +251,7 @@ def test_linear_bn_full_qat_pipeline_conversion( assert any("lowered_module" in node.name for node in edge_program.graph.nodes) # Capture generated model - tflite_flatbuffers_model, _ = converter_spy.spy_return + tflite_flatbuffers_model, *_ = converter_spy.spy_return # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py index d5ff3680f38..ef6fe9c864c 100644 --- a/backends/nxp/tests/nsys_testing.py +++ b/backends/nxp/tests/nsys_testing.py @@ -101,6 +101,8 @@ def _run_delegated_executorch_program( mocker, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_profiling: bool = False, + use_neutron_for_format_conversion=True, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ) -> tuple[ExportedProgram, str]: @@ -129,6 +131,8 @@ def wrapper(*args, **kwargs): delegate_to_npu=True, use_qat=use_qat, train_fn=train_fn, + use_profiling=use_profiling, + use_neutron_for_format_conversion=use_neutron_for_format_conversion, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, ) @@ -405,6 +409,8 @@ def lower_run_compare( reference_model: ReferenceModel = ReferenceModel.QUANTIZED_EXECUTORCH_CPP, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_profiling: bool = False, + use_neutron_for_format_conversion=True, operators_not_to_delegate: list[str] = None, remove_quant_io_ops: bool = False, ): @@ -424,6 +430,10 @@ def lower_run_compare( :param reference_model: Version of the model which will be run to obtain reference output data. :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training). :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`. + :param use_profiling: Enable profiling for neutron delegated model. + :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to + ensure that the IO matches the executorch partition, which will be + delegated to Neutron, :param operators_not_to_delegate: list of operators not to delegate. :param remove_quant_io_ops: If true, IO q-ops are removed and verification is done on quantized version of dataset (quantized INT8 input samples). @@ -468,6 +478,8 @@ def lower_run_compare( mocker, use_qat=use_qat, train_fn=train_fn, + use_profiling=use_profiling, + use_neutron_for_format_conversion=use_neutron_for_format_conversion, operators_not_to_delegate=operators_not_to_delegate, remove_quant_io_ops=remove_quant_io_ops, ) diff --git a/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png b/docs/source/_static/img/nxp/nxp-mcuxpresso-etdump.png new file mode 100644 index 0000000000000000000000000000000000000000..50ed49f57ec93f279668df064157b6063ec01627 GIT binary patch literal 44262 zcmagF2Ut^0*EWg;3#bT46BVUOGZg71qM&qCdRKZ2ktQvGqDTkngaAIxg07JvF|y>()2#(%w@{oRah=M$g)z z)wvG%(E%Aa`u)vkNqUFCTPwyx|J{B*)QU$+L=PTD0lx$M{w-tra`y#>|3CZZn0isi z{@wq=^j(JpVlz@y`u^s;lTluNfjs$qK*J9+*KyI780^f7utHX>ru2?j2KP^*@huDmg_S++f7*6Q1-l z4<_HU_wWW`P>sDdHU=g+No)QT7MJY&F1PS!7xm<<>jgEpR(#qEkMSj`f%4Ad7YNe# zDy)|cC~hqk4pKw?{fWOzt7K+*OZPTe?c=L00y6KzF8cnhM)~&Qlsq_~VKpA9kzurv z4svw@+;LhIOOWSh8#YofC0V=Qt+wVT=+vszHfsBvAAakCne^Hc6^U=L$nYYeybIZC zB9aFlQ%X-JnmW3|Y0agNqZAD+l~%2<*EgFT^3~m>Bw*b}^OamzT`v^S(13k?=O--P zU}DYK;gMR}6N2R8=##@9sY-sKmf)p{k{vxzwC`i1g7DaNNs)Im`(h7eL5axmpuol2 zZDdL4Rgm9ahveA^9*}I1y1{|+g2a9KYkc1#O?7^2ZJ#Hc|NBm{a&4g`qt})J*{I~y zUx+@E@RWAJMK18l2Z>?jy%(Kb0VZ-_!R-3Z4}G2JjI)AnR$rRu-SOJAXV!#wUB{D| z<1<1zUlQUq*gfK(ark{g`XMqujkKdDKZlL9UN9e zydnsRP6=UAeAr8FSAkMk>6W66$ZxNSVWFj){*F`OWww;kT0$bDJxZ8GdWdl~`t4b@ zqkbYdsSMtaWwMxwI@YaIl)uCA#XabnJXoq#R3{hu%`O^q8EqST^Zg~3tPnS&_?xCW zy<7&>7aQWR&(W}|WuL`B9J+9LfQAu@(@A4b?IHq_I?g*0jD}qI$1R#tF%g_`4fXy8 zA}_PVCkUX#VCic`u=|JZa+13#==qT=tB_@vim7*xxT8kxfy$@PtYf{H>6!cSh-z1ilzjh=%=V zWL+8*R&mFq%fBfihf)j*kgdO#|wuCm79nFa^pO zq1SeR9)n-qwpB4dCyNnCCojBVlG7@S zGIZa8{`&4lCx7G)~Rdx69JQ0%9)7=4h3U@8GUh%0smMU{TSE z*Rai|a`H-TP&zsOzQ>E9#Um(E=dP2Ai(kF%tGhlp7bQbA=$-G<_fdmX>9IsO%fsUr zJE;Hmoy_XOP+P-2k(92Y2`3NDM^h3Ri%X*33wxAEmSce_RMTVA^|rF%ciHOy8U9SM zs{fn&W7y8hZr)W)QIe-=383(=TYMt^>i^wY{trI%e=oLo!Eb+e2{-|oygpvsH>DDQ zmr*h>GIs-|+ZJO6SKn(iWc+Xu7_0>`k0^h6{Bf+ZOv&WJ(0aOwsXab(stY_VEiBjZ z4bQKc>~{b_U0?V>*?50hVIdaxgU$W4Ym=&x^pDh5BP9>z-yIeLj+1kl0ZtEr*V}(Y zVpDpQeb6EJc{_tqTLRJ#djbJRG0DEav8+(W6u!M*(S) zc)ivlveRpNUJeFgmPA)0wnICWPQBRuNW|cLGMI7Kgjz`?Or54#TnN>8zBZQm3L1Ck z`#ke5!(FoTCQ9R{`GKs>$JoKq7;x2;KApg&MT%3hRkRl%Ex+?tc<`WXN=~|T`8I~< zruB~sfd$g8%{%jUwDPpeqxY#+-sjYMSRZ(IqS+TioEqz%P-Pv>_#8V@o;h*#^i&Lb zQR-gUcI^Ps{PN!dl=QbK;trMAIf!AU!-G+&`=pB4!Ih};`Y7&a6=J`4+b@BSvnr>o zwD5KLHJ*mlbrIoM63$_n|aFm!0=c0(j{6^?R_`QtGr>K$xhC=3=8vYe1DwLOc` z%E)DHkq;>4KHwo+Uyv}&=H^0a__zzqYG+WZJS zn3(LaUD?a`9d81foLlJmsIP-kC{QlCTV5jX(9&_<@}Yy@kv+A0y@}^NGyzH2T?iX_ z)AIor-~)-6-Sr$7CVmZgN*>IdYSH^eDO)*%Ogv)Fb1Y0yN^#F54(+l|a)GbC4{@%1 zHw~5x-V$$n#{E>JM?s{zwCskg3K5=1@YCl`#2VFoBYpZ&GfZn%0e1t}ipL?8?zj-2 zW~cQ1r@+0v-mUZU0T8<4$T{cab)A-2$0w%+t%NJG)S=k7W7aJSUBMX+F=N8FnkJC4+tR0X`r3h4dU}lT#0wSDoXq z?3|V!O_Yemjy0ZPr@Abq$Q1CHu5kNRV@P@1wVm(Z^>4u*&z)cgSH+u4EPE_t{~YL| zB+Y%)*Ch;;?mHi7T)lSf!&SAV+x>@qEmvpSTl5dA&30-q83}0v^G8*3x0B=aIN>x` zM~;k?mx&^gwnImj8?+s0d;_KC3O{k(WSX8ru3+Ia@^*_R$I)k?{dgXZxmFp?!dF?a zZBdHIf<(KK9%N}xiYOO9P*-;qYyNrr?vD<3#j3$ZRZ`Svm`eKgfMIz15ia;)L>4#r zv|>%Et9o`*=@F)N0fLG6Gg%m3h%1wtw6U?x*msf!%{fdEe+A%L4$(^skVp~G^zH7Q znD|!%imMUrdo<hlL4Tup3~ z5DbeKTHD5RdzA^f&_YycJ@VyFqv$ieVA&)=mDs%otv7_NnnCp{dWw(8$0(&91J-YiNPU-!#$ic15Z``c6 z)r>X7S`*GFi`*>GtfvHu;nO>2vOHv&kFCZjSsd$AVYeK)B~Ge;bgfpE`@I5&Nbd4U zjOqJRi}zQtH^IfdFpVC#+L19uMf01ayD(Qxa^xkSYMS{;+f%%}s2sFkrv1H1__;O8 zmM$f88A_8(;J-HkRf>-%mnnApt&yF{`|@dOni0l5Sy$nM#hd<+w4%@l-Rtq@?D9jj zm;(E#+iInX+h`7*0MH}W~I_{_WRMxkxD%Z&+SEiOqvWqsm);m1SH`nE=6wQb5F z?x*7YFUGaCyjlAb4oqtw3Jiau%gv?owil0fizK5`YLkHRoC{hugEGZT zFd!_tl8oZd2UMG(#NQWVd!|20=M6bMdN#1oaA+BvwVS!}qPp}@a(KzXR<>Yj^dwZJ zeSA$m{7?{I)E*$`c<-L`?X1e8XiUD#D#?>!$Gd2EODHlQlb7Tu=ZFZcw(^h3)qx~5 zD~K6%!Kwzx4=#s=zsKE=haDc>)ga2VQ9Am0RIMqw4z}Qf`&TC0Ie4TrM`Rm9;$2E- zw)xE9Hnr^+ld%-Q0@p#X6n-*Fs)2hC`){OpohUnnu_y~^2--dHZ$H9rgdzt?V-;n6 zgn@OzKhPe`UI?UOls2zCh&eyppOxO?a}TA1BfCswD|vv;PGT>tD)6myL^5Aq8$tX?GVgi!SrUSy249+q9SnV8P$q5#9CN%-y z;Pwe7$0(=nIDUUh0;RZ#9V!{V!J_(D4f7fn<2CHsk3o@kn~Gl!vYMr;$K)S+9RoNy z_i~Pg;Jy032)0qg`l~*n z>NHMtF#Eyoz3VfI)`~|CamXTfd_83sC+f;iD5qR)p`7Iue75cR=)2R)X0Kx7ef^VA zSGQTOg*i^;WBPaMqRRZAFs7A4Y&intZ*I!^>4ZGYT(qvVFw}HNA`TfkqgeL(3afuV9@F((IZ8cCT;W_MKqHF>@t z=DR%_31YR6)xbDGOP3evmbUI$u=1^rQZegOQKGNN&zj0rm?^I${Ubh4$X# zR=l4mYMPZ%><~oM7RLn@ae{B|n*wKc@sq3SC5CXteT_ zS_T!+pEMVr8j}@wi+}Edl4j`rTac&m`vJ#%TXs6%`fafZdii!+;I0V z7|^Yjw|BsJCBWekKQ(gg=i66gu_Jk=@sLP~w#^SFX*tPKfXj0>(}nvu9|r2@a>4(F zdyMS~(;d=3L! zb zuV<}60^_z^SCeR)NVfgZ3W_*!eM09bIaxC~mAnr6=62Mr(tho9$ok3s9KYb-NZhtu zdrmK{J8tbMa3fT+yTt=@+rvehUP$Pt;E$R&4V_=#e`7g9YhRm7P#(4FYF}B`tS+v* zx)LC6_X--+h#8`nwy6L&)#;c@TjPrE7P#%r)7?<55E)%XM}u3y2={O#P;ES51!GV7r$> zPgF%;4~oT9U{sMXu~hf)g~IH1G%YS(i5>PUbX$mU=d92zmYCQxs+PCobr+lB%J1^V z4rD2$3_yH&JaJ7S87eM7>J2Z~_SG$A=gic)P7hZ0EV(E-ko(Z+K|x=18Gna+%Cmjr zD1qE{4fc=vwG?F?6exFuXs_QWYBB5)5M_oks`UCE7+bRnw`xkr+y89WJb-1XEC(zK zbod%omYNSnJfbMpgwBGdFw|i#%a}P`hPbKR= zFH^j4Y9ZC+_DMGH=$Z}4Cx1Kqgy*ofv)S(Vg^g5m;-)&1M@zAj49eDb?+kSObyrma z?;n|1RZyA30X8L?<;dzLN%na3L~1BroqHy#sdT~v+j7WO^@F&~EWez+sHp_q&7tTg zVfpLO)DGXQ_WL6{{dGn!u!mJyhxG=KMHH{$-Qt34HIt1GMcBxXn6h z%ww3O3&Ga$nqewGEDYph9DmZjkjC`fWa_eDzpZgcOjxX!#;pbCRncj0^3dbF2(>#f zOMtt8x99#W*1R3bU_Y<@!+zTQ@9d}T8>@YeBENID(=f1%-Hnr`202~Y7K~mox=k;L zM{WYDA(U*c_u29xk3Ml9_y``Kk;)d!YqmqDNTz7#hB_rL=YJ(jA=;_}0LU!nPdfb? zbzN|I!DI^I%x^`iC}qa`qY)waz&yc(G{|R~l+wj7R5#mu*pJc&5_(v!us_q6!O{3s zCMwQl6zqhr$o5b;Qf}CN(bW*SgQ@Epf~#U>+moF$+)+}3=hlPx{R-rF?S6MSQ@Sq= z!p{`T3sI(gQT{K1J+CdEZYSfc*q^5+73}-f2)oTm^V>Ljz`I9;AA61nD% z7?pCkRv+Y-OFy^_D>{_&(UxhnQGF4A{~M3DAnCGEiV1wQC=oa1Oi`FNL?>q|m0qpu zTBs1@LwpL656lU5rhIkshbI|4bZShC8`YQZ$>s?Yt%6 z8N;C(n4D8u%(NFbk+}bu(maJR8PP}2w4N;#m;l0UhgPj}V~FE-7+Hz(;RsT4%tY}S z`G8yl2hnP;eWrL<&l^QzTo0C@$i=_~ks_%YOo<`NdU!mq1_PtHQnF3+5~bS5I%xdV z*XPs{UE9%ivJ0*xTEn2v*(@}>vyOI>Ztbwfh`~=@9jj(Wc+k*pzpP!{^>#P(>^|qOOR$cUVw%Y1a)QxP z*N~+Dbn|>u%`_dAF7zqW<7~sV{(Q_8KoWmN{$Idj=w@KW=wGvni`{18?aUX!yHxlf zr0-x|7j5YNEwDfPtZ-(Oz4oT_lgUy}V*dhw~U7WQ7*Wsz&awxyhKjfHNq8~Vp&i|@*m%Gavp8>>#_6Hcq)g7{Kb98uXs`N_oi*|m}g$d6N z%Q~I&S?|-`yC7*izA!V{r13UThA$-D6N9!C#fM;)S}NNsas8ZM9F6NZGQr@?pf^~AQj*)fnfR8CIdOi-@i|KWrE;@W2xyjTi8~_~G;7eW#Z3v)H+K-#6!&%48|s0(KFsOrsX2{7 zn88NtMCBVO3o#-ss|EzqfTuVcPQMR{oUL47!4!Xa@Rbf8XPHR8kB!N$?fnl4 z(#?_EyOLiM%D-3kJ_f9u8JLRwJifW~3_1gRdaA;fA}V+@JG23CNzX}MR+vJu69t&1 z)ipdb#nvpGX_ndNhQW)FTCGz`N^WCsY8o_#mVw=kVT-!S-8)UNg1o20oS`>*Gc)1| zXY>3Vbmhz{9#s`;5=E`+!rDlg)Z_Rm#lD1>mp{09!!na6vcCT~?l~QIYgAFz@wK>? z2Qkn=IbD$0n}D4Ec>H-18HuFm9Xxg;GytR@5wIzPmlJPW8j2h?9%h`BlF)ZN(@9-N z{c-kK(6{NMf*oz->b$Ow^~~>0`d@p+AFD3p-t(lVC*VEfz}3~cxoq)U+O;aw9KMfN zW9N(~QDIjVqG<>*oT!dj{O^E$ZlbCi?gL8Pe@ge^T6^v53pbDDpnhnRVr#GtWX*@i zYk$Uw$S;?qHU;i9*t|Q=bm!VXKtfmQok{$m856HZW{4zv#p93q&shUae7CCvXA#(u zqWIybmtE_I)!^?`88{`y<;e6a^IkmRA5@W3^AD_(hERH*RPc2P&cCWwvY1*@!F|pe|6$KtuRX)$xsG?$O|mZ=%2zjoZbQFGejx&T_ z)CTcymA(66uxq)4--h%2<*QjLm1QcMk>{)<#XyaE`=1YET(-i!zpV%Wvl%GqN#t*3 zsv^$iE25{jszoKBl8s9ckK)t}XfW~`DqRTw;TNX1BM(D%*j7s@mF@~jl}xBOb%uBL!XwJJjJ99msH$%{Ow(EO)Qr*p&F zEB1>N>h$de&ipkDRfmym;glUu31L|a?m=Sa_TVw z)Cx_}>UL`_N=x+D@RO@7(XjkDK$p(DdDr*?Tt_T5~3VvHui8|w{CN<#<0U9arOC8_c`=fE&)pGCP z&?9L{b*VKuc{I|5xzVFYR{z_Gan|yBY#ckgi+`j~Lf6R&vj~cx)HiM}Mh-+fvsF>_ zF4|07jC-y*!9;fUU0O5-e)ScOei0dkx9}Ov#F(g5LC$y}sr$$6$Jym@rEJ2D|c;+q}LoALDeRA0@ioL%y)eTbn22UBw?ahpg$5l658@ z8lN%%+0X0nSL9|_`i5$iA7O7VYjx|Pudpg7a*No_T}uQ!n9*$!Lo}o6|A_)Kx`bJW zZr|bdFYEXx1l*4vEb)NMzSEhCT`cu%OSU-MIvunbr>>7Ei4iK40deVgBcqXA^=Psu4MBg$$G zV=j^Eb)rr~Q26uQwAIuDcWflT4|5U0sGD3SmE!4=nYTa9-llnxA@*S+!T!n$M*fu* zB*^*IuljS~L}NmEz!HgB4O@T*4;K}OIzEY!63VX2lU)$6yLiBi47;ykqBwp&A6PS_ zzHzsf#g*W51C7fT5BL-k=X-ez-d8D#ua`CdpflKg(@~&(Xjmu0I+WS}WY(FNRwQo^ z52-;6rC`~$(94d5uA3xONog89qO(9*x$hA7^XonZs8GL2eEbln11XCvKU?KRNG3lU zJ2R#>qgqy2=IjR?i=q8>V*S7Wn|${FhkPcV5e>KGVpw=2=bvy(+n}OrcGH$r1w`W@ zbuMUItx)zV(oIJo_i;$_{m@7F%7%-6h29zdA}Rj;l4)Z2%s-w&iN);#=P#4O%N*sv z*Ej@N(-@&&Lhduu-t4}idLLOv1i*}Kh!0^!8p+MZDINRAiBgSXp-Khf0dq2Znp9DM z%vnd>`mbd_+y1;hT!qg3WZ0Kz6^8j7!g<4@79?k)+-J+}YU^@MFlEWO+z4URzip5x z*>5wdnlaHHV!)gm_pR~_h%!0vxYJw z&F4RN4)XZO_Z_@GUoko;59V-m76gewn^y+(YC#N1!X5hPf04Ouf5}`Z{J4y*$mm8n zpL#O7zVRn>ZA98DIR~2P^bIL_lKSY63sXGVaqj|<@}AV?#>^K6{nol=?nJ{fm#!rT z6;>h3{ZH;V@Rv@a$b%?mbH@V5Hh%Jv(>g{Y)JkPq#Z74)Pv^ztgv3 zpE8gGeD-q?OvejB*jfObo(atj3l@+Vw=T=3jXyCWQ#r}c+J+lfODuZN>aR@IBZo_f zYnUH~~jsP(d5U#yXUJ^hcaeVkL|x$%D3k<7m}D?Z=K##2dqUr;S71fmfjT zjimfp#7$RXP_;L6{@R?cr(yE4%87^31KoVH+r5$gUAB)Ph=SD#o23Vlci%B8RtF0H ztfz8z^L{uDhPh~eF9q1gqx$E3O3*yyh~?xY3K4|6xXdqUVCQI08r2 ztdPc=WO&b=diC{*jMc<8Obq^EPIK${xp7N`upIaz_&>*L9WJB!D+Q$cP#RQ>d?2-k zu4@8341E7AKxfqHIOU(q8FzsJUWL5c`XB;Do;PJ1t-!XY_%?f2n|;|j@!xmam5Wod z0_JgmJ3^PN_qK#OKGJvg+YRZU!wJvlSn%%;#JP8LN{Y$ISvXCw?e{C-_DMS+)Bjn8 zIAxo@*CUzPcChLFyMVvr0X>U(2CxSGPz9Va_~_I6CM9HNs_@0=%Ks3n#C?q{hR8YO zokrV&R~s$Q4S#>@O)ofFt27AO4bAjW=rje2b!mi^Ee!7wg1ld$uKU;EI&)_L06tOG#f^~Nkp^2&=~Ts6i~eD4^Z8>gdU|@ilVB<_Nn65DeLOXx+qo z^pK7c3k#E0ZIToy&XFFS_>;`<`%a)K)G-J&LwCGIKALM!MC9)Z_Xzr7%9XF2yt1M3-v~}$?5X(1I*#pVNVw?q(&8pV#u)A`5Y=^%pi~$0 z+gWp!SObaEocncG=cFAe(t4zub<^@fcF4Z=*DbDN$n6Aw%Bc!3NYs^~ou0dZdTJ8S z93Wz%sx0@CR6#9YR?Zj2KijA6M%!A%39|;?!ny=MPvC8!*Gq04{~MUPSD(YtxN+uB*o6Zc)a=MU5(=z8Jp+h9eIy|-9K=~{9F?OJ{0 zB)!BJA{VJ(43awn8MnBcGf#5Q+qHyB_nbyLE+e91=h}bJ_DWV&#^u3A++GufRr0am zu-HY*C2hjawBs#e*)6=VbIIOMihV!PlnUCbJ*>c3KD0crOsMJguK`PpGZ0s6q^0T( zLjj@PJ(xHfyr^tm5UgIZHe5~>S_RT?6&(9 z=W24!`OP>IcC*eUBs~>y6hlHL#dB6T6M4qyS$jv{kKjd|ZTt6jmIpgLVR3vdzzY)- zMoske7GU|UdL27xB^EaHM;<)Vk;%2~=*G|*adzlsV-p?srVVf_GqnsOiyX;+AOCd( zbVuhWuWNIw%DJnW@}v+gE<4L+Hfm6BXp|NkYqg*;lw@WKD2QMh;eV*)e`tUWduTtG z69;eOBE`k@Id@M9f}CY5h>zzB5X;{U`}AiUaPo957_A5C!)>oHE~k!sWAq+Z|u=)P89`)*d85aJ1nUuss(y>VCaw*EUd@ma1?w&kY+d~ojz4o!rmVpFZVUw;txSL;OK;XA&bpl&gw_{|)UUn_RyFfALp`-ocOOpDGF;0N>wT1#j z{$aiB0u&wgeA*IY2B`=A*n8|0zYY{?*EVZQs=<;CuLecr)0%!0nhjAC`jxSBMnJh$ z>OmrtX;1eC8YhPTMkOXFcmk-zruDn1rpdVkvBfLwK59C>6su&me~D92a|l>Z?J=0 zOq6kN&S9dufhZPbpT;80?i%V^ofWSnIb99?m(V6@rdsxe$ciU=g-a16*#VEcSzC*% zbI;t7WXuN@yc0WemIo~w6z(_y#Vl$62sEM5;(Zi*oWFyH((imrX9c?8lI=@nX9GgU z_Rx$i=P`C~PX2p7m0s*?KNc4edK17Nfq{cwTH8trh_m})Q$$)F8z4Yz0_)ORY==%m z!!fb^22!`;%g*~1Nr53g=4jhTpM@=TVD;UX{ zf$J`;aB^Ku$K(TwH!Te0Be~7-R6jnrmfOmL)VoS6`J<}01_vEtTVO0BSJ#&?weH|m z5ZCxuCqJHAiiW*oAPSMg9?qH~*c#+5o<3Fa8*oGl3?ufzOXy_rIfK_Q$vl#qSl0Cl z(Uy5;u?A8AlN>nWNhqHrsQO7ODM6|BPr_f8y>w`xbT2wPCiXAN0H;$J1Ahv4I-CEoQOM>%YOX$aaaV|^oG9IQk zL%UZO_UzZ}I_^|zp<)u(n;EUFzc<^GvD}mqI!Z8d9d=rfo1jYIo1^+RrZ8;#7sx_t zz*hkLNPA_-cGoNKL%n~&O0=>BUSb=5Mw`M(%ciuK{Zwg-nN)-VsvFBR z4H5G;TQk9C+C}z5m0Ez@PjD{P%Kl3Wc~_CiVw2mSkWbh8w-_`% zYP+E(LGqzz;MUo&eCQc10#r~Wy$_0kPh5j&GJ%zl~VJPq21?|#(U|biau5<rBOqJuAIrcV&FgoL=V#5=I|9er8~#_3GFN`8tTZ6DGWml(JVXl+R$CupDBym0>RS;<8p&VXd)6PC-VGr$^Ew1uX&F8 zwq@AQl%{&`qV_uz%qlOzHo(&h`%2LdAd>X3gFQZR2i^Mw9>>Mw-x^H_0k|Ak99+0G z`1i6H7GChpd6P)iG2-pme2a`^FHilRhijdqkZE^`9!M+2vc;tj6>yuDKtc0_!Fsm&>u7au~XXXjeV-s zB`XQM@@>jk)M7SjqyxG$$!ZV?-;24nArWDYMLk>Ke4wuDJ@<7dvuwvH7+1Un-|yt$ z_7BAgW`I7Z1C7I>aHakOVa+6gIHk!OPWq_BLE8CEc#7RUaNB(M<6olm_0tR>)WW@A z#xz-Eyup5KT<$g9)$;;W8m?EyaV67!6$EO_G%tqST^|wS&LId z?|v$OmO%GUnBv_#qe$6AknPhtA+6>u^yAPZyKmK!4n zyF8e9#g_iYsOoRqPkt>5QW3GDFn*73r|LwFhK8WjH!ap)<#OH`Go9}%QLk@bU<&)v zcyw66GZDWATEc=C=w!b)KdPXwbV&D8tO;%Y9V(Ar{E)uP!*uusDx?kgwLU?`!Z>49 z<+T9|DvzRWB?q_bCQ@Dnt%T?PPKeN+ZEb4*0Wb04e;&xaoMTCayKBUKbpLbE;RC%k z3iKi9__D1VD7ZsFfORJdX!mPa%IJ)k*Aa4O+hNhR{obbKz~I)D?&gZ%VT<2rpZ5CC zb#en1%GpL>8dgjFMc3e@8_xuv&7saM7i@dufTNx>>U4nPS85c;aOAZPyTqvGNlNff zj<79RqgI&76&57%?sq>D2+Bzxw!KiP*$rh7fM#M!arZK zL4Aojy}xT7D|<6!jnUbyfI|^nYs6*h1$NoGTuNbLya(HT}Y+~AHUx!u5j zVRA;pIDGD-LY?A(iB=8tBJr)tkOru#j&HUaH zO5FQFk~qc9seTn4hzpo9NLrVnX1iP#k#t}Bw7oXxUkygc9~W551ve6Z?lllz5kp|R z6CtpxnVRP6cI24!4kK(P!VbF)9EegujpdsjF|QE~xlb!<@(pN|Rz40V={v}X23*RV z_2?H~{u=dTkus?|M63Qu=@Ykh=|KMw=ZE9)P4dddII}HmP|+bPB*NuD@#wrAU}a+O z;Ze)#DaDp+<$2H!mjma;1M{L!ZAsX^Q&za^#DJv`O28B!M2avtR8mDh;eKm=(rwX2 zeIzoQN4Xpb74VG`VP#Q?VxBM|K?hcv%A;}aa&*b2sD*Fc;Da90`oPeELCcJDx^*T+ z520Ug_KVip1u;YU&4E(~Ybch?B6v)G1#hU260ht>f-_w)(XGK7Je%3g?lzNImuf8c zF+E7_+oo&Xk;rlh$+5Zx?GeL*{c7!BV04cDh034M3s95Q-?a^`#0G@3yQEjuTT38o zp1r@qND${``^~LQPEGfPiBr0rv!iNtN${?GJ0QET14Q&Qshj7YDiHFrR}+xJrO8*6 zuvna{W;4COLW**wV9?o+6eSI3(I#0=#tz;xXt9I4x3neE0aJ55)f0X#5ixhKOW=jQ zyPO13n&XbcZBr4(zaO##)62&Zc=yQmmaTS&z`T@jlblq16z-(=Q|E0T%9IkrLt@0 zyAT`^2cg+O94|WNOij4G;|$4A^CNdn=xU3)C$>&EG`f)-P-x-~+0nJH&ftTacFY=@ zdPd+n&b+FP>i(=Lhq!al(&GhjGoH^slQ^?maVKkAhnUfl?d|mF_?z_ypJ_bD{7UU4 zafLk4K1hILeQpjYYKQm6?E$ze#{KeFF~U4(yOnQx$jlF3-<&kqku=BY{gqbryFO@* z%Tb+FM2Xiu!z1&E;`UdhMLm=(REj}k$(71Y6p_Pj0#PwNboqJ3vNh^Xl3Q)FNs|Z9 zQ&GQGDmHrg8q!fSiW_b~nuxn4`P=2lvAa&woxA+;t(^q5_S`rq7K1YgJ!0eiDw_DK z;YyUC{*XjsDo&R)m{KO>-)(=^cR#1Tm*>}wPI%4N2p_p?>~1^F0YXkOHL1yC3DqZN zYA)B%2z7BG4GVHd!ne2$*5{Qah@XV2Upn0 zHVeDYis|c_p(ObF_dbA)5wQazN}f`ULq7klN$nQ&dHgy@1ydq5YGHBhZTyKByU{$- zd|$ss-RbMdx7EIMDM86L2RjfD6pg#@I@DZf(CvQA95>LSPT^)O!;#)zb;G-;Lnk{G zj}=-;mMwpc=t#zucepgq8iX9pHzZphap=oQycB`f)o;K2e$j=pepZz~;31ads-y8N zF9S+en_PR9vzxvw)Y)~(z}icM6~swaQngno$KKlkB6r(2s{Gazg{aW-xJ4MPo=Y^Po-1xIwLQFe*Lht=SW6Oeu_k z$;pW+u(HKs7BJRn^a5kgpf-eaC^G1x04wwlHbtG1eZ4c8`Nu*v zlRVjpNS70!l)3u*2fq6;)ykMnDSGM*x1oB|KS&O*l6|K|lOwztGy;;G7^&jSCciHP5X03lz2=?*SK$<)x=awP@yK-m*4Tta ziA-27`CAu;EO#OL)G4AAhDJ(GYwvLYsDF&tFn?(>$)ro#wO*rPNgjO4@`VVMU#fcO zHlKu&kch4=Bfw73s*Z;WP~3ZNxu$yt!b45}1m&)SGC~PPg{I*!jHU71#+6y8CpM`QxgW;*AH+QuExuXw%W=rFQ-md&%dA$`R zLJB{x(uAaL^-(1|_2W6@#pSw#|0p!w{l?{5@A^nC6kN53fobj4){I(igFZm@`F*mIkpS-1n4sy*pQ6Q%*nr6IHRS3S zQuC0VKNXs%)1D{i+N+9c>-uMGR()#Hb0GDV@6*TR{MsI=cDH@gVm{cs5H6vAJbofj zSSYqnG*4y%ahauYE58(JWov- z7oVkr+NEYhxSs~HN4N~?yhVRZ`MP(F88hEfKP+_w3vbbV<_$zc_$oxo`5Lkl6UG!L#uLaQCR+ zb@t*Ly-fJVDrpmGFDwVl0aFU+BvR`Hg3~#?U)~XLqxkejxuIxn4ZMaJL71!6<{Wy6i0NWL_df8y9>mAoy(Q_4%Zvl9gK&{vUX6nD{B(h|)|tl-@voVWgF~S&co09)pv9&3TY7SY`P-P>A{g}(3q%}1D{Q9cm>yR zMBC75qW0#oMtp+?QBT8GaIb~CL04p)Do!Br4b!7X=F!c&iQfD8l9a2w;l~gKjg*Uj z>jnHi0saBF^!_;sUfuWq`{x_~7ut!{{N__FYLS}3^E6w4>(%BDk9_?kU?&^$PqV-N zgW$URUkENM!0~9sSV^4sawJ@^q2ju>+ur%TK-Xgxp-$QL#5+g)Gx(| z`hyv&9m7#>9+G{XfNsTdE~^{m&~U#t_tHltoB4zPbk#p2ILUQX=ZzVU?fQ!(EBZvF z3dM5#+C;&xCaK^$A6be2fbLoruF{qDs%2}Y{ucvLbQI@0H7g+{kd>C><`eY6mH0P~ z>38+N@g{#GPG`V*fLuiqg`oJ=@R;(Da34@JfqgABRhl~2lqxWr^3V@P{R_!tUjDg$qLN{K#76RvkRAu{_x!8Q^i%(Sa!ToZFaRrj zUX#al!Ktg@VkJWiMiBp3sYluizjZwFAf%^X{ziHH{&_y;Z@mt2zJHb3r~2@23*jQb z;raU`c;$EvER+-_1^9se1?4!m{N`$;%D=xHKP<2ha3B445&eDn)BaBqAfhmG&&Rn^ z@xKb4wHX8;VVD4_dBbW0HE-@ewH;T!j1g}oe0h0fu^r4-fU~F&V_EZ4|Au|M{BpQ8 zH4E0rVegt1_V9ZD2kde6oM*pRM-*1h2IKTfphe)H&H?@L|KI(&>WO;wU){Tk`*iTX zD*v2X0IQaC{APUU^M7vP{UZZwT+~m!hR6Tz<+#c3yZUzlplRuI|JCBl6VGnq{?o+O z@qOuM{ta&VT>%v;5G}3#Kel+ek@mlf|6l7mXK#7nf6V*GBfwd>iN%Ni-JJ4gNhBJ) zF>HF(vhTQu&bd;~angz3H}^^65#Wup7Z{DaYRNhOPge~-13twj}I%I*K{51jmT=hYWx9}v&&fg=8RDdfAkA=9?dw%9kgJRHuLrh*EE z#*`Kt%F_Ck#=>%%X+5Apd>HFq0|`=|2MXKqV1SawhNgE<>aYOAT7rRlEFF8TBzk-l zX>Nqm+B-d6PhRmSj6n<=o7eIEWfp9`N%#mS!JEV6yh9S$%4Wh|N2v8tB=&Y2bZ~;7 zC6n#*b8Nwp0?`%G-shKNwc)FTn+R#59d(8o0HnKb6V@;j*MSl+83VJw=WXw+dpg&+7~BLtNB`| zwC|{}Nc-$UMz9O691LiQ`WGg9GW_Y_@_BmBS7z zClBp*oItxaULOnO{Q5Iw`{$8-ugBInt)1DV2fM2$R0gQp>bey>-~Ef57R6=?r?DK( zr6YlER-SRNjuGr&Fl5IkSBvrMTSnm3bEPWg`nYL!kC^+G1^e3cwO#8FhK0y`Ft{=U z5xQ&hAz&KPUk(V2`M0aY$6{V?5Or|bp&jN=?0tmbs~=bVC&pYB^190NWtw;3OlAPM z0P2m;z!7%bG*fV3;*FpUowtj6q&9K^g@$dXDz|8qa>+hC0_ ziRm58RzC}`dE=D!&T*qvTm)ice5ht^06f&Iv6@=}2{j!a%k?bL_T^!Du^gC0}K zZ0b)x0eKf8ycbZz%JtSId2K|v?u0fdFq@7WH5Zi_qjTHQFwrHhzy(XdTx>INu4@iF zCnuC0{=x3o2C-RB$Aw9yC$S4(6F4`Un$@oR8Y!4AB`P#7b)De4?fG<38*NFh!5kP z-oZlqBC!M!*xR;>^k}BZXnwtH15%>_OPPq@`+PryYR%Vt1T9w^q}*F1R)=VYk=F=5 z->xrrlPmextglvLSDaQuw=ElBV|%iir7#52sN)W@pQA&tjA1UfX$)rO|yJ#=^OYKQD!Bv|=$r$V8aNQjNfNXo&nYZQuJIJOuw^#6so-y62g+0+hBev)CQ>6Zf#qc$C#~- zTrv(@w#>sy771FyAjsXAGHx(@=hR}N6kF#An&~&J?KhV&c@{c0C@qsZbl2CoDU-cZ zbSh+vQ(IK}ZZ6Eop4A7Q151f)#zQBztwUDYY=(CTZ4kXKh#xnL@jRk|6H1;mb!%Nu zoYfY2Vr#So9w(xs&h-5mPnmxUE65SLU5XRnnW-Rrpvak@6woY54&LqHXLGGgBMwe) zj&KaINEUt|2P$+EOQ#UZna2KOBOKk7_DKo8KQjV7x9+H8z8*j{w-7KJ&6f#HK4M-; zeHl!GTT@v5iW=6o?P`rZmO2Q$vwk#{@6KC&TSIFcR>J({_@fQBwE0*u!L=9Gtsc`d zeyxO6&Ppp|fl9oa(Dc)&Gmz8TwZPs*P@=FRYpfG?%#pP_aB1p*?a54-#vW^kfvNNx z$*YJBD$_vQcWjW6QM91D$C5W{ly(e41-xuO)8q?a0v9HG0#UA~G;4~)=@^u

`V4 z>^9%M{y>N9Xspmo<~t;ZT+K*zPfS+hq1 zBcW)$mEcZTtSs=1zHg&^>syh1pMw}=_V{ZUD({p$H2X0UTV8I4#e62M$UsytaYKVd zz<03tz01rUg^D7LlMwwN!L35Wd9U^!d_kG1ir&AUYN#&RhLMv5`ir!ZkIv3b>LcD; z+wI5qbF?-Og?wuS74q9(mAN&)Z3-$#^J9&F^(TaM)b*gBestiHBen(doKT5;y`&R? zOk8*#amtIfS>~12Z;FJyM$i{T7JllnvxR{La`E|$w(b^ZUZ1HIgOx)62%je>4Duvs z-I{6MJma0@mwng@CxlH545<5lfz?( z_OzPH^I$xV1J%m`{vCL&`^I$vLIMYA7>eJW>vF!aP2IS;V>MU48p&erXp;SAiZnLC zx4=-X3$@qeC#r5wwen(tw{w5 zqEq5uPT(m3Rzm8TR(5YiB?GS?#*DABs)_P=@;x8gW~eL@XPH)}rmuy{Nu@W6J37Zt zj6zQ=MP7cZYe=%-<6^s3mQhVb7I*A=1MhBDE5r>b(cWL%&khbUEZ2wVXHiu?Y7bZ1 zfCP(d%pH6OW+fz$dqYD}ET=6y#`iB$IKNya?ZBuri1X;5s#%U&2?ADOzFG5G8fo6k z8Pp|P)TeA@b}W*w<7aKTzffa(mjP-+2d9PMSTJH#aN&Wh3eFK0X>CS-YmJGGh3&4V zt_`f<#|1Xtd+wiC)R0=-lt9`KZ7ylHJ77i2XF8ETGd5R)qkfu)HlGOHne=e$apyPd z#AP#b;}{mU5WdQ0beTFeaJkiAhI5FV7Dd|{A>8jcKsDDonehiVyj_ys>S%7$TQ_Y! z;hmlX!OvaNhWzM?R#_V#wb8Q?_MI>PgNG)`0YoVst?OD_2)75C1KnwDw0ggVw&J#4 zzdD0smzyat`-m~GTpN{68ecPrF>a=%1ZE0b7tP<<9rCgR+blR}xGr7D7_Vd9t;#m- zLv24L5YCm1kIJEoq}Z(UlY4RK!Bz`adM|$O2&2OwXwa%>tzS=m@@7%?mK^gt7-=&y zf!Ux>Nm4@Y+DbtmaNPnSYK3TgO5Orn4RWUn{iO3i6KA%cG;H%}NGYy~xy0}6weB@p zf!F~YLbth|yA=C91SBjW8Njkb**lUH*!FOR>Gg}d6B1S`*V;cqh0>@R#Z+cs>0Mlt zgD00z-^`HZ{0c8}Yf=X{{gf=*1PtfdwZ#zzOg0Ii2vv46PxXMX*c8>)#{mcs?3enb_Kv%7PlVL?d~>HB-F^ry2H zx8SIW5NSGd3CWInLT>$J)H12W@ZgS0BwL0~Q*>**&(9o*(`RQI7P-&I>jFtrb za>tH%J}rF<+r3HWTs{jm5%x&4{1j=#pKpD`-A4@Ao8lF@p9aUQcaz5La7%qr=_X!- z+QVPftlX%ogXGTzBJ~5)s#(o;2F#e+2dUS3qa{fzj6Pfapw%W&rPmNR*A6^G$>#D} z(NnQF4PD*%G$eQC@bhfT2|96$^RqT}okSgFDSa<1cys!X3BR^x zZMZ89a+m9B8awNFHB!J>wF1)=DLS3T{3Fv*@cw*V<4*Nvnn&~f4di4&3*iuCdp|sE zC}<&{f{fArMJsV%&gb0XY(EDLG@O83(*In>xtnC)wo#3=Ivb-R(_iy}RG0Fmzr7)r zVjY!G=3ixiJJGrDvv&~cw^~t{WZ@`msY~hH1`3Q zGT`MTC+^-5DVl4vEIo0Gu6YhD>aaY%tQqyel+4W?Bp4G^R4zjt3?!j<{SGB4aqEPk z-@hn^)0EYAbk(AaHtmg<80a>&=zRK!ZTO>vxD=<#eyZuRw@ip}Ok_x^{ zf+#V}4P4yay?c?gI^B!3x)Fy5OSC#@R8Or~UGg()WE``Uk{+I>mRgnMtUYX5zpSLl z3CkiIw4mbkmhQ5+wa|n12eWLC&?&Jq6|vz2^*n_dQQD@3)SgTnZR=qR!4=dfy_3`I z+|$1JYe@SJ2sw6+#me+^V|a0-Sl>d8DQk;k>Sv$?W3KSDqj~L%_xX&v?%1P_Dsw;R zvd3m1zQC6+pN8Ih2LJ5caowTEb3j(6D}nrwsmvt-DdunkKD@p*QYf%I_tO+%O%5qE5aeBAa&2lG$0 zVpA<2OSxM|5liYJ?-thDKX~Pvnr9i>AXqmx`*^U1uckay28~6o&XoLsZNZvhHqFbH zd@xZ5%gfE)YT02u6=sL;y&CJVVYo_&LZn1ZVzR{#8<*+}d6R39_?=}8K46II)>;*C z-M#H>TlFoHk4}o_DX9Zb(kxt<4NUE!vBy(s?B&gXm>R2I_%we8IG%Mio~0dc#~dJe z@nhjaUJvQq!x1Nv&CWnoyhHSJVM*(jjV)QaWsTd%6GmPrcl$cGpzP05kyejlWSs1Kj@VW1dlQ!v8NF02G_S1c%r6%<)eq-Tbxl~ z?}F>Y>=Ki`wSAlt%Md)$lWTyDN{LRYKbB-DD4L zAu@>g$G4)e?K#_GAZ_-I1kKI0>p{P^7>#;4c{9CQH}Rni{f!D{QBLr&rdL!~PM-W3 zC?I8O2*b&auZL&57tOb&3iw#e8XQZJ`&h;vvupQlan8_hTfq>#9R=pe5eql@+=EAR zU`b@pthc&D8JLGRc3xch^)2sb-2s~Ua{D3#5T-hdH!|BDgWBIu(A>GM)p=kQ*kJy} zNtl3&aCqQ55P3&VrJ4g*i_M4+)fe^HG0AZ3HkRqtaz=q~SOv41d4yHzstCnO#z?J= z^FWt1#13LBw`UK@r4><3WX?cA8f+PVMWnGVu8I9BXvl}NeagQDg3q+4Q4=ugsTGWr_ zd3=uNadCwPFsE#!>a{&?wGACDf69Y`(AsW^gXqANT8moX<94%)RN1u(D{L2a*v>pi zpG>cD=uQWG&EXR6f%judEO%`zajm=EZTW4m@!iD>d=M;OTc*o)>4?-?7sHu+Zz;1o z^XP3N~VlzWTVWIROL=q*9TaMV#Up-NAa~$NG)WxCn{_aY=2WkoiYei%vt1tyi0q^ zTK<|u_E&s_$Y>dhf(;@33**|`fn67W+AhGvi*V>sYBV#!NUai^G`OFN4lZqkw5^Au zEH7#+wilQ1uSJF~YJdS}&*)Nkgs^AfQIrYmQjWAfk0iY~|UYK2OP zQa4c9X(_PLdYA#V>fJ+v)5qMHz*Vl=eKZ#=-HHmUJnU6cC<|*5zyYPQ>=zTF{x8e0V0evtV&f7GDDc2dnbN{!6ur2t@`4OiQHyNHcP4m0dkIR+iV z?$qF)JXQo5Z!Ul#xVhcCWvqJ5Idt#+76LgGZ*U5}=*DE7 zGJbyqq!CTo*>8-qqs37dK5K31vcyNJ_1BSq5fpd+7)A5&U7E*jI8uw?hv;W`p$;p3 z=vw@&?oU1Kfb-ct#JQTEQy|{SMKmtf-shGqy{)aXxGYL@Q?3JIV&g8BAR*uH-=s(@ zjOC|a(WpXjd39{Gl8m%8B%QID^xo=6f-`O1L3QJ$Pm@XsL&o52?x&AgzFpCWc&{bM zoV~Y`XAmEw6AMw9N#Is$i^xht^Do@N+3k!Vb7y{St>8;Fqwr~cgE0^~qG6Y)H=^9h zfo<`8f!Kd(O#4BXKg{>2cg&dru3YIHO@er?ygR;lz9UFt@d)5F&Ji^L}Xys-mBoN?^kHDWEIxLBw;pin7jOA&>({{3CD#mCsa9R$WFO=N1k7S`46e8(+= z-}j(%@YuWW_@>d;vC2jO5|nH#MOyg@64C3MLWn+~UT3m;@kJ3qVc&V6Hm-=fjcaw+ zfXVJqde@-ZPw=bi0N{NMqPwmL3}FX4D@)vJ*u0LA!c0eB&n|PN^1L;+lksqZa+0$_ zTZ>O|TEf%PJBEw&V%08Qs12;Nyh1$2{D%xTv>as9hb-3PO1c5>wMi( zDle3oB~XO84?6qTh;A$d`#m8HV{7w)n- zs}NtqJ)yI-&5LPp2L~BD*n~3k7^i+Kl}Eb=t~3AXn^%&_So8PUkzSa8Tg)L;<_ti2 zzSEr%{ALILnqQv?AIycBHbZzk$qc?n&Ad_Aiu;Hd`v4NZ`&?Ao`P~Dok(un9pkT7V zf<#eRY~V}~*Yw6OklU`%*nr&D-m^d=OYS3bQIk*Zx!Y;lJgb-B;r6xbX;h>y_WqME zg}OZfm9^P^3tb#< zRP_^r^xTf}RuJ(*$2G@~VZtpaKF})eFg%(!QU$dL%$2VFo|;T&$*zJpy2;yh%jvSD z5Q1YVsIiBh9+DlewMx23pND>_0cq>E9~3xSB2H}fBR*iX@7|A(#KUAx#RCN)#+a5qEB&|e?MR{x3PE>;thqPdG zAEb!h%$@xWBmq+fbyp;ksXpx0*BrVS4`D}p&(LK;Sc^7`t!p^(N|n7KWi!dZpc3)W z`W8bV#X@cRv&LmW>~SoDL?WNAHK^D-to7?VdJ(LVfz>#rdFzz+$5sN%7PC__lRpHe z&yl(bnl?ow|K_s0xTqCqty>>xEl*=>@2x#82x$Of;fZ%S8yiIf5Zu}y!&%MIfgGkD zV_Iu$D0mG1qB)q#TjMjIw~Xn~#fZ|(Y~q=h&EWo{cM5s<>shU5K+A2#NmgywVvD`$72*j^bj0mu{(H3;CB1h|2>jf6`2YNwZM&rgV zg?%@OwR(zCOGVy=5CZ>>yljDgT?B$jV}mg>8}%lkbiMNY&{>Fz7$P+<$}Nc#!ahqs70J7{B z+DSb%6a#6@DbHr!9PZnjG`9vT-lCQxZfJG!R8*^)ixY)|XSqY@5m?EYR{S=>cnHbC zgG;nIl(Er8Wa;izIC->UkdafeSS6q`cnkuQg6}n)fJ+khZvhbD z))sr!*negUy;u$b=d#hR{I-hqk4XB&sSI|`^ywyM9(ptTDNhoHP-?Isg+{yyMBx@PEODz$O0^-OVeJN%%I-Ru18G@kys)SoGrC5 ze*lD!4fODM?F32OPW}MZBIXEGu#EwmQ=NZ;lnZ++h5ZTFn zVXL4;3^i-L%RcyfjLNxE3xFB*J0v}R=F|OFf#a#J|L@^rAmaQx1a5QSH<_yTaoo|D z8))f&1Q$Gi$zk?Id{M6**8-~J1E|9>@W=lNxt}{e3jU{^e}&Hf07BaTj>n%nuDoxw z8P-S+Q}R|0d>@zA?#kgUr!#3n9`zZ_-#H!-0uO?v1Q6z zCqYgF+H{o$@)6Rf1jDCp@(U`xZ3~?Z_P0?DvN3*_$Wxx%{+7-=$@son_rvXil^7hg zoe)GmP54Mz!L%>8!-H2dOCU&s#+s(#H2N{HJ6{QA!KFU@r-%o)speneArZ{x-T^>e+u zeewm9(fNy0HXs$Cb@Xc)}a+5=Z?#1IdnALNv2!M_K5rK_k z9kCA1VP|cZy1`R}W%y2s&evQ_3c1q~?6uyD^_Z@)yNpUWN9ig$`qI*>!s}9n%|IQW zV!XE}msce%=DXnhT0HaiG#L5N7m;hE6T^^0e(G?|8H)YhsR0=&fa~CBd6sS2N0qgk zpS7-#WDN(;1#o;zb@eKrkwfV6#QX0&rdo@;1!9tw#(o``X>*~-N146K5f9a+oc8C=j*R;2Uufvozs)`pt%KSQJX2(;d=@%d>$ zps$d%dY=%{wly%IM`;m*3mNGNU3#H@_N<9y-frYWAizTMYd2mOm90MlHb74^&xr{9 z9FB+aJn&~6^WRGpEe*X`<%zW8!j$=&81KDmLs6Gte1t4+{H^QK$z;w2QwcY z_+BQ4HZCZ+Mqy!mIF%Sb|6lQ+*!SStG---?f7LgzR||^UF>pEeZdcaK9Rh}iK2JhB z3e5Syhgz;~im2wyXgg^8;lU}0R^zx{{777G=7~43Bt@}qadM@Lfk|p)1CBml*Kp!1 zDOB4*@GLrb6a<%Z5^Pe8SGzK}-V1uBKOckd3rq}c-_aX*K#7=8;a1cfYvO|rVr2&@ z(z`=B?-e8+UHAWp>iQs80~?yS205y0=m5T1UQ$ysUZ^K|z+m-K(^Dl9HIu&ZLFbJc zrR?(4g9JW{@8KJY?QP=Jwv}sn-aia=jGe+#dTQ1(V2*?kQ8|sDnRf$?nCh9Mf=F?p z0HPQB>)l$b+wJrqKmjeuN8{%cdnetsFX?~9WE-rwqT#$kaJH8}VMnio(_V#K$yhu^ z@`|!V#;A0BDAPn*yjgs^DSJbcW}eR#n6^74Df(0*8{btSvY8?X@jKJB&{DD(BuVE_ z=um3sjMipgYU$7uindli#jUVOz2LNXb!J$@NDr=KoJYZ}k2ibF`Q9h}45#V8%}YLS zL_XH}F_RnWQdzHyfi3!@oz6~q#S|a)|%P_})GLh-Fcu4zP+fQLsdZ}*vLc9@n zi0-*7yU4wd8w2uUR^IlVb%U^LG5|(f!_u&T4 zRZ!0pC6Pe+;6%vvBeu9vb}sC^94aZ%)xT4E$f1|n+4r4x>p`{(x{qAxgn9IO119Jp zp8p=Khn2eW)2ZF#Q1Yf&OzY!#Jy$=$f!XMr6)alfX6|JJN5hJjk-{D38GgCOwU)Di z=2_`g{+fldwDW=*?0C&J&1U<@91NguT@Z(Q+*zq1uNr;-Q$|s=7XxPem(@5-%hY}$ zf+{EnXj95@JL?sfyYyi{70}FOd}iV-|$|cSZXCWH-)!t(7t74 z#Q&>ev)-vfA7G7mTb4w>6INVGIOeYeV(d{}dX5%#l2%*`Pmyg%227O|$(_ke9}%Bb zZ?V-Y$!-bECj~lEE}uoq2gbVx(>y+tchsn9dwpg)@xo`(61L+kBV2 zH5NZP2~79zHg|9ZS``QYa0?6Oh@a_ACw)vOBcR1q`*wR*lP8sudLBQZZO8FRd`|}S zUWdt*8N2d922Kt>xk*yQz-;^=zekns0JoEs0c%G1&n7p*Md6O*Z2uEIE6vaDcxau5 z-bvO+=UDo6CO=9-Tw|r6#E9DmET3O;c$ttC{$3pu!+xqTd0(UPdtjg1hSp4Zhp3X) zOf!sYZ#~!yBt1saeYj1!&6V`9Hk^C5H(LCCj_>#_ZO#ev!{slSPNwnMOvzi| zI;N`Yz8_WC6ho%3Z+oXXuHI(9xxZBGxH`0#n-!@Z-@$r$`&E*5->o(^v{L*_-B;UN zRDD5f+YgQsy-u3GrCo!{fPY{+J-qip*a`2oVi@_*vop@Nl87_Uov1WYKkw~Mozn~9 z9wFKrCGlpZqLKb0QL2ABO#2?#<@;h%+qBeW-G3(|-l*N|hbbRXFNYG@mfuZxH<@Sp zCj@AuoFFT5+lby}Q6@c6H*hH~GP<%0%ALTP=%7dhWTX4B>hlT$3THUW7&+}U0IQ$D zcg+{%{3>aqe&e8eW`)hV>@;^R8Ln6^$^4Vrt@|TMfwJ$q>HY}@ zc1kYNAQ>`Skz~Y$&F>}fW(au5dS+n|^K!B_cRWfTC9&(5ZEf0KJZgysZDbKHP|mAJ zC&Z|LPI9|7!g4+Kgi?;ms=4eBX>oN+i*%#)i7IkGT7sQ`UmL@__mM6~fr=}-MlHV- z>ZEF#O8Nj~wiO_)oB=RXD@!at)beh3mH~}ti(MQtv_?$V^xjW^# z737X4Xm6!rT2WN+*;kbq6Qso0*o)4NuhZN-ri#r^0MF#L_N~?aP<$f80#NYLYj{7n zVdn|lal`2o33P1>Rr44#i{mM^Dv~iSjX#_{Y;eW8Dj*#@Q+5H^mFCIMkUY6F=GO=5 z_J#XdsSM2r!u%6eG4iAsfLNo(1|$WNWYW?)rl963^(hhGiv9LIOn z{dil^M*;gj4@;cT#BG`jxV?)Jq$Qa(p1h!Nt0U~B;|R%McbL{$89T$^?rAu0+E*UF zVi%JVWvM{%4ST}<1DkYwC{wQL!||>xM;hhW$5J#S>>{zV%1u_Y=F|0dXRn9BSCuj3 zY}fujyR?v^57%1+L}D(RfF3B2jb23suX^>+WM56oCU9%(<9Y3MzS8bV8XS|RPsC*F z0)95i`v`TUNfzM@9pi2X{ zWXywJ#n(9W4v*!jZcd?UTysBv)gx6pJc*$|(qw3(-?~xLiteq+f*@e;QT?V5lf;k8 zdU^uH7B8$)6f1b!cV>4zYuxIytja{erQl`|x7%pKuwOh6WHHaIDrk^%(>?7dXh_~K zON^M@Z$REQ6eV`<%?bKt$r#Pwb*?+}yp8mw1R8I~;z{nWi6MJQmA&JsaLJ`vUhIcm z$`&+e6i;$7RCH|weRmm8z8E{K^x`kM`H^5@q$trJY`$F3mk@RLtzvva_^Y10#b#A` zZo3Y;o4QK{!rn^fY#Sg_9dsU7p0e;XWG&RW#1HTym2w;a83Us+ ztV1V{#em_tgr=l!LUjJZ5Vm{33oUmB8vpeA*!$=cuYf&oD&{c{*!*IKuSC^)Z`UoA zD~f~a&A*aVVG2P7o=N&F-LlU?F1X+TKls_dhu=UNsmDWXB0nocw>^cNU}dT2da9&% z=q)eQv$*bKp#Ayw0Rb8xjXSb-GbcESj@ve+O!rk*J*L~KWT^EeD5t>Rt`h=E*_b<( z!$on%lXr3YhaZH|Z&f|n{i---oM>#7F14cbE9z)UxV`1PU=03#3u+z0*vk}CZSvFg zD%Ee=>Be;^?5bKjB+4NG4t;>82v#ewQ5swMdZi?z^AW9yac;)oL1hZf#(z0%o{Dj5 zMJZ%7I?KXXD=A?{`EZ&8I{Z!N!F!tqm|V&8m>S$(hDzhe;ATf*m4m`S3qm5akdII< zWSsP|JQ;;Z0d-_eG|MEL@j|b7xP+%~2lMv5_ilVgGSZ3Len?S^PdO7IZ0uagr*+b( z(<&Xd(c@!5>Vc+t+{5|Uar*+(Wn!}lMyw5R{Jr7z|LSy+=-{}L0rT6v-khWiqu+Zp*;_kMYov+evtDmQ zWXmnNT10mj=-sHW)bLL%q!rv_^}tNJtgWN=dT&Xen$>l8zu61c-wFv)@4cKWg}u@& zbD(N}FQR;>gDE_;qLz0Wl0o)0YjuAer_OKi(-(y!%v+6r+-`fM8y&4sWdE}=to#hJ z`voSPUO+WRnQql}QY@_|)Ak&1Q3#ULfrr{{FzJV`N@^#PrDxY44R~sc8r2qO#IpI~wTZ!o=tg`+JzP zoqMf0;az`z+;iNjrfM+S^M(k@87hbQ4(HrR3dH@I4P;~Vr^hZd3O(?`YIe#{JqZQ8 zevhYz#ZPv4N_c5LcQ=}(J3yjTw9RQJP_Ezid}|+(e1kDpfWKw$w+T$1=>6_o5B}pK zB3>I*T+|mK>X|Wl@}r_(4j{wGLlgCzL-5TcdL_LLAgc!I3zDRE)b{Zy0VK^7K#Zs? zoku}@V|lM7UW{Z^<$We1_U5bOTLI-yOj{uXK5izxUt3C`33Gk`p)|Vc08R2h3_)m4 z`;Rma(KgWl3F_uY`hvL<5O0Ar+0oyfffhD*Yz`-XCnI!EpNBko}I7fp{{tS}x! z=KOfOsYWsi6KZobATqwwx+p(dFBPi=Olj-*)54}jw-VPoXwr{oM=O4SCsPmEcx%S{ z5aO>v377#)`*z=P(?a4EcObnYGmmEhm4P+#ZE{1-VQ!FmAhTUxAA){-V_M5fkD#1z z1@U%QHC(E!9Xd!23pi_-z>Gr_QV4~9>_=9dvo{z26uU&4s&M6Q}2O~g3;CqV$ zqPi#aL6i;tol&8Enj{yJdm->Sg)6Nl{!CSvJtiZ*G!Y2;$-b=}q%ihguT&Sdn)1-}C&@mIfIU)A6R^v`q>w$~w+q_o9dDF(SMxAle7=x~fu9!NIP zV(W;{xFZ%ck@KWAF$)AEIEfOs8@DGcvKO2d2G^YwBr~L><{3dm&rqI5(?a-r$-MiB zQ2AJ?Y4v*DV<1-ENiE@E>BuILuFfh_PwM*#ZPr_Exuu6VJzN4&3T*XG5h|DuPt*re zfGwLFOzYLxMMtV|m&??HK_}h+Jx;m&E1LaE52B_KAF%REA-W_vPbohah^lD7A2Fj% z+3onRBH(uazoM`fJ7beaojZbs(KkUX!#cbUR$e z#e*hp8jSCa_o;^vjgFJbsygz(X4Fa1^Is&~U#f26(vRH66(1)h;=U)tR`8n*WXoUp zDT$LDNWb{WjZBUkeB4B#Lt6q`%IMU==_^ZN-d7=woJVIfs$2QqekwO~rj4c_w=ckf zy^aA%9Kyxd{D|7eY%V9I#mj;_aW@rBh{eTnX;OxRwgY^(EiEMbTFNZxahPiF-*QB^ zyP0M_if73emEkOUn2Yj^e*}rLjKyrhBWqk)2bBFgu{!jXY--Z7$C{eqY z-@{_OP_uthBiX&3y0q^%OSGRa2j-#VF&`v$tK>no^Ofheh13|Z1sCs2hN;ak${};f z7dl$hy(amikjfSgmw zIBlO!^|k*&BUQQdMwJu-E97dS8niPk_X`(2By)eV>7KnMYNF* zM8=T^M%|7LcM%@h4Aj`yogAgYq$ULPmx?-^)-g|6%kG^kE5&w7) zQntEl-0ngd!v=1QU2iX%jKt?eVb^yp9w>9ByNyxTJ=8?YP|G$0+v95spX3b6v}@0C zP2)-hX7F(X;(jLNP~CaaGQQ-owvN{zho|Z7QshL_p%dB96PIJX)^bo!lo?^al(Ozw zaBTHaTlG>(l_6#Cd%Gxc-G?-mz;j=Q0(@+?s*L}L|r@;#LBuBpSM}{XzPz( zB$?aC9!8Bd{6#ym!o>t-L3DSvmj$o*T!w3Gger1kQSSpD9B5rMEr(lHpt?~@V<+p1 zNO8qaIllUfdt88Rm9`sxk+gI6Cn*Z+=_ORZCU^bWg7}V`bDn1GNLSs4R$87HHfpu; zd&m`auo*94>untM>_Nsk_;;Sq2aK3L8fVa&_*(TXF(I`X2)Fl!yOcZ)T)La=#}oG& zr1~CriB>PPpsL8K`gG12BI;r-D;TlEG^V*u7saX7A>z277^*q6zDXMal5$m4Z1c7R zX-t5*z2l^d5FW3?`4W)m^I)!;0d+2Ojh}*REp9K@q)eNym^4RsX(f&V*}ld&!8Q-# zr@fMZ^lC@i&#e4r_F5jC4 z;8LZ0h5az{`}iG!q4SqS548T-@R1}Z?^bjAc(~<@GZ1&r;_!*GESi2O6>?B~=I;7H zOa>K}p;eFYipCDoC?<*sm#>ko*gJz#=qyX#NSQ*V%IK~g+xO>=10y$B;?b=FyJ@m%PR(y6zrckiJk${AKQ>+0Hgp4yH64{fu|+FFcPF$F80FNn*cuU1=7o;KI}rVr)aX#r9&LQEh;c6!Ta3STIpp!>0-s8E6v$yPy%6%tf1b~q5@!$PvgLO&u&|i@8~IZjOoEUCcRufRyXKYtglBDtgI&GE7u=?Obn`fOWa+w7D<`3?7|M+%hxQ{8z+HPyB28hvfA zs0c`qqDUwT1|mg}A`t`x0YOn91c~$-B2{{X7c_K{8bAU{ks6vbfhf|e(vhY_5;}%Z z6G-+_&-ab7$2j|(z5kt)U+c$A)?71du5~|iKF@stAZif>g(#Jy!|rY3A1RZ>SChz< zKEnrOQn=V+?l}cN@B6Ja;{6(?zw?B;6Mx$*^Qk~3Pu`JtQXZcmEt_1G^N^+3{ML%5it{p|Q^?KWE z9N}XbXEI|ht(+(m`w0j&)@2hVynAFGA61s4sP-qNg473UP%7e~&%zOVbK)b0CJl43 zGxv>gH3brd6D6)(bZe@{jy07~ulm$zbr~McAsxru`Orq_ zHYGP1{i?e+69i~jG%fMUK|;qt?+XV}aUnQXY`i`g+LgR@;cCTG2cEOiQWQr1K@dk% zwH4V_R~Y_wxw<)5b9~2oLj0Escu@g_(SR6%3uHT0s!nZ{$(x!_$~Yf zul#C!h3yyd?f2&c8vs4{bNHVd^HE2f4F&=D&%00|WjCD1?zt-cbDJs%a*coDIb~wq z@YO+Yhw12HUuJ`ZoBKQSdzMA-i35kPz{L|T_@vD2XqrhSxz|DOP>M)%;@j`Xw~p(321#s} zCURZ^uUCBi$}_=hIKdAH~s(e=;RkYlE?il#1$Qb@>T^ejRAYf-g}oHwVKb{ z^XqK_Xm~+ngTbhiqhm*n`KvtM$0((TDZwuCGfVIV2Y}lkmF6~5U{9_wL9gSOHreES zID4sgk?uQlOrP!=E{|>$^mm+h8viHNCSHH&RGz@6K#a83-40I$W&KZ!{m=PhVrXZl z)T_lEy?n3j)BGHx^Ip@sgSE%{$Ie{mm^%S<0Kz8=SmPWT%I`fe$fe#MwGb{^nYAXD z26pVF_fm- zHeY$Y_`KeehkV29e_h;JQ6OVC%~Qjn&C5i*^Fm$J_YwJFmn~$t2rhRa&$us0Nu$lW zHNC}pANzD@rM9QjLU)EquhHuD|9Yq5nkB~_9~p~l6HI6C8H{qZz5jH%TbTA*OS-MQ zSA&1%HMcq+vIUp*_}&va7;50hwb!1b23sYLfwaf_$`_$aph@B$oD3FYF_V`>_|oZB zmL-Ox-p{mU?}N52HW=N^{BdI6o(`@jKTs&oy_1}QNnyWn-bW(d z`Wi>Ywt(SR-hH5Gi4pt}fC0YpCMK_w{UV_rvgI3+GH`qGcsV4B2T;ip?0Wf4ke%UB z8Kw3pc!5o9#l%nBa@@m5UDs@0>&DD17=tRT8dv2MxhgY}re~kTa9#A@1GrFP-*zga zg;(+dnc;5ae17wWmRnG}ct$DLgH?hX0l9sqmtbp;ir6D{UYQ~K>_7Cu=mnq~z~4Rv zw>b1|KdD#iT6smE#iTN94}cRf#H(L`SPuWD&9tZ!tba%lGZJJT!ev2hD>F`h`S0LB zv!g0uKY_mKl&=;?kWnqPPWA+UcHRWYJbQW#+;2{%DI6~-wlKci5V6CL8VIhrQ;T}p zntiV5f;GBIE%z|sIs@EEWj`+UHY~JSl|71UWSWYf@ z^BJy)EW0w0xnDH+Qk`^TuJTx^W|sRoKXNyHWR!57O13Mk#V^jrkW8p^qb3uaF)u1> z$IC0$yUKn#M{AR>h@N&FjENVO!}VD0ObY&*DMyk$sBQO=W!=qk48l)|3dgzdJQ|cy z&KzofSgoc78rQqs@Hdy|_l$QTOHri0$QoxF)JqPQ^ujDadb)5LdA%X{+zZ7TKC?-^ zBUd{c9@G*qKg7RYgGTYbzbP9f4K)__YrZ`T1);%oUx@JnUN^JMGu062X#GJDz(qZMh8R+QHVbN(KVKaBF*?@9bsKFG4b$d z`SF!$z>sFtaPZ#D-_SS9ckfHv`K>fFZnk}(D*bioJ#lJDoRp$p%jije{VlCi&V3aj zi0C3t*I()Y-P#SNEFBo<>L5i$MsQK{x|TYVZwBpuol$^GPj6Ify^`6`vrW${MBnM& z5#x45^2-t@#Oq33C%hOOVR-DPFYc96FLC5`tM>-7hkn0<%sh5xdn2EH7zodV6jshL zXU&K|FGx^Y^B`(dt+`U%oZ`^s$M~W_sOFb;dS!u&5d+eYxMapxVXqqbNjRB$6hV(>SwN z0w4nTyrQN2GFlErOfR^u#Z=yOorBYRjltb!>V=WJ?W%+tu8i{t`Vr7)7mtbqq5b;38i2-A9DLHzzb z&41xJ<&&IL^JMjH*N7WcO${mOW-vHV>&ks#E*3r6v>$&L_wJ86S8ZRFb8xF$=TuF| z`?BufFb2c4Nd)#47yq6XS%@Rj|2D9D4*nm>?Eih^)12tR`-Z`WCEA-m zdBca-UW|K`H_V9I!0t84V^DlQzU_nXvXt{Lh32-pp1n2M#ta$+XQ>q1tO+CRF&`R+ zi~WA#!vZ6r*9LsMhH@M^E{}Tn(f`K?dUrv?qAv)kp{1cHi=owrT8lmk17m0N3`$D1WfLYAOO|I)?|dc zMY@eH)hc$z7?UUAFbe6 ze^Um0uv8}KEmkq)@T8+>R&ZIMH4hSLxzP#+9T-h5txi4q;L(AaLf!uDsINHqt!@G1 zh1bYK=VmxmGD$9_{bwrtHl3ZNt*8o>%(H1@HH!P~sec{y?ddKkC*hc<(a-)$kINqJ zZDfZ|pR&*(Se8dSgU9klE(_Oe(esr~8K1flcWFvNg54VJZm!+Fv>lbl?ey>8kO?~m64 z_N>&$@DjH+yFJ74@Y^n-+Rq2aiiamUawMoDKLR@F1u?nI!*WbfrVJm(yi_)YOQc!_nEu=F9R;Atixa2bniNgE7yj$A}x6+6uqs4KMg_NXhbc3<}7 zf(+dr!rzRqUuimLVOH>@si{cR_^q{JZ*LWFCP7o?gT|x~7Vb#QSaDcHcfJ<42PSRD zB(~)Kwxc(7U{>1eh*4{eM@F0TUeL?s7ZP~08p+N6IoT&rQFy6LX@5C~=cLM_yd>#? zP3miHt2!!7PJKyxvw`DeNlv+eai@kQSzA{2a_?R~jdV3Mje&8@? z{d&H!ycWQB@gwk z#=?>KcV3O7Pd*1rVPM*dUn(Q$o%ESLyKIFn9`wgB$2>^A6KClJAdbU7is%vfCtI~DFdeViY1 z7buddC?lVz9x{ehx&Y}&fy$lEKv+4zP((I_lgc{$^4Ijw@flAAI+)+~&ApC2?MZ~b z^ed)5>1e|v0s9JOvc5|wwuJ7rIRbu>_@Y8BG4)5P1ux|GFO;*zW~=vr+CNOVs}9S} z^YV0_pL=P=+t?jv1^yuqUOHHNv0`WRITxPG>kg*(ZiJy}hwWx!s!a1|^)P_Bk;8RK za4^y7N-sO=L4Uy*ij4BTHp9|%-HW-gcj-VE4W^*A7>E81`Vwsj78yESZX5exPj!a6 zV^=CrsGkz~`NgiQCxDg7vJIS!<=vA?IN4|0RJ$s3@hNG?C-to6yiVYE{!(m3 z&%`k#@f;&^U$1cwKz+$QO^!}l>o1(uqPys?=(R(w{X}`t+(jlz8nJf)dKQLCXj!-v z@U|E1r(HdRZ(uvseJj*sZioo8Wmwxt_R4qeB*#=dl9lNxd!b(wcN-FjNtLCF z&567r0Sl+D24~fIotB;%9(l&!CE~f>ED*}RbD`bbWq$Y zS3?bw2te9h$c2M$ujG)k0h?i*&!RyX@k+Z2^Eae|+RIW%V&)c%f)(cU6rAx~J=9a2 z&(p09uU*b+^bn$5_jpP~c3w|9TBGvJMyksmrA9vMygEF4KfC3}p%u@=lZ7(%!O-@O zi=qh^Cy*tEQuWn?mTD%Ob^UWi?F)P&(-lm3v=UPaNEIME;1yw$!9iDGz!oR{)T_WT zXMWy~BV59xSu2K~4S8ptTE;`CeXR;XDDst*8(HI&mll`1+@9t>!wPKEqps-@h}en- z*%Y9lc2$K_`&@)#5R$T)eWe+aFE7+t?KgAuR))5m`<*?zxd-3qPUIM3z%RB}8~evc zz+ZW;|7g2Xm%DbO2P@`_E6eSn*ay+JdWdzeZpcKTew0%>P|Dnem1}F;4y{7#JAj12 zSFBNq^HnSFO|Nz*o4_D4J1=!(O1AtJ4}x7 ztYbOk;8=JUDp^m~45DNE1>K$laKh?p=aiku`m-sFPrPW|havl-G*WPNZBVs(yl+&L zh^NJo(S9B@L%gEDMIxami5c5cZBQRtV${$H>K*Ql{_I>7xLc92Blz*2dYG1kOG*N6 z5Fz#M8`s^r>q05iPV4^O=mOw5>tBFjSqHaM(Gih~PDHR8Y$Zyb4V3cM*9C`i8U~fF zLupD;dT?KIN!CX$trkSn{FB4RdM&s6V5bu!6}1QxP>z&N?h0JdlVelWh;h*3`QiBrbiETJ%otQQ;}eX+_8~xn}%k zOPm=NwT-s!J9IdQRl{lY@SfgvTvT%~LaKE14%r||F-a>*Si{k>b>c(`qHVi9R-TrU z+#1(vC<|M4=w`##cunNkei5JFJ?H>Hyfk+n2L^*AH1dJqi(?z_W_3cRC26e&2%{c< zV;iL`m3P@rbFqS1Nv#5%$-0^NhMfavFC3(f+=He(2I$PvOxtusuV1WvS~qQas69q_ z`jk>(l%jg9&9eVWBd*3kGP^x=unLWH z7*+lLKm*vo2y~3kjpG64K+B-xL$r6o&3sIa8~B55IUz7V7hLtDKP4LdNdur;@uKTF zl*FRezF3{FL}WjeW%&zO(=*_SsXDUvo$=4P%vU!D;k5u;s|geE4gf?yu6zCV>KnP3 zEu134Qp=6#)j2i2}&&V6#|%$@ED1-OwkIusIV_@%(trWqCMj}V<1RH?d9RDr4h z^5qz9K7Pq}g}SRdhnIXrL(3*m3E&NrI5dT(rKqV0h<~%QfmHD*;uV(tt7ffFqlkQ}`o~F_Ise_B4m**t}1IiYvY;Ql9@1irN9ElFc&;QfgDLiK-#q zeq(extr5$@TD3k_>+`(ltMUK)Z+7m0VoNvcy(-PJS{i8+A9qY>$#CD|={kX=yPrYLRm4LU>v8CKM{mmqLNQdpgly{&mG21d2{ z0>pG9XwZw=?W=?C5}w_PI1i?3HAyMcW*@$Lej;ZzEmMn2TGT7wlBw1+=_E$dMJ z=4Bf3CB;REaVJ>v5#w5nIm~$iKWN$VC><*L1rA%vDxSn75o6HM{*uYRmP{WbR1|Dn zYHGGMV*W~p8!3?{YHzOYSFtL98hkYZMJ<17ls;1uHbS-;F}BfPl@NfjUjp*W<2Uh5 zWntly$3guAeR5mWXdbJhiBA)F+Z|c5u!(5kZuz>PlzHDBllWl5AAF>?9QD@1B}%7f9*5#p13?BXTlM#TWg}#`elc zm?qBN!-QKX>_VL(-bd5LXYLM?Y4>s(shgwQMMy{Lz+?Gas`7pZRHnYqmO8cEdxHH@ zxP9X*)@a2`l&1G(&(KST)yP@I4VkKc#X7PtKH6Q!vc6iVFym+R)e&Q z!$M(ey06XzgWXQiA4an$rQEICXRJJ&@!ykPP5ne&nD;6;m#|W)`u?68h964)9Q=6lFH|o6ITZ}VRl{Kf^K~x8L&E{_A8HF!)`IoLjwc$1(Qw9;r14) zSV|R|fmDG*rKWA46$|mD1FAzUppQkUv=}%PaNOnzBV$&qIMxvoan~99qf$oad^bKf znM=p1ZZDryNvebwIwgE6P-B%+Hi$kImgWsHMZ2S!8)G?*8%nb~gE-{~D38Qe25ZJw zfYVHw8vqcuT!KRbf}SbOp0hu3Poy_Sx3OXAjZ5(l)(^VQ9=IbXS0#AD^sIwLL&RHy zwUdk)GlYiD?4Y|2LQ$<(IoYHday~`MkTuc5IkqLY{P7juV7c{`(1CFvFX|GoPEv8j zGv$+yaE3I?He>;+Z5QH@g0ApQ%1s+e5X;;84M%#+3^fD=xZztJ56o9{*o^38WczY& z98@ZSmhWzwPsd~b#x#UR8+^-6C2eYd|g^UGc~=fgd`0n zjzSlSJM{T-`c=_cuV;~DLWU#|A6^7ZtX7+l6&wAQ$nn&X-EBRQCK3&)5ZpMACm$}l z4l0Nz*XK;~LMqtY?2?__saXrsH1Q||+z6q0qI1s?Gl2dF#-O%p>euXw5RpPSIUGxZ z+d-N2a&z55Z!f2O-mh|HV7De}&6#vLleH>~u6Qi;1Vje=@W(zywB9N`6BPl8=Uqru zegmY?bX4-QbKo2zte%lZ$~z`^D#b~jgnmiSpyl!%4}xJ5b;+L=?>88n|8i;Y-o2s) zEY!e~r4SrssZbN^KVWE0yCq^R-H`0z##6%{0BpWpI5aC|k;=~WAO^ZSEp;8VfbvZ# zz^SKcIK$yB4{4R;ik-vo%tZ`OBCBeMJ)$>3%mTtYd{QDECSho!$Ew_a;DpzP0irALV zCo?GyWKHQYt_Aq50?xZ-wX;(1Q3E{k!z~6-00Y!9LFeVYUvH2){DKuJ>*)p`rtZxo1a2sK%x4*jcikvvrE{JE75MTE* zJY6k?eutH?OJ$^f7y;k}9fxOQ;s*RHc6eD%?|og%{XEL&R_w2LkqMF4B%GmhILCy%%E$&SV2_ZZ{1`H|Y14;rfE(M9xF~u7QWQX`mE$z*|dY zW`xHreMrW3;x|{vvANZ~UP}KK+1>}@H{1Lg0i@otOWT0;-2JYoL9l zi|I|2(G;wEsR_bV#K>O1qZOhljsr{-Eu1V4*#@Aac*1>n-KRt!SqBs1u73F z#SnOBOMo=l=`lN|-NZx7{%p}9TEYlHL5*mMRiW?grksevKi~p<#*~j-XMM#(81Kw< zN$gr0#7M>Xi~a~(OII$xW%?Ms^wNMKzO`*_l-=Fo7NGayf(;t^?!k&Ew+cymE=ybE z)F0c>^A_L9> z4L{uuG++sJu3ULYT*cJK#@xhn4t!k*RVsNSGTiNOonDJERn5`qU9_PEQTp+kZ*3}A zPBKnsG7= +``` +```c +// 1. Create ETDumpGen BEFORE inference. +auto etdump_gen_ptr = std::make_unique(); +executorch::etdump::ETDumpGen* etdump_gen = etdump_gen_ptr.get(); + +// 2. Load a method from the program by name with ETDump generator for profiling. +Result method = program->load_method(method_name, &memory_manager, etdump_gen); + +// 3. Input tensor setup. +Tensor::SizesType sizes[] = {1, 1, 32, 32}; +Tensor::DimOrderType dim_order[] = {0, 2, 3, 1}; +TensorImpl impl(ScalarType::Float, 4, sizes, image_data, dim_order); +Tensor tensor(&impl); +Error status = method->set_input(tensor, 0); + +// 4. Execute. +status = method->execute(); + +// Get ETDump. +if (etdump_gen != nullptr) { + executorch::etdump::ETDumpResult result = etdump_gen->get_etdump_data(); + if (result.buf != nullptr && result.size > 0) { + PRINTF("Add a brakepoint here and run this command in Debugger Console: " + "dump binary memory trace.etdump result.buf (result.buf + result.size)\r\n"); + } +} +``` + + +To save an `ETDump` file from the board to a PC, use the **Debug Console** in the MCUXpresso IDE: + +- Set a breakpoint at the `PRINTF(...)` line in the example above. +- Enter the following command in the Debug Console and press **Enter**: + + ``` + dump binary memory trace.etdump result.buf (result.buf + result.size) + ``` + + +

+ Save ETDump in MCUXPresso project +
+ Figure 1: Save ETDump in MCUXPresso Project. +
+
+ + +The resulting `ETDump` file is generated in the project folder within the MCUXpresso workspace. + +> **Note:** +> Profilable models print profiling data to the terminal. Generating this dump may take longer than executing the +> Neutron kernels themselves, but this overhead can be ignored as it affects only models with profiling support +> enabled. The dump generation time is included in the `ETDump` as the final kernel entry. + +--- + +## Creating an Inspector + +The [Inspector](https://docs.pytorch.org/executorch/1.0/model-inspector.html) APIs provide a way to analyze the +contents of `ETRecord` and `ETDump`, enabling developers to gain insights into model architecture +and performance statistics. + +`ETRecord` is an optional argument used to obtain a mapping between the original model and the converted Neutron model. + +An `ETDump` generated on the board contains metadata for each Neutron operator, including its unique identifier. +To visualize this metadata in the Inspector results table, set the `include_delegate_debug_data = True` argument. + +### Example + +```python +from executorch.devtools import Inspector + +inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin") +inspector.print_data_tabular(include_delegate_debug_data = True) +``` + +### Complete Example + +A full implementation is available +in [analyzing_with_inspector.py](https://github.com/pytorch/executorch/blob/main/examples/nxp/analyzing_with_inspector.py). @lint-ignore + +--- + +## Summary + +* Build the model with the `--use_profiling` flag enabled. +* Build the ExecuTorch runtime library with the `ET_EVENT_TRACER_ENABLED` flag and the ETDump Developer Tool. +* Use the Debug Console in MCUXpresso to save the `ETDump` file from the board to a PC. +* Visualize the profiling results using the Inspector. diff --git a/examples/nxp/analyzing_with_inspector.py b/examples/nxp/analyzing_with_inspector.py new file mode 100644 index 00000000000..b339af79d6e --- /dev/null +++ b/examples/nxp/analyzing_with_inspector.py @@ -0,0 +1,58 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Print profiling table for the NXP Neutron NPU model + +from typing import Any, Union + +from executorch.devtools import Inspector + + +def parse_delegate_metadata( + delegate_metadatas: list[bytes], +) -> Union[list[str], dict[str, Any]]: + """Metadata parser for Neutron Backend metadata. + + The parser is a callable that deserializes the data and returns neutron kernel number. + The deserialized data is then added back to the corresponding event in the event block for user consumption. + """ + + metadata_list = [] + for metadata_bytes in delegate_metadatas: + if len(metadata_bytes) == 1: + function_code = metadata_bytes[0] + if function_code == 0: + metadata_list.append("Profiling dump") + else: + metadata_list.append("Neutron kernel " + str(function_code)) + else: + metadata_list.append("Invalid metadata size") + return metadata_list + + +if __name__ == "__main__": + + try: + etrecord_path = "etrecord/etrecord.bin" + etdump_path = "etdump/trace.etdump" + inspector = Inspector( + etdump_path=etdump_path, + etrecord=etrecord_path, + delegate_metadata_parser=parse_delegate_metadata, + ) + + # Access raw event data and filter quantized_decomposed nodes + for event_block in inspector.event_blocks: + for event in event_block.events: + if hasattr(event, "op_types") and isinstance(event.op_types, list): + # Filter out quantized_decomposed ops from the actual list + filtered = [ + op for op in event.op_types if "quantized_decomposed" not in op + ] + event.op_types = filtered if filtered else event.op_types + + inspector.print_data_tabular(include_delegate_debug_data=True) + except Exception as e: + print(f"Error during inspection: {type(e).__name__}: {e}") diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py index f5f92d36541..258b4c87772 100644 --- a/examples/nxp/aot_neutron_compile.py +++ b/examples/nxp/aot_neutron_compile.py @@ -8,6 +8,7 @@ import argparse import io import logging +import os from collections import defaultdict import executorch.extension.pybindings.portable_lib @@ -167,6 +168,13 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): default=False, help="Use QAT mode for quantization (performs two QAT training epochs)", ) + parser.add_argument( + "--use_profiling", + action="store_true", + required=False, + default=False, + help="Enable profiling for eIQ Neutron NPU delegated model", + ) parser.add_argument( "-s", "--so_library", @@ -322,6 +330,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): operators_not_to_delegate=args.operators_not_to_delegate, fetch_constants_to_sram=args.fetch_constants_to_sram, dump_kernel_selection_code=args.dump_kernel_selection_code, + use_profiling=args.use_profiling, ) partitioners = ( [ @@ -338,6 +347,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): edge_program_manager = to_edge_transform_and_lower( export(module, example_inputs, strict=True), transform_passes=NeutronEdgePassManager(), + generate_etrecord=args.use_profiling, partitioner=partitioners, compile_config=EdgeCompileConfig( _core_aten_ops_exception_list=core_aten_ops_exception_list, @@ -360,6 +370,21 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): exec_prog = edge_program_manager.to_executorch( config=ExecutorchBackendConfig(extract_delegate_segments=False) ) + + # Generate ETRecord if profiling flag is set + if args.use_profiling: + etrecord_path = os.path.join("etrecord", f"{args.model_name}_etrecord.bin") + # Create directory if it doesn't exist + os.makedirs(os.path.dirname(etrecord_path), exist_ok=True) + # Save ETRecord + exec_prog.get_etrecord().save(etrecord_path) + # Notify the user about profiling enablement and ETRecord generation. + logging.info( + "The model was converted with profiling enabled. The time spent generating the profiling dump is traced as the " + "final delegate operation and can be ignored, as no dump is produced for non‑profilable models." + ) + logging.info(f"The ETRecord for the model was saved to {etrecord_path}.") + except RuntimeError as e: if "Missing out variants" in str(e.args[0]): raise RuntimeError( @@ -378,8 +403,10 @@ def executorch_program_to_str(ep, verbose=False): logging.debug(f"Executorch program:\n{executorch_program_to_str(exec_prog)}") # 6. Serialize to *.pte - model_name = f"{args.model_name}" + ( - "_nxp_delegate" if args.delegate is True else "" + model_name = ( + f"{args.model_name}" + + ("_nxp_delegate" if args.delegate is True else "") + + ("_profile" if args.use_profiling is True else "") ) save_pte_program(exec_prog, model_name) From 3447d08964881e3d2ef34123e8c985bbc800a01b Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 23 Jun 2026 13:53:13 -0700 Subject: [PATCH 4/7] Quantize moveaxis/movedim so they delegate to Ethos-U (#20314) Differential Revision: D108478011 Pull Request resolved: https://github.com/pytorch/executorch/pull/20453 --- .../arm/quantizer/quantization_annotator.py | 10 ++++++ backends/arm/test/ops/test_permute.py | 17 +++++++++ .../test/quantizer/test_generic_annotater.py | 35 +++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py index 3b713659e84..13693bd235d 100644 --- a/backends/arm/quantizer/quantization_annotator.py +++ b/backends/arm/quantizer/quantization_annotator.py @@ -631,6 +631,16 @@ def _get_fixed_qparams_qspec( if _transpose_dimname is not None: _one_to_one_shared_input_qspec.add(_transpose_dimname) +for _op in ( + getattr(torch.ops.aten.moveaxis, "int", None), + getattr(torch.ops.aten.moveaxis, "intlist", None), + getattr(torch.ops.aten.movedim, "int", None), + getattr(torch.ops.aten.movedim, "intlist", None), +): + if _op is not None: + _one_to_one_shared_input_qspec.add(_op) + + _one_to_one_shared_input_or_input_act_qspec: set[OpOverload] = { torch.ops.aten.alias.default, torch.ops.aten.clone.default, diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py index 8864324dbd5..6819929104e 100644 --- a/backends/arm/test/ops/test_permute.py +++ b/backends/arm/test/ops/test_permute.py @@ -78,6 +78,12 @@ def forward(self, x): return torch.permute(x, self.dims) +class SimpleMoveAxis(torch.nn.Module): + + def forward(self, x): + return torch.moveaxis(x, 1, -1) + + @common.parametrize( "test_data", test_data_suite | test_data_suite_fp16 | test_data_suite_bf16 ) @@ -118,6 +124,17 @@ def test_permute_u55_INT(test_data): pipeline.run() +def test_moveaxis_u55_INT(): + pipeline = EthosU55PipelineINT[input_t1]( + SimpleMoveAxis(), + (torch.rand(1, 4, 5, 6),), + "torch.ops.aten.moveaxis.int", + exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default", + run_on_fvp=False, + ) + pipeline.run() + + @common.parametrize("test_data", test_data_suite_u55_reject) def test_permute_u55_INT_not_delegated(test_data: torch.Tensor): test_data, dims = test_data() diff --git a/backends/arm/test/quantizer/test_generic_annotater.py b/backends/arm/test/quantizer/test_generic_annotater.py index dd883e72b1f..b5cfd1efdc6 100644 --- a/backends/arm/test/quantizer/test_generic_annotater.py +++ b/backends/arm/test/quantizer/test_generic_annotater.py @@ -89,6 +89,41 @@ def test_transpose_tosa_INT(): ) +def test_moveaxis_movedim_tosa_INT(): + check_annotation( + SingleOpModel( + torch.moveaxis, + (torch.randn(2, 3, 4),), + source=1, + destination=-1, + ), + ) + check_annotation( + SingleOpModel( + torch.moveaxis, + (torch.randn(2, 3, 4),), + source=(0, 1), + destination=(-1, -2), + ), + ) + check_annotation( + SingleOpModel( + torch.movedim, + (torch.randn(2, 3, 4),), + source=1, + destination=-1, + ), + ) + check_annotation( + SingleOpModel( + torch.movedim, + (torch.randn(2, 3, 4),), + source=(0, 1), + destination=(-1, -2), + ), + ) + + def test_tile_tosa_INT(): check_annotation( SingleOpModel(torch.tile, (torch.randn(4, 4),), dims=(2,)), From 65bc0cafe96c5e3c63e57f976b9e05a25027a53a Mon Sep 17 00:00:00 2001 From: Jon Janzen Date: Tue, 23 Jun 2026 13:53:27 -0700 Subject: [PATCH 5/7] drop redundant TARGETS files that duplicate sister BUCK files (#20403) Differential Revision: D109082060 Pull Request resolved: https://github.com/pytorch/executorch/pull/20403 --- backends/qualcomm/aot/wrappers/TARGETS | 5 - backends/qualcomm/builders/TARGETS | 5 - codegen/test/TARGETS | 8 -- configurations/TARGETS | 8 -- examples/devtools/example_runner/TARGETS | 8 -- examples/models/gemma4/BUCK | 120 ++++++++++++++++++ examples/qualcomm/executor_runner/TARGETS | 8 -- extension/aten_util/TARGETS | 8 -- extension/aten_util/test/TARGETS | 8 -- extension/cuda/TARGETS | 8 -- extension/image/TARGETS | 5 - extension/image/benchmark/TARGETS | 5 - extension/image/test/TARGETS | 5 - extension/pytree/aten_util/TARGETS | 7 - extension/pytree/aten_util/test/TARGETS | 8 -- extension/runner_util/TARGETS | 8 -- extension/tensor/TARGETS | 8 -- extension/tensor/test/TARGETS | 8 -- extension/testing_util/TARGETS | 8 -- extension/testing_util/test/TARGETS | 8 -- extension/threadpool/TARGETS | 8 -- extension/threadpool/test/TARGETS | 8 -- kernels/optimized/cpu/TARGETS | 8 -- kernels/optimized/test/TARGETS | 8 -- kernels/portable/cpu/util/TARGETS | 8 -- kernels/portable/cpu/util/test/TARGETS | 8 -- kernels/prim_ops/TARGETS | 7 - runtime/backend/TARGETS | 8 -- runtime/backend/test/TARGETS | 8 -- runtime/core/TARGETS | 8 -- runtime/core/exec_aten/TARGETS | 8 -- runtime/core/exec_aten/testing_util/TARGETS | 8 -- .../core/exec_aten/testing_util/test/TARGETS | 8 -- runtime/core/exec_aten/util/TARGETS | 8 -- runtime/platform/TARGETS | 8 -- runtime/platform/test/TARGETS | 8 -- schema/TARGETS | 8 -- schema/test/TARGETS | 8 -- test/TARGETS | 8 -- 39 files changed, 120 insertions(+), 287 deletions(-) delete mode 100644 backends/qualcomm/aot/wrappers/TARGETS delete mode 100644 backends/qualcomm/builders/TARGETS delete mode 100644 codegen/test/TARGETS delete mode 100644 configurations/TARGETS delete mode 100644 examples/devtools/example_runner/TARGETS delete mode 100644 examples/qualcomm/executor_runner/TARGETS delete mode 100644 extension/aten_util/TARGETS delete mode 100644 extension/aten_util/test/TARGETS delete mode 100644 extension/cuda/TARGETS delete mode 100644 extension/image/TARGETS delete mode 100644 extension/image/benchmark/TARGETS delete mode 100644 extension/image/test/TARGETS delete mode 100644 extension/pytree/aten_util/TARGETS delete mode 100644 extension/pytree/aten_util/test/TARGETS delete mode 100644 extension/runner_util/TARGETS delete mode 100644 extension/tensor/TARGETS delete mode 100644 extension/tensor/test/TARGETS delete mode 100644 extension/testing_util/TARGETS delete mode 100644 extension/testing_util/test/TARGETS delete mode 100644 extension/threadpool/TARGETS delete mode 100644 extension/threadpool/test/TARGETS delete mode 100644 kernels/optimized/cpu/TARGETS delete mode 100644 kernels/optimized/test/TARGETS delete mode 100644 kernels/portable/cpu/util/TARGETS delete mode 100644 kernels/portable/cpu/util/test/TARGETS delete mode 100644 kernels/prim_ops/TARGETS delete mode 100644 runtime/backend/TARGETS delete mode 100644 runtime/backend/test/TARGETS delete mode 100644 runtime/core/TARGETS delete mode 100644 runtime/core/exec_aten/TARGETS delete mode 100644 runtime/core/exec_aten/testing_util/TARGETS delete mode 100644 runtime/core/exec_aten/testing_util/test/TARGETS delete mode 100644 runtime/core/exec_aten/util/TARGETS delete mode 100644 runtime/platform/TARGETS delete mode 100644 runtime/platform/test/TARGETS delete mode 100644 schema/TARGETS delete mode 100644 schema/test/TARGETS delete mode 100644 test/TARGETS diff --git a/backends/qualcomm/aot/wrappers/TARGETS b/backends/qualcomm/aot/wrappers/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/backends/qualcomm/aot/wrappers/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/backends/qualcomm/builders/TARGETS b/backends/qualcomm/builders/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/backends/qualcomm/builders/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/codegen/test/TARGETS b/codegen/test/TARGETS deleted file mode 100644 index 1e8cc179228..00000000000 --- a/codegen/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain xplat-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/configurations/TARGETS b/configurations/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/configurations/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/examples/devtools/example_runner/TARGETS b/examples/devtools/example_runner/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/examples/devtools/example_runner/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/examples/models/gemma4/BUCK b/examples/models/gemma4/BUCK index e587370ece0..19f0ff90c93 100644 --- a/examples/models/gemma4/BUCK +++ b/examples/models/gemma4/BUCK @@ -1,4 +1,5 @@ load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load(":targets.bzl", "define_common_targets") oncall("executorch") @@ -6,3 +7,122 @@ oncall("executorch") non_fbcode_target(_kind = define_common_targets,) fbcode_target(_kind = define_common_targets,) + +# Text decoder module +fbcode_target(_kind = runtime.python_library, + name = "text_decoder", + srcs = [ + "text_decoder/__init__.py", + "text_decoder/convert_weights.py", + "text_decoder/gemma4_attention.py", + "text_decoder/gemma4_config.py", + "text_decoder/gemma4_cross_decoder.py", + "text_decoder/gemma4_decoder_layer.py", + "text_decoder/gemma4_model.py", + "text_decoder/gemma4_self_decoder.py", + "text_decoder/gemma4_transformer.py", + ], + _is_external_target = True, + base_module = "executorch.examples.models.gemma4", + resources = { + "config/e2b_config.json": "config/e2b_config.json", + "config/e4b_config.json": "config/e4b_config.json", + }, + deps = [ + "//caffe2:torch", + "fbsource//third-party/pypi/safetensors:safetensors", + "fbsource//third-party/pypi/transformers:transformers", + ], + visibility = ["PUBLIC"], +) + +# Speech transform module +fbcode_target(_kind = runtime.python_library, + name = "speech_transform", + srcs = [ + "speech_transform.py", + ], + _is_external_target = True, + base_module = "executorch.examples.models.gemma4", + deps = [ + "//caffe2:torch", + ], + visibility = ["PUBLIC"], +) + +# Export utilities (shared quantization code) +fbcode_target(_kind = runtime.python_library, + name = "quant_utils", + srcs = ["quant_utils.py"], + _is_external_target = True, + base_module = "executorch.examples.models.gemma4", + deps = [ + "//caffe2:torch", + "//executorch/examples/models/llama:source_transformation", + "//executorch/extension/llm/export:export_lib", + "//pytorch/ao:torchao", + ], + visibility = ["PUBLIC"], +) + +# Single PTE export +fbcode_target(_kind = runtime.python_binary, + name = "export_gemma4", + srcs = ["export_gemma4.py"], + main_function = "executorch.examples.models.gemma4.export_gemma4.main", + preload_deps = [ + "//pytorch/ao/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight:op_linear_8bit_act_xbit_weight_aten", + "//pytorch/ao/torchao/csrc/cpu/shared_kernels/embedding_xbit:op_embedding_xbit_aten", + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/kernels/quantized:aot_lib", + ], + deps = [ + ":text_decoder", + ":speech_transform", + ":quant_utils", + "//caffe2:torch", + "//executorch/exir:lib", + "//executorch/backends/xnnpack/partition:xnnpack_partitioner", + "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer", + "//executorch/extension/llm/export:export_lib", + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/extension/llm/custom_ops:custom_ops_aot_py", + "//executorch/kernels/quantized:aot_lib", + "//pytorch/ao:torchao", + "fbsource//third-party/pypi/safetensors:safetensors", + "fbsource//third-party/pypi/transformers:transformers", + ], +) + +# Image preprocessing utilities +fbcode_target(_kind = runtime.python_library, + name = "image_utils", + srcs = ["image_utils.py"], + _is_external_target = True, + base_module = "executorch.examples.models.gemma4", + deps = [ + "//caffe2:torch", + "fbsource//third-party/pypi/pillow:pillow", + ], + visibility = ["PUBLIC"], +) + +# Python runner (single PTE, audio + vision + text-only) +fbcode_target(_kind = runtime.python_binary, + name = "run_gemma4", + srcs = ["run_gemma4.py"], + main_function = "executorch.examples.models.gemma4.run_gemma4.main", + preload_deps = [ + "//executorch/backends/xnnpack:xnnpack_backend", + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/kernels/quantized:aot_lib", + "//pytorch/ao/torchao/csrc/cpu/shared_kernels/embedding_xbit:op_embedding_xbit_aten", + "//pytorch/ao/torchao/csrc/cpu/shared_kernels/linear_8bit_act_xbit_weight:op_linear_8bit_act_xbit_weight_aten", + ], + deps = [ + ":image_utils", + "//caffe2:torch", + "//executorch/runtime:runtime", + "fbsource//third-party/pypi/sentencepiece:sentencepiece", + ], +) diff --git a/examples/qualcomm/executor_runner/TARGETS b/examples/qualcomm/executor_runner/TARGETS deleted file mode 100644 index 1e8cc179228..00000000000 --- a/examples/qualcomm/executor_runner/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain xplat-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/aten_util/TARGETS b/extension/aten_util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/aten_util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/aten_util/test/TARGETS b/extension/aten_util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/aten_util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/cuda/TARGETS b/extension/cuda/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/cuda/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/image/TARGETS b/extension/image/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/extension/image/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/image/benchmark/TARGETS b/extension/image/benchmark/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/extension/image/benchmark/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/image/test/TARGETS b/extension/image/test/TARGETS deleted file mode 100644 index 0a42614a385..00000000000 --- a/extension/image/test/TARGETS +++ /dev/null @@ -1,5 +0,0 @@ -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/pytree/aten_util/TARGETS b/extension/pytree/aten_util/TARGETS deleted file mode 100644 index 77b38349334..00000000000 --- a/extension/pytree/aten_util/TARGETS +++ /dev/null @@ -1,7 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/pytree/aten_util/test/TARGETS b/extension/pytree/aten_util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/pytree/aten_util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/runner_util/TARGETS b/extension/runner_util/TARGETS deleted file mode 100644 index 1e8cc179228..00000000000 --- a/extension/runner_util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain xplat-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/tensor/TARGETS b/extension/tensor/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/tensor/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/tensor/test/TARGETS b/extension/tensor/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/tensor/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/testing_util/TARGETS b/extension/testing_util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/testing_util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/testing_util/test/TARGETS b/extension/testing_util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/testing_util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/threadpool/TARGETS b/extension/threadpool/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/threadpool/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/threadpool/test/TARGETS b/extension/threadpool/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/threadpool/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/optimized/cpu/TARGETS b/kernels/optimized/cpu/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/kernels/optimized/cpu/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/optimized/test/TARGETS b/kernels/optimized/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/kernels/optimized/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/portable/cpu/util/TARGETS b/kernels/portable/cpu/util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/kernels/portable/cpu/util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/portable/cpu/util/test/TARGETS b/kernels/portable/cpu/util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/kernels/portable/cpu/util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/kernels/prim_ops/TARGETS b/kernels/prim_ops/TARGETS deleted file mode 100644 index 77b38349334..00000000000 --- a/kernels/prim_ops/TARGETS +++ /dev/null @@ -1,7 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/backend/TARGETS b/runtime/backend/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/backend/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/backend/test/TARGETS b/runtime/backend/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/backend/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/TARGETS b/runtime/core/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/exec_aten/TARGETS b/runtime/core/exec_aten/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/exec_aten/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/exec_aten/testing_util/TARGETS b/runtime/core/exec_aten/testing_util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/exec_aten/testing_util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/exec_aten/testing_util/test/TARGETS b/runtime/core/exec_aten/testing_util/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/exec_aten/testing_util/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/core/exec_aten/util/TARGETS b/runtime/core/exec_aten/util/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/core/exec_aten/util/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/platform/TARGETS b/runtime/platform/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/platform/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/runtime/platform/test/TARGETS b/runtime/platform/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/runtime/platform/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/schema/TARGETS b/schema/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/schema/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/schema/test/TARGETS b/schema/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/schema/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/test/TARGETS b/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() From 8b145b5fc6d3d18b3d5702ea0823402a7699f401 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 23 Jun 2026 15:21:25 -0700 Subject: [PATCH 6/7] [executorch][cuda] gemma4_31b: fuse gate/up MLP projections (default-on) Summary: Fuse each gemma4_31b MLP's gate_proj|up_proj into a single [2*intermediate, hidden] coalesced-int4 matmul, applied by default in the CUDA export. This issues one activation-quant + one W4A8 matvec per layer instead of two, cutting per-token launch + activation-quant overhead in the launch-bound decode path. Only Q4_K (CudaCoalescedInt4Tensor) gate/up pairs are fused; any other quant type (e.g. Q6_K) is left as two matmuls (guarded, still correct). Builds on the already-landed kv_len-bounded tq4_sdpa kernel + gemma4_31b call-site (kv_len + mask_is_causal), which recovered 128k decode from ~2.8 to ~43 tok/s. With both, ET gemma4_31b 128k+TurboQuant decode beats llama.cpp at every measured context (cuda_graph ON): ctx ET llama 512 44.80 42.77 2K 43.20 41.97 8K 42.23 41.23 32K 41.64 40.27 127K 38.41 35.97 TurboQuant KV compression kept; prefill restored (6-8x) with no regression; output quality preserved. Test Plan: - Fusion numerics: fused vs unfused MLP through the real W4A8 int4_plain_mm kernel = bit-exact (max_abs_diff 0.0, cos 1.000000) for decode (T=1) and prefill (T=4). - Export + run: fused module exported via CudaPartitioner and executed through executor_runner (RC=0, cos 0.999915 vs eager). Full 31B export logs "Fused gate+up on 60 MLP layers". - Decode A/B (gemma4_31b 128k+TQ, cuda_graph ON, 5x median): table above; beats llama.cpp at 512 -> 127K. nsys: tq4_sdpa 91.7% -> 2.9% of decode. --- .../gemma4_31b/cuda_source_transformations.py | 107 ++++++++++++++++++ examples/models/gemma4_31b/export.py | 9 +- 2 files changed, 111 insertions(+), 5 deletions(-) diff --git a/examples/models/gemma4_31b/cuda_source_transformations.py b/examples/models/gemma4_31b/cuda_source_transformations.py index 666d0c44e9d..6609178e084 100644 --- a/examples/models/gemma4_31b/cuda_source_transformations.py +++ b/examples/models/gemma4_31b/cuda_source_transformations.py @@ -30,6 +30,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F from executorch.examples.models.gemma4.text_decoder import apply_rotary_emb from executorch.extension.llm.modules.turboquant import TurboQuantKVCache @@ -110,6 +111,105 @@ def _turboquant_attention_forward( return self.o_proj(y) +def _fused_mlp_forward(self, x: torch.Tensor) -> torch.Tensor: + """Drop-in ``Gemma4MLP.forward`` over a fused gate|up projection. + + Identical math to ``down(gelu(gate(x)) * up(x))``: the single + ``gate_up_proj`` emits ``[gate | up]`` concatenated on the last dim, + which is then split. One W4A8 matmul (and one activation-quant of ``x``) + instead of two. + """ + h = self.gate_up_proj(x) + gate = h[..., : self.intermediate_size] + up = h[..., self.intermediate_size :] + return self.down_proj(F.gelu(gate, approximate="tanh") * up) + + +def _concat_coalesced_int4_along_n(a, b): + """Concatenate two ``CudaCoalescedInt4Tensor`` along the output (N) dim. + + qdata is ``[N, K/2]`` and scale/zero_point are ``[N, n_groups]`` in the + coalesced layout, so a per-output-row concat on dim 0 is exact: the W4A8 + dp4a matvec reads each output row's qdata/scale/zero independently, so + out[:N_a] reproduces ``a`` and out[N_a:] reproduces ``b`` bit-for-bit. + """ + from executorch.backends.cuda.coalesced_int4_tensor import CudaCoalescedInt4Tensor + + return CudaCoalescedInt4Tensor( + torch.cat([a.qdata, b.qdata], dim=0), + torch.cat([a.scale, b.scale], dim=0), + torch.cat([a.zero_point, b.zero_point], dim=0), + a.block_size, + torch.Size([a.shape[0] + b.shape[0], a.shape[1]]), + None, + a.activation_dtype, + ) + + +def _is_fuseable_int4_pair(gate_w, up_w) -> bool: + """True iff gate/up are both coalesced-int4 with matching K + block_size. + + Q4_K MLP weights become ``CudaCoalescedInt4Tensor`` (fuseable); a Q6_K + weight becomes ``CudaDp4aPlanarInt6Tensor`` (left alone). ``act_pre_scale`` + is unused on this path but we require it absent so the concat stays exact. + """ + from executorch.backends.cuda.coalesced_int4_tensor import CudaCoalescedInt4Tensor + + return ( + isinstance(gate_w, CudaCoalescedInt4Tensor) + and isinstance(up_w, CudaCoalescedInt4Tensor) + and list(gate_w.block_size) == list(up_w.block_size) + and gate_w.shape[1] == up_w.shape[1] + and gate_w.act_pre_scale is None + and up_w.act_pre_scale is None + ) + + +def _fuse_gate_up_proj(model: nn.Module) -> None: + """Fuse each MLP's ``gate_proj | up_proj`` into one ``gate_up_proj``. + + gate and up share the same input, so the unfused path quantizes ``x`` to + int8 twice and launches two W4A8 matvecs per layer. Fusing the weights + into one ``[2*inter, hidden]`` tensor halves both. Weight bytes read are + unchanged, so the win is launch + activation-quant overhead (decode is + launch-bound). Only Q4_K (coalesced-int4) layers are fused; any layer + with a non-int4 weight is left as two matmuls (still correct). + + Must run AFTER weights are packed to ``CudaCoalescedInt4Tensor`` (i.e. + inside ``_export_cuda``), and is independent of TurboQuant. + """ + n_fused = 0 + n_skipped = 0 + for layer in model.layers: + mlp = getattr(layer, "mlp", None) + if mlp is None or not (hasattr(mlp, "gate_proj") and hasattr(mlp, "up_proj")): + continue + gate_w = mlp.gate_proj.weight + up_w = mlp.up_proj.weight + if not _is_fuseable_int4_pair(gate_w, up_w): + n_skipped += 1 + continue + inter = up_w.shape[0] + hidden = up_w.shape[1] + fused_w = _concat_coalesced_int4_along_n(gate_w, up_w) + + # Container built on meta to avoid materializing a dense + # [2*inter, hidden] weight before we overwrite it with fused_w. + gate_up = nn.Linear(hidden, 2 * inter, bias=False, device="meta") + gate_up.weight = nn.Parameter(fused_w, requires_grad=False) + mlp.gate_up_proj = gate_up + mlp.intermediate_size = inter + del mlp.gate_proj + del mlp.up_proj + mlp.forward = types.MethodType(_fused_mlp_forward, mlp) + n_fused += 1 + + msg = f"[gemma4_31b cuda] Fused gate+up on {n_fused} MLP layers" + if n_skipped: + msg += f" ({n_skipped} skipped: non-int4 weights)" + print(msg) + + def cuda_source_transformations( model: nn.Module, *, @@ -117,6 +217,11 @@ def cuda_source_transformations( ) -> None: """Apply CUDA source transformations to a Gemma 4 31B model in place. + Always fuses each MLP's ``gate_proj|up_proj`` into a single matmul (one + activation-quant + one W4A8 matvec per layer instead of two; Q4_K + coalesced-int4 layers only — other quant types are left untouched). + Optionally also swaps full-attention KV caches for TurboQuant TQ4. + Args: model: ``Gemma4_31B`` instance to transform. use_turboquant: When True, swap full-attention layers' KV caches @@ -125,6 +230,8 @@ def cuda_source_transformations( ``torch.ops.triton.tq4_sdpa``. Sliding-window layers are unaffected. """ + _fuse_gate_up_proj(model) + if not use_turboquant: return diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py index d9e16bc34df..b2b2264178a 100644 --- a/examples/models/gemma4_31b/export.py +++ b/examples/models/gemma4_31b/export.py @@ -182,12 +182,11 @@ def _export_cuda( materialize_runtime_buffers(model, dtype=torch.bfloat16) - if use_turboquant: - from executorch.examples.models.gemma4_31b.cuda_source_transformations import ( - cuda_source_transformations, - ) + from executorch.examples.models.gemma4_31b.cuda_source_transformations import ( + cuda_source_transformations, + ) - cuda_source_transformations(model, use_turboquant=True) + cuda_source_transformations(model, use_turboquant=use_turboquant) # Int4Tensor weights are used directly — no format conversion. # F.linear dispatches to executorch_cuda::int4_plain_mm (CUDA shim). From 638f07ae1d3aad4f00122217dde062d5a0a4b3a8 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 23 Jun 2026 17:08:39 -0700 Subject: [PATCH 7/7] [executorch][gemma4] fuse MLP gate/up at GGUF load (single point, cuda+mlx) Summary: Move the gemma4 MLP gate_proj|up_proj fusion to a single backend-agnostic point in the GGUF loader, and make the model forward consume it. Supersedes the earlier CUDA-only export-time fusion (reverted here). - gguf_loader.py: before any backend conversion (_convert_weight), buffer each layer's raw gate/up ExportableGGUFTensor and, once both arrive, row-concat their raw GGUF blocks along the output dim into one fused gate_up ExportableGGUFTensor (gate rows then up rows). Both backends then pack the already-fused weight with NO per-type concat: CUDA (Q4_K -> CudaCoalescedInt4Tensor, Q6_K -> CudaDp4aPlanarInt6Tensor) and MLX (ExportableGGUFTensor). Guards: same ggml_type + K; non-fuseable pairs and unpaired leftovers fall through unfused. - Gemma4MLP: when a fused gate_up_proj is present, run one matmul and split the [.., 2*intermediate_size] output back into gate/up; otherwise use the separate projections. The shared MLP stays safe for unfused checkpoints and the prequant/HF load paths (no gate_up_proj -> original path, no crash). - Revert the previous CUDA-localized fusion (cuda_source_transformations.py and export.py back to their original form). The kv_len-bounded tq4_sdpa kernel + call-site (already on main) are unchanged. Single fusion point widens applicability (CUDA + MLX, incl. Q6_K) and keeps the model def backend-agnostic. Decode win is unchanged (same fused matmul, produced at load instead of at export). Test Plan: - Raw concat (real GGUF blk.0 ffn, q4_k): fused.dequantize() == [gate; up] stacked, bit-exact; fused CudaCoalescedInt4Tensor rows [:N]/[N:] qdata+scale+zero bit-identical to gate/up. - Model-def fused vs unfused forward through real W4A8 int4_plain_mm: decode (T=1) bit-exact (cos 1.000000); prefill (T=4) cos 0.999988 -- the only delta is cuBLAS GEMM shape-dependent fp ordering (N=43008 vs 21504, identical weights), benign and inherent to any gate/up fusion. - Full CUDA GGUF export (gemma4_31b, --turboquant, max-seq-len 131072): loader logs "Fused gate+up on 60 MLP layers", TurboQuant swaps 10 layers, AOTI build clean (model.pte + 26.18GB aoti_cuda_blob.ptd, "Done."). - Decode via gemma4_31b_runner on the new build: coherent output, no NaN; prefill 1375 tok/s, decode 38.3 tok/s (no cuda_graph sanity). --- .../text_decoder/gemma4_decoder_layer.py | 17 ++- .../gemma4_31b/cuda_source_transformations.py | 107 ---------------- examples/models/gemma4_31b/export.py | 9 +- examples/models/gemma4_31b/gguf_loader.py | 119 ++++++++++++++++++ 4 files changed, 138 insertions(+), 114 deletions(-) diff --git a/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py b/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py index e10c1c7e415..fe3e3bb94cb 100644 --- a/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py +++ b/examples/models/gemma4/text_decoder/gemma4_decoder_layer.py @@ -34,14 +34,25 @@ class Gemma4MLP(nn.Module): def __init__(self, hidden_size: int, intermediate_size: int): super().__init__() + self.intermediate_size = intermediate_size self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.down_proj( - F.gelu(self.gate_proj(x), approximate="tanh") * self.up_proj(x) - ) + # If a loader fused gate_proj|up_proj into one gate_up_proj (single + # matmul; e.g. the GGUF loader's coalesced fusion), use it and split the + # [.., 2*intermediate_size] output back into gate/up. Otherwise fall back + # to the separate projections (unfused checkpoints / non-fusing loaders). + gate_up = getattr(self, "gate_up_proj", None) + if gate_up is not None: + fused = gate_up(x) + gate = fused[..., : self.intermediate_size] + up = fused[..., self.intermediate_size :] + else: + gate = self.gate_proj(x) + up = self.up_proj(x) + return self.down_proj(F.gelu(gate, approximate="tanh") * up) class Gemma4DecoderLayer(nn.Module): diff --git a/examples/models/gemma4_31b/cuda_source_transformations.py b/examples/models/gemma4_31b/cuda_source_transformations.py index 6609178e084..666d0c44e9d 100644 --- a/examples/models/gemma4_31b/cuda_source_transformations.py +++ b/examples/models/gemma4_31b/cuda_source_transformations.py @@ -30,7 +30,6 @@ import torch import torch.nn as nn -import torch.nn.functional as F from executorch.examples.models.gemma4.text_decoder import apply_rotary_emb from executorch.extension.llm.modules.turboquant import TurboQuantKVCache @@ -111,105 +110,6 @@ def _turboquant_attention_forward( return self.o_proj(y) -def _fused_mlp_forward(self, x: torch.Tensor) -> torch.Tensor: - """Drop-in ``Gemma4MLP.forward`` over a fused gate|up projection. - - Identical math to ``down(gelu(gate(x)) * up(x))``: the single - ``gate_up_proj`` emits ``[gate | up]`` concatenated on the last dim, - which is then split. One W4A8 matmul (and one activation-quant of ``x``) - instead of two. - """ - h = self.gate_up_proj(x) - gate = h[..., : self.intermediate_size] - up = h[..., self.intermediate_size :] - return self.down_proj(F.gelu(gate, approximate="tanh") * up) - - -def _concat_coalesced_int4_along_n(a, b): - """Concatenate two ``CudaCoalescedInt4Tensor`` along the output (N) dim. - - qdata is ``[N, K/2]`` and scale/zero_point are ``[N, n_groups]`` in the - coalesced layout, so a per-output-row concat on dim 0 is exact: the W4A8 - dp4a matvec reads each output row's qdata/scale/zero independently, so - out[:N_a] reproduces ``a`` and out[N_a:] reproduces ``b`` bit-for-bit. - """ - from executorch.backends.cuda.coalesced_int4_tensor import CudaCoalescedInt4Tensor - - return CudaCoalescedInt4Tensor( - torch.cat([a.qdata, b.qdata], dim=0), - torch.cat([a.scale, b.scale], dim=0), - torch.cat([a.zero_point, b.zero_point], dim=0), - a.block_size, - torch.Size([a.shape[0] + b.shape[0], a.shape[1]]), - None, - a.activation_dtype, - ) - - -def _is_fuseable_int4_pair(gate_w, up_w) -> bool: - """True iff gate/up are both coalesced-int4 with matching K + block_size. - - Q4_K MLP weights become ``CudaCoalescedInt4Tensor`` (fuseable); a Q6_K - weight becomes ``CudaDp4aPlanarInt6Tensor`` (left alone). ``act_pre_scale`` - is unused on this path but we require it absent so the concat stays exact. - """ - from executorch.backends.cuda.coalesced_int4_tensor import CudaCoalescedInt4Tensor - - return ( - isinstance(gate_w, CudaCoalescedInt4Tensor) - and isinstance(up_w, CudaCoalescedInt4Tensor) - and list(gate_w.block_size) == list(up_w.block_size) - and gate_w.shape[1] == up_w.shape[1] - and gate_w.act_pre_scale is None - and up_w.act_pre_scale is None - ) - - -def _fuse_gate_up_proj(model: nn.Module) -> None: - """Fuse each MLP's ``gate_proj | up_proj`` into one ``gate_up_proj``. - - gate and up share the same input, so the unfused path quantizes ``x`` to - int8 twice and launches two W4A8 matvecs per layer. Fusing the weights - into one ``[2*inter, hidden]`` tensor halves both. Weight bytes read are - unchanged, so the win is launch + activation-quant overhead (decode is - launch-bound). Only Q4_K (coalesced-int4) layers are fused; any layer - with a non-int4 weight is left as two matmuls (still correct). - - Must run AFTER weights are packed to ``CudaCoalescedInt4Tensor`` (i.e. - inside ``_export_cuda``), and is independent of TurboQuant. - """ - n_fused = 0 - n_skipped = 0 - for layer in model.layers: - mlp = getattr(layer, "mlp", None) - if mlp is None or not (hasattr(mlp, "gate_proj") and hasattr(mlp, "up_proj")): - continue - gate_w = mlp.gate_proj.weight - up_w = mlp.up_proj.weight - if not _is_fuseable_int4_pair(gate_w, up_w): - n_skipped += 1 - continue - inter = up_w.shape[0] - hidden = up_w.shape[1] - fused_w = _concat_coalesced_int4_along_n(gate_w, up_w) - - # Container built on meta to avoid materializing a dense - # [2*inter, hidden] weight before we overwrite it with fused_w. - gate_up = nn.Linear(hidden, 2 * inter, bias=False, device="meta") - gate_up.weight = nn.Parameter(fused_w, requires_grad=False) - mlp.gate_up_proj = gate_up - mlp.intermediate_size = inter - del mlp.gate_proj - del mlp.up_proj - mlp.forward = types.MethodType(_fused_mlp_forward, mlp) - n_fused += 1 - - msg = f"[gemma4_31b cuda] Fused gate+up on {n_fused} MLP layers" - if n_skipped: - msg += f" ({n_skipped} skipped: non-int4 weights)" - print(msg) - - def cuda_source_transformations( model: nn.Module, *, @@ -217,11 +117,6 @@ def cuda_source_transformations( ) -> None: """Apply CUDA source transformations to a Gemma 4 31B model in place. - Always fuses each MLP's ``gate_proj|up_proj`` into a single matmul (one - activation-quant + one W4A8 matvec per layer instead of two; Q4_K - coalesced-int4 layers only — other quant types are left untouched). - Optionally also swaps full-attention KV caches for TurboQuant TQ4. - Args: model: ``Gemma4_31B`` instance to transform. use_turboquant: When True, swap full-attention layers' KV caches @@ -230,8 +125,6 @@ def cuda_source_transformations( ``torch.ops.triton.tq4_sdpa``. Sliding-window layers are unaffected. """ - _fuse_gate_up_proj(model) - if not use_turboquant: return diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py index b2b2264178a..d9e16bc34df 100644 --- a/examples/models/gemma4_31b/export.py +++ b/examples/models/gemma4_31b/export.py @@ -182,11 +182,12 @@ def _export_cuda( materialize_runtime_buffers(model, dtype=torch.bfloat16) - from executorch.examples.models.gemma4_31b.cuda_source_transformations import ( - cuda_source_transformations, - ) + if use_turboquant: + from executorch.examples.models.gemma4_31b.cuda_source_transformations import ( + cuda_source_transformations, + ) - cuda_source_transformations(model, use_turboquant=use_turboquant) + cuda_source_transformations(model, use_turboquant=True) # Int4Tensor weights are used directly — no format conversion. # F.linear dispatches to executorch_cuda::int4_plain_mm (CUDA shim). diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py index 90839ea6f6a..6a4a70ced18 100644 --- a/examples/models/gemma4_31b/gguf_loader.py +++ b/examples/models/gemma4_31b/gguf_loader.py @@ -104,6 +104,89 @@ def _convert_weight(model, model_key: str, gtensor, backend: str): return gtensor +# --------------------------------------------------------------------------- +# Single-point gate/up fusion (backend-agnostic, at the raw GGUF level) +# +# gate_proj and up_proj share the same input, so the MLP can issue ONE matmul +# over a [2*intermediate, hidden] weight instead of two. We fuse here -- before +# any backend conversion (_convert_weight) -- by concatenating the two raw GGUF +# block blobs along the output (row) dim. ExportableGGUFTensor.raw is +# (N, row_bytes) row-major with each output row self-contained, so the concat is +# an exact row-stack (no re-quant, no scale recompute). Both CUDA and MLX then +# pack the already-fused weight, so there is no per-backend-type concat. The +# model's Gemma4MLP.forward splits the [.., 2*intermediate] output back into +# gate/up only when a fused gate_up_proj is present (graceful for unfused loads). + + +def _gate_up_layer_kind(model_key: str): + """If ``model_key`` is an MLP gate/up proj weight, return ``(layer_idx, kind)`` + with ``kind`` in ``{"gate", "up"}``; otherwise ``None``.""" + prefix = "layers." + for kind in ("gate", "up"): + suffix = f".mlp.{kind}_proj.weight" + if model_key.startswith(prefix) and model_key.endswith(suffix): + mid = model_key[len(prefix) : len(model_key) - len(suffix)] + if mid.isdigit(): + return int(mid), kind + return None + + +def _gate_up_fuseable(gate, up) -> bool: + """True iff gate/up are the same GGUF quant type and same packed row width + (hence same K + block layout), so a row-concat along output N is valid.""" + return ( + gate.ggml_type == up.ggml_type + and gate.raw.shape[1] == up.raw.shape[1] + and int(gate.shape[1]) == int(up.shape[1]) + ) + + +def _fuse_gate_up_raw(gate, up): + """Row-concat gate|up raw GGUF blocks (gate rows first) into one fused + ExportableGGUFTensor of shape (2*N, K).""" + from executorch.extension.llm.export.gguf import ExportableGGUFTensor + + fused_raw = torch.cat([gate.raw, up.raw], dim=0) + return ExportableGGUFTensor.from_raw(fused_raw, gate.ggml_type, gate.orig_dtype) + + +def _assign_gate_up_unfused(model, layer_idx, kind, gtensor, backend, packers): + """Assign a single gate/up GGUF tensor to its own projection (no fusion).""" + from executorch.examples.models.gemma4_31b.quant import pack_one + + key = f"layers.{layer_idx}.mlp.{kind}_proj.weight" + pack_one(model, key, _convert_weight(model, key, gtensor, backend), packers) + + +def _install_and_pack_fused_gate_up(model, layer_idx, gate, up, backend, packers): + """Fuse gate|up at the raw level, swap the layer's MLP to a single + ``gate_up_proj`` (dropping gate_proj/up_proj), then pack the fused weight.""" + import torch.nn as nn + + from executorch.examples.models.gemma4_31b.quant import pack_one + + fused = _fuse_gate_up_raw(gate, up) + inter, hidden = int(gate.shape[0]), int(gate.shape[1]) + + mlp = model.get_submodule(f"layers.{layer_idx}.mlp") + mlp.gate_up_proj = nn.Linear(hidden, 2 * inter, bias=False, device="meta") + del mlp.gate_proj + del mlp.up_proj + + key = f"layers.{layer_idx}.mlp.gate_up_proj.weight" + pack_one(model, key, _convert_weight(model, key, fused, backend), packers) + + +def _process_gate_up_pair(model, layer_idx, gate, up, backend, packers) -> bool: + """Fuse gate|up if compatible (returns True), else assign them unfused.""" + if _gate_up_fuseable(gate, up): + _install_and_pack_fused_gate_up(model, layer_idx, gate, up, backend, packers) + return True + _assign_gate_up_unfused(model, layer_idx, "gate", gate, backend, packers) + _assign_gate_up_unfused(model, layer_idx, "up", up, backend, packers) + return False + + def _resolve_tied_lm_head(model, lm_head_weight, packers): """Assign a tied lm_head (GGUF ties it to the token embedding).""" from executorch.examples.models.gemma4_31b.quant import pack_one @@ -217,11 +300,32 @@ def load_gguf_model( n_processed = 0 print(f"Streaming GGUF from {gguf_path}...") + pending_gate_up: dict = {} # layer_idx -> {"gate": raw, "up": raw} + n_fused = 0 + n_unfused = 0 for gguf_name, value in iter_gguf(gguf_path): model_key = gguf_to_model_key(gguf_name) if model_key is None: continue + # Buffer the RAW gate/up ExportableGGUFTensor (pre-conversion) and fuse + # once both arrive -- the single common point upstream of _convert_weight. + gu = _gate_up_layer_kind(model_key) + if gu is not None and isinstance(value, ExportableGGUFTensor): + layer_idx, kind = gu + slot = pending_gate_up.setdefault(layer_idx, {}) + slot[kind] = value + if "gate" in slot and "up" in slot: + if _process_gate_up_pair( + model, layer_idx, slot["gate"], slot["up"], backend, packers + ): + n_fused += 1 + else: + n_unfused += 1 + pending_gate_up.pop(layer_idx, None) + n_processed += 2 + continue + if isinstance(value, ExportableGGUFTensor): weight = _convert_weight(model, model_key, value, backend) if model_key == "embed_tokens.weight": @@ -238,6 +342,21 @@ def load_gguf_model( if n_processed % 100 == 0: print(f" Processed {n_processed} tensors...") + # Flush any unpaired gate/up (partial/malformed) as separate unfused + # projections so no weight is left on meta. + for layer_idx, slot in pending_gate_up.items(): + for kind in ("gate", "up"): + if kind in slot: + _assign_gate_up_unfused( + model, layer_idx, kind, slot[kind], backend, packers + ) + n_unfused += 1 + + print( + f"[gemma4_31b gguf] Fused gate+up on {n_fused} MLP layers" + + (f" ({n_unfused} left unfused)" if n_unfused else "") + ) + _resolve_tied_lm_head(model, lm_head_weight, packers) # Fill RoPE tables / KV caches / scalar constants (left on meta by the