From 2e0ac267b3c32bfe7d5f605d8ff3469c41a9db7b Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Tue, 23 Jun 2026 13:42:34 -0700
Subject: [PATCH] gemma4_31b: add OpenAI serving entrypoint

Add Gemma4 31B model-specific serving support on top of the shared examples/llm_server harness.

This extracts the existing runner flow into a small Gemma4_31BEngine/LLMSession adapter, keeps main.cpp as a thin runner wrapper, and adds a C++ JSONL worker plus Python OpenAI-compatible launcher. The generic server remains model-agnostic; Gemma-specific behavior stays in examples/models/gemma4_31b, including chat-template options, BOS handling, channel cleanup, and Gemma tool-call parsing.

Also wire the worker into the existing Gemma CUDA/MLX CMake presets and Makefile targets, document the serving harness usage, and add validation coverage: hermetic launcher tests, an opt-in on-device BOS/template regression test, and a CUDA no-bleed integration proof for interleaved multi-session execution.
---
 Makefile                                      |  15 +-
 examples/models/gemma4_31b/CMakeLists.txt     |  32 +-
 examples/models/gemma4_31b/CMakePresets.json  |  20 +-
 examples/models/gemma4_31b/README.md          |  65 +-
 examples/models/gemma4_31b/export.py          |   9 +
 .../models/gemma4_31b/gemma4_31b_engine.cpp   | 737 ++++++++++++++++++
 .../models/gemma4_31b/gemma4_31b_engine.h     | 128 +++
 .../models/gemma4_31b/gemma4_31b_worker.cpp   |  63 ++
 examples/models/gemma4_31b/main.cpp           | 448 ++---------
 examples/models/gemma4_31b/serve.py           | 214 +++++
 .../gemma4_31b/test_gemma4_31b_nobleed.cpp    | 261 +++++++
 .../gemma4_31b/test_ondevice_serving.py       |  62 ++
 examples/models/gemma4_31b/test_serve.py      | 184 +++++
 13 files changed, 1846 insertions(+), 392 deletions(-)
 create mode 100644 examples/models/gemma4_31b/gemma4_31b_engine.cpp
 create mode 100644 examples/models/gemma4_31b/gemma4_31b_engine.h
 create mode 100644 examples/models/gemma4_31b/gemma4_31b_worker.cpp
 create mode 100644 examples/models/gemma4_31b/serve.py
 create mode 100644 examples/models/gemma4_31b/test_gemma4_31b_nobleed.cpp
 create mode 100644 examples/models/gemma4_31b/test_ondevice_serving.py
 create mode 100644 examples/models/gemma4_31b/test_serve.py

diff --git a/Makefile b/Makefile
index 552bbf89bd7..969b53644cd 100644
--- a/Makefile
+++ b/Makefile
@@ -127,8 +127,8 @@ help:
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
-	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
-	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
+	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner and worker with CUDA backend"
+	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner and worker with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  qwen3_5_moe-mlx     - Build Qwen3.5 MoE runner with MLX backend"
@@ -444,20 +444,23 @@ qwen3_5_moe-cuda:
 gemma4_31b-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
-	@echo "==> Building Gemma 4 31B runner with CUDA..."
+	@echo "==> Building Gemma 4 31B runner, worker, and no-bleed test with CUDA..."
 	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
 	@echo ""
 	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
+	@echo "  Test:   cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed"
 
 gemma4_31b-mlx:
 	@echo "==> Building and installing ExecuTorch with MLX..."
 	cmake --workflow --preset mlx-release
-	@echo "==> Building Gemma 4 31B runner with MLX..."
+	@echo "==> Building Gemma 4 31B runner and worker with MLX..."
 	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-mlx
 	@echo ""
 	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
 
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."
diff --git a/examples/models/gemma4_31b/CMakeLists.txt b/examples/models/gemma4_31b/CMakeLists.txt
index 52419eb95bc..c740de13c36 100644
--- a/examples/models/gemma4_31b/CMakeLists.txt
+++ b/examples/models/gemma4_31b/CMakeLists.txt
@@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_json_include
+    ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
+)
 
 # gflags
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
@@ -51,6 +54,7 @@ if(EXECUTORCH_BUILD_CUDA)
 elseif(TARGET mlxdelegate)
   list(APPEND link_libraries mlxdelegate mlx)
   executorch_target_link_options_shared_lib(mlxdelegate)
+  add_compile_definitions(EXECUTORCH_BUILD_MLX)
 else()
   message(FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_MLX=ON")
 endif()
@@ -58,19 +62,43 @@ endif()
 # Tokenizer (HuggingFace tokenizer.json)
 list(APPEND link_libraries tokenizers::tokenizers)
 
-add_executable(gemma4_31b_runner main.cpp)
+add_executable(gemma4_31b_runner main.cpp gemma4_31b_engine.cpp)
 target_include_directories(
-  gemma4_31b_runner PUBLIC ${_common_include_directories}
+  gemma4_31b_runner PUBLIC ${_common_include_directories} ${_json_include}
 )
 target_link_libraries(gemma4_31b_runner PUBLIC ${link_libraries})
 
+add_executable(gemma4_31b_worker gemma4_31b_worker.cpp gemma4_31b_engine.cpp)
+target_include_directories(
+  gemma4_31b_worker PUBLIC ${_common_include_directories} ${_json_include}
+)
+target_link_libraries(gemma4_31b_worker PUBLIC ${link_libraries})
+
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(gemma4_31b_runner)
   if(NOT APPLE AND NOT MSVC)
     target_link_options(gemma4_31b_runner PRIVATE "LINKER:-s")
   endif()
+  target_link_options_gc_sections(gemma4_31b_worker)
+  if(NOT APPLE AND NOT MSVC)
+    target_link_options(gemma4_31b_worker PRIVATE "LINKER:-s")
+  endif()
 endif()
 
 if(TARGET mlxdelegate)
   executorch_target_copy_mlx_metallib(gemma4_31b_runner)
+  executorch_target_copy_mlx_metallib(gemma4_31b_worker)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  enable_testing()
+  add_executable(
+    test_gemma4_31b_nobleed test_gemma4_31b_nobleed.cpp gemma4_31b_engine.cpp
+  )
+  target_include_directories(
+    test_gemma4_31b_nobleed PUBLIC ${_common_include_directories}
+                                   ${_json_include}
+  )
+  target_link_libraries(test_gemma4_31b_nobleed PUBLIC ${link_libraries})
+  add_test(NAME gemma4_31b_nobleed COMMAND test_gemma4_31b_nobleed)
 endif()
diff --git a/examples/models/gemma4_31b/CMakePresets.json b/examples/models/gemma4_31b/CMakePresets.json
index 23a7d42e035..7bbdc82c81a 100644
--- a/examples/models/gemma4_31b/CMakePresets.json
+++ b/examples/models/gemma4_31b/CMakePresets.json
@@ -13,7 +13,7 @@
         },
         {
             "name": "gemma4-31b-cuda",
-            "displayName": "Gemma 4 31B runner (CUDA)",
+            "displayName": "Gemma 4 31B runner and worker (CUDA)",
             "inherits": ["gemma4-31b-base"],
             "cacheVariables": {
                 "EXECUTORCH_BUILD_CUDA": "ON"
@@ -26,7 +26,7 @@
         },
         {
             "name": "gemma4-31b-mlx",
-            "displayName": "Gemma 4 31B runner (MLX)",
+            "displayName": "Gemma 4 31B runner and worker (MLX)",
             "inherits": ["gemma4-31b-base"],
             "cacheVariables": {},
             "condition": {
@@ -39,21 +39,25 @@
     "buildPresets": [
         {
             "name": "gemma4-31b-cuda",
-            "displayName": "Build Gemma 4 31B runner (CUDA)",
+            "displayName": "Build Gemma 4 31B runner, worker, and no-bleed test (CUDA)",
             "configurePreset": "gemma4-31b-cuda",
-            "targets": ["gemma4_31b_runner"]
+            "targets": [
+                "gemma4_31b_runner",
+                "gemma4_31b_worker",
+                "test_gemma4_31b_nobleed"
+            ]
         },
         {
             "name": "gemma4-31b-mlx",
-            "displayName": "Build Gemma 4 31B runner (MLX)",
+            "displayName": "Build Gemma 4 31B runner and worker (MLX)",
             "configurePreset": "gemma4-31b-mlx",
-            "targets": ["gemma4_31b_runner"]
+            "targets": ["gemma4_31b_runner", "gemma4_31b_worker"]
         }
     ],
     "workflowPresets": [
         {
             "name": "gemma4-31b-cuda",
-            "displayName": "Configure and build Gemma 4 31B runner (CUDA)",
+            "displayName": "Configure and build Gemma 4 31B runner and worker (CUDA)",
             "steps": [
                 {
                     "type": "configure",
@@ -67,7 +71,7 @@
         },
         {
             "name": "gemma4-31b-mlx",
-            "displayName": "Configure and build Gemma 4 31B runner (MLX)",
+            "displayName": "Configure and build Gemma 4 31B runner and worker (MLX)",
             "steps": [
                 {
                     "type": "configure",
diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md
index 482f64083a0..eeb4fa65545 100644
--- a/examples/models/gemma4_31b/README.md
+++ b/examples/models/gemma4_31b/README.md
@@ -153,14 +153,17 @@ python examples/models/gemma4_31b/inference.py \
 Useful before spending the export+lowering time to confirm the quantized
 model produces sensible text.
 
-## Build the runner
+## Build the runner and worker
 
 ```bash
 make gemma4_31b-cuda   # Linux — CUDA backend
 make gemma4_31b-mlx    # macOS — MLX backend (Apple Silicon)
 ```
 
-The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`.
+The binaries land at:
+
+- `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`
+- `cmake-out/examples/models/gemma4_31b/gemma4_31b_worker`
 
 ## Run the .pte
 
@@ -179,3 +182,61 @@ Pass `--raw_prompt` to skip template wrapping for pre-formatted input.
 
 For benchmarking, add `--cuda_graph` to capture the decode method in a CUDA
 graph (decode is fully static — `T=1`).
+
+## OpenAI-compatible serving harness
+
+The serving path is a test harness for local-agent workflows. Python owns HTTP,
+chat templating, request validation, and tool parsing; the C++ worker owns model
+loading, prefill/decode, and session state. Use the runner or engine/session API
+directly for production integrations.
+
+### CUDA
+
+```bash
+python -m executorch.examples.models.gemma4_31b.serve \
+    --model-path ./gemma4_31b_exports/model.pte \
+    --data-path ./gemma4_31b_exports/aoti_cuda_blob.ptd \
+    --tokenizer-path ./gemma4_31b_int4/tokenizer.json \
+    --hf-tokenizer ./gemma4_31b_int4 \
+    --model-id gemma4_31b \
+    --max-context 4096 \
+    --max-sessions 4 \
+    --host 127.0.0.1 \
+    --port 8000
+```
+
+### MLX
+
+```bash
+python -m executorch.examples.models.gemma4_31b.serve \
+    --model-path ./gemma4_31b_exports_mlx/model.pte \
+    --tokenizer-path ./gemma4_31b_int4/tokenizer.json \
+    --hf-tokenizer ./gemma4_31b_int4 \
+    --model-id gemma4_31b \
+    --max-context 4096 \
+    --max-sessions 4 \
+    --host 127.0.0.1 \
+    --port 8000
+```
+
+Named sessions use one loaded model with isolated mutable state when the backend
+supports it. Set `--max-sessions >= 2` and send a stable `session_id` (or one of
+the supported affinity headers) to enable separate conversations and warm
+append-only resume. One capacity slot is reserved for anonymous requests.
+
+The default parser is Gemma's tool-call format. Use `--tool-parser hermes`,
+`--tool-parser qwen`, or `--tool-parser none` if a different prompt/template
+emits another format.
+
+### CUDA no-bleed test
+
+The CUDA build also produces `test_gemma4_31b_nobleed`, which validates that
+two sessions can interleave prefill/decode on one loaded model without sharing
+mutable state:
+
+```bash
+GEMMA_MODEL_PATH=gemma4_31b_exports/model.pte \
+GEMMA_DATA_PATH=gemma4_31b_exports/aoti_cuda_blob.ptd \
+GEMMA_TOKENIZER_PATH=gemma4_31b_int4/tokenizer.json \
+  cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed
+```
diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py
index d9e16bc34df..2a3ae9c9364 100644
--- a/examples/models/gemma4_31b/export.py
+++ b/examples/models/gemma4_31b/export.py
@@ -24,6 +24,7 @@
 """
 
 import argparse
+import json
 import os
 
 import torch
@@ -135,6 +136,11 @@ def _pack_for_backend(model: nn.Module, path: str, backend: str) -> None:
 # Export + lower
 
 
+def _mutable_buffer_metadata(model: nn.Module) -> str:
+    mutable = [name for name, _ in model.named_buffers() if ".kv_cache." in name]
+    return json.dumps({"version": 1, "mutable_buffers": mutable})
+
+
 def export_and_lower(
     model: Gemma4_31B,
     config: Gemma4_31BConfig,
@@ -181,6 +187,7 @@ def _export_cuda(
     import executorch.backends.cuda.quantize_op_dispatch  # noqa: F401
 
     materialize_runtime_buffers(model, dtype=torch.bfloat16)
+    mutable_buffer_metadata = _mutable_buffer_metadata(model)
 
     if use_turboquant:
         from executorch.examples.models.gemma4_31b.cuda_source_transformations import (
@@ -255,6 +262,8 @@ def _export_cuda(
             "get_vocab_size": config.vocab_size,
             "get_n_layers": config.num_hidden_layers,
             "get_max_prefill_chunk": max_prefill,
+            "get_min_prefill_chunk": 5,
+            "get_mutable_buffer_metadata": mutable_buffer_metadata,
             "use_kv_cache": True,
             "use_sdpa_with_kv_cache": False,
             "enable_dynamic_shape": True,
diff --git a/examples/models/gemma4_31b/gemma4_31b_engine.cpp b/examples/models/gemma4_31b/gemma4_31b_engine.cpp
new file mode 100644
index 00000000000..6b2a11d3be3
--- /dev/null
+++ b/examples/models/gemma4_31b/gemma4_31b_engine.cpp
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/gemma4_31b/gemma4_31b_engine.h>
+
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
+#include <executorch/runtime/core/portable_type/device.h>
+#include <executorch/runtime/platform/log.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+#include <cstring>
+#include <optional>
+#include <vector>
+
+#ifdef EXECUTORCH_BUILD_CUDA
+#include <cuda_runtime.h>
+#include <nlohmann/json.hpp>
+#else
+#include <executorch/extension/llm/sampler/util.h>
+#endif
+
+namespace executorch::extension::llm {
+
+using ::executorch::extension::clone_tensor_ptr_to;
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
+using ::executorch::extension::TensorPtr;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::Result;
+using SizesType = executorch::aten::SizesType;
+
+namespace {
+
+#ifdef EXECUTORCH_BUILD_MLX
+constexpr const char* kPrefillMethod = "forward";
+constexpr const char* kDecodeMethod = "forward";
+#else
+constexpr const char* kPrefillMethod = "prefill";
+constexpr const char* kDecodeMethod = "decode";
+#endif
+
+constexpr const char* kMaxPrefillChunk = "get_max_prefill_chunk";
+constexpr const char* kMinPrefillChunk = "get_min_prefill_chunk";
+
+Result<uint64_t> read_sampled_token(
+    const executorch::aten::Tensor& output,
+    float temperature) {
+#ifdef EXECUTORCH_BUILD_CUDA
+  (void)temperature;
+  const void* ptr = output.const_data_ptr();
+  cudaPointerAttributes attrs{};
+  const bool on_device = cudaPointerGetAttributes(&attrs, ptr) == cudaSuccess &&
+      attrs.type == cudaMemoryTypeDevice;
+
+  auto copy_scalar = [&](void* dst, size_t nbytes) -> Error {
+    if (on_device) {
+      if (cudaMemcpy(dst, ptr, nbytes, cudaMemcpyDeviceToHost) != cudaSuccess) {
+        return Error::Internal;
+      }
+    } else {
+      std::memcpy(dst, ptr, nbytes);
+    }
+    return Error::Ok;
+  };
+
+  if (output.scalar_type() == executorch::aten::ScalarType::Long) {
+    int64_t val = 0;
+    if (copy_scalar(&val, sizeof(val)) != Error::Ok) {
+      ET_LOG(Error, "read_sampled_token: cudaMemcpy D2H failed");
+      return Error::Internal;
+    }
+    return static_cast<uint64_t>(val);
+  }
+  if (output.scalar_type() == executorch::aten::ScalarType::Float) {
+    float val = 0.0f;
+    if (copy_scalar(&val, sizeof(val)) != Error::Ok) {
+      ET_LOG(Error, "read_sampled_token: cudaMemcpy D2H failed");
+      return Error::Internal;
+    }
+    return static_cast<uint64_t>(llrintf(val));
+  }
+  ET_LOG(
+      Error,
+      "read_sampled_token: expected Long or Float scalar output, got %d",
+      static_cast<int>(output.scalar_type()));
+  return Error::InvalidArgument;
+#else
+  return static_cast<uint64_t>(
+      logits_to_token(output, temperature < 0.0f ? 0.0f : temperature));
+#endif
+}
+
+Result<std::unique_ptr<Module>> build_gemma_module(
+    const Gemma4_31BConfig& config) {
+  std::vector<std::string> data_files;
+  if (!config.data_path.empty()) {
+    data_files.push_back(config.data_path);
+  }
+  auto module = std::make_unique<Module>(
+      config.model_path,
+      data_files,
+      Module::LoadMode::MmapUseMlockIgnoreErrors,
+      /*event_tracer=*/nullptr,
+      /*memory_allocator=*/nullptr,
+      /*temp_allocator=*/nullptr);
+
+#ifdef EXECUTORCH_BUILD_CUDA
+  if (config.enable_cuda_graph) {
+    executorch::runtime::BackendOptions<2> cuda_opts;
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        cuda_opts.set_option("enable_cuda_graph_for_method", "decode"));
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        executorch::runtime::set_option("CudaBackend", cuda_opts.view()));
+    ET_LOG(Info, "Gemma4_31BEngine: CUDA graph enabled for decode method");
+  }
+  {
+    executorch::runtime::BackendOptions<1> backend_options;
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        backend_options.set_option("weight_sharing_across_methods", true));
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        executorch::runtime::set_option("CudaBackend", backend_options.view()));
+  }
+#endif
+
+  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kPrefillMethod));
+  if (std::string(kDecodeMethod) != std::string(kPrefillMethod)) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kDecodeMethod));
+  }
+  return module;
+}
+
+void add_token_piece(
+    ::tokenizers::Tokenizer* tokenizer,
+    std::unordered_set<uint64_t>& ids,
+    const char* piece) {
+  if (auto id = tokenizer->piece_to_id(piece); id.ok()) {
+    ids.insert(*id);
+  }
+}
+
+#ifdef EXECUTORCH_BUILD_CUDA
+Error register_mutable_fqns(
+    Module* module,
+    ::executorch::backends::cuda::MutableStateContextOwner& mutable_state) {
+  auto res = module->execute("get_mutable_buffer_metadata");
+  if (res.error() != Error::Ok) {
+    ET_LOG(
+        Info,
+        "Gemma4_31BEngine: model has no get_mutable_buffer_metadata; "
+        "multi-session disabled");
+    return res.error();
+  }
+  const auto& outs = res.get();
+  if (outs.empty() || !outs[0].isString()) {
+    ET_LOG(Error, "get_mutable_buffer_metadata did not return a string");
+    return Error::InvalidProgram;
+  }
+  std::string json_str(outs[0].toString());
+  auto j = nlohmann::json::parse(json_str, nullptr, /*allow_exceptions=*/false);
+  if (j.is_discarded() || !j.is_object()) {
+    ET_LOG(Error, "get_mutable_buffer_metadata is not a valid JSON object");
+    return Error::InvalidProgram;
+  }
+  if (!j.contains("version") || !j["version"].is_number_integer() ||
+      j["version"].get<int>() != 1) {
+    ET_LOG(Error, "get_mutable_buffer_metadata: unsupported/missing version");
+    return Error::InvalidProgram;
+  }
+  if (!j.contains("mutable_buffers") || !j["mutable_buffers"].is_array() ||
+      j["mutable_buffers"].empty()) {
+    ET_LOG(
+        Error,
+        "get_mutable_buffer_metadata: mutable_buffers must be a non-empty array");
+    return Error::InvalidProgram;
+  }
+  std::vector<std::string> fqns;
+  for (const auto& f : j["mutable_buffers"]) {
+    if (!f.is_string() || f.get<std::string>().empty()) {
+      ET_LOG(
+          Error,
+          "get_mutable_buffer_metadata: every mutable_buffers entry must be a "
+          "non-empty string");
+      return Error::InvalidProgram;
+    }
+    fqns.push_back(f.get<std::string>());
+  }
+  mutable_state.register_fqns(fqns);
+  return Error::Ok;
+}
+#endif
+
+class Gemma4_31BSession : public LLMSession {
+ public:
+  Gemma4_31BSession(
+      Module* module,
+      std::mutex* exec_mutex,
+      std::atomic<int>* live_sessions,
+      ::tokenizers::Tokenizer* tokenizer,
+      std::unordered_map<std::string, int64_t> metadata,
+      std::unordered_set<uint64_t> eos_ids,
+      int64_t max_prefill_chunk,
+      int64_t min_prefill_chunk
+#ifdef GEMMA_HAS_MUTABLE_STATE
+      ,
+      GemmaMutableStateContextOwner* mutable_state,
+      int session_token
+#endif
+      )
+      : module_(module),
+        exec_mutex_(exec_mutex),
+        live_sessions_(live_sessions),
+        tokenizer_(tokenizer),
+        metadata_(std::move(metadata)),
+        eos_ids_(std::move(eos_ids)),
+        max_prefill_chunk_(max_prefill_chunk),
+        min_prefill_chunk_(min_prefill_chunk)
+#ifdef GEMMA_HAS_MUTABLE_STATE
+        ,
+        mutable_state_(mutable_state),
+        session_token_(session_token)
+#endif
+  {
+    decode_tokens_ = from_blob(
+        decode_token_data_, {1, 1}, executorch::aten::ScalarType::Long);
+    decode_pos_ =
+        from_blob(decode_pos_data_, {1}, executorch::aten::ScalarType::Long);
+#ifdef EXECUTORCH_BUILD_CUDA
+    decode_tokens_dev_ = clone_tensor_ptr_to(decode_tokens_, cuda_device_);
+    decode_pos_dev_ = clone_tensor_ptr_to(decode_pos_, cuda_device_);
+    auto temp_host =
+        from_blob(&temp_val_, {1}, executorch::aten::ScalarType::Float);
+    temp_tensor_dev_ = clone_tensor_ptr_to(temp_host, cuda_device_);
+#endif
+  }
+
+  ~Gemma4_31BSession() override {
+#ifdef GEMMA_HAS_MUTABLE_STATE
+    if (mutable_state_ != nullptr && session_token_ != kGemmaNoMutableSession) {
+      mutable_state_->destroy_session(session_token_);
+    }
+#endif
+    if (live_sessions_ != nullptr) {
+      live_sessions_->fetch_sub(1);
+    }
+  }
+
+  Error prefill_tokens(
+      const std::vector<uint64_t>& tokens,
+      const SamplingConfig* initial_sampling) override {
+    if (tokens.empty()) {
+      ET_LOG(Error, "prefill_tokens: empty token list");
+      return Error::InvalidArgument;
+    }
+    float first_token_temp = temperature_;
+    if (initial_sampling != nullptr) {
+      if (initial_sampling->top_p != 1.0f || initial_sampling->top_k != 0 ||
+          initial_sampling->seed != 0) {
+        ET_LOG(
+            Error,
+            "Gemma4_31BSession: only temperature is supported; top_p/top_k/seed "
+            "are not implemented");
+        return Error::NotSupported;
+      }
+      first_token_temp = initial_sampling->temperature;
+    }
+    if (!valid_temperature(first_token_temp)) {
+      ET_LOG(Error, "prefill_tokens: temperature must be -1 or in [0, 1]");
+      return Error::InvalidArgument;
+    }
+
+    const int64_t T = static_cast<int64_t>(tokens.size());
+    const auto ctx_it = metadata_.find(kMaxContextLen);
+    if (ctx_it != metadata_.end() && pos_ + T >= ctx_it->second) {
+      ET_LOG(
+          Error,
+          "prefill_tokens would leave no room to generate (pos %" PRId64
+          " + %" PRId64 " >= max_context %" PRId64 ")",
+          pos_,
+          T,
+          ctx_it->second);
+      return Error::InvalidArgument;
+    }
+
+    stop_.store(false, std::memory_order_relaxed);
+    int64_t offset = 0;
+    while (offset < T) {
+      int64_t chunk = T - offset;
+      if (max_prefill_chunk_ > 0) {
+        chunk = std::min(chunk, max_prefill_chunk_);
+      }
+#ifdef EXECUTORCH_BUILD_CUDA
+      if (chunk > 1 && chunk < min_prefill_chunk_) {
+        chunk = 1;
+      }
+#endif
+      auto sampled =
+          run_prefill_chunk(tokens.data() + offset, chunk, first_token_temp);
+      ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
+      pending_ = sampled.get();
+      pos_ += chunk;
+      offset += chunk;
+    }
+    prev_decode_token_ = tokens.back();
+    return Error::Ok;
+  }
+
+  Result<DecodeResult> decode_one(const SamplingConfig& sampling) override {
+    if (sampling.top_p != 1.0f || sampling.top_k != 0 || sampling.seed != 0) {
+      ET_LOG(
+          Error,
+          "Gemma4_31BSession: only temperature is supported; top_p/top_k/seed "
+          "are not implemented");
+      return Error::NotSupported;
+    }
+    if (!valid_temperature(sampling.temperature)) {
+      ET_LOG(Error, "decode_one: temperature must be -1 or in [0, 1]");
+      return Error::InvalidArgument;
+    }
+    ET_CHECK_OR_RETURN_ERROR(
+        pending_.has_value(),
+        InvalidState,
+        "decode_one requires a pending token; call prefill_tokens() first");
+    temperature_ = sampling.temperature;
+
+    if (stop_.load(std::memory_order_relaxed)) {
+      return DecodeResult{0, "", /*is_eos=*/false, /*is_terminal=*/true};
+    }
+
+    const uint64_t token = pending_.value();
+    const bool is_eos = eos_ids_.find(token) != eos_ids_.end();
+    const uint64_t prev = prev_decode_token_.value_or(token);
+    auto dec = tokenizer_->decode(prev, token);
+    if (!dec.ok()) {
+      ET_LOG(
+          Error,
+          "Tokenizers error code %d",
+          static_cast<uint32_t>(dec.error()));
+      return Error::InvalidArgument;
+    }
+    std::string text_piece = std::move(*dec);
+
+    if (is_eos) {
+      pending_.reset();
+      return DecodeResult{
+          token, std::move(text_piece), is_eos, /*is_terminal=*/true};
+    }
+
+    const auto ctx_it = metadata_.find(kMaxContextLen);
+    if (ctx_it != metadata_.end()) {
+      ET_CHECK_OR_RETURN_ERROR(
+          pos_ < ctx_it->second,
+          InvalidArgument,
+          "decode_one would exceed context capacity: pos_ %" PRId64
+          " >= max_context %" PRId64,
+          pos_,
+          ctx_it->second);
+    }
+
+    decode_token_data_[0] = static_cast<int64_t>(token);
+    decode_pos_data_[0] = pos_;
+    std::vector<EValue> inputs;
+#ifdef EXECUTORCH_BUILD_CUDA
+    ET_CHECK_OK_OR_RETURN_ERROR(copy_decode_inputs_to_cuda());
+    ET_CHECK_OK_OR_RETURN_ERROR(set_temperature(temperature_));
+    inputs.push_back(EValue(decode_tokens_dev_));
+    inputs.push_back(EValue(decode_pos_dev_));
+    inputs.push_back(EValue(temp_tensor_dev_));
+#else
+    inputs.push_back(EValue(decode_tokens_));
+    inputs.push_back(EValue(decode_pos_));
+#endif
+    auto sampled =
+        run_locked(kDecodeMethod, inputs, temperature_, /*sync_after=*/false);
+    ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
+    pending_ = sampled.get();
+    prev_decode_token_ = token;
+    pos_ += 1;
+    return DecodeResult{
+        token, std::move(text_piece), /*is_eos=*/false, /*is_terminal=*/false};
+  }
+
+  int64_t position() const override {
+    return pos_;
+  }
+
+  Error reset() override {
+    pos_ = 0;
+    pending_.reset();
+    prev_decode_token_.reset();
+    stop_.store(false, std::memory_order_relaxed);
+    return Error::Ok;
+  }
+
+  void stop() override {
+    stop_.store(true, std::memory_order_relaxed);
+  }
+
+ private:
+  static bool valid_temperature(float temperature) {
+    return temperature == -1.0f || (temperature >= 0.0f && temperature <= 1.0f);
+  }
+
+  Result<uint64_t>
+  run_prefill_chunk(const uint64_t* tokens, int64_t T, float temperature) {
+    std::vector<int64_t> token_data(tokens, tokens + T);
+    std::vector<int64_t> pos_data(T);
+    for (int64_t i = 0; i < T; ++i) {
+      pos_data[i] = pos_ + i;
+    }
+    auto tokens_tensor = from_blob(
+        token_data.data(),
+        {1, static_cast<SizesType>(T)},
+        executorch::aten::ScalarType::Long);
+    auto pos_tensor = from_blob(
+        pos_data.data(),
+        {static_cast<SizesType>(T)},
+        executorch::aten::ScalarType::Long);
+    std::vector<EValue> inputs;
+    TensorPtr token_input = tokens_tensor;
+    TensorPtr pos_input = pos_tensor;
+#ifdef EXECUTORCH_BUILD_CUDA
+    std::vector<TensorPtr> device_inputs;
+    token_input = to_cuda(token_input, device_inputs);
+    pos_input = to_cuda(pos_input, device_inputs);
+    ET_CHECK_OK_OR_RETURN_ERROR(set_temperature(temperature));
+#endif
+    inputs.push_back(EValue(token_input));
+    inputs.push_back(EValue(pos_input));
+#ifdef EXECUTORCH_BUILD_CUDA
+    inputs.push_back(EValue(temp_tensor_dev_));
+    const char* method =
+        (T >= min_prefill_chunk_) ? kPrefillMethod : kDecodeMethod;
+#else
+    const char* method = kPrefillMethod;
+#endif
+    return run_locked(method, inputs, temperature, /*sync_after=*/true);
+  }
+
+#ifdef EXECUTORCH_BUILD_CUDA
+  TensorPtr to_cuda(TensorPtr tensor, std::vector<TensorPtr>& keep_alive) {
+    keep_alive.push_back(clone_tensor_ptr_to(tensor, cuda_device_));
+    return keep_alive.back();
+  }
+
+  Error set_temperature(float temperature) {
+    temp_val_ = (temperature <= 0.0f) ? 1e-6f : temperature;
+    if (cudaMemcpy(
+            temp_tensor_dev_->mutable_data_ptr(),
+            &temp_val_,
+            sizeof(float),
+            cudaMemcpyHostToDevice) != cudaSuccess) {
+      ET_LOG(Error, "set_temperature: cudaMemcpy H2D failed");
+      return Error::Internal;
+    }
+    return Error::Ok;
+  }
+
+  Error copy_decode_inputs_to_cuda() {
+    if (cudaMemcpy(
+            decode_tokens_dev_->mutable_data_ptr(),
+            decode_token_data_,
+            sizeof(int64_t),
+            cudaMemcpyHostToDevice) != cudaSuccess) {
+      ET_LOG(Error, "copy_decode_inputs_to_cuda: token H2D failed");
+      return Error::Internal;
+    }
+    if (cudaMemcpy(
+            decode_pos_dev_->mutable_data_ptr(),
+            decode_pos_data_,
+            sizeof(int64_t),
+            cudaMemcpyHostToDevice) != cudaSuccess) {
+      ET_LOG(Error, "copy_decode_inputs_to_cuda: pos H2D failed");
+      return Error::Internal;
+    }
+    return Error::Ok;
+  }
+#endif
+
+  Result<uint64_t> run_locked(
+      const char* method,
+      std::vector<EValue>& inputs,
+      float temperature,
+      bool sync_after) {
+    std::lock_guard<std::mutex> guard(*exec_mutex_);
+#ifdef GEMMA_HAS_MUTABLE_STATE
+    auto res = mutable_state_ != nullptr
+        ? mutable_state_->with_active_session(
+              session_token_,
+              [&]() { return module_->execute(method, inputs); })
+        : module_->execute(method, inputs);
+#else
+    auto res = module_->execute(method, inputs);
+#endif
+    ET_CHECK_OK_OR_RETURN_ERROR(res.error());
+    auto sampled = read_sampled_token(res.get()[0].toTensor(), temperature);
+    ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
+#ifdef EXECUTORCH_BUILD_CUDA
+    if (sync_after && cudaDeviceSynchronize() != cudaSuccess) {
+      ET_LOG(Error, "run_locked: cudaDeviceSynchronize failed");
+      return Error::Internal;
+    }
+#else
+    (void)sync_after;
+#endif
+    return sampled.get();
+  }
+
+  Module* module_;
+  std::mutex* exec_mutex_;
+  std::atomic<int>* live_sessions_;
+  ::tokenizers::Tokenizer* tokenizer_;
+  std::unordered_map<std::string, int64_t> metadata_;
+  std::unordered_set<uint64_t> eos_ids_;
+  int64_t max_prefill_chunk_;
+  int64_t min_prefill_chunk_;
+
+  int64_t pos_ = 0;
+  std::optional<uint64_t> pending_;
+  std::optional<uint64_t> prev_decode_token_;
+  float temperature_ = -1.0f;
+  std::atomic<bool> stop_{false};
+
+  int64_t decode_token_data_[1] = {0};
+  int64_t decode_pos_data_[1] = {0};
+  TensorPtr decode_tokens_;
+  TensorPtr decode_pos_;
+#ifdef GEMMA_HAS_MUTABLE_STATE
+  GemmaMutableStateContextOwner* mutable_state_ = nullptr;
+  int session_token_ = kGemmaNoMutableSession;
+#endif
+#ifdef EXECUTORCH_BUILD_CUDA
+  float temp_val_ = 1e-6f;
+  executorch::aten::Device cuda_device_ =
+      executorch::aten::Device(executorch::aten::DeviceType::CUDA, 0);
+  TensorPtr decode_tokens_dev_;
+  TensorPtr decode_pos_dev_;
+  TensorPtr temp_tensor_dev_;
+#endif
+};
+
+} // namespace
+
+Result<std::unique_ptr<Gemma4_31BEngine>> Gemma4_31BEngine::create(
+    const Gemma4_31BConfig& config) {
+  if (config.model_path.empty() || config.tokenizer_path.empty()) {
+    ET_LOG(
+        Error, "Gemma4_31BEngine: model_path and tokenizer_path are required");
+    return Error::InvalidArgument;
+  }
+
+  auto tokenizer = std::make_unique<::tokenizers::HFTokenizer>();
+  if (tokenizer->load(config.tokenizer_path) != ::tokenizers::Error::Ok) {
+    ET_LOG(Error, "Gemma4_31BEngine: failed to load tokenizer");
+    return Error::InvalidArgument;
+  }
+
+  std::vector<std::string> data_files;
+  if (!config.data_path.empty()) {
+    data_files.push_back(config.data_path);
+  }
+  auto meta_module = std::make_unique<Module>(
+      config.model_path, data_files, Module::LoadMode::File);
+  auto metadata_result = get_llm_metadata(tokenizer.get(), meta_module.get());
+  if (metadata_result.error() != Error::Ok) {
+    ET_LOG(Error, "Gemma4_31BEngine: failed to read metadata");
+    return metadata_result.error();
+  }
+
+  auto eos_ids = get_eos_ids(tokenizer.get(), meta_module.get());
+  eos_ids.insert(static_cast<uint64_t>(config.eos_id));
+  add_token_piece(tokenizer.get(), eos_ids, "<turn|>");
+  add_token_piece(tokenizer.get(), eos_ids, "<end_of_turn>");
+
+  auto metadata = metadata_result.get();
+  int64_t max_prefill_chunk = 1;
+  auto max_ctx_it = metadata.find(kMaxContextLen);
+  if (max_ctx_it != metadata.end() && max_ctx_it->second > 1) {
+    max_prefill_chunk = max_ctx_it->second - 1;
+  }
+  if (auto get_result = meta_module->get(kMaxPrefillChunk); get_result.ok()) {
+    max_prefill_chunk = get_result->toScalar().to<int64_t>();
+    metadata[kMaxPrefillChunk] = max_prefill_chunk;
+  }
+
+  int64_t min_prefill_chunk = 1;
+#ifdef EXECUTORCH_BUILD_CUDA
+  min_prefill_chunk = 5;
+  if (auto get_result = meta_module->get(kMinPrefillChunk); get_result.ok()) {
+    min_prefill_chunk = get_result->toScalar().to<int64_t>();
+  }
+  metadata[kMinPrefillChunk] = min_prefill_chunk;
+#endif
+
+#ifdef GEMMA_HAS_MUTABLE_STATE
+  std::unique_ptr<GemmaMutableStateContextOwner> mutable_state;
+#endif
+#ifdef EXECUTORCH_BUILD_CUDA
+  if (config.enable_cuda_graph) {
+    ET_LOG(
+        Info,
+        "Gemma4_31BEngine: CUDA graph requested; per-session rebinding "
+        "disabled and serving capacity clamped to 1 session.");
+  } else {
+    auto candidate = std::make_unique<GemmaMutableStateContextOwner>();
+    if (Error e = register_mutable_fqns(meta_module.get(), *candidate);
+        e == Error::Ok) {
+      mutable_state = std::move(candidate);
+    } else {
+      ET_LOG(
+          Info,
+          "Gemma4_31BEngine: mutable-buffer metadata unavailable or invalid; "
+          "serving capacity clamped to 1 session.");
+    }
+  }
+#elif defined(EXECUTORCH_BUILD_MLX)
+  mutable_state = std::make_unique<GemmaMutableStateContextOwner>();
+#endif
+
+#ifdef GEMMA_HAS_MUTABLE_STATE
+  auto module_res = mutable_state != nullptr
+      ? mutable_state->with_load_scope(
+            [&]() { return build_gemma_module(config); })
+      : build_gemma_module(config);
+#else
+  auto module_res = build_gemma_module(config);
+#endif
+  if (module_res.error() != Error::Ok) {
+    return module_res.error();
+  }
+  std::unique_ptr<Module> shared_module = std::move(module_res.get());
+
+  bool rebind_available = false;
+#ifdef GEMMA_HAS_MUTABLE_STATE
+  rebind_available = mutable_state != nullptr && mutable_state->available();
+  if (rebind_available && mutable_state->validate_coverage() != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Gemma4_31BEngine: mutable-buffer coverage check failed; disabling "
+        "multi-session (capacity clamped to 1).");
+    rebind_available = false;
+  }
+  if (!rebind_available) {
+    ET_LOG(
+        Info,
+        "Gemma4_31BEngine: per-session rebinding unavailable; serving capacity "
+        "clamped to 1 session.");
+  }
+#endif
+
+  return std::unique_ptr<Gemma4_31BEngine>(new Gemma4_31BEngine(
+      config,
+      std::move(tokenizer),
+      std::move(metadata),
+      std::move(eos_ids),
+      std::move(shared_module),
+      max_prefill_chunk,
+      min_prefill_chunk,
+      rebind_available
+#ifdef GEMMA_HAS_MUTABLE_STATE
+      ,
+      std::move(mutable_state)
+#endif
+          ));
+}
+
+Gemma4_31BEngine::~Gemma4_31BEngine() = default;
+
+Result<std::unique_ptr<LLMSession>> Gemma4_31BEngine::create_session() {
+  const int cap =
+      serving_capacity().max_physical_sessions_without_weight_duplication;
+  {
+    std::lock_guard<std::mutex> g(exec_mutex_);
+    if (live_sessions_.load() >= cap) {
+      ET_LOG(
+          Error,
+          "Gemma4_31BEngine: at session capacity (%d); refusing create_session",
+          cap);
+      return Error::InvalidState;
+    }
+    live_sessions_.fetch_add(1);
+  }
+
+  int token = -1;
+#ifdef GEMMA_HAS_MUTABLE_STATE
+  if (rebind_available_) {
+    auto t = mutable_state_->create_session();
+    if (t.error() != Error::Ok) {
+      live_sessions_.fetch_sub(1);
+      return t.error();
+    }
+    token = t.get();
+  }
+#endif
+
+  return std::unique_ptr<LLMSession>(new Gemma4_31BSession(
+      shared_module_.get(),
+      &exec_mutex_,
+      &live_sessions_,
+      tokenizer_.get(),
+      metadata_,
+      eos_ids_,
+      max_prefill_chunk_,
+      min_prefill_chunk_
+#ifdef GEMMA_HAS_MUTABLE_STATE
+      ,
+      mutable_state_.get(),
+      token
+#endif
+      ));
+}
+
+LLMServingCapacity Gemma4_31BEngine::serving_capacity() const {
+  LLMServingCapacity cap;
+#ifdef GEMMA_HAS_MUTABLE_STATE
+  if (rebind_available_) {
+    cap.max_physical_sessions_without_weight_duplication =
+        config_.max_sessions > 1 ? config_.max_sessions : 1;
+    cap.estimated_bytes_per_session = mutable_state_->bytes_per_session();
+  }
+#endif
+  return cap;
+}
+
+} // namespace executorch::extension::llm
diff --git a/examples/models/gemma4_31b/gemma4_31b_engine.h b/examples/models/gemma4_31b/gemma4_31b_engine.h
new file mode 100644
index 00000000000..aa67ae5380c
--- /dev/null
+++ b/examples/models/gemma4_31b/gemma4_31b_engine.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Engine/Session adapter for Gemma 4 31B exported methods. CUDA builds use
+// separate prefill/decode methods; MLX builds use one dynamic forward method.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include <executorch/extension/llm/runner/llm_session.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/result.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+#ifdef EXECUTORCH_BUILD_CUDA
+#include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
+#elif defined(EXECUTORCH_BUILD_MLX)
+#include <executorch/backends/mlx/runtime/mlx_mutable_state.h>
+#endif
+
+#if defined(EXECUTORCH_BUILD_CUDA) || defined(EXECUTORCH_BUILD_MLX)
+#define GEMMA_HAS_MUTABLE_STATE 1
+#endif
+
+namespace executorch::extension::llm {
+
+#if defined(EXECUTORCH_BUILD_CUDA)
+using GemmaMutableStateContextOwner =
+    ::executorch::backends::cuda::MutableStateContextOwner;
+constexpr int kGemmaNoMutableSession =
+    ::executorch::backends::cuda::kNoMutableSession;
+#elif defined(EXECUTORCH_BUILD_MLX)
+using GemmaMutableStateContextOwner =
+    ::executorch::backends::mlx::MutableStateContextOwner;
+constexpr int kGemmaNoMutableSession =
+    ::executorch::backends::mlx::kNoMutableSession;
+#endif
+
+struct Gemma4_31BConfig {
+  std::string model_path;
+  std::string data_path;
+  std::string tokenizer_path;
+  int32_t max_sessions = 1;
+  int64_t eos_id = 1;
+  bool enable_cuda_graph = false;
+};
+
+class ET_EXPERIMENTAL Gemma4_31BEngine : public LLMEngine {
+ public:
+  static ::executorch::runtime::Result<std::unique_ptr<Gemma4_31BEngine>>
+  create(const Gemma4_31BConfig& config);
+
+  ~Gemma4_31BEngine() override;
+
+  ::executorch::runtime::Result<std::unique_ptr<LLMSession>> create_session()
+      override;
+
+  LLMServingCapacity serving_capacity() const override;
+
+  const std::unordered_map<std::string, int64_t>& metadata() const override {
+    return metadata_;
+  }
+
+  ::tokenizers::Tokenizer* tokenizer() const {
+    return tokenizer_.get();
+  }
+
+  Gemma4_31BEngine(const Gemma4_31BEngine&) = delete;
+  Gemma4_31BEngine& operator=(const Gemma4_31BEngine&) = delete;
+
+ private:
+  Gemma4_31BEngine(
+      Gemma4_31BConfig config,
+      std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+      std::unordered_map<std::string, int64_t> metadata,
+      std::unordered_set<uint64_t> eos_ids,
+      std::unique_ptr<Module> shared_module,
+      int64_t max_prefill_chunk,
+      int64_t min_prefill_chunk,
+      bool rebind_available
+#ifdef GEMMA_HAS_MUTABLE_STATE
+      ,
+      std::unique_ptr<GemmaMutableStateContextOwner> mutable_state
+#endif
+      )
+      : config_(std::move(config)),
+        tokenizer_(std::move(tokenizer)),
+        metadata_(std::move(metadata)),
+        eos_ids_(std::move(eos_ids)),
+        shared_module_(std::move(shared_module)),
+        max_prefill_chunk_(max_prefill_chunk),
+        min_prefill_chunk_(min_prefill_chunk),
+        rebind_available_(rebind_available)
+#ifdef GEMMA_HAS_MUTABLE_STATE
+        ,
+        mutable_state_(std::move(mutable_state))
+#endif
+  {
+  }
+
+  Gemma4_31BConfig config_;
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
+  std::unordered_map<std::string, int64_t> metadata_;
+  std::unordered_set<uint64_t> eos_ids_;
+  std::unique_ptr<Module> shared_module_;
+  std::mutex exec_mutex_;
+  int64_t max_prefill_chunk_ = 0;
+  int64_t min_prefill_chunk_ = 1;
+  bool rebind_available_ = false;
+#ifdef GEMMA_HAS_MUTABLE_STATE
+  std::unique_ptr<GemmaMutableStateContextOwner> mutable_state_;
+#endif
+  std::atomic<int> live_sessions_{0};
+};
+
+} // namespace executorch::extension::llm
diff --git a/examples/models/gemma4_31b/gemma4_31b_worker.cpp b/examples/models/gemma4_31b/gemma4_31b_worker.cpp
new file mode 100644
index 00000000000..f10a588e668
--- /dev/null
+++ b/examples/models/gemma4_31b/gemma4_31b_worker.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gflags/gflags.h>
+
+#include <executorch/examples/llm_server/cpp/worker_loop.h>
+#include <executorch/examples/models/gemma4_31b/gemma4_31b_engine.h>
+#include <executorch/runtime/platform/log.h>
+
+DEFINE_string(model_path, "", "Model .pte file path.");
+DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
+DEFINE_string(data_path, "", "Data file (.ptd) for delegated weights.");
+DEFINE_int32(
+    max_sessions,
+    1,
+    "Max physical sessions to host on one weight allocation. Clamped to 1 if "
+    "the backend cannot isolate per-session mutable state.");
+DEFINE_bool(
+    warm_resume,
+    true,
+    "Warm append-only resume for named sessions when the engine supports them.");
+DEFINE_int32(bos_id, 2, "BOS token id to prepend to server-rendered prompts.");
+DEFINE_int32(eos_id, 1, "EOS token id (Gemma convention: 1).");
+
+namespace {
+namespace llm = ::executorch::extension::llm;
+using ::executorch::runtime::Error;
+} // namespace
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_path.empty() || FLAGS_tokenizer_path.empty()) {
+    ET_LOG(
+        Error, "gemma4_31b_worker: --model_path and --tokenizer_path required");
+    return 1;
+  }
+
+  llm::Gemma4_31BConfig config;
+  config.model_path = FLAGS_model_path;
+  config.data_path = FLAGS_data_path;
+  config.tokenizer_path = FLAGS_tokenizer_path;
+  config.max_sessions = FLAGS_max_sessions;
+  config.eos_id = FLAGS_eos_id;
+
+  auto engine_result = llm::Gemma4_31BEngine::create(config);
+  if (engine_result.error() != Error::Ok) {
+    ET_LOG(Error, "gemma4_31b_worker: failed to create engine");
+    return 1;
+  }
+  auto engine = std::move(engine_result.get());
+
+  return llm::run_worker_stdio_loop(
+      *engine,
+      *engine->tokenizer(),
+      engine->metadata(),
+      FLAGS_warm_resume,
+      {static_cast<uint64_t>(FLAGS_bos_id)});
+}
diff --git a/examples/models/gemma4_31b/main.cpp b/examples/models/gemma4_31b/main.cpp
index 3d9970b1610..15c8719636c 100644
--- a/examples/models/gemma4_31b/main.cpp
+++ b/examples/models/gemma4_31b/main.cpp
@@ -6,35 +6,15 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// Gemma 4 31B-IT runner for ExecuTorch. Supports two backends:
-//   CUDA  — exports ``prefill`` (T>=2, dynamic) + ``decode`` (T=1, static)
-//           methods sharing KV-cache buffers; on-device Gumbel-max sampling
-//           with temperature passed as a third input; returns a scalar
-//           float token id.
-//   MLX   — exports a single ``forward`` method with dynamic seq_len;
-//           returns last-token logits; the runner samples on the host via
-//           ``llm::logits_to_token`` with the same temperature semantics.
-
 #include <gflags/gflags.h>
 
-#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/examples/models/gemma4_31b/gemma4_31b_engine.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/llm/sampler/util.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/extension/tensor/tensor_ptr.h>
-#include <executorch/runtime/backend/interface.h>
-#include <executorch/runtime/backend/options.h>
-#include <executorch/runtime/core/portable_type/device.h>
-#include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
-#include <pytorch/tokenizers/hf_tokenizer.h>
 
 #include <cinttypes>
-#include <cmath>
 #include <cstdio>
-#include <cstring>
 #include <fstream>
 #include <string>
 #include <vector>
@@ -81,61 +61,38 @@ DEFINE_bool(
     "Enable CUDA graph capture for the decode method. CUDA only.");
 
 namespace llm = ::executorch::extension::llm;
-using ::executorch::extension::from_blob;
-using ::executorch::extension::make_tensor_ptr;
-using ::executorch::extension::Module;
-using ::executorch::extension::TensorPtr;
 using ::executorch::runtime::Error;
-using ::executorch::runtime::EValue;
-#ifdef EXECUTORCH_BUILD_CUDA
-using ::executorch::extension::clone_tensor_ptr_to;
-#endif
-
-using SizesType = executorch::aten::SizesType;
 
-// Read a sampled token ID from a scalar int64 output (CUDA path).
-//
-// The model now emits the sampled token as int64 (see sampler.py), matching
-// the decode method's int64 token input so the on-device output buffer can be
-// aliased directly as the next step's input. We still copy the 8-byte scalar
-// back to the host here for EOS detection and detokenization.
-static uint64_t read_token(const executorch::aten::Tensor& output) {
-  const void* ptr = output.const_data_ptr();
-  int64_t val = 0;
+namespace {
 
-#ifdef EXECUTORCH_BUILD_CUDA
-  cudaPointerAttributes attrs{};
-  bool on_device = cudaPointerGetAttributes(&attrs, ptr) == cudaSuccess &&
-      attrs.type == cudaMemoryTypeDevice;
-  if (on_device) {
-    cudaError_t err =
-        cudaMemcpy(&val, ptr, sizeof(int64_t), cudaMemcpyDeviceToHost);
-    if (err != cudaSuccess) {
-      ET_LOG(
-          Error,
-          "read_token: cudaMemcpy D2H failed: %s",
-          cudaGetErrorString(err));
-      return 0;
-    }
-  } else {
-    memcpy(&val, ptr, sizeof(int64_t));
+std::string read_prompt() {
+  if (FLAGS_prompt_file.empty()) {
+    return FLAGS_prompt;
   }
-#else
-  memcpy(&val, ptr, sizeof(int64_t));
-#endif
+  std::ifstream f(FLAGS_prompt_file);
+  if (!f.is_open()) {
+    ET_LOG(Error, "Failed to open prompt file: %s", FLAGS_prompt_file.c_str());
+    return "";
+  }
+  return std::string(
+      (std::istreambuf_iterator<char>(f)), std::istreambuf_iterator<char>());
+}
 
-  return static_cast<uint64_t>(val);
+std::string format_prompt(std::string prompt) {
+  if (FLAGS_raw_prompt) {
+    return prompt;
+  }
+  return "<|turn>user\n" + prompt +
+      "<turn|>\n<|turn>model\n<|channel>thought\n<channel|>";
 }
 
+} // namespace
+
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-  if (FLAGS_model_path.empty()) {
-    ET_LOG(Error, "Must specify --model_path");
-    return 1;
-  }
-  if (FLAGS_tokenizer_path.empty()) {
-    ET_LOG(Error, "Must specify --tokenizer_path");
+  if (FLAGS_model_path.empty() || FLAGS_tokenizer_path.empty()) {
+    ET_LOG(Error, "--model_path and --tokenizer_path are required");
     return 1;
   }
 
@@ -146,106 +103,29 @@ int main(int argc, char** argv) {
   cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
   stats.gpu_total_bytes = gpu_total_bytes;
   stats.gpu_free_before_load_bytes = gpu_free_bytes;
-#endif
-
-  stats.model_load_start_ms = llm::time_in_ms();
-
-  // Tokenizer
-  auto tokenizer = std::make_unique<tokenizers::HFTokenizer>();
-  if (tokenizer->load(FLAGS_tokenizer_path) != tokenizers::Error::Ok) {
-    ET_LOG(
-        Error,
-        "Failed to load tokenizer from %s",
-        FLAGS_tokenizer_path.c_str());
-    return 1;
-  }
-
-  // Module
-  std::vector<std::string> data_files;
-  if (!FLAGS_data_path.empty()) {
-    data_files.push_back(FLAGS_data_path);
-  }
-  auto module = std::make_unique<Module>(
-      FLAGS_model_path,
-      data_files,
-      Module::LoadMode::MmapUseMlockIgnoreErrors,
-      /*event_tracer=*/nullptr,
-      /*memory_allocator=*/nullptr,
-      /*temp_allocator=*/nullptr);
-
-  // Get metadata
-  auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
-  if (metadata_result.error() != Error::Ok) {
-    ET_LOG(Error, "Failed to read model metadata");
-    return 1;
-  }
-
-  int64_t max_prefill_chunk = (*metadata_result)[llm::kMaxSeqLen] - 1;
-  {
-    auto get_result = module->get("get_max_prefill_chunk");
-    if (get_result.ok()) {
-      max_prefill_chunk = get_result->toScalar().to<int64_t>();
-    }
-  }
-
-  auto S = [](int64_t v) -> SizesType { return static_cast<SizesType>(v); };
-
-  float temp_val =
-      FLAGS_temperature <= 0.0 ? 1e-6f : static_cast<float>(FLAGS_temperature);
-
-#ifdef EXECUTORCH_BUILD_CUDA
-  const auto cuda_device =
-      executorch::aten::Device(executorch::aten::DeviceType::CUDA, 0);
-  if (FLAGS_cuda_graph) {
-    executorch::runtime::BackendOptions<2> cuda_opts;
-    cuda_opts.set_option("enable_cuda_graph_for_method", "decode");
-    executorch::runtime::set_option("CudaBackend", cuda_opts.view());
-    printf("CUDA graph enabled for decode method\n");
-  }
-  {
-    executorch::runtime::BackendOptions<1> backend_options;
-    auto set_err =
-        backend_options.set_option("weight_sharing_across_methods", true);
-    if (set_err != Error::Ok) {
-      ET_LOG(
-          Error,
-          "Failed to set weight_sharing_across_methods: %d",
-          static_cast<int>(set_err));
-      return 1;
-    }
-    auto opt_err =
-        executorch::runtime::set_option("CudaBackend", backend_options.view());
-    if (opt_err != Error::Ok) {
-      ET_LOG(
-          Error,
-          "Failed to enable weight_sharing_across_methods: %d",
-          static_cast<int>(opt_err));
-      return 1;
-    }
-  }
-  printf("Loading methods...\n");
-  if (module->load_method("prefill") != Error::Ok) {
-    ET_LOG(Error, "Failed to load prefill method");
-    return 1;
-  }
-  if (module->load_method("decode") != Error::Ok) {
-    ET_LOG(Error, "Failed to load decode method");
-    return 1;
-  }
-  auto temp_tensor = clone_tensor_ptr_to(
-      from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float),
-      cuda_device);
 #else
   if (FLAGS_cuda_graph) {
     ET_LOG(Info, "--cuda_graph ignored on non-CUDA build");
   }
-  printf("Loading model...\n");
-  if (module->load_method("forward") != Error::Ok) {
-    ET_LOG(Error, "Failed to load forward method");
-    return 1;
-  }
 #endif
 
+  llm::Gemma4_31BConfig config;
+  config.model_path = FLAGS_model_path;
+  config.data_path = FLAGS_data_path;
+  config.tokenizer_path = FLAGS_tokenizer_path;
+  config.max_sessions = 1;
+  config.eos_id = FLAGS_eos_id;
+#ifdef EXECUTORCH_BUILD_CUDA
+  config.enable_cuda_graph = FLAGS_cuda_graph;
+#endif
+
+  stats.model_load_start_ms = llm::time_in_ms();
+  auto engine_result = llm::Gemma4_31BEngine::create(config);
+  if (engine_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to create Gemma4_31BEngine");
+    return 1;
+  }
+  auto engine = std::move(engine_result.get());
   stats.model_load_end_ms = llm::time_in_ms();
 
 #ifdef EXECUTORCH_BUILD_CUDA
@@ -253,240 +133,60 @@ int main(int argc, char** argv) {
   stats.gpu_free_after_load_bytes = gpu_free_bytes;
 #endif
 
-  auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get());
-  eos_ids.insert(static_cast<uint64_t>(FLAGS_eos_id));
-  auto turn_ids = tokenizer->encode("<turn|>", /*bos=*/0, /*eos=*/0);
-  if (turn_ids.ok() && turn_ids->size() == 1) {
-    eos_ids.insert(turn_ids.get()[0]);
-  }
-
-  // Read prompt
-  std::string prompt_text = FLAGS_prompt;
-  if (!FLAGS_prompt_file.empty()) {
-    std::ifstream f(FLAGS_prompt_file);
-    if (!f.is_open()) {
-      ET_LOG(
-          Error, "Failed to open prompt file: %s", FLAGS_prompt_file.c_str());
-      return 1;
-    }
-    prompt_text = std::string(
-        (std::istreambuf_iterator<char>(f)), std::istreambuf_iterator<char>());
+  auto session_result = engine->create_session();
+  if (session_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to create session");
+    return 1;
   }
+  auto session = std::move(session_result.get());
 
-  // Wrap with Gemma 4 IT chat template unless --raw_prompt is set.
-  // BOS is prepended separately below; this adds the turn structure and the
-  // empty thought block required by the instruction-tuned model.
-  if (!FLAGS_raw_prompt) {
-    prompt_text = "<|turn>user\n" + prompt_text +
-        "<turn|>\n<|turn>model\n<|channel>thought\n<channel|>";
+  std::string prompt_text = read_prompt();
+  if (!FLAGS_prompt_file.empty() && prompt_text.empty()) {
+    return 1;
   }
+  prompt_text = format_prompt(std::move(prompt_text));
 
-  // Encode prompt
-  auto encode_result = tokenizer->encode(prompt_text);
-  if (!encode_result.ok()) {
+  auto encoded = engine->tokenizer()->encode(prompt_text, /*bos=*/0, /*eos=*/0);
+  if (!encoded.ok()) {
     ET_LOG(Error, "Failed to encode prompt");
     return 1;
   }
-  auto prompt_tokens = std::move(*encode_result);
-  // Gemma models require BOS at the start of the sequence.
-  prompt_tokens.insert(
-      prompt_tokens.begin(), static_cast<uint64_t>(FLAGS_bos_id));
-  int64_t num_prompt_tokens = static_cast<int64_t>(prompt_tokens.size());
-  printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens);
-  stats.num_prompt_tokens = num_prompt_tokens;
-
-  stats.inference_start_ms = llm::time_in_ms();
-
-  // ---------------------------------------------------------------
-  // Prefill (chunked to respect ring-buffer KV cache limit)
-  // ---------------------------------------------------------------
-  uint64_t cur_token = 0;
-  int64_t prefill_pos = 0;
-#ifdef EXECUTORCH_BUILD_CUDA
-  // Alias of the most recent forward's on-device int64 output token. The last
-  // prefill chunk's output seeds the first decode step (no token H2D); each
-  // decode step then re-aliases its own output for the next step.
-  TensorPtr device_out_token;
-#endif
-  while (prefill_pos < num_prompt_tokens) {
-    int64_t chunk_len =
-        std::min(num_prompt_tokens - prefill_pos, max_prefill_chunk);
-
-    std::vector<int64_t> token_data(
-        prompt_tokens.begin() + prefill_pos,
-        prompt_tokens.begin() + prefill_pos + chunk_len);
-    std::vector<int64_t> pos_data(chunk_len);
-    for (int64_t i = 0; i < chunk_len; i++) {
-      pos_data[i] = prefill_pos + i;
-    }
-    auto tokens_tensor = from_blob(
-        token_data.data(),
-        {1, S(chunk_len)},
-        executorch::aten::ScalarType::Long);
-    auto pos_tensor = from_blob(
-        pos_data.data(), {S(chunk_len)}, executorch::aten::ScalarType::Long);
-
-#ifdef EXECUTORCH_BUILD_CUDA
-    // skip_h2d: prefill/decode method inputs must already live in CUDA memory.
-    tokens_tensor = clone_tensor_ptr_to(tokens_tensor, cuda_device);
-    pos_tensor = clone_tensor_ptr_to(pos_tensor, cuda_device);
-#endif
-
-    std::vector<EValue> inputs;
-    inputs.push_back(EValue(tokens_tensor));
-    inputs.push_back(EValue(pos_tensor));
-
-#ifdef EXECUTORCH_BUILD_CUDA
-    inputs.push_back(EValue(temp_tensor));
-    std::string method = (chunk_len == 1) ? "decode" : "prefill";
-#else
-    std::string method = "forward";
-#endif
+  std::vector<uint64_t> prompt_tokens;
+  prompt_tokens.reserve(encoded->size() + 1);
+  prompt_tokens.push_back(static_cast<uint64_t>(FLAGS_bos_id));
+  prompt_tokens.insert(prompt_tokens.end(), encoded->begin(), encoded->end());
 
-    auto result = module->execute(method, inputs);
-    if (result.error() != Error::Ok) {
-      ET_LOG(Error, "%s failed at pos %" PRId64, method.c_str(), prefill_pos);
-      return 1;
-    }
-
-#ifdef EXECUTORCH_BUILD_CUDA
-    const auto& out_tensor = result.get()[0].toTensor();
-    cur_token = read_token(out_tensor);
-    // Keep the sampled token on device: alias the output buffer so it feeds
-    // straight into the next forward as the int64 token input (zero copy).
-    device_out_token = make_tensor_ptr(out_tensor);
-#else
-    cur_token = static_cast<uint64_t>(
-        llm::logits_to_token(result.get()[0].toTensor(), temp_val));
-#endif
+  stats.num_prompt_tokens = static_cast<int64_t>(prompt_tokens.size());
+  printf("Prompt tokens: %" PRId64 "\n", stats.num_prompt_tokens);
 
-    prefill_pos += chunk_len;
+  llm::SamplingConfig sampling;
+  sampling.temperature = static_cast<float>(FLAGS_temperature);
+  stats.inference_start_ms = llm::time_in_ms();
+  if (session->prefill_tokens(prompt_tokens, &sampling) != Error::Ok) {
+    ET_LOG(Error, "Prefill failed");
+    return 1;
   }
-
   stats.prompt_eval_end_ms = llm::time_in_ms();
-  // First generated token came from the last prefill chunk; TTFT is prefill.
   stats.first_token_ms = stats.prompt_eval_end_ms;
 
-#ifdef EXECUTORCH_BUILD_CUDA
-  cudaDeviceSynchronize();
-#endif
-
-  // Print the first generated token (from the last prefill chunk).
-  // Use the last prompt token as the streaming-decode prefix so any BPE
-  // partial-character handling stays correct.
-  {
-    auto first_str = tokenizer->decode(prompt_tokens.back(), cur_token);
-    if (first_str.ok()) {
-      printf("%s", first_str->c_str());
-      fflush(stdout);
-    }
-  }
-
-  // ---------------------------------------------------------------
-  // Decode loop
-  // ---------------------------------------------------------------
-  int64_t pos = num_prompt_tokens;
-  std::vector<int64_t> decode_pos_data = {pos};
-  auto decode_pos_cpu = from_blob(
-      decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long);
-#ifdef EXECUTORCH_BUILD_CUDA
-  // Fixed device-resident position input slot: the decode method always reads
-  // the position from this same address every step (cuda-graph-safe). Seeded
-  // once here with a one-time H2D; refreshed each step by an on-device D2D.
-  auto decode_pos = clone_tensor_ptr_to(decode_pos_cpu, cuda_device);
-  // Upload the FULL decode position array to device ONCE (a single H2D - the
-  // one-time copy we keep). Each step copies its position from here into the
-  // fixed slot with a device-to-device copy, so there is NO per-round pos H2D.
-  std::vector<int64_t> pos_seq_data(FLAGS_max_new_tokens);
-  for (int32_t i = 0; i < FLAGS_max_new_tokens; i++) {
-    pos_seq_data[i] = num_prompt_tokens + i;
-  }
-  auto pos_seq_dev = clone_tensor_ptr_to(
-      from_blob(
-          pos_seq_data.data(),
-          {S(FLAGS_max_new_tokens)},
-          executorch::aten::ScalarType::Long),
-      cuda_device);
-  auto* pos_seq_dev_ptr =
-      static_cast<int64_t*>(pos_seq_dev->mutable_data_ptr());
-  auto* decode_pos_slot_ptr =
-      static_cast<int64_t*>(decode_pos->mutable_data_ptr());
-#else
-  // Non-CUDA (MLX) path: keep host token/pos buffers; the backend stages them
-  // and the host samples from the returned logits.
-  std::vector<int64_t> decode_token_data = {static_cast<int64_t>(cur_token)};
-  auto decode_tokens = from_blob(
-      decode_token_data.data(), {1, 1}, executorch::aten::ScalarType::Long);
-  auto decode_pos = decode_pos_cpu;
-#endif
-
-  uint64_t prev_token = cur_token;
-  bool hit_eos = eos_ids.find(cur_token) != eos_ids.end();
-  for (int32_t step = 0; step < FLAGS_max_new_tokens && !hit_eos; step++) {
-#ifdef EXECUTORCH_BUILD_CUDA
-    // No per-round H2D: copy this step's position from the pre-uploaded device
-    // position array into the fixed position slot with an on-device D2D. With
-    // the token aliased on device (Option A) and the position staged via D2D,
-    // the per-round HtoD count is zero (independent of decode length).
-    // cudaMemcpy D2D is host-synchronous, so the slot is updated before the
-    // decode kernels read it; with cuda graph enabled this becomes a captured
-    // cudaMemcpyAsync on the decode stream into this same fixed slot.
-    ET_CHECK_MSG(
-        cudaMemcpy(
-            decode_pos_slot_ptr,
-            pos_seq_dev_ptr + step,
-            sizeof(int64_t),
-            cudaMemcpyDeviceToDevice) == cudaSuccess,
-        "Failed to copy decode position D2D");
-#else
-    decode_pos_data[0] = pos;
-    decode_token_data[0] = static_cast<int64_t>(cur_token);
-#endif
-
-    std::vector<EValue> inputs;
-#ifdef EXECUTORCH_BUILD_CUDA
-    inputs.push_back(EValue(device_out_token));
-#else
-    inputs.push_back(EValue(decode_tokens));
-#endif
-    inputs.push_back(EValue(decode_pos));
-
-#ifdef EXECUTORCH_BUILD_CUDA
-    inputs.push_back(EValue(temp_tensor));
-    auto result = module->execute("decode", inputs);
-#else
-    auto result = module->execute("forward", inputs);
-#endif
-
-    if (result.error() != Error::Ok) {
-      ET_LOG(Error, "Decode step %d failed", step);
+  int64_t generated = 0;
+  for (; generated < FLAGS_max_new_tokens; ++generated) {
+    auto step = session->decode_one(sampling);
+    if (step.error() != Error::Ok) {
+      ET_LOG(Error, "Decode failed");
       return 1;
     }
-
-    prev_token = cur_token;
-#ifdef EXECUTORCH_BUILD_CUDA
-    const auto& out_tensor = result.get()[0].toTensor();
-    cur_token = read_token(out_tensor);
-    // Alias this step's on-device output token as the next step's token input.
-    device_out_token = make_tensor_ptr(out_tensor);
-#else
-    cur_token = static_cast<uint64_t>(
-        llm::logits_to_token(result.get()[0].toTensor(), temp_val));
-#endif
-    pos++;
-
-    auto decode_str = tokenizer->decode(prev_token, cur_token);
-    if (decode_str.ok()) {
-      printf("%s", decode_str->c_str());
-      fflush(stdout);
+    const auto& d = step.get();
+    if (d.is_terminal) {
+      break;
     }
-
-    hit_eos = eos_ids.find(cur_token) != eos_ids.end();
+    printf("%s", d.text_piece.c_str());
+    fflush(stdout);
   }
   printf("\n");
 
   stats.inference_end_ms = llm::time_in_ms();
-  stats.num_generated_tokens = pos - num_prompt_tokens;
+  stats.num_generated_tokens = generated;
 
 #ifdef EXECUTORCH_BUILD_CUDA
   cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
diff --git a/examples/models/gemma4_31b/serve.py b/examples/models/gemma4_31b/serve.py
new file mode 100644
index 00000000000..b99fae08c9c
--- /dev/null
+++ b/examples/models/gemma4_31b/serve.py
@@ -0,0 +1,214 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""OpenAI-compatible HTTP server for Gemma 4 31B.
+
+The server process is Python control plane only. It handles HTTP, Gemma chat
+templating, tool parsing, and request validation. Model execution lives in the
+C++ `gemma4_31b_worker` process and is driven over the generic
+`examples/llm_server` JSONL protocol.
+"""
+
+import argparse
+import logging
+import os
+import re
+from pathlib import Path
+
+from executorch.examples.llm_server.python.chat_template import ChatTemplate
+from executorch.examples.llm_server.python.server import build_app
+from executorch.examples.llm_server.python.serving_chat import ServingChat
+from executorch.examples.llm_server.python.session_runtime import SessionRuntime
+from executorch.examples.llm_server.python.tool_parsers import (
+    GemmaToolCallDetector,
+    HermesDetector,
+    QwenFunctionCallDetector,
+)
+from executorch.examples.llm_server.python.worker_client import spawn_worker
+
+logger = logging.getLogger(__name__)
+
+_GEMMA_CHANNEL_SPECIALS = {"<|channel>", "<channel|>", "<|think|>"}
+_GEMMA_CHANNEL_BLOCK = re.compile(r"<\|channel>.*?<channel\|>", re.DOTALL)
+
+
+def _strip_gemma_channels(text: str) -> str:
+    text = _GEMMA_CHANNEL_BLOCK.sub("", text)
+    open_idx = text.find("<|channel>")
+    if open_idx != -1:
+        text = text[:open_idx]
+    return text.replace("<channel|>", "").replace("<|think|>", "").strip()
+
+
+def _repo_root() -> Path:
+    starts = [Path(__file__).resolve(), Path(__file__).absolute(), Path.cwd().resolve()]
+    seen: set[Path] = set()
+    for start in starts:
+        cur = start if start.is_dir() else start.parent
+        for path in (cur, *cur.parents):
+            if path in seen:
+                continue
+            seen.add(path)
+            if (path / "CMakeLists.txt").exists() and (
+                path / "examples" / "models" / "gemma4_31b"
+            ).is_dir():
+                return path
+    raise RuntimeError(
+        "Could not locate the ExecuTorch source checkout; pass --worker-bin "
+        "explicitly."
+    )
+
+
+def _default_worker_bin() -> str:
+    return str(
+        _repo_root()
+        / "cmake-out"
+        / "examples"
+        / "models"
+        / "gemma4_31b"
+        / "gemma4_31b_worker"
+    )
+
+
+def _spawn(args):
+    env = dict(os.environ)
+    conda = os.environ.get("CONDA_PREFIX")
+    if conda:
+        existing = env.get("LD_LIBRARY_PATH", "")
+        env["LD_LIBRARY_PATH"] = (
+            f"{conda}/lib:{existing}" if existing else f"{conda}/lib"
+        )
+    worker_bin = args.worker_bin or _default_worker_bin()
+    cmd = [
+        worker_bin,
+        "--model_path",
+        args.model_path,
+        "--tokenizer_path",
+        args.tokenizer_path,
+        "--max_sessions",
+        str(args.max_sessions),
+        f"--warm_resume={'true' if args.warm_resume else 'false'}",
+        "--bos_id",
+        str(args.bos_id),
+        "--eos_id",
+        str(args.eos_id),
+    ]
+    if args.data_path:
+        cmd += ["--data_path", args.data_path]
+    logger.info("Starting Gemma4 31B worker subprocess (loads the model once).")
+    return spawn_worker(cmd, env=env)
+
+
+def _tool_detector(name: str):
+    if name == "gemma":
+        return GemmaToolCallDetector
+    if name == "hermes":
+        return HermesDetector
+    if name == "qwen":
+        return QwenFunctionCallDetector
+    if name == "none":
+        return None
+    raise ValueError(f"unknown tool parser: {name}")
+
+
+def build_app_from_args(args):
+    template = ChatTemplate(
+        args.hf_tokenizer,
+        assistant_header="<|turn>model\n",
+        strip_rendered_bos=True,
+        append_generation_prompt_after_tool_response=True,
+    )
+    worker = _spawn(args)
+    runtime = SessionRuntime(worker)
+    serving = ServingChat(
+        runtime,
+        template,
+        args.model_id,
+        max_context=args.max_context,
+        tool_detector_cls=_tool_detector(args.tool_parser),
+        prompt_token_offset=1,
+        content_filter=_strip_gemma_channels,
+        content_filter_specials=_GEMMA_CHANNEL_SPECIALS,
+    )
+    app = build_app(serving, args.model_id)
+
+    @app.on_event("shutdown")
+    def _stop_worker():
+        runtime.close_worker()
+
+    return app, args.model_id
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(
+        description="OpenAI-compatible LLM server for Gemma 4 31B"
+    )
+    p.add_argument("--model-path", required=True, help="Path to the .pte model")
+    p.add_argument("--data-path", default=None, help="Path to the delegate blob")
+    p.add_argument("--tokenizer-path", required=True, help="Path to tokenizer.json")
+    p.add_argument(
+        "--hf-tokenizer",
+        required=True,
+        help="HF tokenizer id/dir for the model's chat template",
+    )
+    p.add_argument("--model-id", default="gemma4_31b")
+    p.add_argument("--host", default="127.0.0.1")
+    p.add_argument("--port", type=int, default=8000)
+    p.add_argument("--max-context", type=int, default=None)
+    p.add_argument(
+        "--num-runners",
+        type=int,
+        default=1,
+        help="Worker processes. 1 only; more would duplicate the weights.",
+    )
+    p.add_argument(
+        "--max-sessions",
+        type=int,
+        default=1,
+        help="Physical sessions the worker can host on one weight load. One "
+        "slot is reserved for anonymous requests, so addressable named "
+        "sessions are max-sessions - 1.",
+    )
+    p.add_argument(
+        "--warm-resume",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Warm append-only resume for named sessions when available.",
+    )
+    p.add_argument(
+        "--tool-parser",
+        choices=("gemma", "hermes", "qwen", "none"),
+        default="gemma",
+        help="Tool-call format parser to apply to model output.",
+    )
+    p.add_argument(
+        "--bos-id",
+        type=int,
+        default=2,
+        help="BOS token id to prepend in the worker. The launcher strips the "
+        "HF template's literal BOS before C++ tokenization.",
+    )
+    p.add_argument("--eos-id", type=int, default=1)
+    p.add_argument(
+        "--worker-bin",
+        default=None,
+        help="Path to gemma4_31b_worker. Defaults to the CMake output path.",
+    )
+    args = p.parse_args()
+    logging.basicConfig(level=logging.INFO)
+
+    if args.num_runners != 1:
+        p.error("Only 1 worker process is supported; more would duplicate weights.")
+
+    app, _ = build_app_from_args(args)
+
+    import uvicorn
+
+    uvicorn.run(app, host=args.host, port=args.port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/gemma4_31b/test_gemma4_31b_nobleed.cpp b/examples/models/gemma4_31b/test_gemma4_31b_nobleed.cpp
new file mode 100644
index 00000000000..3eb48009807
--- /dev/null
+++ b/examples/models/gemma4_31b/test_gemma4_31b_nobleed.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// CUDA integration proof for one loaded Gemma4 31B model with isolated
+// sessions. Set GEMMA_MODEL_PATH, GEMMA_DATA_PATH, and GEMMA_TOKENIZER_PATH to
+// run it.
+
+#include <executorch/examples/models/gemma4_31b/gemma4_31b_engine.h>
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llm = ::executorch::extension::llm;
+using ::executorch::runtime::Error;
+
+namespace {
+
+int g_failures = 0;
+
+void check(const std::string& name, bool ok) {
+  printf("  [%s] %s\n", ok ? "PASS" : "FAIL", name.c_str());
+  if (!ok) {
+    ++g_failures;
+  }
+}
+
+const char* env(const char* k) {
+  const char* v = std::getenv(k);
+  return (v && *v) ? v : nullptr;
+}
+
+std::vector<uint64_t> encode_prompt(
+    llm::Gemma4_31BEngine& engine,
+    const std::string& prompt) {
+  const std::string rendered = "<|turn>user\n" + prompt +
+      "<turn|>\n<|turn>model\n"
+      "<|channel>thought\n<channel|>";
+  auto encoded = engine.tokenizer()->encode(rendered, /*bos=*/0, /*eos=*/0);
+  if (!encoded.ok()) {
+    return {};
+  }
+  std::vector<uint64_t> ids;
+  ids.reserve(encoded->size() + 1);
+  ids.push_back(2);
+  ids.insert(ids.end(), encoded->begin(), encoded->end());
+  return ids;
+}
+
+std::vector<uint64_t> encode_prompt_at_least(
+    llm::Gemma4_31BEngine& engine,
+    const std::string& seed,
+    const std::string& filler,
+    size_t min_tokens) {
+  std::string prompt = seed;
+  std::vector<uint64_t> ids = encode_prompt(engine, prompt);
+  while (!ids.empty() && ids.size() < min_tokens) {
+    prompt.append(" ");
+    prompt.append(filler);
+    ids = encode_prompt(engine, prompt);
+  }
+  return ids;
+}
+
+std::vector<uint64_t> decode_tokens(
+    llm::LLMSession& session,
+    const std::vector<uint64_t>& prompt,
+    int n) {
+  llm::SamplingConfig sampling;
+  std::vector<uint64_t> out;
+  if (session.prefill_tokens(prompt, &sampling) != Error::Ok) {
+    return out;
+  }
+  for (int i = 0; i < n; ++i) {
+    auto step = session.decode_one(sampling);
+    if (step.error() != Error::Ok || step.get().is_terminal) {
+      break;
+    }
+    out.push_back(step.get().token_id);
+  }
+  return out;
+}
+
+void run_no_bleed_case(
+    llm::Gemma4_31BEngine& engine,
+    const std::string& label,
+    const std::vector<uint64_t>& prompt_a,
+    const std::vector<uint64_t>& prompt_b,
+    int decode_steps) {
+  printf(
+      "\n%s (A=%zu tokens, B=%zu tokens)\n",
+      label.c_str(),
+      prompt_a.size(),
+      prompt_b.size());
+  check(label + ": prompts encoded", !prompt_a.empty() && !prompt_b.empty());
+
+  std::vector<uint64_t> solo;
+  auto solo_session = engine.create_session();
+  check(label + ": create solo session", solo_session.error() == Error::Ok);
+  if (solo_session.error() == Error::Ok) {
+    solo = decode_tokens(*solo_session.get(), prompt_a, decode_steps);
+  }
+  check(label + ": solo produced tokens", !solo.empty());
+
+  std::vector<uint64_t> interleaved_a;
+  std::vector<uint64_t> interleaved_b;
+  auto a_result = engine.create_session();
+  auto b_result = engine.create_session();
+  check(
+      label + ": create interleaved sessions",
+      a_result.error() == Error::Ok && b_result.error() == Error::Ok);
+  if (a_result.error() == Error::Ok && b_result.error() == Error::Ok) {
+    auto a = std::move(a_result.get());
+    auto b = std::move(b_result.get());
+    llm::SamplingConfig sampling;
+    bool ok = a->prefill_tokens(prompt_a, &sampling) == Error::Ok &&
+        b->prefill_tokens(prompt_b, &sampling) == Error::Ok;
+    check(label + ": interleaved prefills", ok);
+    bool a_done = false;
+    bool b_done = false;
+    for (int i = 0; i < decode_steps && ok; ++i) {
+      if (!a_done) {
+        auto step = a->decode_one(sampling);
+        if (step.error() != Error::Ok || step.get().is_terminal) {
+          a_done = true;
+        } else {
+          interleaved_a.push_back(step.get().token_id);
+        }
+      }
+      if (!b_done) {
+        auto step = b->decode_one(sampling);
+        if (step.error() != Error::Ok || step.get().is_terminal) {
+          b_done = true;
+        } else {
+          interleaved_b.push_back(step.get().token_id);
+        }
+      }
+    }
+  }
+
+  check(
+      label + ": A interleaved == A solo (bit-identical)",
+      interleaved_a == solo);
+  check(
+      label + ": B ran a distinct conversation",
+      !interleaved_b.empty() && interleaved_b != solo);
+}
+
+int64_t gpu_free() {
+  size_t free = 0;
+  size_t total = 0;
+  return cudaMemGetInfo(&free, &total) == cudaSuccess
+      ? static_cast<int64_t>(free)
+      : -1;
+}
+
+} // namespace
+
+int main() {
+  const char* model = env("GEMMA_MODEL_PATH");
+  const char* tokenizer = env("GEMMA_TOKENIZER_PATH");
+  if (!model || !tokenizer) {
+    printf(
+        "SKIP: integration proof needs GEMMA_MODEL_PATH / "
+        "GEMMA_TOKENIZER_PATH (+ GEMMA_DATA_PATH) on a CUDA box.\n");
+    return 0;
+  }
+
+  llm::Gemma4_31BConfig config;
+  config.model_path = model;
+  config.data_path = env("GEMMA_DATA_PATH") ? env("GEMMA_DATA_PATH") : "";
+  config.tokenizer_path = tokenizer;
+  config.max_sessions = 4;
+
+  auto engine_result = llm::Gemma4_31BEngine::create(config);
+  if (engine_result.error() != Error::Ok) {
+    printf("SKIP: engine create failed (no CUDA device / bad paths).\n");
+    return 0;
+  }
+  auto engine = std::move(engine_result.get());
+
+  printf("Gemma4 31B no-bleed integration proof:\n");
+  auto prompt_a = encode_prompt(*engine, "List three colors.");
+  auto prompt_b = encode_prompt(*engine, "Name two countries in Europe.");
+  run_no_bleed_case(*engine, "short-context", prompt_a, prompt_b, 24);
+
+  auto long_prompt_a = encode_prompt_at_least(
+      *engine,
+      "After reading these notes, answer with one concise sentence.",
+      "alpha beta gamma delta epsilon zeta eta theta iota kappa",
+      1152);
+  auto long_prompt_b = encode_prompt_at_least(
+      *engine,
+      "After reading this inventory, answer with one concise sentence.",
+      "red orange yellow green blue indigo violet black white silver",
+      1152);
+  run_no_bleed_case(
+      *engine,
+      "long-context-crosses-sliding-window",
+      long_prompt_a,
+      long_prompt_b,
+      8);
+
+  const int64_t est = engine->serving_capacity().estimated_bytes_per_session;
+  int64_t free_before = gpu_free();
+  {
+    auto extra_result = engine->create_session();
+    if (extra_result.error() == Error::Ok) {
+      auto extra = std::move(extra_result.get());
+      llm::SamplingConfig sampling;
+      extra->prefill_tokens(prompt_a, &sampling);
+      int64_t free_after = gpu_free();
+      if (free_before > 0 && free_after > 0) {
+        const int64_t delta = free_before - free_after;
+        printf(
+            "    extra-session GPU delta=%lld bytes (est/session=%lld)\n",
+            static_cast<long long>(delta),
+            static_cast<long long>(est));
+        check(
+            "extra session is state-sized, not another model load",
+            delta > 0 && delta < (16LL << 30));
+        if (est > 0) {
+          check(
+              "memory delta within 2x of estimated_bytes_per_session",
+              delta <= est * 2 + (512LL << 20));
+        }
+      }
+    }
+  }
+
+  std::vector<std::unique_ptr<llm::LLMSession>> held;
+  while (true) {
+    auto session = engine->create_session();
+    if (session.error() != Error::Ok) {
+      break;
+    }
+    held.push_back(std::move(session.get()));
+    if (held.size() > static_cast<size_t>(config.max_sessions) + 2) {
+      break;
+    }
+  }
+  check(
+      "capacity enforced: create_session fails past max_sessions",
+      held.size() <= static_cast<size_t>(config.max_sessions));
+
+  printf(
+      "\n%s (%d failure(s))\n",
+      g_failures ? "FAILURES" : "ALL PASS",
+      g_failures);
+  return g_failures ? 1 : 0;
+}
diff --git a/examples/models/gemma4_31b/test_ondevice_serving.py b/examples/models/gemma4_31b/test_ondevice_serving.py
new file mode 100644
index 00000000000..865a5c30045
--- /dev/null
+++ b/examples/models/gemma4_31b/test_ondevice_serving.py
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import urllib.request
+
+import pytest
+
+from executorch.examples.llm_server.python.chat_template import ChatTemplate
+from executorch.examples.llm_server.python.protocol import ChatMessage
+
+_SERVER = os.environ.get("GEMMA_SERVER_URL")
+_HF_DIR = os.environ.get("GEMMA_HF_DIR")
+
+pytestmark = pytest.mark.skipif(
+    not _SERVER or not _HF_DIR or not os.path.isdir(_HF_DIR),
+    reason="set GEMMA_SERVER_URL and GEMMA_HF_DIR to run Gemma on-device tests",
+)
+
+
+def _post(path: str, payload: dict) -> dict:
+    data = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(
+        _SERVER.rstrip("/") + path,
+        data=data,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def test_prompt_tokens_match_real_template_with_numeric_bos_prefix():
+    pytest.importorskip("transformers")
+    from transformers import AutoTokenizer
+
+    template = ChatTemplate(
+        _HF_DIR,
+        assistant_header="<|turn>model\n",
+        strip_rendered_bos=True,
+        append_generation_prompt_after_tool_response=True,
+    )
+    tok = AutoTokenizer.from_pretrained(_HF_DIR)
+    messages = [ChatMessage(role="user", content="Say ok.")]
+    rendered = template.render(messages)
+    expected_ids = [tok.bos_token_id] + tok.encode(rendered, add_special_tokens=False)
+
+    body = _post(
+        "/v1/chat/completions",
+        {
+            "model": "gemma4_31b",
+            "messages": [{"role": "user", "content": "Say ok."}],
+            "max_tokens": 1,
+            "temperature": 0,
+            "session_id": "gemma-bos-regression",
+        },
+    )
+    assert body["usage"]["prompt_tokens"] == len(expected_ids)
diff --git a/examples/models/gemma4_31b/test_serve.py b/examples/models/gemma4_31b/test_serve.py
new file mode 100644
index 00000000000..98f9cde0520
--- /dev/null
+++ b/examples/models/gemma4_31b/test_serve.py
@@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Hermetic tests for the Gemma 4 31B OpenAI serving launcher."""
+
+import pathlib
+from types import SimpleNamespace
+
+import pytest
+
+from executorch.examples.models.gemma4_31b import serve
+
+_HERE = pathlib.Path(serve.__file__).resolve().parent
+_REPO_ROOT = serve._repo_root()
+
+
+def test_generic_server_does_not_reference_gemma4_31b():
+    server_dir = _REPO_ROOT / "examples/llm_server"
+    offenders = []
+    for p in server_dir.rglob("*.py"):
+        text = p.read_text()
+        if "gemma4_31b" in text or "Gemma4_31B" in text:
+            offenders.append(p)
+    assert offenders == [], f"generic server must not reference Gemma: {offenders}"
+
+
+def test_control_plane_runs_no_model_code():
+    serve_src = (_HERE / "serve.py").read_text()
+    assert "Gemma4_31BEngine" not in serve_src
+    worker_src = (_HERE / "gemma4_31b_worker.cpp").read_text()
+    assert "Gemma4_31BEngine" in worker_src
+
+
+def test_python_worker_and_pybind_are_absent():
+    assert not (_HERE / "worker.py").exists()
+    assert not (_HERE / "gemma4_31b_pybindings.cpp").exists()
+
+
+def test_spawn_builds_worker_command(monkeypatch):
+    captured = {}
+
+    def fake_spawn(cmd, env=None):
+        captured["cmd"] = cmd
+        captured["env"] = env
+        return object()
+
+    monkeypatch.setattr(serve, "spawn_worker", fake_spawn)
+    serve._spawn(
+        SimpleNamespace(
+            worker_bin="/bin/gemma_worker",
+            model_path="m.pte",
+            tokenizer_path="t.json",
+            data_path="d.ptd",
+            max_sessions=4,
+            warm_resume=True,
+            bos_id=2,
+            eos_id=1,
+        )
+    )
+    assert captured["cmd"] == [
+        "/bin/gemma_worker",
+        "--model_path",
+        "m.pte",
+        "--tokenizer_path",
+        "t.json",
+        "--max_sessions",
+        "4",
+        "--warm_resume=true",
+        "--bos_id",
+        "2",
+        "--eos_id",
+        "1",
+        "--data_path",
+        "d.ptd",
+    ]
+
+
+def test_spawn_defaults_worker_bin_and_omits_empty_data_path(monkeypatch):
+    captured = {}
+    monkeypatch.setattr(
+        serve, "spawn_worker", lambda cmd, env=None: captured.update(cmd=cmd)
+    )
+    serve._spawn(
+        SimpleNamespace(
+            worker_bin=None,
+            model_path="m.pte",
+            tokenizer_path="t.json",
+            data_path=None,
+            max_sessions=1,
+            warm_resume=False,
+            bos_id=2,
+            eos_id=1,
+        )
+    )
+    cmd = captured["cmd"]
+    assert cmd[0].endswith("gemma4_31b_worker")
+    assert "--data_path" not in cmd
+    assert "--warm_resume=false" in cmd
+
+
+def test_build_app_uses_gemma_options(monkeypatch):
+    captured = {}
+
+    class _FakeTemplate:
+        def __init__(self, *args, **kwargs):
+            captured["template_kwargs"] = kwargs
+
+    class _FakeRuntime:
+        def close_worker(self):
+            pass
+
+    class _FakeApp:
+        def on_event(self, event):
+            captured["event"] = event
+            return lambda fn: fn
+
+    monkeypatch.setattr(serve, "ChatTemplate", _FakeTemplate)
+    monkeypatch.setattr(serve, "_spawn", lambda args: object())
+    monkeypatch.setattr(serve, "SessionRuntime", lambda worker: _FakeRuntime())
+
+    def fake_serving(*args, **kwargs):
+        captured["serving_kwargs"] = kwargs
+        return object()
+
+    monkeypatch.setattr(serve, "ServingChat", fake_serving)
+    monkeypatch.setattr(serve, "build_app", lambda serving, model_id: _FakeApp())
+    serve.build_app_from_args(
+        SimpleNamespace(
+            hf_tokenizer="hf",
+            model_id="gemma4_31b",
+            max_context=10,
+            tool_parser="gemma",
+        )
+    )
+
+    assert captured["template_kwargs"] == {
+        "assistant_header": "<|turn>model\n",
+        "strip_rendered_bos": True,
+        "append_generation_prompt_after_tool_response": True,
+    }
+    assert (
+        captured["serving_kwargs"]["tool_detector_cls"] is serve.GemmaToolCallDetector
+    )
+    assert captured["serving_kwargs"]["prompt_token_offset"] == 1
+    assert captured["serving_kwargs"]["content_filter"] is serve._strip_gemma_channels
+    assert captured["event"] == "shutdown"
+
+
+def test_strip_gemma_channels_returns_visible_answer():
+    text = "<|channel>thought\nscratch work\n<channel|>The answer."
+    assert serve._strip_gemma_channels(text) == "The answer."
+
+
+def test_strip_gemma_channels_cuts_unclosed_channel():
+    assert serve._strip_gemma_channels("Lead <|channel>thought") == "Lead"
+
+
+def test_strip_gemma_channels_removes_stray_close():
+    assert serve._strip_gemma_channels("Visible<channel|>") == "Visible"
+
+
+def test_rejects_multiple_runners(monkeypatch):
+    import sys
+
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "serve.py",
+            "--model-path",
+            "m.pte",
+            "--tokenizer-path",
+            "t.json",
+            "--hf-tokenizer",
+            "hf",
+            "--num-runners",
+            "2",
+        ],
+    )
+    with pytest.raises(SystemExit):
+        serve.main()