Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ help:
@echo " llava-cpu - Build Llava runner with CPU backend"
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend"
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend"
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner and worker with CUDA backend"
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner and worker with MLX backend"
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
@echo " qwen3_5_moe-mlx - Build Qwen3.5 MoE runner with MLX backend"
Expand Down Expand Up @@ -444,20 +444,23 @@ qwen3_5_moe-cuda:
gemma4_31b-cuda:
@echo "==> Building and installing ExecuTorch with CUDA..."
cmake --workflow --preset llm-release-cuda
@echo "==> Building Gemma 4 31B runner with CUDA..."
@echo "==> Building Gemma 4 31B runner, worker, and no-bleed test with CUDA..."
cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
@echo " Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
@echo " Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
@echo " Test: cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed"

gemma4_31b-mlx:
@echo "==> Building and installing ExecuTorch with MLX..."
cmake --workflow --preset mlx-release
@echo "==> Building Gemma 4 31B runner with MLX..."
@echo "==> Building Gemma 4 31B runner and worker with MLX..."
cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-mlx
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
@echo " Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
@echo " Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"

qwen3_5_moe-metal:
@echo "==> Building and installing ExecuTorch with Metal..."
Expand Down
32 changes: 30 additions & 2 deletions examples/models/gemma4_31b/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)

set(_common_include_directories ${EXECUTORCH_ROOT}/..)
set(_json_include
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
)

# gflags
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
Expand Down Expand Up @@ -51,26 +54,51 @@ if(EXECUTORCH_BUILD_CUDA)
elseif(TARGET mlxdelegate)
list(APPEND link_libraries mlxdelegate mlx)
executorch_target_link_options_shared_lib(mlxdelegate)
add_compile_definitions(EXECUTORCH_BUILD_MLX)
else()
message(FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_MLX=ON")
endif()

# Tokenizer (HuggingFace tokenizer.json)
list(APPEND link_libraries tokenizers::tokenizers)

add_executable(gemma4_31b_runner main.cpp)
add_executable(gemma4_31b_runner main.cpp gemma4_31b_engine.cpp)
target_include_directories(
gemma4_31b_runner PUBLIC ${_common_include_directories}
gemma4_31b_runner PUBLIC ${_common_include_directories} ${_json_include}
)
target_link_libraries(gemma4_31b_runner PUBLIC ${link_libraries})

add_executable(gemma4_31b_worker gemma4_31b_worker.cpp gemma4_31b_engine.cpp)
target_include_directories(
gemma4_31b_worker PUBLIC ${_common_include_directories} ${_json_include}
)
target_link_libraries(gemma4_31b_worker PUBLIC ${link_libraries})

if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(gemma4_31b_runner)
if(NOT APPLE AND NOT MSVC)
target_link_options(gemma4_31b_runner PRIVATE "LINKER:-s")
endif()
target_link_options_gc_sections(gemma4_31b_worker)
if(NOT APPLE AND NOT MSVC)
target_link_options(gemma4_31b_worker PRIVATE "LINKER:-s")
endif()
endif()

if(TARGET mlxdelegate)
executorch_target_copy_mlx_metallib(gemma4_31b_runner)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this repeated twice?

executorch_target_copy_mlx_metallib(gemma4_31b_worker)
endif()

if(EXECUTORCH_BUILD_CUDA)
enable_testing()
add_executable(
test_gemma4_31b_nobleed test_gemma4_31b_nobleed.cpp gemma4_31b_engine.cpp
)
target_include_directories(
test_gemma4_31b_nobleed PUBLIC ${_common_include_directories}
${_json_include}
)
target_link_libraries(test_gemma4_31b_nobleed PUBLIC ${link_libraries})
add_test(NAME gemma4_31b_nobleed COMMAND test_gemma4_31b_nobleed)
endif()
20 changes: 12 additions & 8 deletions examples/models/gemma4_31b/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
{
"name": "gemma4-31b-cuda",
"displayName": "Gemma 4 31B runner (CUDA)",
"displayName": "Gemma 4 31B runner and worker (CUDA)",
"inherits": ["gemma4-31b-base"],
"cacheVariables": {
"EXECUTORCH_BUILD_CUDA": "ON"
Expand All @@ -26,7 +26,7 @@
},
{
"name": "gemma4-31b-mlx",
"displayName": "Gemma 4 31B runner (MLX)",
"displayName": "Gemma 4 31B runner and worker (MLX)",
"inherits": ["gemma4-31b-base"],
"cacheVariables": {},
"condition": {
Expand All @@ -39,21 +39,25 @@
"buildPresets": [
{
"name": "gemma4-31b-cuda",
"displayName": "Build Gemma 4 31B runner (CUDA)",
"displayName": "Build Gemma 4 31B runner, worker, and no-bleed test (CUDA)",
"configurePreset": "gemma4-31b-cuda",
"targets": ["gemma4_31b_runner"]
"targets": [
"gemma4_31b_runner",
"gemma4_31b_worker",
"test_gemma4_31b_nobleed"
]
},
{
"name": "gemma4-31b-mlx",
"displayName": "Build Gemma 4 31B runner (MLX)",
"displayName": "Build Gemma 4 31B runner and worker (MLX)",
"configurePreset": "gemma4-31b-mlx",
"targets": ["gemma4_31b_runner"]
"targets": ["gemma4_31b_runner", "gemma4_31b_worker"]
}
],
"workflowPresets": [
{
"name": "gemma4-31b-cuda",
"displayName": "Configure and build Gemma 4 31B runner (CUDA)",
"displayName": "Configure and build Gemma 4 31B runner and worker (CUDA)",
"steps": [
{
"type": "configure",
Expand All @@ -67,7 +71,7 @@
},
{
"name": "gemma4-31b-mlx",
"displayName": "Configure and build Gemma 4 31B runner (MLX)",
"displayName": "Configure and build Gemma 4 31B runner and worker (MLX)",
"steps": [
{
"type": "configure",
Expand Down
65 changes: 63 additions & 2 deletions examples/models/gemma4_31b/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,17 @@ python examples/models/gemma4_31b/inference.py \
Useful before spending the export+lowering time to confirm the quantized
model produces sensible text.

## Build the runner
## Build the runner and worker

```bash
make gemma4_31b-cuda # Linux — CUDA backend
make gemma4_31b-mlx # macOS — MLX backend (Apple Silicon)
```

The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`.
The binaries land at:

- `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`
- `cmake-out/examples/models/gemma4_31b/gemma4_31b_worker`

## Run the .pte

Expand All @@ -179,3 +182,61 @@ Pass `--raw_prompt` to skip template wrapping for pre-formatted input.

For benchmarking, add `--cuda_graph` to capture the decode method in a CUDA
graph (decode is fully static — `T=1`).

## OpenAI-compatible serving harness

The serving path is a test harness for local-agent workflows. Python owns HTTP,
chat templating, request validation, and tool parsing; the C++ worker owns model
loading, prefill/decode, and session state. Use the runner or engine/session API
directly for production integrations.

### CUDA

```bash
python -m executorch.examples.models.gemma4_31b.serve \
--model-path ./gemma4_31b_exports/model.pte \
--data-path ./gemma4_31b_exports/aoti_cuda_blob.ptd \
--tokenizer-path ./gemma4_31b_int4/tokenizer.json \
--hf-tokenizer ./gemma4_31b_int4 \
--model-id gemma4_31b \
--max-context 4096 \
--max-sessions 4 \
--host 127.0.0.1 \
--port 8000
```

### MLX

```bash
python -m executorch.examples.models.gemma4_31b.serve \
--model-path ./gemma4_31b_exports_mlx/model.pte \
--tokenizer-path ./gemma4_31b_int4/tokenizer.json \
--hf-tokenizer ./gemma4_31b_int4 \
--model-id gemma4_31b \
--max-context 4096 \
--max-sessions 4 \
--host 127.0.0.1 \
--port 8000
```

Named sessions use one loaded model with isolated mutable state when the backend
supports it. Set `--max-sessions >= 2` and send a stable `session_id` (or one of
the supported affinity headers) to enable separate conversations and warm
append-only resume. One capacity slot is reserved for anonymous requests.

The default parser is Gemma's tool-call format. Use `--tool-parser hermes`,
`--tool-parser qwen`, or `--tool-parser none` if a different prompt/template
emits another format.

### CUDA no-bleed test

The CUDA build also produces `test_gemma4_31b_nobleed`, which validates that
two sessions can interleave prefill/decode on one loaded model without sharing
mutable state:

```bash
GEMMA_MODEL_PATH=gemma4_31b_exports/model.pte \
GEMMA_DATA_PATH=gemma4_31b_exports/aoti_cuda_blob.ptd \
GEMMA_TOKENIZER_PATH=gemma4_31b_int4/tokenizer.json \
cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed
```
9 changes: 9 additions & 0 deletions examples/models/gemma4_31b/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"""

import argparse
import json
import os

import torch
Expand Down Expand Up @@ -135,6 +136,11 @@ def _pack_for_backend(model: nn.Module, path: str, backend: str) -> None:
# Export + lower


def _mutable_buffer_metadata(model: nn.Module) -> str:
mutable = [name for name, _ in model.named_buffers() if ".kv_cache." in name]
return json.dumps({"version": 1, "mutable_buffers": mutable})


def export_and_lower(
model: Gemma4_31B,
config: Gemma4_31BConfig,
Expand Down Expand Up @@ -181,6 +187,7 @@ def _export_cuda(
import executorch.backends.cuda.quantize_op_dispatch # noqa: F401

materialize_runtime_buffers(model, dtype=torch.bfloat16)
mutable_buffer_metadata = _mutable_buffer_metadata(model)

if use_turboquant:
from executorch.examples.models.gemma4_31b.cuda_source_transformations import (
Expand Down Expand Up @@ -255,6 +262,8 @@ def _export_cuda(
"get_vocab_size": config.vocab_size,
"get_n_layers": config.num_hidden_layers,
"get_max_prefill_chunk": max_prefill,
"get_min_prefill_chunk": 5,

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you pad up to this chunk size? Should this be backend specific? MLX can handle seqlen=1.

"get_mutable_buffer_metadata": mutable_buffer_metadata,
"use_kv_cache": True,
"use_sdpa_with_kv_cache": False,
"enable_dynamic_shape": True,
Expand Down
Loading
Loading