From 2e0ac267b3c32bfe7d5f605d8ff3469c41a9db7b Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Tue, 23 Jun 2026 13:42:34 -0700 Subject: [PATCH] gemma4_31b: add OpenAI serving entrypoint Add Gemma4 31B model-specific serving support on top of the shared examples/llm_server harness. This extracts the existing runner flow into a small Gemma4_31BEngine/LLMSession adapter, keeps main.cpp as a thin runner wrapper, and adds a C++ JSONL worker plus Python OpenAI-compatible launcher. The generic server remains model-agnostic; Gemma-specific behavior stays in examples/models/gemma4_31b, including chat-template options, BOS handling, channel cleanup, and Gemma tool-call parsing. Also wire the worker into the existing Gemma CUDA/MLX CMake presets and Makefile targets, document the serving harness usage, and add validation coverage: hermetic launcher tests, an opt-in on-device BOS/template regression test, and a CUDA no-bleed integration proof for interleaved multi-session execution. --- Makefile | 15 +- examples/models/gemma4_31b/CMakeLists.txt | 32 +- examples/models/gemma4_31b/CMakePresets.json | 20 +- examples/models/gemma4_31b/README.md | 65 +- examples/models/gemma4_31b/export.py | 9 + .../models/gemma4_31b/gemma4_31b_engine.cpp | 737 ++++++++++++++++++ .../models/gemma4_31b/gemma4_31b_engine.h | 128 +++ .../models/gemma4_31b/gemma4_31b_worker.cpp | 63 ++ examples/models/gemma4_31b/main.cpp | 448 ++--------- examples/models/gemma4_31b/serve.py | 214 +++++ .../gemma4_31b/test_gemma4_31b_nobleed.cpp | 261 +++++++ .../gemma4_31b/test_ondevice_serving.py | 62 ++ examples/models/gemma4_31b/test_serve.py | 184 +++++ 13 files changed, 1846 insertions(+), 392 deletions(-) create mode 100644 examples/models/gemma4_31b/gemma4_31b_engine.cpp create mode 100644 examples/models/gemma4_31b/gemma4_31b_engine.h create mode 100644 examples/models/gemma4_31b/gemma4_31b_worker.cpp create mode 100644 examples/models/gemma4_31b/serve.py create mode 100644 examples/models/gemma4_31b/test_gemma4_31b_nobleed.cpp create mode 100644 examples/models/gemma4_31b/test_ondevice_serving.py create mode 100644 examples/models/gemma4_31b/test_serve.py diff --git a/Makefile b/Makefile index 552bbf89bd7..969b53644cd 100644 --- a/Makefile +++ b/Makefile @@ -127,8 +127,8 @@ help: @echo " llava-cpu - Build Llava runner with CPU backend" @echo " gemma3-cuda - Build Gemma3 runner with CUDA backend" @echo " gemma3-cpu - Build Gemma3 runner with CPU backend" - @echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend" - @echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend" + @echo " gemma4_31b-cuda - Build Gemma 4 31B runner and worker with CUDA backend" + @echo " gemma4_31b-mlx - Build Gemma 4 31B runner and worker with MLX backend" @echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend" @echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend" @echo " qwen3_5_moe-mlx - Build Qwen3.5 MoE runner with MLX backend" @@ -444,20 +444,23 @@ qwen3_5_moe-cuda: gemma4_31b-cuda: @echo "==> Building and installing ExecuTorch with CUDA..." cmake --workflow --preset llm-release-cuda - @echo "==> Building Gemma 4 31B runner with CUDA..." + @echo "==> Building Gemma 4 31B runner, worker, and no-bleed test with CUDA..." cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda @echo "" @echo "✓ Build complete!" - @echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner" + @echo " Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner" + @echo " Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker" + @echo " Test: cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed" gemma4_31b-mlx: @echo "==> Building and installing ExecuTorch with MLX..." cmake --workflow --preset mlx-release - @echo "==> Building Gemma 4 31B runner with MLX..." + @echo "==> Building Gemma 4 31B runner and worker with MLX..." cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-mlx @echo "" @echo "✓ Build complete!" - @echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner" + @echo " Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner" + @echo " Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker" qwen3_5_moe-metal: @echo "==> Building and installing ExecuTorch with Metal..." diff --git a/examples/models/gemma4_31b/CMakeLists.txt b/examples/models/gemma4_31b/CMakeLists.txt index 52419eb95bc..c740de13c36 100644 --- a/examples/models/gemma4_31b/CMakeLists.txt +++ b/examples/models/gemma4_31b/CMakeLists.txt @@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) set(_common_include_directories ${EXECUTORCH_ROOT}/..) +set(_json_include + ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include +) # gflags set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) @@ -51,6 +54,7 @@ if(EXECUTORCH_BUILD_CUDA) elseif(TARGET mlxdelegate) list(APPEND link_libraries mlxdelegate mlx) executorch_target_link_options_shared_lib(mlxdelegate) + add_compile_definitions(EXECUTORCH_BUILD_MLX) else() message(FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_MLX=ON") endif() @@ -58,19 +62,43 @@ endif() # Tokenizer (HuggingFace tokenizer.json) list(APPEND link_libraries tokenizers::tokenizers) -add_executable(gemma4_31b_runner main.cpp) +add_executable(gemma4_31b_runner main.cpp gemma4_31b_engine.cpp) target_include_directories( - gemma4_31b_runner PUBLIC ${_common_include_directories} + gemma4_31b_runner PUBLIC ${_common_include_directories} ${_json_include} ) target_link_libraries(gemma4_31b_runner PUBLIC ${link_libraries}) +add_executable(gemma4_31b_worker gemma4_31b_worker.cpp gemma4_31b_engine.cpp) +target_include_directories( + gemma4_31b_worker PUBLIC ${_common_include_directories} ${_json_include} +) +target_link_libraries(gemma4_31b_worker PUBLIC ${link_libraries}) + if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(gemma4_31b_runner) if(NOT APPLE AND NOT MSVC) target_link_options(gemma4_31b_runner PRIVATE "LINKER:-s") endif() + target_link_options_gc_sections(gemma4_31b_worker) + if(NOT APPLE AND NOT MSVC) + target_link_options(gemma4_31b_worker PRIVATE "LINKER:-s") + endif() endif() if(TARGET mlxdelegate) executorch_target_copy_mlx_metallib(gemma4_31b_runner) + executorch_target_copy_mlx_metallib(gemma4_31b_worker) +endif() + +if(EXECUTORCH_BUILD_CUDA) + enable_testing() + add_executable( + test_gemma4_31b_nobleed test_gemma4_31b_nobleed.cpp gemma4_31b_engine.cpp + ) + target_include_directories( + test_gemma4_31b_nobleed PUBLIC ${_common_include_directories} + ${_json_include} + ) + target_link_libraries(test_gemma4_31b_nobleed PUBLIC ${link_libraries}) + add_test(NAME gemma4_31b_nobleed COMMAND test_gemma4_31b_nobleed) endif() diff --git a/examples/models/gemma4_31b/CMakePresets.json b/examples/models/gemma4_31b/CMakePresets.json index 23a7d42e035..7bbdc82c81a 100644 --- a/examples/models/gemma4_31b/CMakePresets.json +++ b/examples/models/gemma4_31b/CMakePresets.json @@ -13,7 +13,7 @@ }, { "name": "gemma4-31b-cuda", - "displayName": "Gemma 4 31B runner (CUDA)", + "displayName": "Gemma 4 31B runner and worker (CUDA)", "inherits": ["gemma4-31b-base"], "cacheVariables": { "EXECUTORCH_BUILD_CUDA": "ON" @@ -26,7 +26,7 @@ }, { "name": "gemma4-31b-mlx", - "displayName": "Gemma 4 31B runner (MLX)", + "displayName": "Gemma 4 31B runner and worker (MLX)", "inherits": ["gemma4-31b-base"], "cacheVariables": {}, "condition": { @@ -39,21 +39,25 @@ "buildPresets": [ { "name": "gemma4-31b-cuda", - "displayName": "Build Gemma 4 31B runner (CUDA)", + "displayName": "Build Gemma 4 31B runner, worker, and no-bleed test (CUDA)", "configurePreset": "gemma4-31b-cuda", - "targets": ["gemma4_31b_runner"] + "targets": [ + "gemma4_31b_runner", + "gemma4_31b_worker", + "test_gemma4_31b_nobleed" + ] }, { "name": "gemma4-31b-mlx", - "displayName": "Build Gemma 4 31B runner (MLX)", + "displayName": "Build Gemma 4 31B runner and worker (MLX)", "configurePreset": "gemma4-31b-mlx", - "targets": ["gemma4_31b_runner"] + "targets": ["gemma4_31b_runner", "gemma4_31b_worker"] } ], "workflowPresets": [ { "name": "gemma4-31b-cuda", - "displayName": "Configure and build Gemma 4 31B runner (CUDA)", + "displayName": "Configure and build Gemma 4 31B runner and worker (CUDA)", "steps": [ { "type": "configure", @@ -67,7 +71,7 @@ }, { "name": "gemma4-31b-mlx", - "displayName": "Configure and build Gemma 4 31B runner (MLX)", + "displayName": "Configure and build Gemma 4 31B runner and worker (MLX)", "steps": [ { "type": "configure", diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md index 482f64083a0..eeb4fa65545 100644 --- a/examples/models/gemma4_31b/README.md +++ b/examples/models/gemma4_31b/README.md @@ -153,14 +153,17 @@ python examples/models/gemma4_31b/inference.py \ Useful before spending the export+lowering time to confirm the quantized model produces sensible text. -## Build the runner +## Build the runner and worker ```bash make gemma4_31b-cuda # Linux — CUDA backend make gemma4_31b-mlx # macOS — MLX backend (Apple Silicon) ``` -The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`. +The binaries land at: + +- `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner` +- `cmake-out/examples/models/gemma4_31b/gemma4_31b_worker` ## Run the .pte @@ -179,3 +182,61 @@ Pass `--raw_prompt` to skip template wrapping for pre-formatted input. For benchmarking, add `--cuda_graph` to capture the decode method in a CUDA graph (decode is fully static — `T=1`). + +## OpenAI-compatible serving harness + +The serving path is a test harness for local-agent workflows. Python owns HTTP, +chat templating, request validation, and tool parsing; the C++ worker owns model +loading, prefill/decode, and session state. Use the runner or engine/session API +directly for production integrations. + +### CUDA + +```bash +python -m executorch.examples.models.gemma4_31b.serve \ + --model-path ./gemma4_31b_exports/model.pte \ + --data-path ./gemma4_31b_exports/aoti_cuda_blob.ptd \ + --tokenizer-path ./gemma4_31b_int4/tokenizer.json \ + --hf-tokenizer ./gemma4_31b_int4 \ + --model-id gemma4_31b \ + --max-context 4096 \ + --max-sessions 4 \ + --host 127.0.0.1 \ + --port 8000 +``` + +### MLX + +```bash +python -m executorch.examples.models.gemma4_31b.serve \ + --model-path ./gemma4_31b_exports_mlx/model.pte \ + --tokenizer-path ./gemma4_31b_int4/tokenizer.json \ + --hf-tokenizer ./gemma4_31b_int4 \ + --model-id gemma4_31b \ + --max-context 4096 \ + --max-sessions 4 \ + --host 127.0.0.1 \ + --port 8000 +``` + +Named sessions use one loaded model with isolated mutable state when the backend +supports it. Set `--max-sessions >= 2` and send a stable `session_id` (or one of +the supported affinity headers) to enable separate conversations and warm +append-only resume. One capacity slot is reserved for anonymous requests. + +The default parser is Gemma's tool-call format. Use `--tool-parser hermes`, +`--tool-parser qwen`, or `--tool-parser none` if a different prompt/template +emits another format. + +### CUDA no-bleed test + +The CUDA build also produces `test_gemma4_31b_nobleed`, which validates that +two sessions can interleave prefill/decode on one loaded model without sharing +mutable state: + +```bash +GEMMA_MODEL_PATH=gemma4_31b_exports/model.pte \ +GEMMA_DATA_PATH=gemma4_31b_exports/aoti_cuda_blob.ptd \ +GEMMA_TOKENIZER_PATH=gemma4_31b_int4/tokenizer.json \ + cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed +``` diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py index d9e16bc34df..2a3ae9c9364 100644 --- a/examples/models/gemma4_31b/export.py +++ b/examples/models/gemma4_31b/export.py @@ -24,6 +24,7 @@ """ import argparse +import json import os import torch @@ -135,6 +136,11 @@ def _pack_for_backend(model: nn.Module, path: str, backend: str) -> None: # Export + lower +def _mutable_buffer_metadata(model: nn.Module) -> str: + mutable = [name for name, _ in model.named_buffers() if ".kv_cache." in name] + return json.dumps({"version": 1, "mutable_buffers": mutable}) + + def export_and_lower( model: Gemma4_31B, config: Gemma4_31BConfig, @@ -181,6 +187,7 @@ def _export_cuda( import executorch.backends.cuda.quantize_op_dispatch # noqa: F401 materialize_runtime_buffers(model, dtype=torch.bfloat16) + mutable_buffer_metadata = _mutable_buffer_metadata(model) if use_turboquant: from executorch.examples.models.gemma4_31b.cuda_source_transformations import ( @@ -255,6 +262,8 @@ def _export_cuda( "get_vocab_size": config.vocab_size, "get_n_layers": config.num_hidden_layers, "get_max_prefill_chunk": max_prefill, + "get_min_prefill_chunk": 5, + "get_mutable_buffer_metadata": mutable_buffer_metadata, "use_kv_cache": True, "use_sdpa_with_kv_cache": False, "enable_dynamic_shape": True, diff --git a/examples/models/gemma4_31b/gemma4_31b_engine.cpp b/examples/models/gemma4_31b/gemma4_31b_engine.cpp new file mode 100644 index 00000000000..6b2a11d3be3 --- /dev/null +++ b/examples/models/gemma4_31b/gemma4_31b_engine.cpp @@ -0,0 +1,737 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef EXECUTORCH_BUILD_CUDA +#include +#include +#else +#include +#endif + +namespace executorch::extension::llm { + +using ::executorch::extension::clone_tensor_ptr_to; +using ::executorch::extension::from_blob; +using ::executorch::extension::Module; +using ::executorch::extension::TensorPtr; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::Result; +using SizesType = executorch::aten::SizesType; + +namespace { + +#ifdef EXECUTORCH_BUILD_MLX +constexpr const char* kPrefillMethod = "forward"; +constexpr const char* kDecodeMethod = "forward"; +#else +constexpr const char* kPrefillMethod = "prefill"; +constexpr const char* kDecodeMethod = "decode"; +#endif + +constexpr const char* kMaxPrefillChunk = "get_max_prefill_chunk"; +constexpr const char* kMinPrefillChunk = "get_min_prefill_chunk"; + +Result read_sampled_token( + const executorch::aten::Tensor& output, + float temperature) { +#ifdef EXECUTORCH_BUILD_CUDA + (void)temperature; + const void* ptr = output.const_data_ptr(); + cudaPointerAttributes attrs{}; + const bool on_device = cudaPointerGetAttributes(&attrs, ptr) == cudaSuccess && + attrs.type == cudaMemoryTypeDevice; + + auto copy_scalar = [&](void* dst, size_t nbytes) -> Error { + if (on_device) { + if (cudaMemcpy(dst, ptr, nbytes, cudaMemcpyDeviceToHost) != cudaSuccess) { + return Error::Internal; + } + } else { + std::memcpy(dst, ptr, nbytes); + } + return Error::Ok; + }; + + if (output.scalar_type() == executorch::aten::ScalarType::Long) { + int64_t val = 0; + if (copy_scalar(&val, sizeof(val)) != Error::Ok) { + ET_LOG(Error, "read_sampled_token: cudaMemcpy D2H failed"); + return Error::Internal; + } + return static_cast(val); + } + if (output.scalar_type() == executorch::aten::ScalarType::Float) { + float val = 0.0f; + if (copy_scalar(&val, sizeof(val)) != Error::Ok) { + ET_LOG(Error, "read_sampled_token: cudaMemcpy D2H failed"); + return Error::Internal; + } + return static_cast(llrintf(val)); + } + ET_LOG( + Error, + "read_sampled_token: expected Long or Float scalar output, got %d", + static_cast(output.scalar_type())); + return Error::InvalidArgument; +#else + return static_cast( + logits_to_token(output, temperature < 0.0f ? 0.0f : temperature)); +#endif +} + +Result> build_gemma_module( + const Gemma4_31BConfig& config) { + std::vector data_files; + if (!config.data_path.empty()) { + data_files.push_back(config.data_path); + } + auto module = std::make_unique( + config.model_path, + data_files, + Module::LoadMode::MmapUseMlockIgnoreErrors, + /*event_tracer=*/nullptr, + /*memory_allocator=*/nullptr, + /*temp_allocator=*/nullptr); + +#ifdef EXECUTORCH_BUILD_CUDA + if (config.enable_cuda_graph) { + executorch::runtime::BackendOptions<2> cuda_opts; + ET_CHECK_OK_OR_RETURN_ERROR( + cuda_opts.set_option("enable_cuda_graph_for_method", "decode")); + ET_CHECK_OK_OR_RETURN_ERROR( + executorch::runtime::set_option("CudaBackend", cuda_opts.view())); + ET_LOG(Info, "Gemma4_31BEngine: CUDA graph enabled for decode method"); + } + { + executorch::runtime::BackendOptions<1> backend_options; + ET_CHECK_OK_OR_RETURN_ERROR( + backend_options.set_option("weight_sharing_across_methods", true)); + ET_CHECK_OK_OR_RETURN_ERROR( + executorch::runtime::set_option("CudaBackend", backend_options.view())); + } +#endif + + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kPrefillMethod)); + if (std::string(kDecodeMethod) != std::string(kPrefillMethod)) { + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kDecodeMethod)); + } + return module; +} + +void add_token_piece( + ::tokenizers::Tokenizer* tokenizer, + std::unordered_set& ids, + const char* piece) { + if (auto id = tokenizer->piece_to_id(piece); id.ok()) { + ids.insert(*id); + } +} + +#ifdef EXECUTORCH_BUILD_CUDA +Error register_mutable_fqns( + Module* module, + ::executorch::backends::cuda::MutableStateContextOwner& mutable_state) { + auto res = module->execute("get_mutable_buffer_metadata"); + if (res.error() != Error::Ok) { + ET_LOG( + Info, + "Gemma4_31BEngine: model has no get_mutable_buffer_metadata; " + "multi-session disabled"); + return res.error(); + } + const auto& outs = res.get(); + if (outs.empty() || !outs[0].isString()) { + ET_LOG(Error, "get_mutable_buffer_metadata did not return a string"); + return Error::InvalidProgram; + } + std::string json_str(outs[0].toString()); + auto j = nlohmann::json::parse(json_str, nullptr, /*allow_exceptions=*/false); + if (j.is_discarded() || !j.is_object()) { + ET_LOG(Error, "get_mutable_buffer_metadata is not a valid JSON object"); + return Error::InvalidProgram; + } + if (!j.contains("version") || !j["version"].is_number_integer() || + j["version"].get() != 1) { + ET_LOG(Error, "get_mutable_buffer_metadata: unsupported/missing version"); + return Error::InvalidProgram; + } + if (!j.contains("mutable_buffers") || !j["mutable_buffers"].is_array() || + j["mutable_buffers"].empty()) { + ET_LOG( + Error, + "get_mutable_buffer_metadata: mutable_buffers must be a non-empty array"); + return Error::InvalidProgram; + } + std::vector fqns; + for (const auto& f : j["mutable_buffers"]) { + if (!f.is_string() || f.get().empty()) { + ET_LOG( + Error, + "get_mutable_buffer_metadata: every mutable_buffers entry must be a " + "non-empty string"); + return Error::InvalidProgram; + } + fqns.push_back(f.get()); + } + mutable_state.register_fqns(fqns); + return Error::Ok; +} +#endif + +class Gemma4_31BSession : public LLMSession { + public: + Gemma4_31BSession( + Module* module, + std::mutex* exec_mutex, + std::atomic* live_sessions, + ::tokenizers::Tokenizer* tokenizer, + std::unordered_map metadata, + std::unordered_set eos_ids, + int64_t max_prefill_chunk, + int64_t min_prefill_chunk +#ifdef GEMMA_HAS_MUTABLE_STATE + , + GemmaMutableStateContextOwner* mutable_state, + int session_token +#endif + ) + : module_(module), + exec_mutex_(exec_mutex), + live_sessions_(live_sessions), + tokenizer_(tokenizer), + metadata_(std::move(metadata)), + eos_ids_(std::move(eos_ids)), + max_prefill_chunk_(max_prefill_chunk), + min_prefill_chunk_(min_prefill_chunk) +#ifdef GEMMA_HAS_MUTABLE_STATE + , + mutable_state_(mutable_state), + session_token_(session_token) +#endif + { + decode_tokens_ = from_blob( + decode_token_data_, {1, 1}, executorch::aten::ScalarType::Long); + decode_pos_ = + from_blob(decode_pos_data_, {1}, executorch::aten::ScalarType::Long); +#ifdef EXECUTORCH_BUILD_CUDA + decode_tokens_dev_ = clone_tensor_ptr_to(decode_tokens_, cuda_device_); + decode_pos_dev_ = clone_tensor_ptr_to(decode_pos_, cuda_device_); + auto temp_host = + from_blob(&temp_val_, {1}, executorch::aten::ScalarType::Float); + temp_tensor_dev_ = clone_tensor_ptr_to(temp_host, cuda_device_); +#endif + } + + ~Gemma4_31BSession() override { +#ifdef GEMMA_HAS_MUTABLE_STATE + if (mutable_state_ != nullptr && session_token_ != kGemmaNoMutableSession) { + mutable_state_->destroy_session(session_token_); + } +#endif + if (live_sessions_ != nullptr) { + live_sessions_->fetch_sub(1); + } + } + + Error prefill_tokens( + const std::vector& tokens, + const SamplingConfig* initial_sampling) override { + if (tokens.empty()) { + ET_LOG(Error, "prefill_tokens: empty token list"); + return Error::InvalidArgument; + } + float first_token_temp = temperature_; + if (initial_sampling != nullptr) { + if (initial_sampling->top_p != 1.0f || initial_sampling->top_k != 0 || + initial_sampling->seed != 0) { + ET_LOG( + Error, + "Gemma4_31BSession: only temperature is supported; top_p/top_k/seed " + "are not implemented"); + return Error::NotSupported; + } + first_token_temp = initial_sampling->temperature; + } + if (!valid_temperature(first_token_temp)) { + ET_LOG(Error, "prefill_tokens: temperature must be -1 or in [0, 1]"); + return Error::InvalidArgument; + } + + const int64_t T = static_cast(tokens.size()); + const auto ctx_it = metadata_.find(kMaxContextLen); + if (ctx_it != metadata_.end() && pos_ + T >= ctx_it->second) { + ET_LOG( + Error, + "prefill_tokens would leave no room to generate (pos %" PRId64 + " + %" PRId64 " >= max_context %" PRId64 ")", + pos_, + T, + ctx_it->second); + return Error::InvalidArgument; + } + + stop_.store(false, std::memory_order_relaxed); + int64_t offset = 0; + while (offset < T) { + int64_t chunk = T - offset; + if (max_prefill_chunk_ > 0) { + chunk = std::min(chunk, max_prefill_chunk_); + } +#ifdef EXECUTORCH_BUILD_CUDA + if (chunk > 1 && chunk < min_prefill_chunk_) { + chunk = 1; + } +#endif + auto sampled = + run_prefill_chunk(tokens.data() + offset, chunk, first_token_temp); + ET_CHECK_OK_OR_RETURN_ERROR(sampled.error()); + pending_ = sampled.get(); + pos_ += chunk; + offset += chunk; + } + prev_decode_token_ = tokens.back(); + return Error::Ok; + } + + Result decode_one(const SamplingConfig& sampling) override { + if (sampling.top_p != 1.0f || sampling.top_k != 0 || sampling.seed != 0) { + ET_LOG( + Error, + "Gemma4_31BSession: only temperature is supported; top_p/top_k/seed " + "are not implemented"); + return Error::NotSupported; + } + if (!valid_temperature(sampling.temperature)) { + ET_LOG(Error, "decode_one: temperature must be -1 or in [0, 1]"); + return Error::InvalidArgument; + } + ET_CHECK_OR_RETURN_ERROR( + pending_.has_value(), + InvalidState, + "decode_one requires a pending token; call prefill_tokens() first"); + temperature_ = sampling.temperature; + + if (stop_.load(std::memory_order_relaxed)) { + return DecodeResult{0, "", /*is_eos=*/false, /*is_terminal=*/true}; + } + + const uint64_t token = pending_.value(); + const bool is_eos = eos_ids_.find(token) != eos_ids_.end(); + const uint64_t prev = prev_decode_token_.value_or(token); + auto dec = tokenizer_->decode(prev, token); + if (!dec.ok()) { + ET_LOG( + Error, + "Tokenizers error code %d", + static_cast(dec.error())); + return Error::InvalidArgument; + } + std::string text_piece = std::move(*dec); + + if (is_eos) { + pending_.reset(); + return DecodeResult{ + token, std::move(text_piece), is_eos, /*is_terminal=*/true}; + } + + const auto ctx_it = metadata_.find(kMaxContextLen); + if (ctx_it != metadata_.end()) { + ET_CHECK_OR_RETURN_ERROR( + pos_ < ctx_it->second, + InvalidArgument, + "decode_one would exceed context capacity: pos_ %" PRId64 + " >= max_context %" PRId64, + pos_, + ctx_it->second); + } + + decode_token_data_[0] = static_cast(token); + decode_pos_data_[0] = pos_; + std::vector inputs; +#ifdef EXECUTORCH_BUILD_CUDA + ET_CHECK_OK_OR_RETURN_ERROR(copy_decode_inputs_to_cuda()); + ET_CHECK_OK_OR_RETURN_ERROR(set_temperature(temperature_)); + inputs.push_back(EValue(decode_tokens_dev_)); + inputs.push_back(EValue(decode_pos_dev_)); + inputs.push_back(EValue(temp_tensor_dev_)); +#else + inputs.push_back(EValue(decode_tokens_)); + inputs.push_back(EValue(decode_pos_)); +#endif + auto sampled = + run_locked(kDecodeMethod, inputs, temperature_, /*sync_after=*/false); + ET_CHECK_OK_OR_RETURN_ERROR(sampled.error()); + pending_ = sampled.get(); + prev_decode_token_ = token; + pos_ += 1; + return DecodeResult{ + token, std::move(text_piece), /*is_eos=*/false, /*is_terminal=*/false}; + } + + int64_t position() const override { + return pos_; + } + + Error reset() override { + pos_ = 0; + pending_.reset(); + prev_decode_token_.reset(); + stop_.store(false, std::memory_order_relaxed); + return Error::Ok; + } + + void stop() override { + stop_.store(true, std::memory_order_relaxed); + } + + private: + static bool valid_temperature(float temperature) { + return temperature == -1.0f || (temperature >= 0.0f && temperature <= 1.0f); + } + + Result + run_prefill_chunk(const uint64_t* tokens, int64_t T, float temperature) { + std::vector token_data(tokens, tokens + T); + std::vector pos_data(T); + for (int64_t i = 0; i < T; ++i) { + pos_data[i] = pos_ + i; + } + auto tokens_tensor = from_blob( + token_data.data(), + {1, static_cast(T)}, + executorch::aten::ScalarType::Long); + auto pos_tensor = from_blob( + pos_data.data(), + {static_cast(T)}, + executorch::aten::ScalarType::Long); + std::vector inputs; + TensorPtr token_input = tokens_tensor; + TensorPtr pos_input = pos_tensor; +#ifdef EXECUTORCH_BUILD_CUDA + std::vector device_inputs; + token_input = to_cuda(token_input, device_inputs); + pos_input = to_cuda(pos_input, device_inputs); + ET_CHECK_OK_OR_RETURN_ERROR(set_temperature(temperature)); +#endif + inputs.push_back(EValue(token_input)); + inputs.push_back(EValue(pos_input)); +#ifdef EXECUTORCH_BUILD_CUDA + inputs.push_back(EValue(temp_tensor_dev_)); + const char* method = + (T >= min_prefill_chunk_) ? kPrefillMethod : kDecodeMethod; +#else + const char* method = kPrefillMethod; +#endif + return run_locked(method, inputs, temperature, /*sync_after=*/true); + } + +#ifdef EXECUTORCH_BUILD_CUDA + TensorPtr to_cuda(TensorPtr tensor, std::vector& keep_alive) { + keep_alive.push_back(clone_tensor_ptr_to(tensor, cuda_device_)); + return keep_alive.back(); + } + + Error set_temperature(float temperature) { + temp_val_ = (temperature <= 0.0f) ? 1e-6f : temperature; + if (cudaMemcpy( + temp_tensor_dev_->mutable_data_ptr(), + &temp_val_, + sizeof(float), + cudaMemcpyHostToDevice) != cudaSuccess) { + ET_LOG(Error, "set_temperature: cudaMemcpy H2D failed"); + return Error::Internal; + } + return Error::Ok; + } + + Error copy_decode_inputs_to_cuda() { + if (cudaMemcpy( + decode_tokens_dev_->mutable_data_ptr(), + decode_token_data_, + sizeof(int64_t), + cudaMemcpyHostToDevice) != cudaSuccess) { + ET_LOG(Error, "copy_decode_inputs_to_cuda: token H2D failed"); + return Error::Internal; + } + if (cudaMemcpy( + decode_pos_dev_->mutable_data_ptr(), + decode_pos_data_, + sizeof(int64_t), + cudaMemcpyHostToDevice) != cudaSuccess) { + ET_LOG(Error, "copy_decode_inputs_to_cuda: pos H2D failed"); + return Error::Internal; + } + return Error::Ok; + } +#endif + + Result run_locked( + const char* method, + std::vector& inputs, + float temperature, + bool sync_after) { + std::lock_guard guard(*exec_mutex_); +#ifdef GEMMA_HAS_MUTABLE_STATE + auto res = mutable_state_ != nullptr + ? mutable_state_->with_active_session( + session_token_, + [&]() { return module_->execute(method, inputs); }) + : module_->execute(method, inputs); +#else + auto res = module_->execute(method, inputs); +#endif + ET_CHECK_OK_OR_RETURN_ERROR(res.error()); + auto sampled = read_sampled_token(res.get()[0].toTensor(), temperature); + ET_CHECK_OK_OR_RETURN_ERROR(sampled.error()); +#ifdef EXECUTORCH_BUILD_CUDA + if (sync_after && cudaDeviceSynchronize() != cudaSuccess) { + ET_LOG(Error, "run_locked: cudaDeviceSynchronize failed"); + return Error::Internal; + } +#else + (void)sync_after; +#endif + return sampled.get(); + } + + Module* module_; + std::mutex* exec_mutex_; + std::atomic* live_sessions_; + ::tokenizers::Tokenizer* tokenizer_; + std::unordered_map metadata_; + std::unordered_set eos_ids_; + int64_t max_prefill_chunk_; + int64_t min_prefill_chunk_; + + int64_t pos_ = 0; + std::optional pending_; + std::optional prev_decode_token_; + float temperature_ = -1.0f; + std::atomic stop_{false}; + + int64_t decode_token_data_[1] = {0}; + int64_t decode_pos_data_[1] = {0}; + TensorPtr decode_tokens_; + TensorPtr decode_pos_; +#ifdef GEMMA_HAS_MUTABLE_STATE + GemmaMutableStateContextOwner* mutable_state_ = nullptr; + int session_token_ = kGemmaNoMutableSession; +#endif +#ifdef EXECUTORCH_BUILD_CUDA + float temp_val_ = 1e-6f; + executorch::aten::Device cuda_device_ = + executorch::aten::Device(executorch::aten::DeviceType::CUDA, 0); + TensorPtr decode_tokens_dev_; + TensorPtr decode_pos_dev_; + TensorPtr temp_tensor_dev_; +#endif +}; + +} // namespace + +Result> Gemma4_31BEngine::create( + const Gemma4_31BConfig& config) { + if (config.model_path.empty() || config.tokenizer_path.empty()) { + ET_LOG( + Error, "Gemma4_31BEngine: model_path and tokenizer_path are required"); + return Error::InvalidArgument; + } + + auto tokenizer = std::make_unique<::tokenizers::HFTokenizer>(); + if (tokenizer->load(config.tokenizer_path) != ::tokenizers::Error::Ok) { + ET_LOG(Error, "Gemma4_31BEngine: failed to load tokenizer"); + return Error::InvalidArgument; + } + + std::vector data_files; + if (!config.data_path.empty()) { + data_files.push_back(config.data_path); + } + auto meta_module = std::make_unique( + config.model_path, data_files, Module::LoadMode::File); + auto metadata_result = get_llm_metadata(tokenizer.get(), meta_module.get()); + if (metadata_result.error() != Error::Ok) { + ET_LOG(Error, "Gemma4_31BEngine: failed to read metadata"); + return metadata_result.error(); + } + + auto eos_ids = get_eos_ids(tokenizer.get(), meta_module.get()); + eos_ids.insert(static_cast(config.eos_id)); + add_token_piece(tokenizer.get(), eos_ids, ""); + add_token_piece(tokenizer.get(), eos_ids, ""); + + auto metadata = metadata_result.get(); + int64_t max_prefill_chunk = 1; + auto max_ctx_it = metadata.find(kMaxContextLen); + if (max_ctx_it != metadata.end() && max_ctx_it->second > 1) { + max_prefill_chunk = max_ctx_it->second - 1; + } + if (auto get_result = meta_module->get(kMaxPrefillChunk); get_result.ok()) { + max_prefill_chunk = get_result->toScalar().to(); + metadata[kMaxPrefillChunk] = max_prefill_chunk; + } + + int64_t min_prefill_chunk = 1; +#ifdef EXECUTORCH_BUILD_CUDA + min_prefill_chunk = 5; + if (auto get_result = meta_module->get(kMinPrefillChunk); get_result.ok()) { + min_prefill_chunk = get_result->toScalar().to(); + } + metadata[kMinPrefillChunk] = min_prefill_chunk; +#endif + +#ifdef GEMMA_HAS_MUTABLE_STATE + std::unique_ptr mutable_state; +#endif +#ifdef EXECUTORCH_BUILD_CUDA + if (config.enable_cuda_graph) { + ET_LOG( + Info, + "Gemma4_31BEngine: CUDA graph requested; per-session rebinding " + "disabled and serving capacity clamped to 1 session."); + } else { + auto candidate = std::make_unique(); + if (Error e = register_mutable_fqns(meta_module.get(), *candidate); + e == Error::Ok) { + mutable_state = std::move(candidate); + } else { + ET_LOG( + Info, + "Gemma4_31BEngine: mutable-buffer metadata unavailable or invalid; " + "serving capacity clamped to 1 session."); + } + } +#elif defined(EXECUTORCH_BUILD_MLX) + mutable_state = std::make_unique(); +#endif + +#ifdef GEMMA_HAS_MUTABLE_STATE + auto module_res = mutable_state != nullptr + ? mutable_state->with_load_scope( + [&]() { return build_gemma_module(config); }) + : build_gemma_module(config); +#else + auto module_res = build_gemma_module(config); +#endif + if (module_res.error() != Error::Ok) { + return module_res.error(); + } + std::unique_ptr shared_module = std::move(module_res.get()); + + bool rebind_available = false; +#ifdef GEMMA_HAS_MUTABLE_STATE + rebind_available = mutable_state != nullptr && mutable_state->available(); + if (rebind_available && mutable_state->validate_coverage() != Error::Ok) { + ET_LOG( + Error, + "Gemma4_31BEngine: mutable-buffer coverage check failed; disabling " + "multi-session (capacity clamped to 1)."); + rebind_available = false; + } + if (!rebind_available) { + ET_LOG( + Info, + "Gemma4_31BEngine: per-session rebinding unavailable; serving capacity " + "clamped to 1 session."); + } +#endif + + return std::unique_ptr(new Gemma4_31BEngine( + config, + std::move(tokenizer), + std::move(metadata), + std::move(eos_ids), + std::move(shared_module), + max_prefill_chunk, + min_prefill_chunk, + rebind_available +#ifdef GEMMA_HAS_MUTABLE_STATE + , + std::move(mutable_state) +#endif + )); +} + +Gemma4_31BEngine::~Gemma4_31BEngine() = default; + +Result> Gemma4_31BEngine::create_session() { + const int cap = + serving_capacity().max_physical_sessions_without_weight_duplication; + { + std::lock_guard g(exec_mutex_); + if (live_sessions_.load() >= cap) { + ET_LOG( + Error, + "Gemma4_31BEngine: at session capacity (%d); refusing create_session", + cap); + return Error::InvalidState; + } + live_sessions_.fetch_add(1); + } + + int token = -1; +#ifdef GEMMA_HAS_MUTABLE_STATE + if (rebind_available_) { + auto t = mutable_state_->create_session(); + if (t.error() != Error::Ok) { + live_sessions_.fetch_sub(1); + return t.error(); + } + token = t.get(); + } +#endif + + return std::unique_ptr(new Gemma4_31BSession( + shared_module_.get(), + &exec_mutex_, + &live_sessions_, + tokenizer_.get(), + metadata_, + eos_ids_, + max_prefill_chunk_, + min_prefill_chunk_ +#ifdef GEMMA_HAS_MUTABLE_STATE + , + mutable_state_.get(), + token +#endif + )); +} + +LLMServingCapacity Gemma4_31BEngine::serving_capacity() const { + LLMServingCapacity cap; +#ifdef GEMMA_HAS_MUTABLE_STATE + if (rebind_available_) { + cap.max_physical_sessions_without_weight_duplication = + config_.max_sessions > 1 ? config_.max_sessions : 1; + cap.estimated_bytes_per_session = mutable_state_->bytes_per_session(); + } +#endif + return cap; +} + +} // namespace executorch::extension::llm diff --git a/examples/models/gemma4_31b/gemma4_31b_engine.h b/examples/models/gemma4_31b/gemma4_31b_engine.h new file mode 100644 index 00000000000..aa67ae5380c --- /dev/null +++ b/examples/models/gemma4_31b/gemma4_31b_engine.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Engine/Session adapter for Gemma 4 31B exported methods. CUDA builds use +// separate prefill/decode methods; MLX builds use one dynamic forward method. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef EXECUTORCH_BUILD_CUDA +#include +#elif defined(EXECUTORCH_BUILD_MLX) +#include +#endif + +#if defined(EXECUTORCH_BUILD_CUDA) || defined(EXECUTORCH_BUILD_MLX) +#define GEMMA_HAS_MUTABLE_STATE 1 +#endif + +namespace executorch::extension::llm { + +#if defined(EXECUTORCH_BUILD_CUDA) +using GemmaMutableStateContextOwner = + ::executorch::backends::cuda::MutableStateContextOwner; +constexpr int kGemmaNoMutableSession = + ::executorch::backends::cuda::kNoMutableSession; +#elif defined(EXECUTORCH_BUILD_MLX) +using GemmaMutableStateContextOwner = + ::executorch::backends::mlx::MutableStateContextOwner; +constexpr int kGemmaNoMutableSession = + ::executorch::backends::mlx::kNoMutableSession; +#endif + +struct Gemma4_31BConfig { + std::string model_path; + std::string data_path; + std::string tokenizer_path; + int32_t max_sessions = 1; + int64_t eos_id = 1; + bool enable_cuda_graph = false; +}; + +class ET_EXPERIMENTAL Gemma4_31BEngine : public LLMEngine { + public: + static ::executorch::runtime::Result> + create(const Gemma4_31BConfig& config); + + ~Gemma4_31BEngine() override; + + ::executorch::runtime::Result> create_session() + override; + + LLMServingCapacity serving_capacity() const override; + + const std::unordered_map& metadata() const override { + return metadata_; + } + + ::tokenizers::Tokenizer* tokenizer() const { + return tokenizer_.get(); + } + + Gemma4_31BEngine(const Gemma4_31BEngine&) = delete; + Gemma4_31BEngine& operator=(const Gemma4_31BEngine&) = delete; + + private: + Gemma4_31BEngine( + Gemma4_31BConfig config, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::unordered_map metadata, + std::unordered_set eos_ids, + std::unique_ptr shared_module, + int64_t max_prefill_chunk, + int64_t min_prefill_chunk, + bool rebind_available +#ifdef GEMMA_HAS_MUTABLE_STATE + , + std::unique_ptr mutable_state +#endif + ) + : config_(std::move(config)), + tokenizer_(std::move(tokenizer)), + metadata_(std::move(metadata)), + eos_ids_(std::move(eos_ids)), + shared_module_(std::move(shared_module)), + max_prefill_chunk_(max_prefill_chunk), + min_prefill_chunk_(min_prefill_chunk), + rebind_available_(rebind_available) +#ifdef GEMMA_HAS_MUTABLE_STATE + , + mutable_state_(std::move(mutable_state)) +#endif + { + } + + Gemma4_31BConfig config_; + std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; + std::unordered_map metadata_; + std::unordered_set eos_ids_; + std::unique_ptr shared_module_; + std::mutex exec_mutex_; + int64_t max_prefill_chunk_ = 0; + int64_t min_prefill_chunk_ = 1; + bool rebind_available_ = false; +#ifdef GEMMA_HAS_MUTABLE_STATE + std::unique_ptr mutable_state_; +#endif + std::atomic live_sessions_{0}; +}; + +} // namespace executorch::extension::llm diff --git a/examples/models/gemma4_31b/gemma4_31b_worker.cpp b/examples/models/gemma4_31b/gemma4_31b_worker.cpp new file mode 100644 index 00000000000..f10a588e668 --- /dev/null +++ b/examples/models/gemma4_31b/gemma4_31b_worker.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +DEFINE_string(model_path, "", "Model .pte file path."); +DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path."); +DEFINE_string(data_path, "", "Data file (.ptd) for delegated weights."); +DEFINE_int32( + max_sessions, + 1, + "Max physical sessions to host on one weight allocation. Clamped to 1 if " + "the backend cannot isolate per-session mutable state."); +DEFINE_bool( + warm_resume, + true, + "Warm append-only resume for named sessions when the engine supports them."); +DEFINE_int32(bos_id, 2, "BOS token id to prepend to server-rendered prompts."); +DEFINE_int32(eos_id, 1, "EOS token id (Gemma convention: 1)."); + +namespace { +namespace llm = ::executorch::extension::llm; +using ::executorch::runtime::Error; +} // namespace + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_path.empty() || FLAGS_tokenizer_path.empty()) { + ET_LOG( + Error, "gemma4_31b_worker: --model_path and --tokenizer_path required"); + return 1; + } + + llm::Gemma4_31BConfig config; + config.model_path = FLAGS_model_path; + config.data_path = FLAGS_data_path; + config.tokenizer_path = FLAGS_tokenizer_path; + config.max_sessions = FLAGS_max_sessions; + config.eos_id = FLAGS_eos_id; + + auto engine_result = llm::Gemma4_31BEngine::create(config); + if (engine_result.error() != Error::Ok) { + ET_LOG(Error, "gemma4_31b_worker: failed to create engine"); + return 1; + } + auto engine = std::move(engine_result.get()); + + return llm::run_worker_stdio_loop( + *engine, + *engine->tokenizer(), + engine->metadata(), + FLAGS_warm_resume, + {static_cast(FLAGS_bos_id)}); +} diff --git a/examples/models/gemma4_31b/main.cpp b/examples/models/gemma4_31b/main.cpp index 3d9970b1610..15c8719636c 100644 --- a/examples/models/gemma4_31b/main.cpp +++ b/examples/models/gemma4_31b/main.cpp @@ -6,35 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -// Gemma 4 31B-IT runner for ExecuTorch. Supports two backends: -// CUDA — exports ``prefill`` (T>=2, dynamic) + ``decode`` (T=1, static) -// methods sharing KV-cache buffers; on-device Gumbel-max sampling -// with temperature passed as a third input; returns a scalar -// float token id. -// MLX — exports a single ``forward`` method with dynamic seq_len; -// returns last-token logits; the runner samples on the host via -// ``llm::logits_to_token`` with the same temperature semantics. - #include -#include +#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include #include -#include #include -#include #include #include #include @@ -81,61 +61,38 @@ DEFINE_bool( "Enable CUDA graph capture for the decode method. CUDA only."); namespace llm = ::executorch::extension::llm; -using ::executorch::extension::from_blob; -using ::executorch::extension::make_tensor_ptr; -using ::executorch::extension::Module; -using ::executorch::extension::TensorPtr; using ::executorch::runtime::Error; -using ::executorch::runtime::EValue; -#ifdef EXECUTORCH_BUILD_CUDA -using ::executorch::extension::clone_tensor_ptr_to; -#endif - -using SizesType = executorch::aten::SizesType; -// Read a sampled token ID from a scalar int64 output (CUDA path). -// -// The model now emits the sampled token as int64 (see sampler.py), matching -// the decode method's int64 token input so the on-device output buffer can be -// aliased directly as the next step's input. We still copy the 8-byte scalar -// back to the host here for EOS detection and detokenization. -static uint64_t read_token(const executorch::aten::Tensor& output) { - const void* ptr = output.const_data_ptr(); - int64_t val = 0; +namespace { -#ifdef EXECUTORCH_BUILD_CUDA - cudaPointerAttributes attrs{}; - bool on_device = cudaPointerGetAttributes(&attrs, ptr) == cudaSuccess && - attrs.type == cudaMemoryTypeDevice; - if (on_device) { - cudaError_t err = - cudaMemcpy(&val, ptr, sizeof(int64_t), cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - ET_LOG( - Error, - "read_token: cudaMemcpy D2H failed: %s", - cudaGetErrorString(err)); - return 0; - } - } else { - memcpy(&val, ptr, sizeof(int64_t)); +std::string read_prompt() { + if (FLAGS_prompt_file.empty()) { + return FLAGS_prompt; } -#else - memcpy(&val, ptr, sizeof(int64_t)); -#endif + std::ifstream f(FLAGS_prompt_file); + if (!f.is_open()) { + ET_LOG(Error, "Failed to open prompt file: %s", FLAGS_prompt_file.c_str()); + return ""; + } + return std::string( + (std::istreambuf_iterator(f)), std::istreambuf_iterator()); +} - return static_cast(val); +std::string format_prompt(std::string prompt) { + if (FLAGS_raw_prompt) { + return prompt; + } + return "<|turn>user\n" + prompt + + "\n<|turn>model\n<|channel>thought\n"; } +} // namespace + int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_model_path.empty()) { - ET_LOG(Error, "Must specify --model_path"); - return 1; - } - if (FLAGS_tokenizer_path.empty()) { - ET_LOG(Error, "Must specify --tokenizer_path"); + if (FLAGS_model_path.empty() || FLAGS_tokenizer_path.empty()) { + ET_LOG(Error, "--model_path and --tokenizer_path are required"); return 1; } @@ -146,106 +103,29 @@ int main(int argc, char** argv) { cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); stats.gpu_total_bytes = gpu_total_bytes; stats.gpu_free_before_load_bytes = gpu_free_bytes; -#endif - - stats.model_load_start_ms = llm::time_in_ms(); - - // Tokenizer - auto tokenizer = std::make_unique(); - if (tokenizer->load(FLAGS_tokenizer_path) != tokenizers::Error::Ok) { - ET_LOG( - Error, - "Failed to load tokenizer from %s", - FLAGS_tokenizer_path.c_str()); - return 1; - } - - // Module - std::vector data_files; - if (!FLAGS_data_path.empty()) { - data_files.push_back(FLAGS_data_path); - } - auto module = std::make_unique( - FLAGS_model_path, - data_files, - Module::LoadMode::MmapUseMlockIgnoreErrors, - /*event_tracer=*/nullptr, - /*memory_allocator=*/nullptr, - /*temp_allocator=*/nullptr); - - // Get metadata - auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get()); - if (metadata_result.error() != Error::Ok) { - ET_LOG(Error, "Failed to read model metadata"); - return 1; - } - - int64_t max_prefill_chunk = (*metadata_result)[llm::kMaxSeqLen] - 1; - { - auto get_result = module->get("get_max_prefill_chunk"); - if (get_result.ok()) { - max_prefill_chunk = get_result->toScalar().to(); - } - } - - auto S = [](int64_t v) -> SizesType { return static_cast(v); }; - - float temp_val = - FLAGS_temperature <= 0.0 ? 1e-6f : static_cast(FLAGS_temperature); - -#ifdef EXECUTORCH_BUILD_CUDA - const auto cuda_device = - executorch::aten::Device(executorch::aten::DeviceType::CUDA, 0); - if (FLAGS_cuda_graph) { - executorch::runtime::BackendOptions<2> cuda_opts; - cuda_opts.set_option("enable_cuda_graph_for_method", "decode"); - executorch::runtime::set_option("CudaBackend", cuda_opts.view()); - printf("CUDA graph enabled for decode method\n"); - } - { - executorch::runtime::BackendOptions<1> backend_options; - auto set_err = - backend_options.set_option("weight_sharing_across_methods", true); - if (set_err != Error::Ok) { - ET_LOG( - Error, - "Failed to set weight_sharing_across_methods: %d", - static_cast(set_err)); - return 1; - } - auto opt_err = - executorch::runtime::set_option("CudaBackend", backend_options.view()); - if (opt_err != Error::Ok) { - ET_LOG( - Error, - "Failed to enable weight_sharing_across_methods: %d", - static_cast(opt_err)); - return 1; - } - } - printf("Loading methods...\n"); - if (module->load_method("prefill") != Error::Ok) { - ET_LOG(Error, "Failed to load prefill method"); - return 1; - } - if (module->load_method("decode") != Error::Ok) { - ET_LOG(Error, "Failed to load decode method"); - return 1; - } - auto temp_tensor = clone_tensor_ptr_to( - from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float), - cuda_device); #else if (FLAGS_cuda_graph) { ET_LOG(Info, "--cuda_graph ignored on non-CUDA build"); } - printf("Loading model...\n"); - if (module->load_method("forward") != Error::Ok) { - ET_LOG(Error, "Failed to load forward method"); - return 1; - } #endif + llm::Gemma4_31BConfig config; + config.model_path = FLAGS_model_path; + config.data_path = FLAGS_data_path; + config.tokenizer_path = FLAGS_tokenizer_path; + config.max_sessions = 1; + config.eos_id = FLAGS_eos_id; +#ifdef EXECUTORCH_BUILD_CUDA + config.enable_cuda_graph = FLAGS_cuda_graph; +#endif + + stats.model_load_start_ms = llm::time_in_ms(); + auto engine_result = llm::Gemma4_31BEngine::create(config); + if (engine_result.error() != Error::Ok) { + ET_LOG(Error, "Failed to create Gemma4_31BEngine"); + return 1; + } + auto engine = std::move(engine_result.get()); stats.model_load_end_ms = llm::time_in_ms(); #ifdef EXECUTORCH_BUILD_CUDA @@ -253,240 +133,60 @@ int main(int argc, char** argv) { stats.gpu_free_after_load_bytes = gpu_free_bytes; #endif - auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get()); - eos_ids.insert(static_cast(FLAGS_eos_id)); - auto turn_ids = tokenizer->encode("", /*bos=*/0, /*eos=*/0); - if (turn_ids.ok() && turn_ids->size() == 1) { - eos_ids.insert(turn_ids.get()[0]); - } - - // Read prompt - std::string prompt_text = FLAGS_prompt; - if (!FLAGS_prompt_file.empty()) { - std::ifstream f(FLAGS_prompt_file); - if (!f.is_open()) { - ET_LOG( - Error, "Failed to open prompt file: %s", FLAGS_prompt_file.c_str()); - return 1; - } - prompt_text = std::string( - (std::istreambuf_iterator(f)), std::istreambuf_iterator()); + auto session_result = engine->create_session(); + if (session_result.error() != Error::Ok) { + ET_LOG(Error, "Failed to create session"); + return 1; } + auto session = std::move(session_result.get()); - // Wrap with Gemma 4 IT chat template unless --raw_prompt is set. - // BOS is prepended separately below; this adds the turn structure and the - // empty thought block required by the instruction-tuned model. - if (!FLAGS_raw_prompt) { - prompt_text = "<|turn>user\n" + prompt_text + - "\n<|turn>model\n<|channel>thought\n"; + std::string prompt_text = read_prompt(); + if (!FLAGS_prompt_file.empty() && prompt_text.empty()) { + return 1; } + prompt_text = format_prompt(std::move(prompt_text)); - // Encode prompt - auto encode_result = tokenizer->encode(prompt_text); - if (!encode_result.ok()) { + auto encoded = engine->tokenizer()->encode(prompt_text, /*bos=*/0, /*eos=*/0); + if (!encoded.ok()) { ET_LOG(Error, "Failed to encode prompt"); return 1; } - auto prompt_tokens = std::move(*encode_result); - // Gemma models require BOS at the start of the sequence. - prompt_tokens.insert( - prompt_tokens.begin(), static_cast(FLAGS_bos_id)); - int64_t num_prompt_tokens = static_cast(prompt_tokens.size()); - printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens); - stats.num_prompt_tokens = num_prompt_tokens; - - stats.inference_start_ms = llm::time_in_ms(); - - // --------------------------------------------------------------- - // Prefill (chunked to respect ring-buffer KV cache limit) - // --------------------------------------------------------------- - uint64_t cur_token = 0; - int64_t prefill_pos = 0; -#ifdef EXECUTORCH_BUILD_CUDA - // Alias of the most recent forward's on-device int64 output token. The last - // prefill chunk's output seeds the first decode step (no token H2D); each - // decode step then re-aliases its own output for the next step. - TensorPtr device_out_token; -#endif - while (prefill_pos < num_prompt_tokens) { - int64_t chunk_len = - std::min(num_prompt_tokens - prefill_pos, max_prefill_chunk); - - std::vector token_data( - prompt_tokens.begin() + prefill_pos, - prompt_tokens.begin() + prefill_pos + chunk_len); - std::vector pos_data(chunk_len); - for (int64_t i = 0; i < chunk_len; i++) { - pos_data[i] = prefill_pos + i; - } - auto tokens_tensor = from_blob( - token_data.data(), - {1, S(chunk_len)}, - executorch::aten::ScalarType::Long); - auto pos_tensor = from_blob( - pos_data.data(), {S(chunk_len)}, executorch::aten::ScalarType::Long); - -#ifdef EXECUTORCH_BUILD_CUDA - // skip_h2d: prefill/decode method inputs must already live in CUDA memory. - tokens_tensor = clone_tensor_ptr_to(tokens_tensor, cuda_device); - pos_tensor = clone_tensor_ptr_to(pos_tensor, cuda_device); -#endif - - std::vector inputs; - inputs.push_back(EValue(tokens_tensor)); - inputs.push_back(EValue(pos_tensor)); - -#ifdef EXECUTORCH_BUILD_CUDA - inputs.push_back(EValue(temp_tensor)); - std::string method = (chunk_len == 1) ? "decode" : "prefill"; -#else - std::string method = "forward"; -#endif + std::vector prompt_tokens; + prompt_tokens.reserve(encoded->size() + 1); + prompt_tokens.push_back(static_cast(FLAGS_bos_id)); + prompt_tokens.insert(prompt_tokens.end(), encoded->begin(), encoded->end()); - auto result = module->execute(method, inputs); - if (result.error() != Error::Ok) { - ET_LOG(Error, "%s failed at pos %" PRId64, method.c_str(), prefill_pos); - return 1; - } - -#ifdef EXECUTORCH_BUILD_CUDA - const auto& out_tensor = result.get()[0].toTensor(); - cur_token = read_token(out_tensor); - // Keep the sampled token on device: alias the output buffer so it feeds - // straight into the next forward as the int64 token input (zero copy). - device_out_token = make_tensor_ptr(out_tensor); -#else - cur_token = static_cast( - llm::logits_to_token(result.get()[0].toTensor(), temp_val)); -#endif + stats.num_prompt_tokens = static_cast(prompt_tokens.size()); + printf("Prompt tokens: %" PRId64 "\n", stats.num_prompt_tokens); - prefill_pos += chunk_len; + llm::SamplingConfig sampling; + sampling.temperature = static_cast(FLAGS_temperature); + stats.inference_start_ms = llm::time_in_ms(); + if (session->prefill_tokens(prompt_tokens, &sampling) != Error::Ok) { + ET_LOG(Error, "Prefill failed"); + return 1; } - stats.prompt_eval_end_ms = llm::time_in_ms(); - // First generated token came from the last prefill chunk; TTFT is prefill. stats.first_token_ms = stats.prompt_eval_end_ms; -#ifdef EXECUTORCH_BUILD_CUDA - cudaDeviceSynchronize(); -#endif - - // Print the first generated token (from the last prefill chunk). - // Use the last prompt token as the streaming-decode prefix so any BPE - // partial-character handling stays correct. - { - auto first_str = tokenizer->decode(prompt_tokens.back(), cur_token); - if (first_str.ok()) { - printf("%s", first_str->c_str()); - fflush(stdout); - } - } - - // --------------------------------------------------------------- - // Decode loop - // --------------------------------------------------------------- - int64_t pos = num_prompt_tokens; - std::vector decode_pos_data = {pos}; - auto decode_pos_cpu = from_blob( - decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long); -#ifdef EXECUTORCH_BUILD_CUDA - // Fixed device-resident position input slot: the decode method always reads - // the position from this same address every step (cuda-graph-safe). Seeded - // once here with a one-time H2D; refreshed each step by an on-device D2D. - auto decode_pos = clone_tensor_ptr_to(decode_pos_cpu, cuda_device); - // Upload the FULL decode position array to device ONCE (a single H2D - the - // one-time copy we keep). Each step copies its position from here into the - // fixed slot with a device-to-device copy, so there is NO per-round pos H2D. - std::vector pos_seq_data(FLAGS_max_new_tokens); - for (int32_t i = 0; i < FLAGS_max_new_tokens; i++) { - pos_seq_data[i] = num_prompt_tokens + i; - } - auto pos_seq_dev = clone_tensor_ptr_to( - from_blob( - pos_seq_data.data(), - {S(FLAGS_max_new_tokens)}, - executorch::aten::ScalarType::Long), - cuda_device); - auto* pos_seq_dev_ptr = - static_cast(pos_seq_dev->mutable_data_ptr()); - auto* decode_pos_slot_ptr = - static_cast(decode_pos->mutable_data_ptr()); -#else - // Non-CUDA (MLX) path: keep host token/pos buffers; the backend stages them - // and the host samples from the returned logits. - std::vector decode_token_data = {static_cast(cur_token)}; - auto decode_tokens = from_blob( - decode_token_data.data(), {1, 1}, executorch::aten::ScalarType::Long); - auto decode_pos = decode_pos_cpu; -#endif - - uint64_t prev_token = cur_token; - bool hit_eos = eos_ids.find(cur_token) != eos_ids.end(); - for (int32_t step = 0; step < FLAGS_max_new_tokens && !hit_eos; step++) { -#ifdef EXECUTORCH_BUILD_CUDA - // No per-round H2D: copy this step's position from the pre-uploaded device - // position array into the fixed position slot with an on-device D2D. With - // the token aliased on device (Option A) and the position staged via D2D, - // the per-round HtoD count is zero (independent of decode length). - // cudaMemcpy D2D is host-synchronous, so the slot is updated before the - // decode kernels read it; with cuda graph enabled this becomes a captured - // cudaMemcpyAsync on the decode stream into this same fixed slot. - ET_CHECK_MSG( - cudaMemcpy( - decode_pos_slot_ptr, - pos_seq_dev_ptr + step, - sizeof(int64_t), - cudaMemcpyDeviceToDevice) == cudaSuccess, - "Failed to copy decode position D2D"); -#else - decode_pos_data[0] = pos; - decode_token_data[0] = static_cast(cur_token); -#endif - - std::vector inputs; -#ifdef EXECUTORCH_BUILD_CUDA - inputs.push_back(EValue(device_out_token)); -#else - inputs.push_back(EValue(decode_tokens)); -#endif - inputs.push_back(EValue(decode_pos)); - -#ifdef EXECUTORCH_BUILD_CUDA - inputs.push_back(EValue(temp_tensor)); - auto result = module->execute("decode", inputs); -#else - auto result = module->execute("forward", inputs); -#endif - - if (result.error() != Error::Ok) { - ET_LOG(Error, "Decode step %d failed", step); + int64_t generated = 0; + for (; generated < FLAGS_max_new_tokens; ++generated) { + auto step = session->decode_one(sampling); + if (step.error() != Error::Ok) { + ET_LOG(Error, "Decode failed"); return 1; } - - prev_token = cur_token; -#ifdef EXECUTORCH_BUILD_CUDA - const auto& out_tensor = result.get()[0].toTensor(); - cur_token = read_token(out_tensor); - // Alias this step's on-device output token as the next step's token input. - device_out_token = make_tensor_ptr(out_tensor); -#else - cur_token = static_cast( - llm::logits_to_token(result.get()[0].toTensor(), temp_val)); -#endif - pos++; - - auto decode_str = tokenizer->decode(prev_token, cur_token); - if (decode_str.ok()) { - printf("%s", decode_str->c_str()); - fflush(stdout); + const auto& d = step.get(); + if (d.is_terminal) { + break; } - - hit_eos = eos_ids.find(cur_token) != eos_ids.end(); + printf("%s", d.text_piece.c_str()); + fflush(stdout); } printf("\n"); stats.inference_end_ms = llm::time_in_ms(); - stats.num_generated_tokens = pos - num_prompt_tokens; + stats.num_generated_tokens = generated; #ifdef EXECUTORCH_BUILD_CUDA cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); diff --git a/examples/models/gemma4_31b/serve.py b/examples/models/gemma4_31b/serve.py new file mode 100644 index 00000000000..b99fae08c9c --- /dev/null +++ b/examples/models/gemma4_31b/serve.py @@ -0,0 +1,214 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""OpenAI-compatible HTTP server for Gemma 4 31B. + +The server process is Python control plane only. It handles HTTP, Gemma chat +templating, tool parsing, and request validation. Model execution lives in the +C++ `gemma4_31b_worker` process and is driven over the generic +`examples/llm_server` JSONL protocol. +""" + +import argparse +import logging +import os +import re +from pathlib import Path + +from executorch.examples.llm_server.python.chat_template import ChatTemplate +from executorch.examples.llm_server.python.server import build_app +from executorch.examples.llm_server.python.serving_chat import ServingChat +from executorch.examples.llm_server.python.session_runtime import SessionRuntime +from executorch.examples.llm_server.python.tool_parsers import ( + GemmaToolCallDetector, + HermesDetector, + QwenFunctionCallDetector, +) +from executorch.examples.llm_server.python.worker_client import spawn_worker + +logger = logging.getLogger(__name__) + +_GEMMA_CHANNEL_SPECIALS = {"<|channel>", "", "<|think|>"} +_GEMMA_CHANNEL_BLOCK = re.compile(r"<\|channel>.*?", re.DOTALL) + + +def _strip_gemma_channels(text: str) -> str: + text = _GEMMA_CHANNEL_BLOCK.sub("", text) + open_idx = text.find("<|channel>") + if open_idx != -1: + text = text[:open_idx] + return text.replace("", "").replace("<|think|>", "").strip() + + +def _repo_root() -> Path: + starts = [Path(__file__).resolve(), Path(__file__).absolute(), Path.cwd().resolve()] + seen: set[Path] = set() + for start in starts: + cur = start if start.is_dir() else start.parent + for path in (cur, *cur.parents): + if path in seen: + continue + seen.add(path) + if (path / "CMakeLists.txt").exists() and ( + path / "examples" / "models" / "gemma4_31b" + ).is_dir(): + return path + raise RuntimeError( + "Could not locate the ExecuTorch source checkout; pass --worker-bin " + "explicitly." + ) + + +def _default_worker_bin() -> str: + return str( + _repo_root() + / "cmake-out" + / "examples" + / "models" + / "gemma4_31b" + / "gemma4_31b_worker" + ) + + +def _spawn(args): + env = dict(os.environ) + conda = os.environ.get("CONDA_PREFIX") + if conda: + existing = env.get("LD_LIBRARY_PATH", "") + env["LD_LIBRARY_PATH"] = ( + f"{conda}/lib:{existing}" if existing else f"{conda}/lib" + ) + worker_bin = args.worker_bin or _default_worker_bin() + cmd = [ + worker_bin, + "--model_path", + args.model_path, + "--tokenizer_path", + args.tokenizer_path, + "--max_sessions", + str(args.max_sessions), + f"--warm_resume={'true' if args.warm_resume else 'false'}", + "--bos_id", + str(args.bos_id), + "--eos_id", + str(args.eos_id), + ] + if args.data_path: + cmd += ["--data_path", args.data_path] + logger.info("Starting Gemma4 31B worker subprocess (loads the model once).") + return spawn_worker(cmd, env=env) + + +def _tool_detector(name: str): + if name == "gemma": + return GemmaToolCallDetector + if name == "hermes": + return HermesDetector + if name == "qwen": + return QwenFunctionCallDetector + if name == "none": + return None + raise ValueError(f"unknown tool parser: {name}") + + +def build_app_from_args(args): + template = ChatTemplate( + args.hf_tokenizer, + assistant_header="<|turn>model\n", + strip_rendered_bos=True, + append_generation_prompt_after_tool_response=True, + ) + worker = _spawn(args) + runtime = SessionRuntime(worker) + serving = ServingChat( + runtime, + template, + args.model_id, + max_context=args.max_context, + tool_detector_cls=_tool_detector(args.tool_parser), + prompt_token_offset=1, + content_filter=_strip_gemma_channels, + content_filter_specials=_GEMMA_CHANNEL_SPECIALS, + ) + app = build_app(serving, args.model_id) + + @app.on_event("shutdown") + def _stop_worker(): + runtime.close_worker() + + return app, args.model_id + + +def main() -> None: + p = argparse.ArgumentParser( + description="OpenAI-compatible LLM server for Gemma 4 31B" + ) + p.add_argument("--model-path", required=True, help="Path to the .pte model") + p.add_argument("--data-path", default=None, help="Path to the delegate blob") + p.add_argument("--tokenizer-path", required=True, help="Path to tokenizer.json") + p.add_argument( + "--hf-tokenizer", + required=True, + help="HF tokenizer id/dir for the model's chat template", + ) + p.add_argument("--model-id", default="gemma4_31b") + p.add_argument("--host", default="127.0.0.1") + p.add_argument("--port", type=int, default=8000) + p.add_argument("--max-context", type=int, default=None) + p.add_argument( + "--num-runners", + type=int, + default=1, + help="Worker processes. 1 only; more would duplicate the weights.", + ) + p.add_argument( + "--max-sessions", + type=int, + default=1, + help="Physical sessions the worker can host on one weight load. One " + "slot is reserved for anonymous requests, so addressable named " + "sessions are max-sessions - 1.", + ) + p.add_argument( + "--warm-resume", + action=argparse.BooleanOptionalAction, + default=True, + help="Warm append-only resume for named sessions when available.", + ) + p.add_argument( + "--tool-parser", + choices=("gemma", "hermes", "qwen", "none"), + default="gemma", + help="Tool-call format parser to apply to model output.", + ) + p.add_argument( + "--bos-id", + type=int, + default=2, + help="BOS token id to prepend in the worker. The launcher strips the " + "HF template's literal BOS before C++ tokenization.", + ) + p.add_argument("--eos-id", type=int, default=1) + p.add_argument( + "--worker-bin", + default=None, + help="Path to gemma4_31b_worker. Defaults to the CMake output path.", + ) + args = p.parse_args() + logging.basicConfig(level=logging.INFO) + + if args.num_runners != 1: + p.error("Only 1 worker process is supported; more would duplicate weights.") + + app, _ = build_app_from_args(args) + + import uvicorn + + uvicorn.run(app, host=args.host, port=args.port) + + +if __name__ == "__main__": + main() diff --git a/examples/models/gemma4_31b/test_gemma4_31b_nobleed.cpp b/examples/models/gemma4_31b/test_gemma4_31b_nobleed.cpp new file mode 100644 index 00000000000..3eb48009807 --- /dev/null +++ b/examples/models/gemma4_31b/test_gemma4_31b_nobleed.cpp @@ -0,0 +1,261 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// CUDA integration proof for one loaded Gemma4 31B model with isolated +// sessions. Set GEMMA_MODEL_PATH, GEMMA_DATA_PATH, and GEMMA_TOKENIZER_PATH to +// run it. + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace llm = ::executorch::extension::llm; +using ::executorch::runtime::Error; + +namespace { + +int g_failures = 0; + +void check(const std::string& name, bool ok) { + printf(" [%s] %s\n", ok ? "PASS" : "FAIL", name.c_str()); + if (!ok) { + ++g_failures; + } +} + +const char* env(const char* k) { + const char* v = std::getenv(k); + return (v && *v) ? v : nullptr; +} + +std::vector encode_prompt( + llm::Gemma4_31BEngine& engine, + const std::string& prompt) { + const std::string rendered = "<|turn>user\n" + prompt + + "\n<|turn>model\n" + "<|channel>thought\n"; + auto encoded = engine.tokenizer()->encode(rendered, /*bos=*/0, /*eos=*/0); + if (!encoded.ok()) { + return {}; + } + std::vector ids; + ids.reserve(encoded->size() + 1); + ids.push_back(2); + ids.insert(ids.end(), encoded->begin(), encoded->end()); + return ids; +} + +std::vector encode_prompt_at_least( + llm::Gemma4_31BEngine& engine, + const std::string& seed, + const std::string& filler, + size_t min_tokens) { + std::string prompt = seed; + std::vector ids = encode_prompt(engine, prompt); + while (!ids.empty() && ids.size() < min_tokens) { + prompt.append(" "); + prompt.append(filler); + ids = encode_prompt(engine, prompt); + } + return ids; +} + +std::vector decode_tokens( + llm::LLMSession& session, + const std::vector& prompt, + int n) { + llm::SamplingConfig sampling; + std::vector out; + if (session.prefill_tokens(prompt, &sampling) != Error::Ok) { + return out; + } + for (int i = 0; i < n; ++i) { + auto step = session.decode_one(sampling); + if (step.error() != Error::Ok || step.get().is_terminal) { + break; + } + out.push_back(step.get().token_id); + } + return out; +} + +void run_no_bleed_case( + llm::Gemma4_31BEngine& engine, + const std::string& label, + const std::vector& prompt_a, + const std::vector& prompt_b, + int decode_steps) { + printf( + "\n%s (A=%zu tokens, B=%zu tokens)\n", + label.c_str(), + prompt_a.size(), + prompt_b.size()); + check(label + ": prompts encoded", !prompt_a.empty() && !prompt_b.empty()); + + std::vector solo; + auto solo_session = engine.create_session(); + check(label + ": create solo session", solo_session.error() == Error::Ok); + if (solo_session.error() == Error::Ok) { + solo = decode_tokens(*solo_session.get(), prompt_a, decode_steps); + } + check(label + ": solo produced tokens", !solo.empty()); + + std::vector interleaved_a; + std::vector interleaved_b; + auto a_result = engine.create_session(); + auto b_result = engine.create_session(); + check( + label + ": create interleaved sessions", + a_result.error() == Error::Ok && b_result.error() == Error::Ok); + if (a_result.error() == Error::Ok && b_result.error() == Error::Ok) { + auto a = std::move(a_result.get()); + auto b = std::move(b_result.get()); + llm::SamplingConfig sampling; + bool ok = a->prefill_tokens(prompt_a, &sampling) == Error::Ok && + b->prefill_tokens(prompt_b, &sampling) == Error::Ok; + check(label + ": interleaved prefills", ok); + bool a_done = false; + bool b_done = false; + for (int i = 0; i < decode_steps && ok; ++i) { + if (!a_done) { + auto step = a->decode_one(sampling); + if (step.error() != Error::Ok || step.get().is_terminal) { + a_done = true; + } else { + interleaved_a.push_back(step.get().token_id); + } + } + if (!b_done) { + auto step = b->decode_one(sampling); + if (step.error() != Error::Ok || step.get().is_terminal) { + b_done = true; + } else { + interleaved_b.push_back(step.get().token_id); + } + } + } + } + + check( + label + ": A interleaved == A solo (bit-identical)", + interleaved_a == solo); + check( + label + ": B ran a distinct conversation", + !interleaved_b.empty() && interleaved_b != solo); +} + +int64_t gpu_free() { + size_t free = 0; + size_t total = 0; + return cudaMemGetInfo(&free, &total) == cudaSuccess + ? static_cast(free) + : -1; +} + +} // namespace + +int main() { + const char* model = env("GEMMA_MODEL_PATH"); + const char* tokenizer = env("GEMMA_TOKENIZER_PATH"); + if (!model || !tokenizer) { + printf( + "SKIP: integration proof needs GEMMA_MODEL_PATH / " + "GEMMA_TOKENIZER_PATH (+ GEMMA_DATA_PATH) on a CUDA box.\n"); + return 0; + } + + llm::Gemma4_31BConfig config; + config.model_path = model; + config.data_path = env("GEMMA_DATA_PATH") ? env("GEMMA_DATA_PATH") : ""; + config.tokenizer_path = tokenizer; + config.max_sessions = 4; + + auto engine_result = llm::Gemma4_31BEngine::create(config); + if (engine_result.error() != Error::Ok) { + printf("SKIP: engine create failed (no CUDA device / bad paths).\n"); + return 0; + } + auto engine = std::move(engine_result.get()); + + printf("Gemma4 31B no-bleed integration proof:\n"); + auto prompt_a = encode_prompt(*engine, "List three colors."); + auto prompt_b = encode_prompt(*engine, "Name two countries in Europe."); + run_no_bleed_case(*engine, "short-context", prompt_a, prompt_b, 24); + + auto long_prompt_a = encode_prompt_at_least( + *engine, + "After reading these notes, answer with one concise sentence.", + "alpha beta gamma delta epsilon zeta eta theta iota kappa", + 1152); + auto long_prompt_b = encode_prompt_at_least( + *engine, + "After reading this inventory, answer with one concise sentence.", + "red orange yellow green blue indigo violet black white silver", + 1152); + run_no_bleed_case( + *engine, + "long-context-crosses-sliding-window", + long_prompt_a, + long_prompt_b, + 8); + + const int64_t est = engine->serving_capacity().estimated_bytes_per_session; + int64_t free_before = gpu_free(); + { + auto extra_result = engine->create_session(); + if (extra_result.error() == Error::Ok) { + auto extra = std::move(extra_result.get()); + llm::SamplingConfig sampling; + extra->prefill_tokens(prompt_a, &sampling); + int64_t free_after = gpu_free(); + if (free_before > 0 && free_after > 0) { + const int64_t delta = free_before - free_after; + printf( + " extra-session GPU delta=%lld bytes (est/session=%lld)\n", + static_cast(delta), + static_cast(est)); + check( + "extra session is state-sized, not another model load", + delta > 0 && delta < (16LL << 30)); + if (est > 0) { + check( + "memory delta within 2x of estimated_bytes_per_session", + delta <= est * 2 + (512LL << 20)); + } + } + } + } + + std::vector> held; + while (true) { + auto session = engine->create_session(); + if (session.error() != Error::Ok) { + break; + } + held.push_back(std::move(session.get())); + if (held.size() > static_cast(config.max_sessions) + 2) { + break; + } + } + check( + "capacity enforced: create_session fails past max_sessions", + held.size() <= static_cast(config.max_sessions)); + + printf( + "\n%s (%d failure(s))\n", + g_failures ? "FAILURES" : "ALL PASS", + g_failures); + return g_failures ? 1 : 0; +} diff --git a/examples/models/gemma4_31b/test_ondevice_serving.py b/examples/models/gemma4_31b/test_ondevice_serving.py new file mode 100644 index 00000000000..865a5c30045 --- /dev/null +++ b/examples/models/gemma4_31b/test_ondevice_serving.py @@ -0,0 +1,62 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import urllib.request + +import pytest + +from executorch.examples.llm_server.python.chat_template import ChatTemplate +from executorch.examples.llm_server.python.protocol import ChatMessage + +_SERVER = os.environ.get("GEMMA_SERVER_URL") +_HF_DIR = os.environ.get("GEMMA_HF_DIR") + +pytestmark = pytest.mark.skipif( + not _SERVER or not _HF_DIR or not os.path.isdir(_HF_DIR), + reason="set GEMMA_SERVER_URL and GEMMA_HF_DIR to run Gemma on-device tests", +) + + +def _post(path: str, payload: dict) -> dict: + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + _SERVER.rstrip("/") + path, + data=data, + headers={"Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=120) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def test_prompt_tokens_match_real_template_with_numeric_bos_prefix(): + pytest.importorskip("transformers") + from transformers import AutoTokenizer + + template = ChatTemplate( + _HF_DIR, + assistant_header="<|turn>model\n", + strip_rendered_bos=True, + append_generation_prompt_after_tool_response=True, + ) + tok = AutoTokenizer.from_pretrained(_HF_DIR) + messages = [ChatMessage(role="user", content="Say ok.")] + rendered = template.render(messages) + expected_ids = [tok.bos_token_id] + tok.encode(rendered, add_special_tokens=False) + + body = _post( + "/v1/chat/completions", + { + "model": "gemma4_31b", + "messages": [{"role": "user", "content": "Say ok."}], + "max_tokens": 1, + "temperature": 0, + "session_id": "gemma-bos-regression", + }, + ) + assert body["usage"]["prompt_tokens"] == len(expected_ids) diff --git a/examples/models/gemma4_31b/test_serve.py b/examples/models/gemma4_31b/test_serve.py new file mode 100644 index 00000000000..98f9cde0520 --- /dev/null +++ b/examples/models/gemma4_31b/test_serve.py @@ -0,0 +1,184 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Hermetic tests for the Gemma 4 31B OpenAI serving launcher.""" + +import pathlib +from types import SimpleNamespace + +import pytest + +from executorch.examples.models.gemma4_31b import serve + +_HERE = pathlib.Path(serve.__file__).resolve().parent +_REPO_ROOT = serve._repo_root() + + +def test_generic_server_does_not_reference_gemma4_31b(): + server_dir = _REPO_ROOT / "examples/llm_server" + offenders = [] + for p in server_dir.rglob("*.py"): + text = p.read_text() + if "gemma4_31b" in text or "Gemma4_31B" in text: + offenders.append(p) + assert offenders == [], f"generic server must not reference Gemma: {offenders}" + + +def test_control_plane_runs_no_model_code(): + serve_src = (_HERE / "serve.py").read_text() + assert "Gemma4_31BEngine" not in serve_src + worker_src = (_HERE / "gemma4_31b_worker.cpp").read_text() + assert "Gemma4_31BEngine" in worker_src + + +def test_python_worker_and_pybind_are_absent(): + assert not (_HERE / "worker.py").exists() + assert not (_HERE / "gemma4_31b_pybindings.cpp").exists() + + +def test_spawn_builds_worker_command(monkeypatch): + captured = {} + + def fake_spawn(cmd, env=None): + captured["cmd"] = cmd + captured["env"] = env + return object() + + monkeypatch.setattr(serve, "spawn_worker", fake_spawn) + serve._spawn( + SimpleNamespace( + worker_bin="/bin/gemma_worker", + model_path="m.pte", + tokenizer_path="t.json", + data_path="d.ptd", + max_sessions=4, + warm_resume=True, + bos_id=2, + eos_id=1, + ) + ) + assert captured["cmd"] == [ + "/bin/gemma_worker", + "--model_path", + "m.pte", + "--tokenizer_path", + "t.json", + "--max_sessions", + "4", + "--warm_resume=true", + "--bos_id", + "2", + "--eos_id", + "1", + "--data_path", + "d.ptd", + ] + + +def test_spawn_defaults_worker_bin_and_omits_empty_data_path(monkeypatch): + captured = {} + monkeypatch.setattr( + serve, "spawn_worker", lambda cmd, env=None: captured.update(cmd=cmd) + ) + serve._spawn( + SimpleNamespace( + worker_bin=None, + model_path="m.pte", + tokenizer_path="t.json", + data_path=None, + max_sessions=1, + warm_resume=False, + bos_id=2, + eos_id=1, + ) + ) + cmd = captured["cmd"] + assert cmd[0].endswith("gemma4_31b_worker") + assert "--data_path" not in cmd + assert "--warm_resume=false" in cmd + + +def test_build_app_uses_gemma_options(monkeypatch): + captured = {} + + class _FakeTemplate: + def __init__(self, *args, **kwargs): + captured["template_kwargs"] = kwargs + + class _FakeRuntime: + def close_worker(self): + pass + + class _FakeApp: + def on_event(self, event): + captured["event"] = event + return lambda fn: fn + + monkeypatch.setattr(serve, "ChatTemplate", _FakeTemplate) + monkeypatch.setattr(serve, "_spawn", lambda args: object()) + monkeypatch.setattr(serve, "SessionRuntime", lambda worker: _FakeRuntime()) + + def fake_serving(*args, **kwargs): + captured["serving_kwargs"] = kwargs + return object() + + monkeypatch.setattr(serve, "ServingChat", fake_serving) + monkeypatch.setattr(serve, "build_app", lambda serving, model_id: _FakeApp()) + serve.build_app_from_args( + SimpleNamespace( + hf_tokenizer="hf", + model_id="gemma4_31b", + max_context=10, + tool_parser="gemma", + ) + ) + + assert captured["template_kwargs"] == { + "assistant_header": "<|turn>model\n", + "strip_rendered_bos": True, + "append_generation_prompt_after_tool_response": True, + } + assert ( + captured["serving_kwargs"]["tool_detector_cls"] is serve.GemmaToolCallDetector + ) + assert captured["serving_kwargs"]["prompt_token_offset"] == 1 + assert captured["serving_kwargs"]["content_filter"] is serve._strip_gemma_channels + assert captured["event"] == "shutdown" + + +def test_strip_gemma_channels_returns_visible_answer(): + text = "<|channel>thought\nscratch work\nThe answer." + assert serve._strip_gemma_channels(text) == "The answer." + + +def test_strip_gemma_channels_cuts_unclosed_channel(): + assert serve._strip_gemma_channels("Lead <|channel>thought") == "Lead" + + +def test_strip_gemma_channels_removes_stray_close(): + assert serve._strip_gemma_channels("Visible") == "Visible" + + +def test_rejects_multiple_runners(monkeypatch): + import sys + + monkeypatch.setattr( + sys, + "argv", + [ + "serve.py", + "--model-path", + "m.pte", + "--tokenizer-path", + "t.json", + "--hf-tokenizer", + "hf", + "--num-runners", + "2", + ], + ) + with pytest.raises(SystemExit): + serve.main()