diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index ca7297c998d..42ad12e1689 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -6529,70 +6529,55 @@ def test_qwen2_5(self): class TestExampleMultimodalityScript(TestQNN): - def test_smolvlm_500m_instruct(self): - if not self.required_envs(): - self.skipTest("missing required envs") - prompt = "Can you describe this image?" - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - prompt, - "--temperature", - "0", - "--decoder_model", - "smolvlm_500m_instruct", - "--model_mode", - "kv", - "--max_seq_len", - "128", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - if self.pre_gen_pte: - cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) + @dataclass(frozen=True) + class MLLMSpecs: + max_seq_len: int + sm8650_token_rate: float + sm8750_token_rate: float + encoder_pte_size: float + text_embedding_pte_size: float + decoder_pte_size: float - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - if not self.enable_x86_64: - encoder_pte_size = msg["encoder_pte_size"] - text_embedding_pte_size = msg["text_embedding_pte_size"] - decoder_pte_size = msg["pte_size"] - self.assertLessEqual(encoder_pte_size, 110_000_000) # 110MB - self.assertLessEqual(text_embedding_pte_size, 100_000_000) # 100MB - self.assertLessEqual(decoder_pte_size, 400_000_000) # 400MB - print(f"Encoder PTE Size: {encoder_pte_size} bytes") - print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes") - print(f"Decoder PTE Size: {decoder_pte_size} bytes") + @dataclass(frozen=True) + class VLMSpecs(MLLMSpecs): + image_path: str + golden_image_feature: str - def test_internvl3_1b(self): - if not self.required_envs(): + # TODO: refactor to support different backends + def setUp(self): + self.vlm_specs = { + "smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs( + max_seq_len=128, + sm8650_token_rate=50, + sm8750_token_rate=55, + encoder_pte_size=110_000_000, # 110MB + text_embedding_pte_size=100_000_000, # 100MB + decoder_pte_size=400_000_000, # 400MB + image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", # New York Bay + golden_image_feature="city", + ), + "internvl3_1b": TestExampleMultimodalityScript.VLMSpecs( + max_seq_len=320, + sm8650_token_rate=11, + sm8750_token_rate=13, + encoder_pte_size=425_000_000, # 425MB + text_embedding_pte_size=300_000_000, # 300MB + decoder_pte_size=550_000_000, # 550 MB + image_path="http://images.cocodataset.org/val2017/000000039769.jpg", # Two cats lying on a blanket + golden_image_feature="cats", + ), + } + + def test_static_vlm(self): + if not self.required_envs([self.model_name]): self.skipTest("missing required envs") + vlm_specs: TestExampleMultimodalityScript.VLMSpecs = self.vlm_specs[ + self.model_name + ] prompt = "Can you describe this image?" + image_path = vlm_specs.image_path cmds = [ "python", f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", @@ -6608,14 +6593,16 @@ def test_internvl3_1b(self): str(self.port), "--prompt", prompt, + "--image_path", + image_path, "--temperature", "0", "--decoder_model", - "internvl3_1b", + f"{self.model_name}", "--model_mode", "kv", "--max_seq_len", - "320", + f"{vlm_specs.max_seq_len}", ] if self.compile_only: cmds.extend(["--compile_only"]) @@ -6636,17 +6623,41 @@ def test_internvl3_1b(self): if "Error" in msg: self.fail(msg["Error"]) else: + if not self.compile_only: + model_out = msg["result"][0] + self.assertTrue( + vlm_specs.golden_image_feature in model_out, + f"Expected Output contains feature: '{vlm_specs.golden_image_feature}' Actual Output: '{model_out}'", + ) + print(f"Image Path: {image_path}") + print(f"Query: {prompt}") + print(f"Answer: {model_out}") if not self.enable_x86_64: encoder_pte_size = msg["encoder_pte_size"] text_embedding_pte_size = msg["text_embedding_pte_size"] decoder_pte_size = msg["pte_size"] - self.assertLessEqual(encoder_pte_size, 425_000_000) # 425MB - self.assertLessEqual(text_embedding_pte_size, 300_000_000) # 300MB - self.assertLessEqual(decoder_pte_size, 550_000_000) # 550MB + self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size) + self.assertLessEqual( + text_embedding_pte_size, vlm_specs.text_embedding_pte_size + ) + self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size) print(f"Encoder PTE Size: {encoder_pte_size} bytes") print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes") print(f"Decoder PTE Size: {decoder_pte_size} bytes") + attr_name = f"{self.model.lower()}_token_rate" + if ( + not self.compile_only + and not self.enable_x86_64 + and hasattr(vlm_specs, attr_name) + ): + device_inference_speed = msg["inference_speed"] + expected_inference_speed = getattr(vlm_specs, attr_name) + print(f"Prompt Evaluation: {device_inference_speed} tokens/second") + self.assertGreaterEqual( + device_inference_speed, expected_inference_speed + ) + class TestExampleOssScript(TestQNN): def test_albert(self): diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt index 612f898028c..a971ff1aacb 100644 --- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt @@ -81,3 +81,60 @@ target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options}) set_target_properties( qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" ) + +# build qnn multimodal runner preprocess qnn runner src files for multimodal +set(_multimodal_runner__srcs ${_llama_runner__srcs}) +list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*qnn_llama_runner.*") +list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*runner/runner\.(cpp|h)") +list( + PREPEND + _multimodal_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.h +) + +list(APPEND _multimodal_runner__srcs) + +# build qnn multimodal runner +add_executable(qnn_multimodal_runner ${_multimodal_runner__srcs}) +target_include_directories( + qnn_multimodal_runner PUBLIC ${_common_include_directories} +) +target_include_directories( + qnn_multimodal_runner + PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include +) +target_compile_options(qnn_multimodal_runner PUBLIC ${_common_compile_options}) + +target_link_libraries( + qnn_multimodal_runner + qnn_executorch_backend + executorch_core + extension_data_loader + extension_flat_tensor + extension_llm_runner + extension_module + extension_tensor + gflags + custom_ops + quantized_ops_lib + quantized_kernels + tokenizers::tokenizers +) + +set_target_properties( + qnn_multimodal_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" +) diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md index 80748642db2..896ffa6a7c1 100644 --- a/examples/qualcomm/oss_scripts/llama/README.md +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -4,7 +4,8 @@ **Video Tutorial:** [Build Along: Run LLMs Locally on Qualcomm Hardware Using ExecuTorch](https://www.youtube.com/watch?v=41PKDlGM3oU) -This file provides you the instructions to run LLM Decoder model with different parameters via Qualcomm HTP backend. We currently support the following models: +This file provides you the instructions to run LLM Decoder model and VLM model with different parameters via Qualcomm HTP backend. We currently support the following models: +- LLM 1. LLAMA2 Stories 110M 1. LLAMA3.2 1B @@ -20,7 +21,10 @@ This file provides you the instructions to run LLM Decoder model with different 1. QWEN3 0.6B / 1.7B 1. SmolLM2 135M 1. SmolLM3 3B - +- VLM + + 1. SmolVLM 500M + 1. InternVL3 1B We offer the following modes to execute the model: @@ -162,7 +166,7 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL ``` #### Phi4-mini-instruct -Default example using kv mode. +Default example using hybrid mode. ```bash python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model phi_4_mini --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 ``` @@ -174,9 +178,9 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL ``` #### QWEN2.5 1.5B -Default example using kv mode +Default example using hybrid mode ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --decoder_model qwen2_5-1_5b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 ``` #### QWEN3 0.6B @@ -198,11 +202,149 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL ``` #### SmolLM3 -Default example using kv mode. +Default example using hybrid mode. ```bash python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm3-3b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 ``` +## Multimodal Support + +### Overview + +Multimodal models extend LLM by processing multiple input modalities (vision, audio, text) simultaneously. This framework provides a unified architecture for multimodal via Qualcomm HTP backend. + +**Current Support Status:** +- **Vision-Language Models (VLM)**: Fully supported +- **Audio-Language Models (ALM)**: Coming soon + +### Multimodal Architecture + +For general multimodal processing pipeline please refer [Multimodal Architecture](../../../../extension/llm/runner/README.md#multimodalrunner-architecture) + + +### Processing Pipeline + +Multimodal inference follows these key stages: + +1. **Modality-Specific Encoding** + - **Vision**: Images are processed through a vision encoder to generate visual embeddings + - **Audio**: Audio waveforms are processed through an audio encoder *(future support)* + - **Text**: Text prompts are tokenized and embedded + +2. **Embedding Fusion** + - All modality embeddings are projected to a common embedding dimension + - Embeddings are concatenated or fused according to the model's template + - Special tokens are inserted to mark modality boundaries + +3. **Unified Language Generation** + - The fused embeddings are fed into the language model decoder + - The decoder generates text autoregressively using the same execution modes as LLM models (KV Cache, Hybrid, Lookahead) + +--- + +## Vision-Language Model (VLM) Support + +Vision-Language Models (VLMs) combine computer vision and natural language processing to understand and generate text based on visual inputs. VLMs in this framework consist of: + +- **[Vision Encoder](model/vision_encoder.py)**: Processes images into visual embeddings (e.g., SigLIP for SmolVLM) + - **Projection Layer** (included in vision encoder): Aligns visual embeddings with the language model's embedding space +- **[Language Decoder](model/static_llama.py)**: Reuse static llama to generates text based on fused visual and text embeddings + +### Instructions + +#### SmolVLM 500M +Default example using hybrid mode. +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" +``` + +#### InternVL 1B +Default example using hybrid mode. +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" +``` + +### Specifying Custom Image + +You can specify custom image for VLM models using the `--image_path` flag: + +Take a example image of Statue-of-Liberty in New York Bay +- **HTTP/HTTPS URLs**: Direct links to images on the web + - Example: `https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg` +- **Local file paths**: Absolute or relative paths to image files on your system + - Example: [`./examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png`](assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png) + +**Default behavior:** +If `--image_path` is not specified, the system will automatically use the default image URL defined in the model's configuration file (`encoder/encoder_config.py`). + +#### Image Preprocessing + +Each VLM model has specific preprocessing requirements defined in its configuration: + +```python +# In encoder/encoder_config.py +@dataclass(init=False, frozen=True) +class SmolVLMEncoder(VisionModalityConfig): + encoder_class = Idefics3VisionEncoder + img_seq_len = 64 + img_resized_h = 512 + img_resized_w = 512 + img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" # Default image + quant_recipe = SmolVLM_Encoder_QuantRecipe +``` + +- **img_resized_h / img_resized_w**: Target resolution for the vision encoder +- **img_seq_len**: Number of visual tokens generated by the encoder + +The image is automatically: +1. Loaded from the specified URL or file path +2. Resized to the model's expected resolution and preprocessed by HuggingFace [processors](https://huggingface.co/docs/transformers/main/processors) + +### Using Pre-Generated PTE Files + +If you have already compiled a VLM model, you can run inference with pre-generated PTE files: + +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} +``` + +### VLM Processing Details + +The VLM inference pipeline consists of: + +1. **Vision Encoding Phase** + - Input image is preprocessed (resize, normalize) + - Vision encoder generates visual embeddings: `[batch, img_seq_len, hidden_dim]` + - Visual embeddings are projected to match the language model dimension by the modality projector + +2. **Text Tokenization Phase** + - User prompt is tokenized into text tokens + - Text tokens are embedded: `[batch, text_seq_len, hidden_dim]` + +3. **Embedding Fusion Phase** + - Visual and text embeddings are concatenated according to the model's template + - Special tokens (e.g., ``, `<|fake_token_around_image|>`, ``) mark modality boundaries (see [tokenizer.py](tokenizer.py)) + + ```python + # Special tokens for Vision-Language Model + VLM_SPECIAL_TOKENS = { + "smolvlm_500m_instruct": { + "image_token": "", + "global_img": "", + "fake_wrap_start": "", + "fake_wrap_end": "", + }, + ... + } + ``` + - Final fused sequence: `[batch, img_seq_len + text_seq_len, hidden_dim]` + +4. **Language Generation Phase** + - Fused embeddings are fed into the language decoder + - Autoregressive generation produces output tokens + - KV cache is updated for efficient subsequent token generation + + ### KV Cache update mechanism We use Smart Mask mechanisms for updating the key-value (KV) cache. diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py index 4a8a017758b..963db6e2d0a 100644 --- a/examples/qualcomm/oss_scripts/llama/__init__.py +++ b/examples/qualcomm/oss_scripts/llama/__init__.py @@ -49,7 +49,7 @@ from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import ( InternVL3Encoder, - LateFusionModalityConfig, + MultiModalityConfig, SmolVLMEncoder, ) from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import ( @@ -183,12 +183,12 @@ def format_value(v): def register_llm_model( name: str, - vision_encoder: Optional[LateFusionModalityConfig] = None, + vision_encoder: Optional[MultiModalityConfig] = None, ): def decorator(cls: Type[LLMModelConfig]): cls.decoder_model_version = DECODER_MODEL_VERSION[name] if vision_encoder is not None and issubclass( - vision_encoder, LateFusionModalityConfig + vision_encoder, MultiModalityConfig ): setattr(cls, VISION_ENCODER, vision_encoder) SUPPORTED_LLM_MODELS[name.lower()] = cls() diff --git a/examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png b/examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png new file mode 100644 index 00000000000..07beceeeb63 Binary files /dev/null and b/examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png differ diff --git a/examples/qualcomm/oss_scripts/llama/dataset.py b/examples/qualcomm/oss_scripts/llama/dataset.py index 69af3fdf8d3..72f9e5d766a 100644 --- a/examples/qualcomm/oss_scripts/llama/dataset.py +++ b/examples/qualcomm/oss_scripts/llama/dataset.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. import argparse +import warnings from typing import Callable, List, Optional from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig @@ -18,7 +19,7 @@ ) from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import ( - LateFusionModalityConfig, + MultiModalityConfig, VisionModalityConfig, ) from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper @@ -54,10 +55,16 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str): Returns: tuple of pixel values tensors """ - # Load image from the specified URL - # Currently only supports loading image from URL - # TODO: allow user-specified image path - image = load_image(config.img_url) + # Load image from user-specified path (URL or local file) + # fall back to the default image URL if no image is provided. + image_path = self.control_args.image_path or config.img_url + if not self.control_args.image_path: + warnings.warn( + f"No image path/URL provided, using default image URL: {config.img_url}", + UserWarning, + stacklevel=1, + ) + image = load_image(image_path) # Process image with text prompt using HuggingFace processor # Some HF processors (e.g. InternVL3) need to pass text arg or it will cause error and process failed @@ -81,7 +88,7 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str): def _build_dataset_for_encoder( self, - config: LateFusionModalityConfig, + config: MultiModalityConfig, prompt: str, ) -> Optional[tuple]: if issubclass(config, VisionModalityConfig): diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py index 3d28ca2186b..6edcc408a96 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py @@ -4,14 +4,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import copy import getpass import logging import os import subprocess from collections import defaultdict, OrderedDict from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Union import numpy as np import torch @@ -101,7 +100,7 @@ def __init__( # noqa: C901 max_seq_length: int, ar_len: int, use_kv_cache: bool, - example_input: Tuple[List[torch.Tensor]], + get_example_inputs: Callable, use_i64_token: bool, seq_mse_candidates: int, ): @@ -113,7 +112,7 @@ def __init__( # noqa: C901 self._model = model.to(self.device) self.ar_len = ar_len self._use_kv_cache = use_kv_cache - self.example_input = example_input + self.get_example_inputs = get_example_inputs self.max_seq_length = max_seq_length self.use_i64_token = use_i64_token self.seq_mse_candidates = seq_mse_candidates @@ -126,9 +125,7 @@ def _model_call(self, inps): kwargs["seq_mse_candidates"] = self.seq_mse_candidates all_logits = INFERENCE_REGISTRY[self._use_kv_cache]( - copy.deepcopy( - self.example_input - ), # Copy the example input to avoid KV cache pollution when testing PPL + self.get_example_inputs, inps, self._model, self._tokenizer, @@ -779,7 +776,7 @@ def _generate( @register_inference(use_kv_cache=True) def kv_inference( # noqa: C901 - example_input, + get_example_inputs: Callable, prompt: Union[str, list], module: torch.fx.GraphModule, tokenizer, @@ -801,7 +798,7 @@ def kv_inference( # noqa: C901 ] ) - _, atten_mask, _, k_caches, v_caches = example_input + _, atten_mask, _, k_caches, v_caches = get_example_inputs() # TODO: change criteria & support batch inputs if necessary all_pos = torch.arange(0, max_seq_len, 1, dtype=torch.int32).unsqueeze(0) @@ -920,7 +917,7 @@ def kv_inference( # noqa: C901 @register_inference(use_kv_cache=False) def prefill_inference( - example_input, + get_example_inputs: Callable, prompt: Union[str, list], module: torch.fx.GraphModule, tokenizer, @@ -939,7 +936,7 @@ def prefill_inference( ] ) - _, atten_mask = example_input + _, atten_mask = get_example_inputs() # TODO: change criteria & support batch inputs if necessary @@ -1001,7 +998,7 @@ def prefill_inference( def graph_module_inference( use_kv_cache: bool, - example_input, + get_example_inputs: Callable, module: torch.fx.GraphModule, tokenizer, ar_len=1, @@ -1034,7 +1031,7 @@ def graph_module_inference( kwargs["lookahead_config"] = lookahead_config INFERENCE_REGISTRY[use_kv_cache]( - example_input, + get_example_inputs, prompt, module, tokenizer, @@ -1054,7 +1051,7 @@ def graph_module_inference( max_seq_length=max_seq_len, ar_len=ar_len, use_kv_cache=use_kv_cache, - example_input=example_input, + get_example_inputs=get_example_inputs, use_i64_token=use_i64_token, seq_mse_candidates=seq_mse_candidates, ) diff --git a/examples/qualcomm/oss_scripts/llama/encoder/__init__.py b/examples/qualcomm/oss_scripts/llama/encoder/__init__.py index 56798748023..f3d821bcd46 100644 --- a/examples/qualcomm/oss_scripts/llama/encoder/__init__.py +++ b/examples/qualcomm/oss_scripts/llama/encoder/__init__.py @@ -6,7 +6,7 @@ from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import ( InternVL3Encoder, - LateFusionModalityConfig, + MultiModalityConfig, SmolVLMEncoder, VisionModalityConfig, ) @@ -20,7 +20,7 @@ "EncoderQuantRecipe", "InternVL3Encoder", "InternVL3_Encoder_QuantRecipe", - "LateFusionModalityConfig", + "MultiModalityConfig", "SmolVLMEncoder", "SmolVLM_Encoder_QuantRecipe", "VisionModalityConfig", diff --git a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py index 331894473af..b8e32904bbf 100644 --- a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py +++ b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py @@ -20,13 +20,10 @@ @dataclass(init=False, frozen=True) -class LateFusionModalityConfig(ABC): +class MultiModalityConfig(ABC): """ Base configuration class for late fusion modality encoders. - Late fusion refers to the approach where different modalities (vision, audio, text) - are processed separately by their respective encoders and then fused at a later stage. - Attributes: encoder_class: The encoder class that implements the modality processing. quant_recipe: Quantization recipe for optimizing the encoder. @@ -41,7 +38,7 @@ def create_encoder(self, config): @dataclass(init=False, frozen=True) -class VisionModalityConfig(LateFusionModalityConfig): +class VisionModalityConfig(MultiModalityConfig): """ Base config for vision modality encoders. diff --git a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py index 53f359d2268..46a7e26d58a 100644 --- a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py +++ b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py @@ -337,7 +337,7 @@ def eval_llm(args): logging.info("Observers added, starting calibration...") graph_module_inference( use_kv_cache=False, - get_example_inputs=lambda use_kv_cache=False: inputs, + get_example_inputs=lambda: inputs, module=model, tokenizer=tokenizer, ar_len=args.max_seq_len, @@ -358,7 +358,7 @@ def eval_llm(args): # graph_module_inference( # use_kv_cache=False, - # get_example_inputs=lambda use_kv_cache=False: inputs, + # get_example_inputs=lambda: inputs, # module=model, # tokenizer=tokenizer, # ar_len=args.max_seq_len, diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 1267b1ac78b..bddebdc6387 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -36,6 +36,7 @@ TEXT_EMBEDDING_GRAPH_NAMES, TEXT_ENCODER, VISION_ENCODER, + VISION_ENCODER_INPUT_FILENAME, ) from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import ( QnnRunnerEvalWrapper, @@ -45,14 +46,12 @@ MultiModalManager, next_power_of_two, ) - from executorch.examples.qualcomm.utils import ( make_output_dir, setup_common_args_and_variables, SimpleADB, ) - try: from lm_eval.evaluator import simple_evaluate except ImportError: @@ -163,38 +162,9 @@ def inference( else f"{args.artifact}/{pte_filenames[TEXT_DECODER]}.pte" ) - # TODO: support multimodal runtime, we only check pte size for now - if is_modality: - if args.ip and args.port != -1: - encoder_pte_path = ( - f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte" - if args.pre_gen_pte - else f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte" - ) - text_embedding_pte_path = ( - f"{args.pre_gen_pte}/{pte_filenames[TEXT_EMBEDDING]}.pte" - if args.pre_gen_pte - else f"{args.artifact}/{pte_filenames[TEXT_EMBEDDING]}.pte" - ) - # Prepare validation results for CI system - validation_results = { - "pte_size": os.path.getsize(pte_path), - "encoder_pte_size": os.path.getsize(encoder_pte_path), - "text_embedding_pte_size": os.path.getsize(text_embedding_pte_path), - } - with Client((args.ip, args.port)) as conn: - conn.send(json.dumps(validation_results)) - else: - logging.info("Multimodal runtime support is currently under development.") - logging.info( - "Detected vision/audio encoder in model config. Exiting process safely." - ) - exit(0) - return None - # For decoder-only models, enable accuracy evaluation using perplexity # TODO: Add support for multimodal accuracy evaluation (e.g., VLM) - if args.run_lm_eval: + if not is_modality and args.run_lm_eval: # Generate the eval wrapper eval_wrapper = QnnRunnerEvalWrapper( args=args, @@ -291,19 +261,51 @@ def post_process(): qnn_sdk = os.getenv("QNN_SDK_ROOT") target = "x86_64-linux-clang" - runner_cmd = " ".join( - [ - f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&", - f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner", - f"--decoder_model_version {decoder_model_config.decoder_model_version}", - f"--tokenizer_path {runtime_tokenizer_path}", - f"--model_path {pte_path}", - f"--seq_len {seq_len}", - f"--output_path {args.artifact}/outputs/outputs.txt", - f"--performance_output_path {args.artifact}/{performance_output_path}", - runner_args, - ] - ) + if not is_modality: + runner_cmd = " ".join( + [ + f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&", + f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner", + f"--decoder_model_version {decoder_model_config.decoder_model_version}", + f"--tokenizer_path {runtime_tokenizer_path}", + f"--model_path {pte_path}", + f"--seq_len {seq_len}", + f"--output_path {args.artifact}/outputs/outputs.txt", + f"--performance_output_path {args.artifact}/{performance_output_path}", + runner_args, + ] + ) + else: + # x86 emulator is intended for CI and not performance. Check only the first few tokens. + # For multimodal models, use 128 tokens (vs 16 for text-only) due to longer sequence length required for modality embeddings. + seq_len = min(seq_len, 128) + encoder_pte_path = ( + f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte" + ) + text_embedding_pte_path = ( + f"{args.pre_gen_pte}/{pte_filenames[TEXT_EMBEDDING]}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filenames[TEXT_EMBEDDING]}.pte" + ) + runner_cmd = " ".join( + [ + f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&", + f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner", + f"--decoder_model_version {decoder_model_config.decoder_model_version}", + f"--tokenizer_path {runtime_tokenizer_path}", + f"--decoder_path {pte_path}", + f"--encoder_path {encoder_pte_path}", + f"--embedding_path {text_embedding_pte_path}", + f"--image_path {args.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw", + f"--seq_len {seq_len}", + f"--output_path {args.artifact}/outputs/outputs.txt", + f"--performance_output_path {args.artifact}/{performance_output_path}", + runner_args, + ] + ) + subprocess.run( runner_cmd, shell=True, @@ -312,36 +314,80 @@ def post_process(): ) post_process() else: - runner_cmd = " ".join( - [ - f"cd {workspace} &&", - f"./qnn_llama_runner", - f"--decoder_model_version {decoder_model_config.decoder_model_version}", - f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}", - f"--model_path {pte_filenames[TEXT_DECODER]}.pte", - f"--seq_len {seq_len}", - "--output_path outputs/outputs.txt", - f"--performance_output_path {performance_output_path}", - "--shared_buffer", - runner_args, - ] - ) + if not is_modality: + runner_cmd = " ".join( + [ + f"cd {workspace} &&", + f"./qnn_llama_runner", + f"--decoder_model_version {decoder_model_config.decoder_model_version}", + f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}", + f"--model_path {pte_filenames[TEXT_DECODER]}.pte", + f"--seq_len {seq_len}", + "--output_path outputs/outputs.txt", + f"--performance_output_path {performance_output_path}", + "--shared_buffer", + runner_args, + ] + ) + else: + encoder_pte_path = ( + f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte" + ) + text_embedding_pte_path = ( + f"{args.pre_gen_pte}/{pte_filenames[TEXT_EMBEDDING]}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filenames[TEXT_EMBEDDING]}.pte" + ) + runner_cmd = " ".join( + [ + f"cd {workspace} &&", + f"./qnn_multimodal_runner", + f"--decoder_model_version {decoder_model_config.decoder_model_version}", + f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}", + f"--decoder_path {pte_filenames[TEXT_DECODER]}.pte", + f"--encoder_path {pte_filenames[VISION_ENCODER]}.pte", + f"--embedding_path {pte_filenames[TEXT_EMBEDDING]}.pte", + f"--image_path {VISION_ENCODER_INPUT_FILENAME}.raw", + f"--seq_len {seq_len}", + "--output_path outputs/outputs.txt", + f"--performance_output_path {performance_output_path}", + "--shared_buffer", + runner_args, + ] + ) + adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", - pte_path=pte_path, + pte_path=( + pte_path + if not is_modality + else [pte_path, encoder_pte_path, text_embedding_pte_path] + ), workspace=workspace, device_id=args.device, host_id=args.host, soc_model=args.model, shared_buffer=True, target=args.target, - runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner", + runner=( + f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner" + if not is_modality + else f"examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner" + ), ) # No pregen inputs, input_list is not required if not args.skip_push: - adb.push(inputs=[], files=[runtime_tokenizer_path]) + # Always use image from artifact folder since that's where it's saved during preprocessing + # regardless of whether pre_gen_pte is used (pre_gen_pte only applies to .pte model files) + image_path = f"{args.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw" + adb.push( + inputs=[], + files=[runtime_tokenizer_path] + ([image_path] if is_modality else []), + ) adb.execute(custom_runner_cmd=runner_cmd) adb.pull(output_path=args.artifact, callback=post_process) @@ -358,6 +404,16 @@ def post_process(): "inference_speed": inference_speed, "pte_size": os.path.getsize(pte_path), } + + # Add multimodal-specific metrics if applicable + if is_modality: + validation_results.update( + { + "encoder_pte_size": os.path.getsize(encoder_pte_path), + "text_embedding_pte_size": os.path.getsize(text_embedding_pte_path), + } + ) + with Client((args.ip, args.port)) as conn: conn.send(json.dumps(validation_results)) else: @@ -526,6 +582,13 @@ def _build_parser(): type=int, ) + parser.add_argument( + "--image_path", + help="Path to the image file for multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.", + default=None, + type=str, + ) + parser.add_argument("-v", "--verbose", action="store_true") return parser @@ -588,6 +651,17 @@ def export_llama(args) -> None: args.prompt, chat_template ) + # TODO: Implement multi-turn conversation support for multimodal models (vision/audio). + assert ( + not ( + hasattr(decoder_model_config, VISION_ENCODER) + or hasattr(decoder_model_config, AUDIO_ENCODER) + ) + ) or (len(args.prompt) <= 1), ( + "Multimodal models currently do not support multi-turn. " + "Please set `--prompt` to 1 or switch to a unimodal (text-only) decoder." + ) + if args.pre_gen_pte: inference( args, decoder_model_config, pte_filenames, runtime_tokenizer_path, tokenizer diff --git a/examples/qualcomm/oss_scripts/llama/model/embedding.py b/examples/qualcomm/oss_scripts/llama/model/embedding.py index b9c470b470d..4956012baf0 100644 --- a/examples/qualcomm/oss_scripts/llama/model/embedding.py +++ b/examples/qualcomm/oss_scripts/llama/model/embedding.py @@ -35,5 +35,5 @@ def get_example_input(self): ), ) - def forward(self, input_ids): - return self.input_embedding_module(input_ids) + def forward(self, tokens): + return self.input_embedding_module(tokens) diff --git a/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py b/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py index 7ea3696799c..431e28a20d6 100644 --- a/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py +++ b/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Optional, Tuple +from typing import Tuple import torch @@ -27,7 +27,7 @@ ) -# Custom implementation based on `transformers/models/idefics3/modeling_idefics3/Idefics3VisionEmbeddings.py` (Transformers v4.56.1) +# Custom implementation based on `transformers/models/idefics3/modeling_idefics3/Idefics3VisionEmbeddings.py` (Transformers v5.0.0rc1) # # Qualcomm optimization: # Precompute and register positional IDs as a buffer to avoid computation during forward passes. @@ -119,62 +119,30 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: return embeddings -# Custom implementation based on `transformers/models/idefics3/modeling_idefics3/Idefics3VisionTransformer.py` (Transformers v4.56.1) +# Custom implementation based on `transformers/models/idefics3/modeling_idefics3/Idefics3VisionTransformer.py` (Transformers v5.0.0rc1) # # Qualcomm changes: # Assume the image is non-empty and skip attention mask propagation to the encoder class CustomIdefics3VisionTransformer(Idefics3PreTrainedModel): config: Idefics3VisionConfig - _supports_sdpa = True - _supports_flash_attn = True - _supports_flex_attn = True def __init__(self, config: Idefics3VisionConfig): super().__init__(config) - embed_dim = config.hidden_size - self.embeddings = Idefics3VisionEmbeddings(config) self.encoder = Idefics3Encoder(config) - self.patch_size = config.patch_size - self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" - - def forward( - self, - pixel_values, - patch_attention_mask: Optional[torch.BoolTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict + self.post_layernorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps ) + def forward(self, pixel_values): hidden_states = self.embeddings(pixel_values=pixel_values) encoder_outputs = self.encoder( inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] last_hidden_state = self.post_layernorm(last_hidden_state) - if not return_dict: - return (last_hidden_state,) + encoder_outputs[1:] - return BaseModelOutput( last_hidden_state=last_hidden_state, hidden_states=encoder_outputs.hidden_states, @@ -182,7 +150,7 @@ def forward( ) -# Custom implementation based on `transformers/models/idefics3/modeling_idefics3.py` (Transformers v4.56.1). +# Custom implementation based on `transformers/models/idefics3/modeling_idefics3.py` (Transformers v5.0.0rc1). # # Qualcomm optimization: # - Dynamic shape support is removed; computations are now static for efficiency. @@ -277,7 +245,7 @@ def forward( return image_hidden_states -# Copy from transformers/models/internvl/modeling_internvl.py (Transformers v4.56.1). +# Copy from transformers/models/internvl/modeling_internvl.py (Transformers v5.0.0rc1). class InternVL3VisionEncoder(torch.nn.Module): def __init__( self, config: InternVLConfig, img_resized_h: int = 448, img_resized_w: int = 448 diff --git a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp new file mode 100644 index 00000000000..0b3f2ee4ad1 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp @@ -0,0 +1,396 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * @file + * + * This tool can run SmolVLM 500M, InternVL3 1B + * with Qualcomm AI Engine Direct. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// Model paths +DEFINE_string( + embedding_path, + "embedding.pte", + "Path to embedding model serialized in flatbuffer format."); +DEFINE_string( + encoder_path, + "encoder.pte", + "Path to vision encoder model serialized in flatbuffer format."); +DEFINE_string( + decoder_path, + "decoder.pte", + "Path to decoder model serialized in flatbuffer format."); + +// Tokenizer and output paths +DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer path."); +DEFINE_string( + output_path, + "outputs.txt", + "Executorch inference data output path."); +DEFINE_string( + performance_output_path, + "inference_speed.txt", + "Records inference speed. For CI purpose."); +DEFINE_string( + dump_logits_path, + "", + "If path is provided, program will dump all logits generated."); + +// Model configuration +DEFINE_string(decoder_model_version, "llama3", "The decoder model version."); +DEFINE_string( + prompt, + "Describe this image:", + "Text prompt for the multimodal model."); +DEFINE_string( + tokenized_prompt, + "", + "This is an alternative of passing prompts. Users could provide this in a raw file, with tokens saved in uint64 format."); +DEFINE_string( + image_path, + "", + "Path to input image file. If empty, text-only mode is used."); +DEFINE_string(system_prompt, "", "System prompt for the model."); + +// Generation parameters +DEFINE_double( + temperature, + 0.0f, + "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); +DEFINE_int32( + seq_len, + 128, + "Total number of tokens to generate (prompt + output)."); +DEFINE_int32( + eval_mode, + 1, + "0: TokenGenerator(kv) / 1: HybridMode (prefill+kv) / 2: Lookahead Decoding"); + +DEFINE_bool( + shared_buffer, + false, + "Specifies to use shared buffers for zero-copy use case between the application and device/co-processor associated with the backend."); + +// Lookahead decoding parameters +DEFINE_int32( + ngram, + 0, + "[Lookahead Decoding] Size of n-grams used in lookahead process."); +DEFINE_int32( + window, + 0, + "[Lookahead Decoding] Number of future tokens to predict in each step."); +DEFINE_int32( + gcap, + 0, + "[Lookahead Decoding] Maximum number of speculations or candidate n-grams."); + +// Execution parameters +DEFINE_int32(num_iters, 1, "Total number of iterations to run."); + +std::vector CollectPrompts(int argc, char** argv) { + // Collect all prompts from command line, example usage: + // --prompt "prompt1" --prompt "prompt2" --prompt "prompt3" + std::vector prompts; + for (int i = 1; i < argc; i++) { + if (std::string(argv[i]) == "--prompt" && i + 1 < argc) { + prompts.push_back(argv[i + 1]); + i++; // Skip the next argument + } + } + return prompts; +} + +/** + * Special tokens structure for different models + */ +struct SpecialTokens { + std::string image_token; + std::string global_img; + std::string fake_wrap_start; + std::string fake_wrap_end; +}; + +/** + * Get special tokens based on decoder model version + */ +SpecialTokens get_special_tokens( + example::MultimodalDecoderModelVersion decoder_model_version) { + SpecialTokens tokens; + + switch (decoder_model_version) { + case example::MultimodalDecoderModelVersion:: + kSmolvlm: // smolvlm_500m_instruct + tokens.image_token = ""; + tokens.global_img = ""; + tokens.fake_wrap_start = ""; + tokens.fake_wrap_end = ""; + break; + case example::MultimodalDecoderModelVersion::kInternvl3: // internvl3_1b + tokens.image_token = ""; + tokens.global_img = ""; + tokens.fake_wrap_start = ""; + tokens.fake_wrap_end = ""; + break; + default: + break; + } + + return tokens; +} + +/** + * Prepare multimodal token IDs by expanding image tokens + * This implements the logic from prepare_multimodal_token_ids in Python + */ +std::string prepare_multimodal_prompt( + const std::string& prompt, + int image_seq_len, + const SpecialTokens& specials) { + // Create image prompt with repeated image tokens + std::string image_prompt = specials.fake_wrap_start; + image_prompt += specials.global_img; + for (int i = 0; i < image_seq_len; ++i) { + image_prompt += specials.image_token; + } + image_prompt += specials.fake_wrap_end; + + // Replace single image token with expanded version + size_t pos = 0; + std::string expanded = prompt; + while ((pos = expanded.find(specials.image_token, pos)) != + std::string::npos) { + expanded.replace(pos, specials.image_token.size(), image_prompt); + pos += image_prompt.size(); + } + ET_LOG(Info, "Prompt after expanding image token: %s", expanded.c_str()); + + return expanded; +} + +/** + * Format prompt based on model version with multimodal token expansion + */ +std::string get_formatted_prompt( + const std::string& prompt, + const std::string& system_prompt, + example::MultimodalDecoderModelVersion decoder_model_version, + int32_t img_seq_len = 0) { + std::string formatted_prompt; + + // Get special tokens for this model + SpecialTokens specials = get_special_tokens(decoder_model_version); + + switch (decoder_model_version) { + case example::MultimodalDecoderModelVersion::kSmolvlm: + if (!system_prompt.empty()) { + formatted_prompt.append( + "<|start_header_id|>system<|end_header_id|>\n\n"); + formatted_prompt.append(system_prompt); + formatted_prompt.append("<|eot_id|>"); + } + formatted_prompt.append("<|im_start|>User:"); + formatted_prompt.append(specials.image_token); + formatted_prompt.append(prompt); + formatted_prompt.append("\nAssistant:"); + break; + case example::MultimodalDecoderModelVersion::kInternvl3: + if (!system_prompt.empty()) { + formatted_prompt.append("<|im_start|>system<|im_end|>\n\n"); + formatted_prompt.append(system_prompt); + formatted_prompt.append("<|im_end|>"); + } + formatted_prompt.append("<|im_start|>user:\n"); + formatted_prompt.append(specials.image_token); + formatted_prompt.append("\n"); + formatted_prompt.append(prompt); + formatted_prompt.append("<|im_end|>assistant\n"); + break; + default: + ET_CHECK_MSG(false, "unsupported VLM version"); + break; + } + + // Expand image tokens + formatted_prompt = + prepare_multimodal_prompt(formatted_prompt, img_seq_len, specials); + + return formatted_prompt; +} + +template +void start_multimodal_runner( + std::unique_ptr encoder_runner, + std::unique_ptr module, + std::unique_ptr embedding, + std::vector& prompts) { + ET_LOG(Info, "Starting multimodal runner"); + + bool use_tokenized_prompt = + gflags::GetCommandLineFlagInfoOrDie("tokenized_prompt").is_default ? false + : true; + + // Load image, run encoder forward pass, and set image hidden states if + // provided + bool has_image = !FLAGS_image_path.empty(); + + // Load encoder + if (encoder_runner->load() != executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to load encoder"); + return; + } + + // Encode image from file + auto encode_result = + encoder_runner->encode_from_file(FLAGS_image_path.c_str()); + if (!encode_result.ok()) { + ET_LOG(Error, "Failed to encode image"); + return; + } + + auto image_hidden_states = encode_result.get(); + + // Create multimodal runner + example::MultimodalRunner runner( + std::move(module), + std::move(embedding), + FLAGS_decoder_model_version.c_str(), + FLAGS_decoder_path.c_str(), + FLAGS_tokenizer_path.c_str(), + FLAGS_dump_logits_path.c_str(), + FLAGS_performance_output_path.c_str(), + FLAGS_temperature, + FLAGS_eval_mode, + FLAGS_shared_buffer, + FLAGS_ngram, + FLAGS_window, + FLAGS_gcap, + std::make_unique(image_hidden_states)); + + auto decoder_model_version = runner.get_decoder_model_version(); + + // Prepare output buffer (similar to qnn_llama_runner.cpp) + std::vector buf; + buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char + std::ofstream fout(FLAGS_output_path.c_str()); + + auto callback = [&](const std::string& piece) { + for (const char c : piece) { + buf.push_back(c); + } + }; + + // Configure generation + executorch::extension::llm::GenerationConfig config{ + true, + -1, + false, + FLAGS_seq_len, + static_cast(FLAGS_temperature), + 0, + 0}; + + // Get image sequence length from encoder + int32_t img_seq_len = encoder_runner->get_image_seq_len(); + if (use_tokenized_prompt) { + runner.generate_from_prompt_or_file( + FLAGS_tokenizer_path.c_str(), use_tokenized_prompt, config, callback); + } else { + // generate tokens & store inference output + for (int i = 0; i < FLAGS_num_iters; i++) { + for (size_t j = 0; j < prompts.size(); ++j) { + const auto& prompt = prompts[j]; + std::string formatted_prompt; + formatted_prompt = get_formatted_prompt( + prompt, + FLAGS_system_prompt, + decoder_model_version.get(), + img_seq_len); + runner.generate_from_prompt_or_file( + formatted_prompt.c_str(), use_tokenized_prompt, config, callback); + } + } + } + fout.write(buf.data(), buf.size()); + fout.close(); +} + +int main(int argc, char** argv) { + std::vector prompts = CollectPrompts(argc, argv); + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (!gflags::GetCommandLineFlagInfoOrDie("prompt").is_default && + !gflags::GetCommandLineFlagInfoOrDie("tokenized_prompt").is_default) { + ET_CHECK_MSG(false, "Only provide prompt or tokenized_input but not both."); + } + if (!gflags::GetCommandLineFlagInfoOrDie("dump_logits_path").is_default && + FLAGS_eval_mode != 0) { + ET_CHECK_MSG( + false, "Only TokenGenerator(kv) mode is supported to dump all logits."); + } + ET_LOG(Info, "Embedding: %s", FLAGS_embedding_path.c_str()); + ET_LOG(Info, "Encoder: %s", FLAGS_encoder_path.c_str()); + ET_LOG(Info, "Decoder: %s", FLAGS_decoder_path.c_str()); + + // Create encoder runner + std::unique_ptr encoder_runner = + std::make_unique(FLAGS_encoder_path.c_str()); + + // load embedding + std::unique_ptr embedding = + std::make_unique( + FLAGS_embedding_path.c_str(), + executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); + + // load decoder + std::unique_ptr module = + std::make_unique( + FLAGS_decoder_path.c_str(), + executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); + + // Using 8bit as default since this meta is introduced with 16bit kv io + // support and older models only have 8bit kv io. + example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8; + if (module->method_names()->count("get_kv_io_bit_width") > 0) { + kv_bitwidth = static_cast( + module->get("get_kv_io_bit_width").get().toScalar().to()); + } + // Start runner with appropriate KV bitwidth + if (kv_bitwidth == example::KvBitWidth::kWidth8) { + start_multimodal_runner( + std::move(encoder_runner), + std::move(module), + std::move(embedding), + prompts); + } else if (kv_bitwidth == example::KvBitWidth::kWidth16) { + start_multimodal_runner( + std::move(encoder_runner), + std::move(module), + std::move(embedding), + prompts); + } else { + ET_CHECK_MSG( + false, + "Unsupported kv bitwidth: %ld", + static_cast(kv_bitwidth)); + } + + return 0; +} diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h index aa355335b68..06fe88517a7 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h +++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h @@ -163,6 +163,10 @@ class KVManager { return total_cache_size_; } + int64_t get_head_dim() const { + return metadata_.head_dim; + } + private: // Helper functions to rearrange and update key and value caches void rearrange_key(KVCache& k_cache, int32_t ar_len_dst); diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp new file mode 100644 index 00000000000..1278a1df7d9 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::runtime::Error; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +using executorch::runtime::TensorInfo; + +namespace example { + +EmbeddingProcessor::EmbeddingProcessor( + EmbeddingRunner* embedding_runner, + const std::string& method_name, + Metadata metadata) + : embedding_runner_(embedding_runner), + method_name_(method_name), + metadata_(metadata) { + input_toks_.size = metadata_.ar_len * sizeof(int64_t); + embeddings_.size = metadata_.ar_len * metadata_.embedding_dim * sizeof(float); + prompt_embeddings_.size = 0; // Will be set in prefill() +} + +void EmbeddingProcessor::init_io( + IMemAlloc* buffer_manager, + Result method_meta) { + input_tensors_.reserve(method_meta->num_inputs()); + output_tensors_.reserve(method_meta->num_outputs()); + + // [I]: input_tokens + Result input_toks = method_meta->input_tensor_meta(0); + input_toks_.data = + reinterpret_cast(buffer_manager->allocate(input_toks_.size)); + + input_toks_.tensor = std::make_unique( + input_toks->scalar_type(), + input_toks->sizes().size(), + const_cast(input_toks->sizes().data()), + input_toks_.data, + const_cast(input_toks->dim_order().data())); + input_tensors_.emplace_back(input_toks_.tensor.get()); + buffer_manager->add_memory_info( + input_toks_.data, input_toks_.size, input_toks.get()); + + // [O]: embeddings + Result embeddings = method_meta->output_tensor_meta(0); + embeddings_.data = + reinterpret_cast(buffer_manager->allocate(embeddings_.size)); + + embeddings_.tensor = std::make_unique( + embeddings->scalar_type(), + embeddings->sizes().size(), + const_cast(embeddings->sizes().data()), + embeddings_.data, + const_cast(embeddings->dim_order().data())); + output_tensors_.emplace_back(embeddings_.tensor.get()); + buffer_manager->add_memory_info( + embeddings_.data, embeddings_.size, embeddings.get()); + + inputs_.reserve(input_tensors_.size()); + for (auto& input_tensor : input_tensors_) { + inputs_.emplace_back(std::move(input_tensor)); + } +} + +void EmbeddingProcessor::update_prompt_embedding( + int32_t num_prompt_tokens, + int64_t prompt_pos) { + for (int i = 0; i < metadata_.ar_len; i++) { + if (prompt_pos + i < num_prompt_tokens) { + std::memcpy( + prompt_embeddings_.data + (prompt_pos + i) * metadata_.embedding_dim, + embeddings_.data + i * metadata_.embedding_dim, + metadata_.embedding_dim * sizeof(float)); + } + } +} + +void EmbeddingProcessor::prefill(const std::vector& prompt_tokens) { + int64_t prompt_pos = 0; + int32_t num_prompt_tokens = prompt_tokens.size(); + prompt_embeddings_.size = + num_prompt_tokens * metadata_.embedding_dim * sizeof(float); + + // Allocate memory using std::vector for smart pointer management + prompt_embeddings_buffer_.resize(num_prompt_tokens * metadata_.embedding_dim); + prompt_embeddings_.data = prompt_embeddings_buffer_.data(); + + // Create TensorImpl for prompt_embeddings_ with shape [1, num_prompt_tokens, + // dim] Store sizes and dim_order as member variables to keep them + // alive + prompt_embeddings_sizes_ = {1, num_prompt_tokens, metadata_.embedding_dim}; + prompt_embeddings_dim_order_ = {0, 1, 2}; + prompt_embeddings_.tensor = std::make_unique( + executorch::aten::ScalarType::Float, + prompt_embeddings_sizes_.size(), + prompt_embeddings_sizes_.data(), + prompt_embeddings_.data, + prompt_embeddings_dim_order_.data()); + + int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len); + + ET_CHECK_MSG( + embedding_runner_->set_outputs(method_name_, output_tensors_) == + executorch::runtime::Error::Ok, + "Failed to set output tensor for module %s", + method_name_.c_str()); + + for (int32_t i = 0; i < num_iters; ++i) { + prepare_io(prompt_tokens, prompt_pos); + + embedding_runner_->step(method_name_, inputs_); + + // Update prompt_embedding + update_prompt_embedding(num_prompt_tokens, prompt_pos); + + prompt_pos += metadata_.ar_len; + } +} + +void EmbeddingProcessor::prepare_io( + const std::vector& prompt_tokens, + int64_t prompt_pos) { + for (int i = 0; i < metadata_.ar_len; i++) { + // Prepare input token data + if (prompt_pos + i < prompt_tokens.size()) { + // Support CPU 4-bit embedding, which requires int64 input. + // However, for QNN embedding, only int32 input is needed. + // Therefore, we need to cast to the correct type to write the data. + if (metadata_.use_int64_token) { + input_toks_.data[i] = prompt_tokens[prompt_pos + i]; + } else { + int32_t* input_toks_ptr = reinterpret_cast(input_toks_.data); + input_toks_ptr[i] = static_cast(prompt_tokens[prompt_pos + i]); + } + } + } +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h new file mode 100644 index 00000000000..0ece8bf2d03 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include +#include +#include +#include +#include + +namespace example { +/** + * @class EmbeddingProcessor + * @brief Class for processing prompts to generate embeddings using embedding + * runner. + */ +class EmbeddingProcessor { + public: + struct Metadata { + int32_t context_len; + int32_t ar_len; + int32_t vocab_size; + bool use_int64_token; + int32_t embedding_dim; + }; + + EmbeddingProcessor( + EmbeddingRunner* embedding_runner, + const std::string& method_name, + Metadata metadata); + + /** + * @brief Initialize I/O tensor and allocate I/O data buffer. + * @param buffer_manager Pointer to IMemAlloc instance. + * @param method_meta Method metadata. + */ + void init_io( + IMemAlloc* buffer_manager, + executorch::runtime::Result method_meta); + + void update_prompt_embedding(int32_t num_prompt_tokens, int64_t prompt_pos); + + /** + * Process prompt tokens to generate embeddings. + * @param prompt_tokens The text prompt tokens. Encoded by tokenizer. + * @param ar_len AR length for chunking. + * @return The embedding tensor result. + */ + void prefill(const std::vector& prompt_tokens); + + /** + * @brief Get total I/O size in bytes. + * @return Total I/O size in bytes. + */ + inline const size_t total_embedding_processor_io_size_in_bytes() const { + return input_toks_.size + embeddings_.size; + } + + inline const TensorStruct& get_prompt_embeddings() const { + return prompt_embeddings_; + } + + private: + /** + * @brief Fill in I/O buffers with prompt tokens. + * @param prompt_tokens Vector of prompt tokens. + */ + void prepare_io( + const std::vector& prompt_tokens, + int64_t prompt_pos); + + EmbeddingRunner* embedding_runner_; + std::string method_name_; + + // metadata + Metadata metadata_; + + // inputs and outputs + TensorStruct input_toks_; + TensorStruct embeddings_; + TensorStruct prompt_embeddings_; + std::vector prompt_embeddings_buffer_; + std::vector prompt_embeddings_sizes_; + std::vector + prompt_embeddings_dim_order_; + + std::vector inputs_; + std::vector input_tensors_; + std::vector output_tensors_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp new file mode 100644 index 00000000000..bf1008e34b1 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +using executorch::aten::Tensor; +using executorch::extension::Module; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::Result; + +namespace example { + +EmbeddingRunner::EmbeddingRunner(Module* module) : module_(module) {} + +Result EmbeddingRunner::step( + const std::string& method_name, + std::vector& inputs) { + // Execute embedding module + Result> outputs_res = + module_->execute(method_name, inputs); + + ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); + ET_CHECK_MSG( + outputs_res.get()[0].isTensor(), + "Non Tensor Output returned from executing Token Embedding"); + + // Get the embedding tensor from result + return outputs_res.get()[0].toTensor(); +} + +Error EmbeddingRunner::set_outputs( + const std::string& method_name, + std::vector output_values) { + for (size_t i = 0; i < output_values.size(); ++i) { + ET_CHECK_OK_OR_RETURN_ERROR( + module_->set_output(method_name, output_values[i], i)); + } + return Error::Ok; +} + +Error EmbeddingRunner::load(const std::vector& method_names) { + if (is_method_loaded(method_names)) { + return Error::Ok; + } + for (const std::string& method_name : method_names) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(method_name)); + } + return Error::Ok; +} + +bool EmbeddingRunner::is_method_loaded( + const std::vector& method_names) { + bool method_loaded = true; + for (const std::string& method_name : method_names) { + method_loaded &= module_->is_method_loaded(method_name); + } + return method_loaded; +} + +bool EmbeddingRunner::is_loaded() const { + return module_ != nullptr && module_->is_loaded(); +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h new file mode 100644 index 00000000000..d5155a45252 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace example { + +/** + * @class EmbeddingRunner + * @brief Class for running embedding module, similar to DecoderRunner + */ +class EmbeddingRunner { + public: + EmbeddingRunner(executorch::extension::Module* module); + + /** + * Run embedding module with inputs to generate embeddings. + * @param method_name The method name to execute. + * @param inputs The inputs to the embedding module. + * @return The output embeddings tensor. + */ + executorch::runtime::Result step( + const std::string& method_name, + std::vector& inputs); + + executorch::runtime::Error set_outputs( + const std::string& method_name, + std::vector output_values); + + /** + * Load the Module for token embedding. + * @return The error code. + */ + executorch::runtime::Error load(const std::vector& method_names); + + /** + * Check if the required methods in the Module are loaded. + * @return True if the Module is loaded, false otherwise. + */ + bool is_method_loaded(const std::vector& method_names); + + /** + * @brief Check if embedding module is loaded + * @return true if module is loaded, false otherwise + */ + bool is_loaded() const; + executorch::extension::Module* module_; +}; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp new file mode 100644 index 00000000000..91789f07a90 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +using executorch::aten::Tensor; +using executorch::extension::Module; +using executorch::extension::TensorPtr; +using executorch::runtime::Error; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; + +namespace example { + +EncoderRunner::EncoderRunner(const std::string& model_path) + : image_seq_len_(0) { + module_ = std::make_unique( + model_path, Module::LoadMode::MmapUseMlockIgnoreErrors); + ET_LOG(Info, "Creating encoder module: model_path=%s", model_path.c_str()); +} + +bool EncoderRunner::is_method_loaded() const { + return module_->is_method_loaded(kEncoderForwardName); +} + +Error EncoderRunner::load() { + if (is_method_loaded()) { + return Error::Ok; + } + + auto load_result = module_->load_method(kEncoderForwardName); + if (load_result != Error::Ok) { + ET_LOG(Error, "Failed to load encoder method"); + return load_result; + } + + // Get image sequence length from output metadata + Result method_meta = module_->method_meta(kEncoderForwardName); + if (!method_meta.ok()) { + ET_LOG(Error, "Failed to get encoder method metadata"); + return method_meta.error(); + } + + // vision embedding output shape: [1, seq_len, dim] + image_seq_len_ = method_meta->output_tensor_meta(0)->sizes()[1]; + ET_LOG(Info, "Encoder loaded successfully, image_seq_len=%d", image_seq_len_); + + return Error::Ok; +} + +int32_t EncoderRunner::get_image_seq_len() const { + return image_seq_len_; +} + +Result EncoderRunner::encode(TensorPtr& image_tensor) { + ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded"); + + auto tensor_ptr = image_tensor.get(); + ET_LOG(Info, "Encoding image tensor with numel: %zu", tensor_ptr->numel()); + + std::vector encoder_inputs; + encoder_inputs.emplace_back(*tensor_ptr); + + auto encoder_result = module_->forward(encoder_inputs); + ET_CHECK_MSG(encoder_result.ok(), "Encoder execution failed"); + + auto encoder_output = encoder_result.get(); + auto image_hidden_states = encoder_output[0].toTensor(); + ET_LOG(Info, "Encoder execution completed, got image hidden states"); + + return image_hidden_states; +} + +Result EncoderRunner::encode_from_file( + const std::string& image_file_path) { + ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded"); + + // Get input tensor metadata + Result method_meta = module_->method_meta(kEncoderForwardName); + auto sizes_span = method_meta->input_tensor_meta(0)->sizes(); + + // Calculate total number of elements + int64_t num_elem = 1; + for (const auto& size : sizes_span) { + num_elem *= size; + } + + // Read image data from file + ET_LOG( + Info, + "Reading image from file: %s, num_elements=%ld", + image_file_path.c_str(), + num_elem); + std::ifstream file(image_file_path, std::ios::binary | std::ios::ate); + ET_CHECK_MSG( + file.is_open(), "Failed to open image file: %s", image_file_path.c_str()); + + // To prevent users from passing images that have not been + // resized to match the encoder input size. + std::streamsize file_size = file.tellg(); + std::streamsize expected_size = num_elem * sizeof(float); + ET_CHECK_MSG( + file_size == expected_size, + "Image file size mismatch: expected %ld bytes but got %ld bytes (file: %s)", + expected_size, + file_size, + image_file_path.c_str()); + + file.seekg(0, std::ios::beg); + std::vector buffer(num_elem); + file.read(reinterpret_cast(buffer.data()), expected_size); + file.close(); + + // Create tensor from buffer + TensorPtr tensor = executorch::extension::from_blob( + buffer.data(), + std::vector(sizes_span.begin(), sizes_span.end()), + executorch::aten::ScalarType::Float); + + // Encode the tensor + return encode(tensor); +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h new file mode 100644 index 00000000000..0e1becc05b6 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +namespace example { + +/** + * @class EncoderRunner + * @brief Class for running vision encoder to generate image hidden states. + */ +class EncoderRunner { + public: + /** + * @brief Constructor for EncoderRunner + * @param model_path Path to the encoder model PTE file + */ + explicit EncoderRunner(const std::string& model_path); + + /** + * @brief Check if the encoder method is loaded + * @return true if method is loaded, false otherwise + */ + bool is_method_loaded() const; + + /** + * @brief Load the encoder method + * @return Error status + */ + executorch::runtime::Error load(); + + /** + * @brief Get the image sequence length from encoder output metadata + * @return Image sequence length + */ + int32_t get_image_seq_len() const; + + /** + * @brief Encode image tensor to hidden states + * @param image_tensor Input image tensor (B, C, H, W) + * @return Result containing the image hidden states tensor + */ + executorch::runtime::Result encode( + executorch::extension::TensorPtr& image_tensor); + + /** + * @brief Encode image from raw file + * @param image_file_path Path to raw image file + * @return Result containing the image hidden states tensor + */ + executorch::runtime::Result encode_from_file( + const std::string& image_file_path); + + private: + std::unique_ptr module_; + inline static const std::string kEncoderForwardName = "forward"; + int32_t image_seq_len_; +}; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp new file mode 100644 index 00000000000..5f0375d1c50 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp @@ -0,0 +1,418 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +using executorch::runtime::Result; + +namespace example { + +template +void MultimodalLhdTokenGenerator::prepare_io( + std::vector input_tokens, + std::vector input_pos) { + for (int i = 0; i < metadata_.ar_len; i++) { + if (i < input_tokens.size()) { + // Prepare pos data + this->input_pos_.data[i] = input_pos[i]; + } + } + + // Generate embeddings for the first metadata_.ar_len tokens + int num_tokens_to_process = + std::min(static_cast(input_tokens.size()), metadata_.ar_len); + std::vector tokens_to_process( + input_tokens.begin(), input_tokens.begin() + num_tokens_to_process); + + embedding_runner_->prefill(tokens_to_process); + const TensorStruct& text_embeddings = + embedding_runner_->get_prompt_embeddings(); + int64_t embedding_dim = text_embeddings.tensor->size(2); + + // Copy embedding to input buffer from the left + std::memcpy( + this->input_embedding_.data, + text_embeddings.data, + num_tokens_to_process * embedding_dim * sizeof(float)); + + // If metadata_.ar_len > input_tokens.size(), initialize remaining part to 0 + if (metadata_.ar_len > num_tokens_to_process) { + int remaining_tokens = metadata_.ar_len - num_tokens_to_process; + std::memset( + this->input_embedding_.data + num_tokens_to_process * embedding_dim, + 0, + remaining_tokens * embedding_dim * sizeof(float)); + } +} + +template +void MultimodalLhdTokenGenerator::init_attention_mask(int32_t n_past) { + std::vector attention_map; + attention_map.reserve(metadata_.ar_len); + // Initialize attention mask with current position + for (int i = 0; i < metadata_.window; ++i) { + attention_map.push_back(i - 1); + } + for (int i = 1; i < metadata_.ngram - 1; ++i) { + for (int j = 0; j < metadata_.window; ++j) { + attention_map.push_back((i - 1) * metadata_.window + j); + } + } + for (int g = 0; g < metadata_.gcap; g++) { + for (int j = 0; j < metadata_.ngram - 1; j++) { + if (j == 0) + attention_map.push_back(0); + else + attention_map.push_back( + (metadata_.window + g) * (metadata_.ngram - 1) + j - 1); + } + } + + this->kv_manager_->init_attention_mask( + this->attention_mask_.data, attention_map, metadata_.ar_len, n_past); + // Initialize window attention mask with current position + if (metadata_.cache_mode == CacheMode::HybridCache) { + this->kv_manager_->init_attention_mask( + this->window_attention_mask_.data, + attention_map, + metadata_.ar_len, + n_past, + metadata_.sliding_window, + position_offset_); + } +} + +template +void MultimodalLhdTokenGenerator::init_lookahead_branch( + const std::vector& tokens) { + for (int i = 0; i < metadata_.ngram - 1; ++i) { + for (int j = 0; j < metadata_.window; ++j) { + // there are different ways to init these tokens + if (0) { + // initialize with a sequence of increasing numbers + lhd_branch_[i][j] = 1000 + j; + } else { + // initialize with the random token from prompt + lhd_branch_[i][j] = tokens[1 + rand() % (tokens.size() - 1)]; + } + } + } + is_lhd_branch_initialized_ = true; +} + +template +void MultimodalLhdTokenGenerator::init_verification_branch( + uint64_t cur_token) { + const int g_cur = ngrams_pool_.cnt[cur_token]; + + v_branch_.resize(g_cur); + for (int g = 0; g < g_cur; g++) { + v_branch_[g].active = true; + v_branch_[g].tokens.resize(metadata_.ngram); + v_branch_[g].i_batch.resize(metadata_.ngram); + v_branch_[g].seq_id = metadata_.window + 1 + g; + v_branch_[g].i_batch[0] = 0; + v_branch_[g].tokens[0] = cur_token; + } + + for (int j = 0; j < metadata_.ngram - 1; j++) { + for (int g = 0; g < g_cur; g++) { + const int idx = cur_token * (metadata_.ngram - 1) * metadata_.gcap + + g * (metadata_.ngram - 1); + const int32_t t = ngrams_pool_.tokens[idx + j]; + v_branch_[g].tokens[j + 1] = t; + v_branch_[g].i_batch[j + 1] = j + 1; + } + } +} + +template +void MultimodalLhdTokenGenerator::update_ngrams_pool() { + std::vector ngram(metadata_.ngram - 1); + // n-gram pool generation + for (int f = 0; f < metadata_.window; ++f) { + const int ft = lhd_branch_prev_[f]; // first token of the n-gram + + for (int j = 0; j < metadata_.ngram - 1; ++j) { + ngram[j] = lhd_branch_[j][f]; + } + + // filter-out repeating n-grams + { + bool is_unique = true; + for (int k = 0; k < ngrams_pool_.cnt[ft]; ++k) { + // calculate the related idx by the first n-gram token + const int idx = ft * (metadata_.ngram - 1) * metadata_.gcap + + k * (metadata_.ngram - 1); + + bool is_match = true; + for (int j = 0; j < metadata_.ngram - 1; ++j) { + if (ngrams_pool_.tokens[idx + j] != ngram[j]) { + is_match = false; + break; + } + } + + // if n-gram match all, discard one of them + if (is_match) { + is_unique = false; + break; + } + } + if (!is_unique) { + continue; + } + } + + const int head = ngrams_pool_.head[ft]; + const int idx = ft * (metadata_.ngram - 1) * metadata_.gcap + + head * (metadata_.ngram - 1); + + for (int i = 0; i < metadata_.ngram - 1; i++) { + // update the n-gram pool with new n-gram + ngrams_pool_.tokens[idx + i] = ngram[i]; + } + + ngrams_pool_.cnt[ft] = + std::min(metadata_.gcap, (int32_t)ngrams_pool_.cnt[ft] + 1); + ngrams_pool_.head[ft] = (head + 1) % metadata_.gcap; + ngrams_pool_.n_total++; + } +} + +template +void MultimodalLhdTokenGenerator::update_lookahead_branch( + const executorch::aten::Tensor& logits_tensor) { + for (int i = 0; i < metadata_.window; i++) { + lhd_branch_prev_[i] = lhd_branch_[0][i]; + } + + for (int j = 0; j < metadata_.ngram - 2; j++) { + lhd_branch_[j] = lhd_branch_[j + 1]; + } + + // sample from the last level + for (int i = 0; i < metadata_.window; i++) { + size_t sample_idx = (metadata_.ngram - 2) * metadata_.window + i; + lhd_branch_[metadata_.ngram - 2][i] = + this->decoder_runner_->logits_to_token(logits_tensor, sample_idx); + } +} + +template +Result MultimodalLhdTokenGenerator::generate( + std::vector tokens, + int64_t start_pos, + int32_t seq_len, + std::function token_callback, + bool dump_logits) { + ET_CHECK_MSG( + !tokens.empty(), "Token generation loop shouldn't take empty tokens"); + // position in the sequence + int64_t pos = start_pos; + int64_t prev_pos; + // number of match tokens + int32_t n_accept{0}; + std::vector result_tokens; + uint64_t cur_token = tokens.back(); + uint64_t prev_token; + result_tokens.push_back(cur_token); + + // Manage the inputs of lookahead decoding + std::vector input_pos; + std::vector input_tokens; + input_tokens.reserve(metadata_.ar_len); + input_pos.reserve(metadata_.ar_len); + + // Rearrange KV cache first and initialize the input and output of KV cache + this->kv_manager_->rearrange_cache(metadata_.ar_len); + + // Initialize attention mask with pos + init_attention_mask(pos); + + // Initialize Lookahead branch at first generation + if (!is_lhd_branch_initialized_) { + ET_LOG(Info, "Initialize Lookahead branch"); + init_lookahead_branch(tokens); + } + + // Initialize the output of the module + ET_CHECK_MSG( + this->decoder_runner_->set_outputs( + this->method_name_, this->output_tensors_) == + executorch::runtime::Error::Ok, + "Failed to set output tensor for module %s", + this->method_name_.c_str()); + + // Generate tokens + while (pos < seq_len - 1) { + std::vector selected(metadata_.ar_len, false); + + input_tokens.clear(); + input_pos.clear(); + + // fill the first token of the first level + input_tokens.push_back(cur_token); + input_pos.push_back(pos); + + // fill the remaining WINDOW - 1 tokens for the first level + for (int i = 1; i < metadata_.window; ++i) { + input_tokens.push_back(lhd_branch_[0][i]); + input_pos.push_back(pos + i); + } + + // fill the rest of the levels + for (int i = 1; i < metadata_.ngram - 1; ++i) { + for (int j = 0; j < metadata_.window; ++j) { + input_tokens.push_back(lhd_branch_[i][j]); + input_pos.push_back(pos + i + j); + } + } + // Verification Branch Init + init_verification_branch(cur_token); + + for (int g = 0; g < v_branch_.size(); g++) { + for (int j = 0; j < metadata_.ngram - 1; j++) { + input_tokens.push_back(v_branch_[g].tokens[j + 1]); + input_pos.push_back(pos + j + 1); + } + } + + prepare_io(input_tokens, input_pos); + + // Run inference + auto logits_res = + this->decoder_runner_->step(this->method_name_, this->inputs_); + ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); + executorch::aten::Tensor& logits_tensor = logits_res.get(); + prev_pos = pos; + + // verification branch seq-id + size_t seq_id_best = 0; + // max hit pos + size_t i_batch_best = 0; + + // Lookahead decoding and verification + for (int v = 0; v < metadata_.ngram; ++v) { + // Verification + int i_batch = 0; + if (v > 0) { + for (int g = 0; g < v_branch_.size(); g++) { + // record the best matched seq and pos + if (v_branch_[g].active) { + i_batch = v_branch_[g].i_batch[v]; + i_batch_best = i_batch; + seq_id_best = v_branch_[g].seq_id; + ++n_accept; + break; + } + } + if (i_batch == 0) { + break; + } + } + + size_t sample_idx; + if (seq_id_best == 0) + sample_idx = 0; + else + sample_idx = metadata_.window * (metadata_.ngram - 1) + + (seq_id_best - (metadata_.window + 1)) * (metadata_.ngram - 1) + + i_batch - 1; + + // vector selected set + selected[sample_idx] = true; + + prev_token = cur_token; + // sampler from logits all + this->stats_->on_sampling_begin(); + cur_token = + this->decoder_runner_->logits_to_token(logits_tensor, sample_idx); + this->stats_->on_sampling_end(); + result_tokens.push_back(cur_token); + pos++; + + // print the token as string, decode it with the Tokenizer object + token_callback( + ET_UNWRAP_TOKENIZER(this->tokenizer_->decode(prev_token, cur_token))); + + // data-dependent terminating condition: we have n_eos_ number of EOS + if (this->eos_ids_->count(cur_token) > 0) { + printf("\n"); + ET_LOG(Info, "\nReached to the end of generation"); + break; + } + + // if verify pass, check the next sample token until verifying failed + for (int g = 0; g < v_branch_.size(); g++) { + // update the n-gram active status + if (v_branch_[g].active) { + if (v == metadata_.ngram - 1) { + v_branch_[g].active = false; + } else { + if (cur_token != v_branch_[g].tokens[v + 1]) { + v_branch_[g].active = false; + } + } + } + } + + // only update n-grams pools and lookahead branch when v=0 + if (v == 0) { + // update lookahead branch + update_lookahead_branch(logits_tensor); + // update n-grams pool + update_ngrams_pool(); + } + } // end of verify loop + + if (pos > metadata_.context_len - metadata_.ar_len) { + printf("\n"); + ET_LOG(Info, "\nReached to the maximum sequence length"); + break; + } + // Update KV Cache with the output results + int32_t n_update = pos - prev_pos; + this->kv_manager_->update_cache( + metadata_.ar_len, prev_pos, n_update, selected); + + // Update attention mask with current position + this->kv_manager_->update_attention_mask( + this->attention_mask_.data, metadata_.ar_len, prev_pos, n_update); + if (metadata_.cache_mode == CacheMode::HybridCache) { + this->kv_manager_->update_attention_mask( + this->window_attention_mask_.data, + metadata_.ar_len, + prev_pos, + n_update, + metadata_.sliding_window, + position_offset_); + } + + // data-dependent terminating condition: we have n_eos_ number of EOS + if (this->eos_ids_->count(cur_token) > 0) { + printf("\n"); + ET_LOG(Info, "\nReached to the end of generation"); + break; + } + } + ET_LOG( + Info, + "Lookahead Decoding: n_generated = %ld / n_accept = %d", + pos - start_pos, + n_accept); + + return pos - start_pos; +} + +// Explicit instantiations +template class MultimodalLhdTokenGenerator; +template class MultimodalLhdTokenGenerator; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h new file mode 100644 index 00000000000..266542f1a72 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include + +namespace example { + +/** + * @class MultimodalLhdTokenGenerator + * @brief Extended LhdTokenGenerator with multimodal embedding support + */ +template +class MultimodalLhdTokenGenerator + : public example::MultimodalTokenGenerator { + public: + struct Metadata { + int32_t context_len; + int64_t num_heads; + int64_t num_layers; + int32_t ar_len; + int32_t vocab_size; + bool use_int64_token; + int32_t ngram; + int32_t window; + int32_t gcap; + int sliding_window; + CacheMode cache_mode; + int32_t embedding_dim = 0; + }; + MultimodalLhdTokenGenerator( + tokenizers::Tokenizer* tokenizer, + EmbeddingProcessor* embedding_runner, + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& forward_name, + std::unique_ptr>&& eos_ids, + Metadata metadata, + executorch::llm::Stats* stats) + : MultimodalTokenGenerator( + tokenizer, + embedding_runner, + decoder_runner, + kv_manager, + forward_name, + std::move(eos_ids), + typename MultimodalTokenGenerator::Metadata{ + metadata.context_len, + metadata.num_heads, + metadata.num_layers, + metadata.ar_len, + metadata.vocab_size, + metadata.use_int64_token, + metadata.sliding_window, + metadata.cache_mode, + metadata.embedding_dim}, + stats), + embedding_runner_(embedding_runner), + metadata_(metadata), + lhd_branch_(metadata.ngram - 1, std::vector(metadata.window)), + lhd_branch_prev_(metadata.window), + ngrams_pool_(metadata.vocab_size, metadata.ngram, metadata.gcap) { + ET_LOG( + Info, + "Use Lookahead decoding: ngram=%d, window=%d, gcap=%d", + metadata.ngram, + metadata.window, + metadata.gcap); + + // initialize position offset + position_offset_ = std::vector(metadata.ar_len); + int idx = 0; + // lookahead branches + for (int i = 0; i < metadata.ngram - 1; ++i) { + for (int j = 0; j < metadata.window; ++j) { + position_offset_[idx++] = i + j; + } + } + // verification branches + for (int i = 0; i < metadata.gcap; ++i) { + for (int j = 1; j < metadata.ngram; ++j) { + position_offset_[idx++] = j; + } + } + } + + ~MultimodalLhdTokenGenerator() = default; + + /** +    * @brief Generate tokens with lookahead decoding. +    * @param tokens Vector of input tokens. +    * @param start_pos Starting position for generation. +    * @param seq_len Length of the sequence to generate. +    * @param token_callback Callback function for generated tokens. +    * @return The number of tokens generated. +    */ + executorch::runtime::Result generate( + std::vector tokens, + int64_t start_pos, + int32_t seq_len, + std::function token_callback, + bool dump_logits) override; + + private: + /** + * @brief Fill in I/O buffers with prompt token and position. + * @param cur_token Current token. + * @param start_pos Starting position. + */ + void prepare_io( + std::vector input_tokens, + std::vector input_pos); + void init_attention_mask(int32_t n_past); + void init_lookahead_branch(const std::vector& tokens); + void init_verification_branch(uint64_t cur_token); + void update_lookahead_branch(const executorch::aten::Tensor& logits_tensor); + void update_ngrams_pool(); + + // Additional members specific to multimodal + EmbeddingProcessor* embedding_runner_; + + struct NgramData { + bool active = false; + int32_t seq_id = -1; + + // match pos + std::vector i_batch; + std::vector tokens; + }; + + // n-gram pool + struct NgramContainer { + NgramContainer(int n_vocab, int n, int g) { + cnt.resize(n_vocab); + head.resize(n_vocab); + tokens.resize(n_vocab * g * (n - 1)); + } + + int n_total = 0; + + std::vector cnt; + std::vector head; + + // [n_vocab][G][N - 1] + // for each token of the vocab, keep a ring-buffer of capacity G of n-grams + // of size N - 1 + std::vector tokens; + }; + + Metadata metadata_; + + // lookahead branch + bool is_lhd_branch_initialized_{false}; + // [N - 1][W] + std::vector> lhd_branch_; + // [W] + std::vector lhd_branch_prev_; + + // verification branch + std::vector v_branch_; + + // position offset in attention mask + std::vector position_offset_; + + // n-gram pools + NgramContainer ngrams_pool_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp new file mode 100644 index 00000000000..07728bfe257 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp @@ -0,0 +1,316 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +using executorch::aten::TensorImpl; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +using executorch::runtime::TensorInfo; + +namespace example { + +template +MultimodalPromptProcessor::MultimodalPromptProcessor( + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& method_name, + Metadata metadata) + : PromptProcessor( + decoder_runner, + kv_manager, + method_name, + {metadata.context_len, + metadata.num_heads, + metadata.num_layers, + metadata.ar_len, + metadata.vocab_size, + metadata.use_int64_token, + metadata.sliding_window, + metadata.cache_mode}), + metadata_(metadata) { + // Set input_toks_.size to 0 since we use embeddings instead + input_toks_.size = 0; + input_embedding_.size = + metadata_.ar_len * metadata_.embedding_dim * sizeof(float); +}; + +template +void MultimodalPromptProcessor::init_io( + IMemAlloc* buffer_manager, + Result method_meta) { + size_t idx = 0; + input_tensors_.reserve(method_meta->num_inputs()); + output_tensors_.reserve(method_meta->num_outputs()); + + // [I]: input embedding + Result input_embedding = method_meta->input_tensor_meta(idx++); + input_embedding_.data = + reinterpret_cast(buffer_manager->allocate(input_embedding_.size)); + input_embedding_.tensor = std::make_unique( + input_embedding->scalar_type(), + input_embedding->sizes().size(), + const_cast(input_embedding->sizes().data()), + input_embedding_.data, + const_cast( + input_embedding->dim_order().data())); + input_tensors_.emplace_back(input_embedding_.tensor.get()); + buffer_manager->add_memory_info( + input_embedding_.data, input_embedding_.size, input_embedding.get()); + + // [I]: attention_mask + Result attention_mask = method_meta->input_tensor_meta(idx++); + attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(attention_mask_.size)); + attention_mask_.tensor = std::make_unique( + attention_mask->scalar_type(), + attention_mask->sizes().size(), + const_cast(attention_mask->sizes().data()), + attention_mask_.data, + const_cast( + attention_mask->dim_order().data())); + input_tensors_.emplace_back(attention_mask_.tensor.get()); + buffer_manager->add_memory_info( + attention_mask_.data, attention_mask_.size, attention_mask.get()); + + // [I]: sliding window attention_mask + if (metadata_.cache_mode == CacheMode::HybridCache) { + Result window_attention_mask = + method_meta->input_tensor_meta(idx++); + window_attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(window_attention_mask_.size)); + window_attention_mask_.tensor = std::make_unique( + window_attention_mask->scalar_type(), + window_attention_mask->sizes().size(), + const_cast( + window_attention_mask->sizes().data()), + window_attention_mask_.data, + const_cast( + window_attention_mask->dim_order().data())); + input_tensors_.emplace_back(window_attention_mask_.tensor.get()); + buffer_manager->add_memory_info( + window_attention_mask_.data, + window_attention_mask_.size, + window_attention_mask.get()); + } + + if (!is_bert()) { + // [I]: input_pos + Result input_pos = method_meta->input_tensor_meta(idx++); + input_pos_.data = + reinterpret_cast(buffer_manager->allocate(input_pos_.size)); + input_pos_.tensor = std::make_unique( + input_pos->scalar_type(), + input_pos->sizes().size(), + const_cast(input_pos->sizes().data()), + input_pos_.data, + const_cast(input_pos->dim_order().data())); + input_tensors_.emplace_back(input_pos_.tensor.get()); + buffer_manager->add_memory_info( + input_pos_.data, input_pos_.size, input_pos.get()); + + // [I] kv_cache + size_t index = idx; // bypass input_tokens, atten_mask, input_pos + for (int cache_group = 0; cache_group < 2; ++cache_group) { + std::vector>& cache = + (cache_group == 0 ? k_cache_in_ : v_cache_in_); + std::vector> cache_ptrs = (cache_group == 0) + ? kv_manager_->get_k_cache_() + : kv_manager_->get_v_cache_(); + for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { + Result kv_cache = method_meta->input_tensor_meta(index); + + T* cache_ptr = cache_ptrs[layer].buffer; + + cache[layer] = std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data())); + input_tensors_.emplace_back(cache[layer].get()); + buffer_manager->add_memory_info( + cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + } + } + } + + // [O]: logits + Result logits = method_meta->output_tensor_meta(0); + logits_.data = + reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.tensor = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), + const_cast(logits->sizes().data()), + logits_.data, + const_cast(logits->dim_order().data())); + output_tensors_.emplace_back(logits_.tensor.get()); + buffer_manager->add_memory_info(logits_.data, logits_.size, logits.get()); + + // [O] kv_cache + size_t index = 1; + for (int cache_group = 0; cache_group < 2; ++cache_group) { + std::vector>& cache = + (cache_group == 0 ? k_cache_out_ : v_cache_out_); + std::vector> cache_ptrs = (cache_group == 0) + ? kv_manager_->get_k_cache_() + : kv_manager_->get_v_cache_(); + for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { + Result kv_cache = method_meta->output_tensor_meta(index); + T* cache_ptr = cache_ptrs[layer].output_buffer; + cache[layer] = std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast(kv_cache->dim_order().data())); + output_tensors_.emplace_back(cache[layer].get()); + buffer_manager->add_memory_info( + cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + } + } + + // Prepare the vector of EValue to run inference + inputs_.reserve(input_tensors_.size()); + for (auto& input_tensor : input_tensors_) { + inputs_.emplace_back(std::move(input_tensor)); + } +} + +// prepare embedding +template +void MultimodalPromptProcessor::prepare_io( + const TensorStruct& prompt_embedding, + int32_t num_prompt_tokens, + int64_t prompt_pos, + int64_t start_pos) { + for (int i = 0; i < metadata_.ar_len; i++) { + if (!is_bert()) { + // Prepare pos data + input_pos_.data[i] = start_pos + i; + } + + // Prepare input token data + if (prompt_pos + i < num_prompt_tokens) { + std::memcpy( + input_embedding_.data + i * metadata_.embedding_dim, + prompt_embedding.data + (prompt_pos + i) * metadata_.embedding_dim, + metadata_.embedding_dim * sizeof(float)); + } + } +} + +template +Result MultimodalPromptProcessor::prefill( + const TensorStruct& prompt_embedding, + int64_t start_pos, + bool dump_logits) { + int32_t num_prompt_tokens = prompt_embedding.tensor->size(1); + if (!is_bert()) { + ET_CHECK_MSG( + (start_pos + num_prompt_tokens) <= + (metadata_.context_len - metadata_.ar_len), + "The sequence length exceeds the maximum limit that the prompt processor can handle."); + } else { + ET_CHECK_MSG( + start_pos == 0, "Bert model doesn't support multi-turn conversation."); + } + + // store the token + int64_t cur_token; + int64_t prompt_pos = 0; + int64_t pos = start_pos; + int32_t n_update = metadata_.ar_len; + int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len); + ET_LOG( + Info, + "Prompt Processor: total %d prompt tokens (AR-%d * %d iters)", + num_prompt_tokens, + metadata_.ar_len, + num_iters); + + // Rearrange KV cache first + kv_manager_->rearrange_cache(metadata_.ar_len); + std::vector attention_map(metadata_.ar_len); + std::iota(attention_map.begin(), attention_map.end(), -1); + // Initialize attention mask with current position + kv_manager_->init_attention_mask( + attention_mask_.data, attention_map, metadata_.ar_len, pos); + // Initialize window attention mask with current position + if (metadata_.cache_mode == CacheMode::HybridCache) { + kv_manager_->init_attention_mask( + window_attention_mask_.data, + attention_map, + metadata_.ar_len, + pos, + metadata_.sliding_window); + } + + // Initialize the output of the module + ET_CHECK_MSG( + decoder_runner_->set_outputs(method_name_, output_tensors_) == + executorch::runtime::Error::Ok, + "Failed to set output tensor for module %s", + method_name_.c_str()); + for (int i = 0; i < num_iters; ++i) { + // Fill in the embedding and position data + prepare_io(prompt_embedding, num_prompt_tokens, prompt_pos, pos); + + // Run inference + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + std::vector> k_cache_ptrs = kv_manager_->get_k_cache_(); + T* k_cache_data = k_cache_ptrs[layer].buffer; + } + for (int layer = 0; layer < metadata_.num_layers; ++layer) { + std::vector> v_cache_ptrs = kv_manager_->get_v_cache_(); + T* v_cache_data = v_cache_ptrs[layer].buffer; + } + + decoder_runner_->step(method_name_, inputs_); + if (dump_logits) { + prompt_all_logits_.insert( + prompt_all_logits_.end(), + logits_.data, + logits_.data + metadata_.ar_len * metadata_.vocab_size); + } + // In the last run, offset to the meaningful logits. + if (i == num_iters - 1) { + n_update = 1 + ((num_prompt_tokens - 1) % metadata_.ar_len); + } + // Update KV Cache with the output results + kv_manager_->update_cache(metadata_.ar_len, pos, n_update, {}); + + // Update attention mask with current position + kv_manager_->update_attention_mask( + attention_mask_.data, metadata_.ar_len, pos, n_update); + if (metadata_.cache_mode == CacheMode::HybridCache) { + kv_manager_->update_attention_mask( + window_attention_mask_.data, + metadata_.ar_len, + pos, + n_update, + metadata_.sliding_window); + } + prompt_pos += metadata_.ar_len; + pos += metadata_.ar_len; + } + + cur_token = decoder_runner_->logits_to_token( + output_tensors_[0], + (num_prompt_tokens + metadata_.ar_len - 1) % metadata_.ar_len); + return cur_token; +} + +// Explicit instantiations +template class MultimodalPromptProcessor; +template class MultimodalPromptProcessor; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h new file mode 100644 index 00000000000..540b147def3 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include + +namespace example { + +/** + * @class MultimodalPromptProcessor + * @brief Extended PromptProcessor with multimodal embedding support + */ +template +class MultimodalPromptProcessor : public example::PromptProcessor { + public: + struct Metadata { + int32_t context_len; + int64_t num_heads; + int64_t num_layers; + int32_t ar_len; + int32_t vocab_size; + bool use_int64_token; + int sliding_window; + CacheMode cache_mode; + int32_t embedding_dim = 0; + }; + + MultimodalPromptProcessor( + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& method_name, + Metadata metadata); + + int64_t get_num_heads() const { + return metadata_.num_heads; + } + int64_t get_num_layers() const { + return metadata_.num_layers; + } + + /** + * @brief Initialize I/O tensor and allocate I/O data buffer. + * @param buffer_manager Pointer to IMemAlloc instance which depends on + * kv_updater. + * @param method_meta Method metadata. + */ + void init_io( + IMemAlloc* buffer_manager, + executorch::runtime::Result method_meta); + + /** + * Prefill an Decoder Module with the given embedding input. + * @param prompt_embedding The embedding tensor from embedding module. + * @param start_pos The starting position in KV cache of the input in the LLM + * Module. + * @param dump_logits Used to save all logits. Only enable when analyzing + * accuracy. + * @return The next token of the LLM Module after prefill. + */ + executorch::runtime::Result prefill( + const TensorStruct& prompt_embedding, + int64_t start_pos, + bool dump_logits); + + /** + * @brief Get total I/O size in bytes (excluding the KV cache size) + * @return Total I/O size in bytes. + */ + inline const size_t total_prompt_processor_io_size_in_bytes() const { + if (metadata_.cache_mode == CacheMode::HybridCache) { + return input_toks_.size + input_pos_.size + attention_mask_.size + + window_attention_mask_.size + logits_.size + input_embedding_.size; + } else { + return input_toks_.size + input_pos_.size + attention_mask_.size + + logits_.size + input_embedding_.size; + } + } + + private: + // Reuse members from token_generator + using PromptProcessor::decoder_runner_; + using PromptProcessor::kv_manager_; + using PromptProcessor::method_name_; + using PromptProcessor::k_cache_in_; + using PromptProcessor::v_cache_in_; + using PromptProcessor::k_cache_out_; + using PromptProcessor::v_cache_out_; + using PromptProcessor::input_toks_; + using PromptProcessor::input_pos_; + using PromptProcessor::attention_mask_; + using PromptProcessor::window_attention_mask_; + using PromptProcessor::logits_; + using PromptProcessor::inputs_; + using PromptProcessor::input_tensors_; + using PromptProcessor::output_tensors_; + using PromptProcessor::prompt_all_logits_; + using PromptProcessor::is_bert; + + /** + * @brief Fill in I/O buffers with embedding data and position. + * @param prompt_embedding The embedding tensor. + * @param prompt_pos Position of the prompt. + * @param start_pos Starting position. + */ + void prepare_io( + const TensorStruct& prompt_embedding, + int32_t num_prompt_tokens, + int64_t prompt_pos, + int64_t start_pos); + + // metadata specific to multimodal + Metadata metadata_; + + // Additional input for multimodal + TensorStruct input_embedding_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp new file mode 100644 index 00000000000..274920ec00f --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp @@ -0,0 +1,752 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// A llama 3.2 runner that includes preprocessing and post processing +// logic. The module takes in a string as input and emits a string as output. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__aarch64__) +#include "arm_neon.h" +#endif + +using executorch::aten::Tensor; +using executorch::extension::Module; +using executorch::extension::llm::get_rss_bytes; +using executorch::extension::llm::print_report; +using executorch::extension::llm::Stats; +using executorch::extension::llm::time_in_ms; +using executorch::runtime::Error; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +namespace llm = ::executorch::extension::llm; + +namespace example { +namespace { +void print_performance_report( + const Stats& stats, + const std::string& performance_output_path) { + // For now, we just print the total inference time for CI, can save more info + // in future if needed. + std::ofstream outfile(performance_output_path.c_str()); + if (outfile.is_open()) { + double num_tok = 0; + if (stats.num_generated_tokens == 0) { + // For cases like evaluate perplexity where prompt_len == cache_len + num_tok = ((stats.num_prompt_tokens)) / + (double)(stats.prompt_eval_end_ms - stats.inference_start_ms) * + stats.SCALING_FACTOR_UNITS_PER_SECOND; + } else { + num_tok = (stats.num_generated_tokens) / + (double)(stats.inference_end_ms - stats.inference_start_ms) * + stats.SCALING_FACTOR_UNITS_PER_SECOND; + } + + outfile << num_tok; + outfile.close(); + } else { + ET_LOG(Error, "Error saving the inference speed file"); + } +} + +void save_logits( + const std::string& dump_logits_path, + const std::vector& prefill_logits, + const std::vector& decode_logits) { + std::ofstream outFile(dump_logits_path.c_str(), std::ios::binary); + if (outFile.is_open()) { + outFile.write( + reinterpret_cast(prefill_logits.data()), + prefill_logits.size() * sizeof(uint16_t)); + + outFile.write( + reinterpret_cast(decode_logits.data()), + decode_logits.size() * sizeof(uint16_t)); + outFile.close(); + } else { + ET_CHECK_MSG(false, "Error saving the dump logits file"); + } +} + +} // namespace + +template +MultimodalRunner::MultimodalRunner( + std::unique_ptr module, + std::unique_ptr embedding_module, + const std::string& decoder_model_version, + const std::string& model_path, + const std::string& tokenizer_path, + const std::string& dump_logits_path, + const std::string& performance_output_path, + const float temperature, + const int eval_mode, + const bool shared_buffer, + const int ngram, + const int window, + const int gcap, + std::unique_ptr image_hidden_states) + : module_(std::move(module)), + embedding_module_(std::move(embedding_module)), + image_hidden_states_(std::move(image_hidden_states)), + ngram_(ngram), + window_(window), + gcap_(gcap), + tokenizer_path_(tokenizer_path), + performance_output_path_(performance_output_path), + dump_logits_path_(dump_logits_path), + temperature_(temperature), + eval_mode_(static_cast(eval_mode)), + shared_buffer_(shared_buffer) { + stats_.reset(); + + if (decoder_model_version == "smolvlm") { + decoder_model_version_ = MultimodalDecoderModelVersion::kSmolvlm; + } else if (decoder_model_version == "internvl3") { + decoder_model_version_ = MultimodalDecoderModelVersion::kInternvl3; + } else { + ET_CHECK_MSG(false, "Unsupported Decoder Model"); + } + + ET_LOG(Info, "creating module: model_path=%s", model_path.c_str()); + ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str()); + ET_LOG(Info, "eval mode=%d", eval_mode_); +} + +template +bool MultimodalRunner::is_loaded() const { + return module_->is_loaded() && embedding_module_->is_loaded() && tokenizer_ && + decoder_runner_ && prompt_processor_ && token_generator_ && kv_manager_ && + buffer_manager_; +} + +template +Error MultimodalRunner::load() { + if (is_loaded()) { + return Error::Ok; + } + + std::string prompt_embedding_method_name, token_embedding_method_name; + std::string token_generator_method_name, prompt_processor_method_name; + std::vector method_names; + switch (eval_mode_) { + case EvalMode::kKVCached: + prompt_embedding_method_name = "tok_embedding_kv_forward"; + token_embedding_method_name = "tok_embedding_kv_forward"; + prompt_processor_method_name = "forward"; + token_generator_method_name = "forward"; + method_names.emplace_back(prompt_processor_method_name); + method_names.emplace_back(token_generator_method_name); + break; + case EvalMode::kHybrid: + case EvalMode::kLookaheadDecoding: + prompt_embedding_method_name = "tok_embedding_prefill_forward"; + token_embedding_method_name = "tok_embedding_kv_forward"; + prompt_processor_method_name = "prefill_forward"; + token_generator_method_name = "kv_forward"; + method_names.emplace_back(prompt_processor_method_name); + method_names.emplace_back(token_generator_method_name); + break; + case EvalMode::kUnsupported: + ET_CHECK_MSG(false, "Unsupported llama evaluation mode"); + break; + } + auto eos_ids = std::make_unique>(); + if (tokenizer_ != nullptr) { + eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]); + eos_ids->insert(tokenizer_->encode("<|eot|>", 0, 0).get()[0]); + eos_ids->insert(tokenizer_->encode("<|end_of_text|>", 0, 0).get()[0]); + } else { + tokenizer_ = llm::load_tokenizer(tokenizer_path_); + if (tokenizer_ == nullptr) { + ET_LOG( + Error, "Failed to load tokenizer with %s", tokenizer_path_.c_str()); + return Error::Internal; + } + eos_ids->insert(tokenizer_->eos_tok()); + } + if (decoder_model_version_ == MultimodalDecoderModelVersion::kSmolvlm) { + eos_ids->insert(tokenizer_->encode("", 0, 0).get()[0]); + } else if ( + decoder_model_version_ == MultimodalDecoderModelVersion::kInternvl3) { + eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]); + } + + // Try avoid getMetadataHelper as it is time consuming. + Result method_meta = + module_->method_meta(token_generator_method_name); + + // For some tokenizer.json, runtime vocab_size might be different, use output + // shape to get vocab size. + int32_t vocab_size = method_meta->output_tensor_meta(0)->sizes()[2]; + decoder_runner_ = + std::make_unique(module_.get(), vocab_size, temperature_); + + ET_CHECK_OK_OR_RETURN_ERROR(decoder_runner_->load(method_names)); + + ET_LOG(Info, "Reading metadata from model"); + // retrieve any method meta, can be either prefill or kv + int64_t num_layers = + ET_UNWRAP(module_->get("get_n_layers")).toScalar().to(); + + ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers"); + // k_cache: [1, n_heads, head_dim, seq_len] + auto k_cache_shape = method_meta->output_tensor_meta(1)->sizes(); + int64_t num_heads = k_cache_shape[1]; + int64_t head_dim = k_cache_shape[2]; + + // TODO: filter shape hidden_state: [1, ar_len, dim] + int64_t dim = embedding_module_->method_meta(token_embedding_method_name) + ->output_tensor_meta(0) + ->sizes()[2]; + bool use_int64_token = + embedding_module_->method_meta(token_embedding_method_name) + ->input_tensor_meta(0) + ->scalar_type() == executorch::aten::ScalarType::Long; + + // Use attention mask length to retrieve AR length and context length + // Cache len equals to context_len - ar_len + int32_t prompt_processor_ar_len = 0; + int32_t token_generator_ar_len = 0; + int32_t max_cache_len = 0; + int32_t max_ar_len = 0; + // atten mask: [1, AR-N, CL] + auto atten_mask_meta_token = method_meta->input_tensor_meta(1); + token_generator_ar_len = atten_mask_meta_token->sizes()[1]; + context_len_ = atten_mask_meta_token->sizes()[2]; + if (eval_mode_ == EvalMode::kKVCached) { + prompt_processor_ar_len = token_generator_ar_len; + } else if ( + eval_mode_ == EvalMode::kHybrid || + eval_mode_ == EvalMode::kLookaheadDecoding) { + auto atten_mask_meta_prompt = + module_->method_meta(prompt_processor_method_name) + ->input_tensor_meta(1); + prompt_processor_ar_len = atten_mask_meta_prompt->sizes()[1]; + } + if (prompt_processor_ar_len == context_len_) + max_cache_len = context_len_; + else + max_cache_len = context_len_ - + std::min(token_generator_ar_len, prompt_processor_ar_len); + max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len); + + embedding_runner_ = + std::make_unique(embedding_module_.get()); + ET_CHECK_OK_OR_RETURN_ERROR(embedding_runner_->load( + {prompt_embedding_method_name, token_embedding_method_name})); + // Initialize EmbeddingProcessor + embedding_processor_ = std::make_unique( + embedding_runner_.get(), + prompt_embedding_method_name, + EmbeddingProcessor::Metadata{ + context_len_, + prompt_processor_ar_len, + vocab_size, + use_int64_token, + static_cast(dim)}); + + // Load the sliding window size if the model supports it. + // This is used to configure the attention mask for models with window + // attention + int32_t sliding_window = context_len_; + if (module_->method_names()->count("get_sliding_window") > 0) { + sliding_window = ET_UNWRAP(module_->get("get_sliding_window")).toInt(); + } + kv_manager_ = std::make_unique>(typename KVManager::Metadata{ + context_len_, + head_dim, + max_ar_len, + max_cache_len, + num_heads, + num_layers}); + + prompt_processor_ = std::make_unique>( + decoder_runner_.get(), + kv_manager_.get(), + prompt_processor_method_name, + typename MultimodalPromptProcessor::Metadata{ + context_len_, + num_heads, + num_layers, + prompt_processor_ar_len, + vocab_size, + use_int64_token, + sliding_window, + cache_mode_, + static_cast(dim)}); + + if (eval_mode_ == EvalMode::kLookaheadDecoding || + eval_mode_ == EvalMode::kHybrid) { + output_k_cache_scales_.resize(num_layers); + output_k_cache_zero_points_.resize(num_layers); + output_v_cache_scales_.resize(num_layers); + output_v_cache_zero_points_.resize(num_layers); + for (int i = 0; i < num_layers; i++) { + std::string get_k_scale_output_name = + "get_k_scale_output_" + std::to_string(i); + std::string get_k_zero_point_output_name = + "get_k_zero_point_output_" + std::to_string(i); + std::string get_v_scale_output_name = + "get_v_scale_output_" + std::to_string(i); + std::string get_v_zero_point_output_name = + "get_v_zero_point_output_" + std::to_string(i); + + if (module_->method_names()->count(get_k_scale_output_name) > 0) { + output_k_cache_scales_[i] = static_cast( + ET_UNWRAP(module_->get(get_k_scale_output_name)).toDouble()); + } else { + ET_LOG(Error, "Cannot find method %s", get_k_scale_output_name.c_str()); + return Error::Internal; + } + if (module_->method_names()->count(get_k_zero_point_output_name) > 0) { + output_k_cache_zero_points_[i] = static_cast( + ET_UNWRAP(module_->get(get_k_zero_point_output_name)).toInt()); + } else { + ET_LOG( + Error, + "Cannot find method %s", + get_k_zero_point_output_name.c_str()); + return Error::Internal; + } + if (module_->method_names()->count(get_v_scale_output_name) > 0) { + output_v_cache_scales_[i] = static_cast( + ET_UNWRAP(module_->get(get_v_scale_output_name)).toDouble()); + } else { + ET_LOG(Error, "Cannot find method %s", get_v_scale_output_name.c_str()); + return Error::Internal; + } + if (module_->method_names()->count(get_v_zero_point_output_name) > 0) { + output_v_cache_zero_points_[i] = static_cast( + ET_UNWRAP(module_->get(get_v_zero_point_output_name)).toInt()); + } else { + ET_LOG( + Error, + "Cannot find method %s", + get_v_zero_point_output_name.c_str()); + return Error::Internal; + } + } + // Load scale and zero point for quantized input KV cache + input_k_cache_scales_.resize(num_layers); + input_k_cache_zero_points_.resize(num_layers); + input_v_cache_scales_.resize(num_layers); + input_v_cache_zero_points_.resize(num_layers); + for (int i = 0; i < num_layers; i++) { + std::string get_k_scale_input_name = + "get_k_scale_input_" + std::to_string(i); + std::string get_k_zero_point_input_name = + "get_k_zero_point_input_" + std::to_string(i); + std::string get_v_scale_input_name = + "get_v_scale_input_" + std::to_string(i); + std::string get_v_zero_point_input_name = + "get_v_zero_point_input_" + std::to_string(i); + if (module_->method_names()->count(get_k_scale_input_name) > 0) { + input_k_cache_scales_[i] = static_cast( + ET_UNWRAP(module_->get(get_k_scale_input_name)).toDouble()); + } else { + ET_LOG(Error, "Cannot find method %s", get_k_scale_input_name.c_str()); + return Error::Internal; + } + if (module_->method_names()->count(get_k_zero_point_input_name) > 0) { + input_k_cache_zero_points_[i] = static_cast( + ET_UNWRAP(module_->get(get_k_zero_point_input_name)).toInt()); + } else { + ET_LOG( + Error, + "Cannot find method %s", + get_k_zero_point_input_name.c_str()); + return Error::Internal; + } + if (module_->method_names()->count(get_v_scale_input_name) > 0) { + input_v_cache_scales_[i] = static_cast( + ET_UNWRAP(module_->get(get_v_scale_input_name)).toDouble()); + } else { + ET_LOG(Error, "Cannot find method %s", get_v_scale_input_name.c_str()); + return Error::Internal; + } + if (module_->method_names()->count(get_v_zero_point_input_name) > 0) { + input_v_cache_zero_points_[i] = static_cast( + ET_UNWRAP(module_->get(get_v_zero_point_input_name)).toInt()); + } else { + ET_LOG( + Error, + "Cannot find method %s", + get_v_zero_point_input_name.c_str()); + return Error::Internal; + } + } + } + + // Initialize EmbeddingGenerator + embedding_generator_ = std::make_unique( + embedding_runner_.get(), + token_embedding_method_name, + EmbeddingProcessor::Metadata{ + context_len_, + token_generator_ar_len, + vocab_size, + use_int64_token, + static_cast(dim)}); + if (eval_mode_ == EvalMode::kLookaheadDecoding) { + // Initialize TokenGenerator + token_generator_ = std::make_unique>( + tokenizer_.get(), + embedding_generator_.get(), + decoder_runner_.get(), + kv_manager_.get(), + token_generator_method_name, + std::move(eos_ids), + typename MultimodalLhdTokenGenerator::Metadata{ + context_len_, + num_heads, + num_layers, + token_generator_ar_len, + vocab_size, + use_int64_token, + ngram_, + window_, + gcap_, + sliding_window, + cache_mode_, + static_cast(dim)}, + &stats_); + } else { + token_generator_ = std::make_unique>( + tokenizer_.get(), + embedding_generator_.get(), + decoder_runner_.get(), + kv_manager_.get(), + token_generator_method_name, + std::move(eos_ids), + typename MultimodalTokenGenerator::Metadata{ + context_len_, + num_heads, + num_layers, + token_generator_ar_len, + vocab_size, + use_int64_token, + sliding_window, + cache_mode_, + static_cast(dim)}, + &stats_); + } + + buffer_manager_ = std::make_unique(); + if (shared_buffer_) { + buffer_manager_ = std::make_unique( + kv_manager_->total_cache_size_in_bytes(), + prompt_processor_->total_prompt_processor_io_size_in_bytes(), + token_generator_->total_token_generator_io_size_in_bytes(), + embedding_processor_->total_embedding_processor_io_size_in_bytes(), + embedding_generator_->total_embedding_processor_io_size_in_bytes()); + } + + ET_LOG(Info, "creating io_memory"); + // prepare io + kv_manager_->init_cache(buffer_manager_.get(), prompt_processor_ar_len); + prompt_processor_->init_io( + buffer_manager_.get(), + module_->method_meta(prompt_processor_method_name)); + token_generator_->init_io( + buffer_manager_.get(), module_->method_meta(token_generator_method_name)); + // Prepare io for embedding + embedding_processor_->init_io( + buffer_manager_.get(), + embedding_module_->method_meta(prompt_embedding_method_name)); + embedding_generator_->init_io( + buffer_manager_.get(), + embedding_module_->method_meta(token_embedding_method_name)); + return Error::Ok; +} + +template +Error MultimodalRunner::generate( + const std::string& prompt, + const llm::GenerationConfig& config, + std::function token_callback, + std::function stats_callback) { + return generate_from_prompt_or_file( + prompt, false, config, token_callback, stats_callback); +} + +template +Error MultimodalRunner::generate_from_prompt_or_file( + const std::string& prompt, + bool tokenized_prompt, + const llm::GenerationConfig& config, + std::function token_callback, + std::function stats_callback) { + ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null"); + if (!is_loaded()) { + stats_.model_load_start_ms = time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + stats_.model_load_end_ms = time_in_ms(); + } + stats_.inference_start_ms = time_in_ms(); + + int32_t seq_len = config.seq_len; + if (seq_len > context_len_) { + ET_LOG( + Info, + "Warning: Requested seq_len (%d) exceeds compiled max_seq_len (%d). Clamping to %d.", + seq_len, + context_len_, + context_len_); + seq_len = context_len_; + } else if (seq_len <= 0) { + ET_LOG( + Info, + "Warning: Invalid seq_len (%d). Using compiled max_seq_len (%d).", + seq_len, + context_len_); + seq_len = context_len_; + } + // For multimodal, we will disable n_bos + int32_t n_bos = 0; + + // encode the (string) prompt into tokens sequence + std::vector prompt_tokens; + if (tokenized_prompt) { + std::ifstream inFile(prompt, std::ios::binary); + if (inFile.is_open()) { + // Get file size + inFile.seekg(0, std::ios::end); + size_t fileSize = inFile.tellg(); + inFile.seekg(0, std::ios::beg); + + // Resize vector and read raw data + prompt_tokens.resize(fileSize / sizeof(uint64_t)); + + inFile.read(reinterpret_cast(prompt_tokens.data()), fileSize); + inFile.close(); + } else { + ET_CHECK_MSG( + false, + "Unable to read tokenized prompt from file: %s", + prompt.c_str()); + } + } else { + tokenizers::Result> encode_res = + tokenizer_->encode(prompt, n_bos, 0); + ET_CHECK_TK_OK_OR_RETURN_ERROR( + encode_res.error(), "failed to encode prompt %s", prompt.c_str()); + prompt_tokens = encode_res.get(); + } + int num_prompt_tokens = prompt_tokens.size(); + ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token"); + ET_CHECK_MSG( + cur_pos_ + num_prompt_tokens < seq_len, + "sequence length exceeded - please increase the seq_len value"); + + // Prompt Processor first + if (token_callback && config.echo) { + token_callback(prompt); + } + bool dump_logits = dump_logits_path_.empty() ? false : true; + embedding_processor_->prefill(prompt_tokens); + const TensorStruct& text_embeddings = + embedding_processor_->get_prompt_embeddings(); + int64_t embedding_dim = text_embeddings.tensor->size(2); + + uint64_t placeholder_token_id = 0; + if (module_->method_names()->count("modality_placeholder_token_id") > 0) { + placeholder_token_id = + module_->get("modality_placeholder_token_id")->toInt(); + } + + ET_LOG(Info, "Merging text embeddings with image hidden states"); + merge_multimodal_embeddings( + prompt_tokens, text_embeddings, placeholder_token_id); + + auto prefill_res = + prompt_processor_->prefill(merged_embeddings_, cur_pos_, dump_logits); + ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); + uint64_t cur_token = prefill_res.get(); + cur_pos_ += num_prompt_tokens; + stats_.first_token_ms = time_in_ms(); + stats_.prompt_eval_end_ms = time_in_ms(); + + // print the first token from prefill. No prev_token so use cur_token for + // it. + if (token_callback) { + token_callback( + ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); + } + ET_LOG( + Info, + "RSS after prompt prefill: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + // start the main loop + prompt_tokens.push_back(cur_token); + + // Requant kv cache for prefill decode I/O + if (eval_mode_ == EvalMode::kLookaheadDecoding || + eval_mode_ == EvalMode::kHybrid) { + int64_t num_heads = prompt_processor_->get_num_heads(); + int64_t num_layers = prompt_processor_->get_num_layers(); + int64_t head_dim = kv_manager_->get_head_dim(); + std::vector> k_cache_ptrs = kv_manager_->get_k_cache_(); + std::vector> v_cache_ptrs = kv_manager_->get_v_cache_(); + + const int64_t num_elems_per_layer = + (context_len_ - 1) * num_heads * head_dim; + // Requant kv cache from prefill output scale/zero_point to decode input + // scale/zero_point + for (int layer_idx = 0; layer_idx < num_layers; layer_idx++) { + T* k_cache_data = k_cache_ptrs[layer_idx].buffer; + T* v_cache_data = v_cache_ptrs[layer_idx].buffer; + + const float scale_ratio_k = + output_k_cache_scales_[layer_idx] / input_k_cache_scales_[layer_idx]; + const float scale_ratio_v = + output_v_cache_scales_[layer_idx] / input_v_cache_scales_[layer_idx]; + + for (int64_t i = 0; i < num_elems_per_layer; i++) { + // Requant k_cache_data from prefill output scale/zero_point to decode + // input scale/zero_point + k_cache_data[i] = static_cast( + (k_cache_data[i] - output_k_cache_zero_points_[layer_idx]) * + scale_ratio_k + + input_k_cache_zero_points_[layer_idx]); + + // Requant v_cache_data from prefill output scale/zero_point to decode + // input scale/zero_point + v_cache_data[i] = static_cast( + (v_cache_data[i] - output_v_cache_zero_points_[layer_idx]) * + scale_ratio_v + + input_v_cache_zero_points_[layer_idx]); + } + } + } + + int64_t num_generated_tokens = ET_UNWRAP(token_generator_->generate( + prompt_tokens, cur_pos_, seq_len, token_callback, dump_logits)); + stats_.inference_end_ms = time_in_ms(); + ET_LOG( + Info, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + cur_pos_ += num_generated_tokens; + if (cur_pos_ == seq_len) { + ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); + } + + stats_.num_prompt_tokens = num_prompt_tokens; + stats_.num_generated_tokens = num_generated_tokens; + print_report(stats_); + print_performance_report(stats_, performance_output_path_); + if (dump_logits) { + save_logits( + dump_logits_path_, + prompt_processor_->get_all_logits(), + token_generator_->get_all_logits()); + } + if (stats_callback) { + stats_callback(stats_); + } + return Error::Ok; +} + +template +void MultimodalRunner::merge_multimodal_embeddings( + const std::vector& input_ids, + const TensorStruct& text_embeddings, + uint64_t placeholder_token_id) { + // This implements the modality_inputs_merger logic from decoder_utils.py + // Find positions where placeholder tokens appear + std::vector placeholder_positions; + for (size_t i = 0; i < input_ids.size(); ++i) { + if (input_ids[i] == placeholder_token_id) { + placeholder_positions.push_back(i); + } + } + + int64_t embedding_dim; + int64_t num_tokens = input_ids.size(); + if (text_embeddings.tensor) { + embedding_dim = text_embeddings.tensor->size(2); + num_tokens = text_embeddings.tensor->size(1); + } else { + ET_CHECK_MSG( + false, + "text_embeddings.tensor is null; cannot determine embedding dim during multimodal embedding merge"); + } + + // Allocate new buffer for merged embeddings + size_t total_elements = num_tokens * embedding_dim; + multimodal_embeddings_buffer_.resize(total_elements); + + // First, copy all text embeddings to the new buffer + std::memcpy( + multimodal_embeddings_buffer_.data(), + text_embeddings.data, + total_elements * sizeof(float)); + + // Then replace placeholder positions with image hidden states + auto* image_data = image_hidden_states_->const_data_ptr(); + auto* merged_data = multimodal_embeddings_buffer_.data(); + + int64_t image_seq_len = image_hidden_states_->size(1); + + // Copy image hidden states to placeholder positions + for (int32_t i = 0; i < placeholder_positions.size(); ++i) { + int32_t pos = placeholder_positions[i]; + std::memcpy( + merged_data + pos * embedding_dim, + image_data + i * embedding_dim, + embedding_dim * sizeof(float)); + } + + merged_embeddings_.data = multimodal_embeddings_buffer_.data(); + merged_embeddings_.size = total_elements * sizeof(float); + + // Create TensorImpl with proper shape [1, num_tokens, embedding_dim] + multimodal_embeddings_sizes_ = { + 1, static_cast(num_tokens), static_cast(embedding_dim)}; + multimodal_embeddings_dim_order_ = {0, 1, 2}; + merged_embeddings_.tensor = std::make_unique( + executorch::aten::ScalarType::Float, + multimodal_embeddings_sizes_.size(), + multimodal_embeddings_sizes_.data(), + merged_embeddings_.data, + multimodal_embeddings_dim_order_.data()); + + ET_LOG(Info, "Multimodal embeddings merged successfully"); +} + +template +Result +MultimodalRunner::get_decoder_model_version() { + if (!is_loaded()) { + stats_.model_load_start_ms = time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + stats_.model_load_end_ms = time_in_ms(); + } + return decoder_model_version_; +} + +// Explicit instantiations +template class MultimodalRunner; +template class MultimodalRunner; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h new file mode 100644 index 00000000000..b7967964de2 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Multimodal runner that extends the base llama runner with vision capabilities + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace example { + +// Extend DecoderModelVersion enum with multimodal models +enum MultimodalDecoderModelVersion { + kSmolvlm = 0, + kInternvl3, +}; + +enum KvBitWidth { + kWidth8 = 8, + kWidth16 = 16, +}; + +template +class MultimodalRunner : public executorch::extension::llm::IRunner { + public: + explicit MultimodalRunner( + std::unique_ptr module, + std::unique_ptr embedding_module, + const std::string& decoder_model, + const std::string& model_path, + const std::string& tokenizer_path, + const std::string& performance_output_path, + const std::string& dump_logits_path, + const float temperature = 0.8f, + const int eval_mode = EvalMode::kHybrid, + const bool shared_buffer = false, + const int ngram = 0, + const int window = 0, + const int gcap = 0, + std::unique_ptr image_hidden_states = nullptr); + + bool is_loaded() const override; + executorch::runtime::Error load() override; + + // Override generate to support multimodal inputs + executorch::runtime::Error generate( + const std::string& prompt, + const executorch::extension::llm::GenerationConfig& config, + std::function token_callback = {}, + std::function stats_callback = {}) + override; + + // Multimodal-specific generation with image embeddings + executorch::runtime::Error generate_from_prompt_or_file( + const std::string& prompt, + bool tokenized_prompt, + const executorch::extension::llm::GenerationConfig& config, + std::function token_callback = {}, + std::function stats_callback = {}); + void stop() override {}; + void reset() override {}; + executorch::runtime::Result + get_decoder_model_version(); + + // Multimodal-specific method for merging embeddings + void merge_multimodal_embeddings( + const std::vector& input_ids, + const TensorStruct& text_embeddings, + uint64_t placeholder_token_id); + + private: + enum EvalMode { + kKVCached = 0, + kHybrid, + kLookaheadDecoding, + kUnsupported, + }; + + // Modules + std::unique_ptr module_; + std::unique_ptr embedding_module_; + + int32_t context_len_{0}; + + int ngram_{0}; + int window_{0}; + int gcap_{0}; + + // Defaults to StaticCahce, indicating that the model does not use a + // global/local architecture. + CacheMode cache_mode_{CacheMode::StaticCahce}; + int64_t cur_pos_{0}; + + std::string tokenizer_path_; + std::string performance_output_path_; + std::string dump_logits_path_; + float temperature_; + EvalMode eval_mode_; + bool shared_buffer_; + + MultimodalDecoderModelVersion decoder_model_version_; + std::unique_ptr buffer_manager_; + std::unique_ptr> kv_manager_; + std::unique_ptr tokenizer_; + std::unique_ptr decoder_runner_; + std::unique_ptr> prompt_processor_; + std::unique_ptr> token_generator_; + std::unique_ptr embedding_runner_; + std::unique_ptr embedding_processor_; + std::unique_ptr embedding_generator_; + + // Image hidden states storage + std::unique_ptr image_hidden_states_; + + // Multimodal embeddings storage + std::vector multimodal_embeddings_buffer_; + std::vector + multimodal_embeddings_sizes_; + std::vector + multimodal_embeddings_dim_order_; + TensorStruct merged_embeddings_; + + // scale and zero point for quantized KV cache + std::vector input_k_cache_scales_; + std::vector input_k_cache_zero_points_; + std::vector input_v_cache_scales_; + std::vector input_v_cache_zero_points_; + std::vector output_k_cache_scales_; + std::vector output_k_cache_zero_points_; + std::vector output_v_cache_scales_; + std::vector output_v_cache_zero_points_; + + // stats + executorch::llm::Stats stats_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp new file mode 100644 index 00000000000..89b8614d407 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp @@ -0,0 +1,216 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +using executorch::aten::TensorImpl; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +using executorch::runtime::TensorInfo; + +namespace example { +// Constructor with embedding runner support +template +MultimodalTokenGenerator::MultimodalTokenGenerator( + tokenizers::Tokenizer* tokenizer, + EmbeddingProcessor* embedding_runner, + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& method_name, + std::unique_ptr>&& eos_ids, + Metadata metadata, + executorch::llm::Stats* stats) + : TokenGenerator( + tokenizer, + decoder_runner, + kv_manager, + method_name, + std::move(eos_ids), + {metadata.context_len, + metadata.num_heads, + metadata.num_layers, + metadata.ar_len, + metadata.vocab_size, + metadata.use_int64_token, + metadata.sliding_window, + metadata.cache_mode}, + stats), + embedding_runner_(embedding_runner), + metadata_(metadata) { + // Set input_toks_.size to 0 since we use embeddings instead + input_toks_.size = 0; + input_embedding_.size = + metadata_.ar_len * metadata_.embedding_dim * sizeof(float); +} + +template +void MultimodalTokenGenerator::init_io( + IMemAlloc* buffer_manager, + Result method_meta) { + size_t idx = 0; + input_tensors_.reserve(method_meta->num_inputs()); + output_tensors_.reserve(method_meta->num_outputs()); + + // [I]: input embedding + Result input_embedding = method_meta->input_tensor_meta(idx++); + input_embedding_.data = + reinterpret_cast(buffer_manager->allocate(input_embedding_.size)); + input_embedding_.tensor = std::make_unique( + input_embedding->scalar_type(), + input_embedding->sizes().size(), + const_cast(input_embedding->sizes().data()), + input_embedding_.data, + const_cast( + input_embedding->dim_order().data())); + input_tensors_.emplace_back(input_embedding_.tensor.get()); + buffer_manager->add_memory_info( + input_embedding_.data, input_embedding_.size, input_embedding.get()); + + // [I]: attention_mask + Result attention_mask = method_meta->input_tensor_meta(idx++); + attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(attention_mask_.size)); + attention_mask_.tensor = std::make_unique( + attention_mask->scalar_type(), + attention_mask->sizes().size(), + const_cast(attention_mask->sizes().data()), + attention_mask_.data, + const_cast( + attention_mask->dim_order().data())); + input_tensors_.emplace_back(attention_mask_.tensor.get()); + buffer_manager->add_memory_info( + attention_mask_.data, attention_mask_.size, attention_mask.get()); + + // [I]: sliding window attention_mask + if (metadata_.cache_mode == CacheMode::HybridCache) { + Result window_attention_mask = + method_meta->input_tensor_meta(idx++); + window_attention_mask_.data = reinterpret_cast( + buffer_manager->allocate(window_attention_mask_.size)); + window_attention_mask_.tensor = std::make_unique( + window_attention_mask->scalar_type(), + window_attention_mask->sizes().size(), + const_cast( + window_attention_mask->sizes().data()), + window_attention_mask_.data, + const_cast( + window_attention_mask->dim_order().data())); + input_tensors_.emplace_back(window_attention_mask_.tensor.get()); + buffer_manager->add_memory_info( + window_attention_mask_.data, + window_attention_mask_.size, + window_attention_mask.get()); + } + + // [I]: input_pos + Result input_pos = method_meta->input_tensor_meta(idx++); + input_pos_.data = + reinterpret_cast(buffer_manager->allocate(input_pos_.size)); + input_pos_.tensor = std::make_unique( + input_pos->scalar_type(), + input_pos->sizes().size(), + const_cast(input_pos->sizes().data()), + input_pos_.data, + const_cast(input_pos->dim_order().data())); + input_tensors_.emplace_back(input_pos_.tensor.get()); + buffer_manager->add_memory_info( + input_pos_.data, input_pos_.size, input_pos.get()); + + // [I] kv_cache + size_t index = idx; // bypass input_tokens, atten_mask, input_pos + for (int cache_group = 0; cache_group < 2; ++cache_group) { + std::vector>& cache = + (cache_group == 0 ? k_cache_in_ : v_cache_in_); + std::vector> cache_ptrs = (cache_group == 0) + ? kv_manager_->get_k_cache_() + : kv_manager_->get_v_cache_(); + for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { + Result kv_cache = method_meta->input_tensor_meta(index); + + T* cache_ptr = cache_ptrs[layer].buffer; + + cache[layer] = std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast(kv_cache->dim_order().data())); + input_tensors_.emplace_back(cache[layer].get()); + buffer_manager->add_memory_info( + cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + } + } + + // [O]: logits + Result logits = method_meta->output_tensor_meta(0); + logits_.data = + reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.tensor = std::make_unique( + logits->scalar_type(), + logits->sizes().size(), + const_cast(logits->sizes().data()), + logits_.data, + const_cast(logits->dim_order().data())); + output_tensors_.emplace_back(logits_.tensor.get()); + buffer_manager->add_memory_info(logits_.data, logits_.size, logits.get()); + + // [O] kv_cache + index = 1; + for (int cache_group = 0; cache_group < 2; ++cache_group) { + std::vector>& cache = + (cache_group == 0 ? k_cache_out_ : v_cache_out_); + std::vector> cache_ptrs = (cache_group == 0) + ? kv_manager_->get_k_cache_() + : kv_manager_->get_v_cache_(); + for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { + Result kv_cache = method_meta->output_tensor_meta(index); + T* cache_ptr = cache_ptrs[layer].output_buffer; + cache[layer] = std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast(kv_cache->dim_order().data())); + output_tensors_.emplace_back(cache[layer].get()); + buffer_manager->add_memory_info( + cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + } + } + + // Prepare the vector of EValue to run inference + inputs_.reserve(input_tensors_.size()); + for (auto& input_tensor : input_tensors_) { + inputs_.emplace_back(std::move(input_tensor)); + } +} + +// This function only considers the case where token_generator_ar_len equals 1. +template +void MultimodalTokenGenerator::prepare_io( + uint64_t cur_token, + int64_t start_pos) { + // Generate embedding for current token using embedding runner + embedding_runner_->prefill({cur_token}); + const TensorStruct& text_embeddings = + embedding_runner_->get_prompt_embeddings(); + int64_t embedding_dim = text_embeddings.tensor->size(2); + // Copy embedding to input buffer + std::memcpy( + input_embedding_.data, + text_embeddings.data, + metadata_.ar_len * embedding_dim * sizeof(float)); + + // update position_ids + *input_pos_.data = static_cast(start_pos); +} + +// Explicit instantiations +template class MultimodalTokenGenerator; +template class MultimodalTokenGenerator; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h new file mode 100644 index 00000000000..b010bf3748e --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include +#include + +namespace example { + +/** + * @class MultimodalTokenGenerator + * @brief Extended TokenGenerator with multimodal embedding support + */ +template +class MultimodalTokenGenerator : public example::TokenGenerator { + public: + struct Metadata { + int32_t context_len; + int64_t num_heads; + int64_t num_layers; + int32_t ar_len; + int32_t vocab_size; + bool use_int64_token; + int sliding_window; + CacheMode cache_mode; + int32_t embedding_dim = 0; + }; + + // Constructor with embedding generator support + MultimodalTokenGenerator( + tokenizers::Tokenizer* tokenizer, + EmbeddingProcessor* embedding_runner, + DecoderRunner* decoder_runner, + KVManager* kv_manager, + const std::string& method_name, + std::unique_ptr>&& eos_ids, + Metadata metadata, + executorch::llm::Stats* stats); + + virtual ~MultimodalTokenGenerator() = default; + + /** + * @brief Initialize I/O tensor and allocate I/O data buffer with embedding + * support. + */ + void init_io( + IMemAlloc* buffer_manager, + executorch::runtime::Result method_meta) + override; + + inline const size_t total_token_generator_io_size_in_bytes() const { + if (metadata_.cache_mode == CacheMode::HybridCache) { + return input_toks_.size + input_pos_.size + attention_mask_.size + + window_attention_mask_.size + logits_.size + input_embedding_.size; + } else { + return input_toks_.size + input_pos_.size + attention_mask_.size + + logits_.size + input_embedding_.size; + } + } + + protected: + // Reuse members from token_generator + using TokenGenerator::kv_manager_; + using TokenGenerator::input_pos_; + using TokenGenerator::attention_mask_; + using TokenGenerator::window_attention_mask_; + using TokenGenerator::inputs_; + using TokenGenerator::input_tensors_; + using TokenGenerator::output_tensors_; + + // Additional members specific to multimodal + TensorStruct input_embedding_; + + private: + // Reuse members from token_generator + using TokenGenerator::input_toks_; + using TokenGenerator::logits_; + using TokenGenerator::k_cache_in_; + using TokenGenerator::v_cache_in_; + using TokenGenerator::k_cache_out_; + using TokenGenerator::v_cache_out_; + + // Additional members specific to multimodal + EmbeddingProcessor* embedding_runner_; + + /** + * @brief Fill in I/O buffers with prompt token and position. + * @param cur_token Current token. + * @param start_pos Starting position. + */ + void prepare_io(uint64_t cur_token, int64_t start_pos) override; + + // metadata specific to multimodal + Metadata metadata_; +}; +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h index 5c97e510987..bd2e1cfc8d8 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h +++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h @@ -84,7 +84,7 @@ class PromptProcessor { } } - private: + protected: // If the cache length is zero, it indicates a BERT model, which does not use // position ids or KV cache inputs. bool is_bert() const { diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp index 67d7ec80aab..f89ee7bde66 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp @@ -23,6 +23,19 @@ RpcMem::RpcMem( shared_buffer_base_ptr_ = QnnExecuTorchAllocCustomMem( total_bytes, MemoryAllocator::kDefaultAlignment); } +RpcMem::RpcMem( + const size_t total_cache_size, + const size_t total_prompt_processor_io_size, + const size_t total_token_generator_io_size, + const size_t total_embedding_processor_io_size, + const size_t total_embedding_generator_io_size) + : calculated_offsets_(0) { + size_t total_bytes = total_cache_size + total_prompt_processor_io_size + + total_token_generator_io_size + total_embedding_processor_io_size + + total_embedding_generator_io_size; + shared_buffer_base_ptr_ = QnnExecuTorchAllocCustomMem( + total_bytes, MemoryAllocator::kDefaultAlignment); +} RpcMem::~RpcMem() { QnnExecuTorchFreeCustomMem(shared_buffer_base_ptr_); } diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h index 99e9cb1dec1..c3dd54d00c7 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h +++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h @@ -19,16 +19,26 @@ namespace example { */ class RpcMem final : public IMemAlloc { public: + RpcMem( + const size_t total_cache_size, + const size_t total_prompt_processor_io_size, + const size_t total_token_generator_io_size); /**    * @brief Constructor to allocate RpcMem with total sizes.    * @param total_cache_size Total size of the cache.    * @param total_prompt_processor_io_size Total size for prompt processor I/O.    * @param total_token_generator_io_size Total size for token generator I/O. -   */ + * @param total_embedding_processor_io_size Total size for embedding prompt +processor I/O. + * @param total_embedding_generator_io_size Total size for embedding generator +I/O.    */ RpcMem( const size_t total_cache_size, const size_t total_prompt_processor_io_size, - const size_t total_token_generator_io_size); + const size_t total_token_generator_io_size, + const size_t total_embedding_processor_io_size, + const size_t total_embedding_generator_io_size); + // Disable copy constructors, r-value referencing, etc RpcMem(const RpcMem&) = delete; RpcMem& operator=(const RpcMem&) = delete; diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 54a5c0b1d5c..aae61cec245 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -214,7 +214,6 @@ Error Runner::load() { eos_ids->insert(tokenizer_->encode("<|user|>", 0, 0).get()[0]); } - // Try avoid getMetadataHelper as it is time consuming. Result method_meta = module_->method_meta(token_generator_method_name); diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h index 329a4d49cc6..b74e0ac65aa 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h +++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h @@ -112,14 +112,14 @@ class TokenGenerator { // stats executorch::llm::Stats* stats_; - private: /** * @brief Fill in I/O buffers with prompt token and position. * @param cur_token Current token. * @param start_pos Starting position. */ - void prepare_io(uint64_t cur_token, int64_t start_pos); + virtual void prepare_io(uint64_t cur_token, int64_t start_pos); + private: // metadata Metadata metadata_; diff --git a/examples/qualcomm/oss_scripts/llama/wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers.py index 61190c5d7e9..6be5b702f80 100644 --- a/examples/qualcomm/oss_scripts/llama/wrappers.py +++ b/examples/qualcomm/oss_scripts/llama/wrappers.py @@ -85,6 +85,37 @@ from transformers import AutoConfig, AutoModel +def is_node_src_start_with_name(node: torch.fx.Node, prefix: str) -> bool: + """ + Return True if any NodeSource in node.meta['from_node'] + has a `name` starting with `prefix`. + """ + + def has_source_name_prefix( + node_src: torch.fx.traceback.NodeSource, prefix: str + ) -> bool: + + name = getattr(node_src, "name", None) + if isinstance(name, str) and name.startswith(prefix): + return True + + children = getattr(node_src, "from_node", None) + if not children: + return False + + for src in children: + if has_source_name_prefix(src, prefix): + return True + + return False + + node_srcs = node.meta.get("from_node", None) + if not node_srcs: + return False + + return any(has_source_name_prefix(node_src, prefix) for node_src in node_srcs) + + def log_info(func): class TimeIt: def __init__(self, event): @@ -173,7 +204,7 @@ def __init__( self.mode = mode self.passes_job = get_capture_program_passes() self.dep_table = get_passes_dependency_for_capture_program() - self.meta = None + self.meta = {} self.quant_recipe: StaticLLMQuantRecipe = ( self.config.quant_recipe(True) if self.config.quant_recipe else None ) @@ -203,7 +234,7 @@ def __init__( self.meta = self.decoder.get_metadata() # check if sharding required - if instance and self.config.num_sharding > 1: + if self.decoder and self.config.num_sharding > 1: SplitGraph, setting = model_sharding.get_split_graph_pass( self.meta["get_n_layers"], shares=self.config.num_sharding, @@ -237,7 +268,6 @@ def _prepare_model(self): # noqa: C901 if (instance := self._get_model_instance()) is None: return None tok_embedding, decoder = instance - # load parameters for HF models if self.control_args.checkpoint is None: checkpoint = download_and_convert_hf_checkpoint( @@ -343,7 +373,6 @@ def permute(w, heads, partial_rotary_dim): self.passes_job[I64toI32][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][ "skip_node" ] = {"tokens"} - if self.apply_embedding: tok_embedding = get_quant_embedding_transform( embedding_quantize=self.control_args.embedding_quantize @@ -360,13 +389,11 @@ def permute(w, heads, partial_rotary_dim): def _get_model_specific_kwargs(self): """ Retrieve model-specific config required for Static LLaMA. - This method handles architecture-specific requirements for both Vision-Language Models (VLMs) and Language-only Models (LLMs), extracting necessary config from HuggingFace configs. """ kwargs = {} - # Vision-Language Model (VLM) # For multimodal models, we need the special token ID that represents image placeholders # in the input sequence. This token is used to mark positions where image embeddings @@ -374,14 +401,12 @@ def _get_model_specific_kwargs(self): if hasattr(self.config, VISION_ENCODER): hf_config = AutoConfig.from_pretrained(self.config.repo_id) kwargs["modality_placeholder_token_id"] = hf_config.image_token_id - # TODO: Support Audio modality elif hasattr(self.config, AUDIO_ENCODER): raise NotImplementedError( "Audio encoder modality is not currently supported. " "Please provide a valid modality_placeholder_token_id in kwargs." ) - # Language-only Model (LLM) configuration # Handle architecture-specific parameters for models that require special configurations # beyond the general Static LLaMA architecture @@ -440,14 +465,13 @@ def _get_model_instance(self) -> LlamaModel: self.config.repo_id, _attn_implementation="eager" ) tok_embedding = TextEmbedding( - auto_model.get_input_embeddings(), + auto_model.get_input_embeddings().to(torch.float32), self.model_args.max_batch_size, ar_len, self.model_args.vocab_size, self.model_args.dim, use_i64_token, ) - # get decoder model self.model_args.max_batch_size = 1 self.model_args.max_seq_len = self.control_args.max_seq_len @@ -471,6 +495,7 @@ def _get_model_instance(self) -> LlamaModel: # get example input self.meta = decoder.get_metadata() self.example_input = decoder.get_example_inputs() + self.get_example_inputs = decoder.get_example_inputs self.export_input = ( self.example_input[0], # tokens or hidden_states *self.example_input[1], # attn_mask @@ -508,6 +533,62 @@ def _save_logits_quant_attrs(self): self.meta["get_logits_zero_point"] = output_node.args[2] break + def _save_input_kv_cache_quant_attrs(self): + input_kv_cache_shape = { + # single head, k input + ( + self.meta["get_head_dim"], + self.meta["get_max_seq_len"] - self.meta["get_ar_len"], + ), + # single head, v input + ( + self.meta["get_max_seq_len"] - self.meta["get_ar_len"], + self.meta["get_head_dim"], + ), + } + + idx = 0 + for node in self.decoder.graph.nodes: + if ( + node.op == "placeholder" + and len(users := list(node.users)) == 1 + and "val" in node.meta + ): + if node.meta["val"].size()[-2:] in input_kv_cache_shape: + scale_cache_name = f"get_k_scale_input_{idx}" + zero_point_cache_name = f"get_k_zero_point_input_{idx}" + if idx >= self.meta["get_n_layers"]: + scale_cache_name = ( + f"get_v_scale_input_{idx % self.meta['get_n_layers']}" + ) + zero_point_cache_name = ( + f"get_v_zero_point_input_{idx % self.meta['get_n_layers']}" + ) + self.meta[scale_cache_name] = users[0].args[1] + self.meta[zero_point_cache_name] = users[0].args[2] + idx += 1 + + def _save_output_kv_cache_quant_attrs(self): + output_kv_cache_shape = { + (self.meta["get_head_dim"], self.meta["get_ar_len"]), + (self.meta["get_ar_len"], self.meta["get_head_dim"]), + } + k_idx = 0 + v_idx = 0 + for node in self.decoder.graph.nodes: + if not is_graph_output(node): + continue + cache_output_node = node.args[0].args[0] + if cache_output_node.meta["val"].size()[-2:] in output_kv_cache_shape: + if is_node_src_start_with_name(cache_output_node, "k_"): + self.meta[f"get_k_scale_output_{k_idx}"] = node.args[1] + self.meta[f"get_k_zero_point_output_{k_idx}"] = node.args[2] + k_idx += 1 + elif is_node_src_start_with_name(cache_output_node, "v_"): + self.meta[f"get_v_scale_output_{v_idx}"] = node.args[1] + self.meta[f"get_v_zero_point_output_{v_idx}"] = node.args[2] + v_idx += 1 + def _tag_ios(self, node, fixed_point_type): # shape of k caches and v caches kv_cache_shape = { @@ -601,7 +682,7 @@ def _calibrate( if has_task_calibration and not is_multimodal: graph_module_inference( use_kv_cache=self.meta["get_use_kv_cache"], - example_input=self.example_input, + get_example_inputs=self.get_example_inputs, module=model, tokenizer=tokenizer, ar_len=self.meta["get_ar_len"], @@ -627,7 +708,7 @@ def _calibrate( for prompt in user_calibration_data: graph_module_inference( use_kv_cache=self.meta["get_use_kv_cache"], - example_input=self.example_input, + get_example_inputs=self.get_example_inputs, hidden_states=intermediate_outputs, # hidden_states for multimodal module=model, tok_embedding=tok_embedding, @@ -739,26 +820,36 @@ def quantize(self, request: Request): # noqa: C901 intermediate_outputs=image_embedding, ) - # propagate kv cache quantization attributes for prefill model - if self.mode == self.Mode.DECODE: - kv_quant_attrs, output_indices = {}, 0 - for node in self.decoder.graph.nodes: - if node.op == "output": - for output in node.args[0]: - kv_quant_attrs[output_indices] = output.args[1:] - output_indices += 1 - break - - data.custom_annotation += ( - partial( - annotate_prefill_kv_output, - kv_quant_attrs=kv_quant_attrs, - ), - ) - # save logit's quantization attributes to meta self._save_logits_quant_attrs() + # LLM: propagate kv cache quantization attributes for prefill model + if not self.apply_embedding: + if self.mode == self.Mode.DECODE: + kv_quant_attrs, output_indices = {}, 0 + for node in self.decoder.graph.nodes: + if node.op == "output": + for output in node.args[0]: + kv_quant_attrs[output_indices] = output.args[1:] + output_indices += 1 + break + + data.custom_annotation += ( + partial( + annotate_prefill_kv_output, + kv_quant_attrs=kv_quant_attrs, + ), + ) + # MultiModal: save kv cache IO quantization attributes to requant kv cache from prefill output scale/zero_point to decode input scale/zero_point + else: + # save input kv cache's quantization attributes to meta + if self.mode == self.Mode.DECODE: + self._save_input_kv_cache_quant_attrs() + + # save output kv cache's quantization attributes to meta + if self.mode == self.Mode.PREFILL: + self._save_output_kv_cache_quant_attrs() + # setup quantized IO self.passes_job[TagQuantIO][QCOM_PASS_ACTIVATE_KEY] = True self.passes_job[TagQuantIO][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][ @@ -909,7 +1000,7 @@ def compile(self, request: Request): # noqa: C901 module=dict(zip(graph_names, [model.decoder for model in models])), inputs=dict(zip(graph_names, example_inputs)), compiler_specs=dict(zip(graph_names, data.compile_spec)), - constant_methods=self.decode.meta, + constant_methods={**self.prefill.meta, **self.decode.meta}, dep_table=dict(zip(graph_names, [model.dep_table for model in models])), passes_job=dict(zip(graph_names, [model.passes_job for model in models])), skip_node_op_set={"llama.fallback.default"},