diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index ca7297c998d..42ad12e1689 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -6529,70 +6529,55 @@ def test_qwen2_5(self):
 
 
 class TestExampleMultimodalityScript(TestQNN):
-    def test_smolvlm_500m_instruct(self):
-        if not self.required_envs():
-            self.skipTest("missing required envs")
 
-        prompt = "Can you describe this image?"
-        cmds = [
-            "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
-            "--artifact",
-            self.artifact_dir,
-            "--build_folder",
-            self.build_folder,
-            "--model",
-            self.model,
-            "--ip",
-            self.ip,
-            "--port",
-            str(self.port),
-            "--prompt",
-            prompt,
-            "--temperature",
-            "0",
-            "--decoder_model",
-            "smolvlm_500m_instruct",
-            "--model_mode",
-            "kv",
-            "--max_seq_len",
-            "128",
-        ]
-        if self.compile_only:
-            cmds.extend(["--compile_only"])
-        elif self.device:
-            cmds.extend(["--device", self.device])
-        if self.host:
-            cmds.extend(["--host", self.host])
-        elif self.enable_x86_64:
-            cmds.extend(["--enable_x86_64"])
-        if self.pre_gen_pte:
-            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+    @dataclass(frozen=True)
+    class MLLMSpecs:
+        max_seq_len: int
+        sm8650_token_rate: float
+        sm8750_token_rate: float
+        encoder_pte_size: float
+        text_embedding_pte_size: float
+        decoder_pte_size: float
 
-        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
-        with Listener((self.ip, self.port)) as listener:
-            conn = listener.accept()
-            p.communicate()
-            msg = json.loads(conn.recv())
-            if "Error" in msg:
-                self.fail(msg["Error"])
-            else:
-                if not self.enable_x86_64:
-                    encoder_pte_size = msg["encoder_pte_size"]
-                    text_embedding_pte_size = msg["text_embedding_pte_size"]
-                    decoder_pte_size = msg["pte_size"]
-                    self.assertLessEqual(encoder_pte_size, 110_000_000)  # 110MB
-                    self.assertLessEqual(text_embedding_pte_size, 100_000_000)  # 100MB
-                    self.assertLessEqual(decoder_pte_size, 400_000_000)  # 400MB
-                    print(f"Encoder PTE Size: {encoder_pte_size} bytes")
-                    print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
-                    print(f"Decoder PTE Size: {decoder_pte_size} bytes")
+    @dataclass(frozen=True)
+    class VLMSpecs(MLLMSpecs):
+        image_path: str
+        golden_image_feature: str
 
-    def test_internvl3_1b(self):
-        if not self.required_envs():
+    # TODO: refactor to support different backends
+    def setUp(self):
+        self.vlm_specs = {
+            "smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
+                max_seq_len=128,
+                sm8650_token_rate=50,
+                sm8750_token_rate=55,
+                encoder_pte_size=110_000_000,  # 110MB
+                text_embedding_pte_size=100_000_000,  # 100MB
+                decoder_pte_size=400_000_000,  # 400MB
+                image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",  # New York Bay
+                golden_image_feature="city",
+            ),
+            "internvl3_1b": TestExampleMultimodalityScript.VLMSpecs(
+                max_seq_len=320,
+                sm8650_token_rate=11,
+                sm8750_token_rate=13,
+                encoder_pte_size=425_000_000,  # 425MB
+                text_embedding_pte_size=300_000_000,  # 300MB
+                decoder_pte_size=550_000_000,  # 550 MB
+                image_path="http://images.cocodataset.org/val2017/000000039769.jpg",  # Two cats lying on a blanket
+                golden_image_feature="cats",
+            ),
+        }
+
+    def test_static_vlm(self):
+        if not self.required_envs([self.model_name]):
             self.skipTest("missing required envs")
 
+        vlm_specs: TestExampleMultimodalityScript.VLMSpecs = self.vlm_specs[
+            self.model_name
+        ]
         prompt = "Can you describe this image?"
+        image_path = vlm_specs.image_path
         cmds = [
             "python",
             f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
@@ -6608,14 +6593,16 @@ def test_internvl3_1b(self):
             str(self.port),
             "--prompt",
             prompt,
+            "--image_path",
+            image_path,
             "--temperature",
             "0",
             "--decoder_model",
-            "internvl3_1b",
+            f"{self.model_name}",
             "--model_mode",
             "kv",
             "--max_seq_len",
-            "320",
+            f"{vlm_specs.max_seq_len}",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -6636,17 +6623,41 @@ def test_internvl3_1b(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(
+                        vlm_specs.golden_image_feature in model_out,
+                        f"Expected Output contains feature: '{vlm_specs.golden_image_feature}'  Actual Output: '{model_out}'",
+                    )
+                    print(f"Image Path: {image_path}")
+                    print(f"Query: {prompt}")
+                    print(f"Answer: {model_out}")
                 if not self.enable_x86_64:
                     encoder_pte_size = msg["encoder_pte_size"]
                     text_embedding_pte_size = msg["text_embedding_pte_size"]
                     decoder_pte_size = msg["pte_size"]
-                    self.assertLessEqual(encoder_pte_size, 425_000_000)  # 425MB
-                    self.assertLessEqual(text_embedding_pte_size, 300_000_000)  # 300MB
-                    self.assertLessEqual(decoder_pte_size, 550_000_000)  # 550MB
+                    self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
+                    self.assertLessEqual(
+                        text_embedding_pte_size, vlm_specs.text_embedding_pte_size
+                    )
+                    self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
                     print(f"Encoder PTE Size: {encoder_pte_size} bytes")
                     print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
                     print(f"Decoder PTE Size: {decoder_pte_size} bytes")
 
+                attr_name = f"{self.model.lower()}_token_rate"
+                if (
+                    not self.compile_only
+                    and not self.enable_x86_64
+                    and hasattr(vlm_specs, attr_name)
+                ):
+                    device_inference_speed = msg["inference_speed"]
+                    expected_inference_speed = getattr(vlm_specs, attr_name)
+                    print(f"Prompt Evaluation: {device_inference_speed} tokens/second")
+                    self.assertGreaterEqual(
+                        device_inference_speed, expected_inference_speed
+                    )
+
 
 class TestExampleOssScript(TestQNN):
     def test_albert(self):
diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
index 612f898028c..a971ff1aacb 100644
--- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
@@ -81,3 +81,60 @@ target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
 set_target_properties(
   qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
+
+# build qnn multimodal runner preprocess qnn runner src files for multimodal
+set(_multimodal_runner__srcs ${_llama_runner__srcs})
+list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*qnn_llama_runner.*")
+list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*runner/runner\.(cpp|h)")
+list(
+  PREPEND
+  _multimodal_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.h
+)
+
+list(APPEND _multimodal_runner__srcs)
+
+# build qnn multimodal runner
+add_executable(qnn_multimodal_runner ${_multimodal_runner__srcs})
+target_include_directories(
+  qnn_multimodal_runner PUBLIC ${_common_include_directories}
+)
+target_include_directories(
+  qnn_multimodal_runner
+  PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
+)
+target_compile_options(qnn_multimodal_runner PUBLIC ${_common_compile_options})
+
+target_link_libraries(
+  qnn_multimodal_runner
+  qnn_executorch_backend
+  executorch_core
+  extension_data_loader
+  extension_flat_tensor
+  extension_llm_runner
+  extension_module
+  extension_tensor
+  gflags
+  custom_ops
+  quantized_ops_lib
+  quantized_kernels
+  tokenizers::tokenizers
+)
+
+set_target_properties(
+  qnn_multimodal_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 80748642db2..896ffa6a7c1 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -4,7 +4,8 @@
 
 **Video Tutorial:** [Build Along: Run LLMs Locally on Qualcomm Hardware Using ExecuTorch](https://www.youtube.com/watch?v=41PKDlGM3oU)
 
-This file provides you the instructions to run LLM Decoder model with different parameters via Qualcomm HTP backend. We currently support the following models:
+This file provides you the instructions to run LLM Decoder model and VLM model with different parameters via Qualcomm HTP backend. We currently support the following models:
+- LLM
 <!-- numbered list will be automatically generated -->
  1. LLAMA2 Stories 110M
  1. LLAMA3.2 1B
@@ -20,7 +21,10 @@ This file provides you the instructions to run LLM Decoder model with different
  1. QWEN3 0.6B / 1.7B
  1. SmolLM2 135M
  1. SmolLM3 3B
-
+- VLM
+<!-- numbered list will be automatically generated -->
+ 1. SmolVLM 500M
+ 1. InternVL3 1B
 
 We offer the following modes to execute the model:
 
@@ -162,7 +166,7 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 ```
 
 #### Phi4-mini-instruct
-Default example using kv mode.
+Default example using hybrid mode.
 ```bash
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model phi_4_mini --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
@@ -174,9 +178,9 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 ```
 
 #### QWEN2.5 1.5B
-Default example using kv mode
+Default example using hybrid mode
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --decoder_model qwen2_5-1_5b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
 
 #### QWEN3 0.6B
@@ -198,11 +202,149 @@ python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL
 ```
 
 #### SmolLM3
-Default example using kv mode.
+Default example using hybrid mode.
 ```bash
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smollm3-3b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
 
+## Multimodal Support
+
+### Overview
+
+Multimodal models extend LLM by processing multiple input modalities (vision, audio, text) simultaneously. This framework provides a unified architecture for multimodal via Qualcomm HTP backend.
+
+**Current Support Status:**
+- **Vision-Language Models (VLM)**: Fully supported
+- **Audio-Language Models (ALM)**: Coming soon
+
+### Multimodal Architecture
+
+For general multimodal processing pipeline please refer [Multimodal Architecture](../../../../extension/llm/runner/README.md#multimodalrunner-architecture)
+
+
+### Processing Pipeline
+
+Multimodal inference follows these key stages:
+
+1. **Modality-Specific Encoding**
+   - **Vision**: Images are processed through a vision encoder to generate visual embeddings
+   - **Audio**: Audio waveforms are processed through an audio encoder *(future support)*
+   - **Text**: Text prompts are tokenized and embedded
+
+2. **Embedding Fusion**
+   - All modality embeddings are projected to a common embedding dimension
+   - Embeddings are concatenated or fused according to the model's template
+   - Special tokens are inserted to mark modality boundaries
+
+3. **Unified Language Generation**
+   - The fused embeddings are fed into the language model decoder
+   - The decoder generates text autoregressively using the same execution modes as LLM models (KV Cache, Hybrid, Lookahead)
+
+---
+
+## Vision-Language Model (VLM) Support
+
+Vision-Language Models (VLMs) combine computer vision and natural language processing to understand and generate text based on visual inputs. VLMs in this framework consist of:
+
+- **[Vision Encoder](model/vision_encoder.py)**: Processes images into visual embeddings (e.g., SigLIP for SmolVLM)
+  - **Projection Layer** (included in vision encoder): Aligns visual embeddings with the language model's embedding space
+- **[Language Decoder](model/static_llama.py)**: Reuse static llama to generates text based on fused visual and text embeddings
+
+### Instructions
+
+#### SmolVLM 500M
+Default example using hybrid mode.
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode hybrid --prefill_ar_len 16 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+```
+
+#### InternVL 1B
+Default example using hybrid mode.
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model internvl3_1b --model_mode hybrid --prefill_ar_len 32 --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg"
+```
+
+### Specifying Custom Image
+
+You can specify custom image for VLM models using the `--image_path` flag:
+
+Take a example image of Statue-of-Liberty in New York Bay
+- **HTTP/HTTPS URLs**: Direct links to images on the web
+  - Example: `https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg`
+- **Local file paths**: Absolute or relative paths to image files on your system
+  - Example: [`./examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png`](assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png)
+
+**Default behavior:**
+If `--image_path` is not specified, the system will automatically use the default image URL defined in the model's configuration file (`encoder/encoder_config.py`).
+
+#### Image Preprocessing
+
+Each VLM model has specific preprocessing requirements defined in its configuration:
+
+```python
+# In encoder/encoder_config.py
+@dataclass(init=False, frozen=True)
+class SmolVLMEncoder(VisionModalityConfig):
+    encoder_class = Idefics3VisionEncoder
+    img_seq_len = 64
+    img_resized_h = 512
+    img_resized_w = 512
+    img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"  # Default image
+    quant_recipe = SmolVLM_Encoder_QuantRecipe
+```
+
+- **img_resized_h / img_resized_w**: Target resolution for the vision encoder
+- **img_seq_len**: Number of visual tokens generated by the encoder
+
+The image is automatically:
+1. Loaded from the specified URL or file path
+2. Resized to the model's expected resolution and preprocessed by HuggingFace [processors](https://huggingface.co/docs/transformers/main/processors)
+
+### Using Pre-Generated PTE Files
+
+If you have already compiled a VLM model, you can run inference with pre-generated PTE files:
+
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
+```
+
+### VLM Processing Details
+
+The VLM inference pipeline consists of:
+
+1. **Vision Encoding Phase**
+   - Input image is preprocessed (resize, normalize)
+   - Vision encoder generates visual embeddings: `[batch, img_seq_len, hidden_dim]`
+   - Visual embeddings are projected to match the language model dimension by the modality projector
+
+2. **Text Tokenization Phase**
+   - User prompt is tokenized into text tokens
+   - Text tokens are embedded: `[batch, text_seq_len, hidden_dim]`
+
+3. **Embedding Fusion Phase**
+   - Visual and text embeddings are concatenated according to the model's template
+   - Special tokens (e.g., `<image>`, `<|fake_token_around_image|>`, `<fake_token_around_image>`) mark modality boundaries (see [tokenizer.py](tokenizer.py))
+   
+   ```python
+   # Special tokens for Vision-Language Model
+   VLM_SPECIAL_TOKENS = {
+       "smolvlm_500m_instruct": {
+           "image_token": "<image>",
+           "global_img": "<global-img>",
+           "fake_wrap_start": "<fake_token_around_image>",
+           "fake_wrap_end": "<fake_token_around_image>",
+       },
+       ...
+   }
+   ```
+   - Final fused sequence: `[batch, img_seq_len + text_seq_len, hidden_dim]`
+
+4. **Language Generation Phase**
+   - Fused embeddings are fed into the language decoder
+   - Autoregressive generation produces output tokens
+   - KV cache is updated for efficient subsequent token generation
+
+
 ### KV Cache update mechanism
 We use Smart Mask mechanisms for updating the key-value (KV) cache.
 
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 4a8a017758b..963db6e2d0a 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -49,7 +49,7 @@
 
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
     InternVL3Encoder,
-    LateFusionModalityConfig,
+    MultiModalityConfig,
     SmolVLMEncoder,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import (
@@ -183,12 +183,12 @@ def format_value(v):
 
 def register_llm_model(
     name: str,
-    vision_encoder: Optional[LateFusionModalityConfig] = None,
+    vision_encoder: Optional[MultiModalityConfig] = None,
 ):
     def decorator(cls: Type[LLMModelConfig]):
         cls.decoder_model_version = DECODER_MODEL_VERSION[name]
         if vision_encoder is not None and issubclass(
-            vision_encoder, LateFusionModalityConfig
+            vision_encoder, MultiModalityConfig
         ):
             setattr(cls, VISION_ENCODER, vision_encoder)
         SUPPORTED_LLM_MODELS[name.lower()] = cls()
diff --git a/examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png b/examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png
new file mode 100644
index 00000000000..07beceeeb63
Binary files /dev/null and b/examples/qualcomm/oss_scripts/llama/assets/samples/images/Statue-of-Liberty-Island-New-York-Bay.png differ
diff --git a/examples/qualcomm/oss_scripts/llama/dataset.py b/examples/qualcomm/oss_scripts/llama/dataset.py
index 69af3fdf8d3..72f9e5d766a 100644
--- a/examples/qualcomm/oss_scripts/llama/dataset.py
+++ b/examples/qualcomm/oss_scripts/llama/dataset.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
+import warnings
 from typing import Callable, List, Optional
 
 from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
@@ -18,7 +19,7 @@
 )
 
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
-    LateFusionModalityConfig,
+    MultiModalityConfig,
     VisionModalityConfig,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper
@@ -54,10 +55,16 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
         Returns:
             tuple of pixel values tensors
         """
-        # Load image from the specified URL
-        # Currently only supports loading image from URL
-        # TODO: allow user-specified image path
-        image = load_image(config.img_url)
+        # Load image from user-specified path (URL or local file)
+        # fall back to the default image URL if no image is provided.
+        image_path = self.control_args.image_path or config.img_url
+        if not self.control_args.image_path:
+            warnings.warn(
+                f"No image path/URL provided, using default image URL: {config.img_url}",
+                UserWarning,
+                stacklevel=1,
+            )
+        image = load_image(image_path)
 
         # Process image with text prompt using HuggingFace processor
         # Some HF processors (e.g. InternVL3) need to pass text arg or it will cause error and process failed
@@ -81,7 +88,7 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
 
     def _build_dataset_for_encoder(
         self,
-        config: LateFusionModalityConfig,
+        config: MultiModalityConfig,
         prompt: str,
     ) -> Optional[tuple]:
         if issubclass(config, VisionModalityConfig):
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
index 3d28ca2186b..6edcc408a96 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
@@ -4,14 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import copy
 import getpass
 import logging
 import os
 import subprocess
 from collections import defaultdict, OrderedDict
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -101,7 +100,7 @@ def __init__(  # noqa: C901
         max_seq_length: int,
         ar_len: int,
         use_kv_cache: bool,
-        example_input: Tuple[List[torch.Tensor]],
+        get_example_inputs: Callable,
         use_i64_token: bool,
         seq_mse_candidates: int,
     ):
@@ -113,7 +112,7 @@ def __init__(  # noqa: C901
         self._model = model.to(self.device)
         self.ar_len = ar_len
         self._use_kv_cache = use_kv_cache
-        self.example_input = example_input
+        self.get_example_inputs = get_example_inputs
         self.max_seq_length = max_seq_length
         self.use_i64_token = use_i64_token
         self.seq_mse_candidates = seq_mse_candidates
@@ -126,9 +125,7 @@ def _model_call(self, inps):
             kwargs["seq_mse_candidates"] = self.seq_mse_candidates
 
         all_logits = INFERENCE_REGISTRY[self._use_kv_cache](
-            copy.deepcopy(
-                self.example_input
-            ),  # Copy the example input to avoid KV cache pollution when testing PPL
+            self.get_example_inputs,
             inps,
             self._model,
             self._tokenizer,
@@ -779,7 +776,7 @@ def _generate(
 
 @register_inference(use_kv_cache=True)
 def kv_inference(  # noqa: C901
-    example_input,
+    get_example_inputs: Callable,
     prompt: Union[str, list],
     module: torch.fx.GraphModule,
     tokenizer,
@@ -801,7 +798,7 @@ def kv_inference(  # noqa: C901
         ]
     )
 
-    _, atten_mask, _, k_caches, v_caches = example_input
+    _, atten_mask, _, k_caches, v_caches = get_example_inputs()
 
     # TODO: change criteria & support batch inputs if necessary
     all_pos = torch.arange(0, max_seq_len, 1, dtype=torch.int32).unsqueeze(0)
@@ -920,7 +917,7 @@ def kv_inference(  # noqa: C901
 
 @register_inference(use_kv_cache=False)
 def prefill_inference(
-    example_input,
+    get_example_inputs: Callable,
     prompt: Union[str, list],
     module: torch.fx.GraphModule,
     tokenizer,
@@ -939,7 +936,7 @@ def prefill_inference(
         ]
     )
 
-    _, atten_mask = example_input
+    _, atten_mask = get_example_inputs()
 
     # TODO: change criteria & support batch inputs if necessary
 
@@ -1001,7 +998,7 @@ def prefill_inference(
 
 def graph_module_inference(
     use_kv_cache: bool,
-    example_input,
+    get_example_inputs: Callable,
     module: torch.fx.GraphModule,
     tokenizer,
     ar_len=1,
@@ -1034,7 +1031,7 @@ def graph_module_inference(
             kwargs["lookahead_config"] = lookahead_config
 
         INFERENCE_REGISTRY[use_kv_cache](
-            example_input,
+            get_example_inputs,
             prompt,
             module,
             tokenizer,
@@ -1054,7 +1051,7 @@ def graph_module_inference(
             max_seq_length=max_seq_len,
             ar_len=ar_len,
             use_kv_cache=use_kv_cache,
-            example_input=example_input,
+            get_example_inputs=get_example_inputs,
             use_i64_token=use_i64_token,
             seq_mse_candidates=seq_mse_candidates,
         )
diff --git a/examples/qualcomm/oss_scripts/llama/encoder/__init__.py b/examples/qualcomm/oss_scripts/llama/encoder/__init__.py
index 56798748023..f3d821bcd46 100644
--- a/examples/qualcomm/oss_scripts/llama/encoder/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/encoder/__init__.py
@@ -6,7 +6,7 @@
 
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
     InternVL3Encoder,
-    LateFusionModalityConfig,
+    MultiModalityConfig,
     SmolVLMEncoder,
     VisionModalityConfig,
 )
@@ -20,7 +20,7 @@
     "EncoderQuantRecipe",
     "InternVL3Encoder",
     "InternVL3_Encoder_QuantRecipe",
-    "LateFusionModalityConfig",
+    "MultiModalityConfig",
     "SmolVLMEncoder",
     "SmolVLM_Encoder_QuantRecipe",
     "VisionModalityConfig",
diff --git a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
index 331894473af..b8e32904bbf 100644
--- a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
+++ b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
@@ -20,13 +20,10 @@
 
 
 @dataclass(init=False, frozen=True)
-class LateFusionModalityConfig(ABC):
+class MultiModalityConfig(ABC):
     """
     Base configuration class for late fusion modality encoders.
 
-    Late fusion refers to the approach where different modalities (vision, audio, text)
-    are processed separately by their respective encoders and then fused at a later stage.
-
     Attributes:
         encoder_class: The encoder class that implements the modality processing.
         quant_recipe: Quantization recipe for optimizing the encoder.
@@ -41,7 +38,7 @@ def create_encoder(self, config):
 
 
 @dataclass(init=False, frozen=True)
-class VisionModalityConfig(LateFusionModalityConfig):
+class VisionModalityConfig(MultiModalityConfig):
     """
     Base config for vision modality encoders.
 
diff --git a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
index 53f359d2268..46a7e26d58a 100644
--- a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
+++ b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
@@ -337,7 +337,7 @@ def eval_llm(args):
         logging.info("Observers added, starting calibration...")
         graph_module_inference(
             use_kv_cache=False,
-            get_example_inputs=lambda use_kv_cache=False: inputs,
+            get_example_inputs=lambda: inputs,
             module=model,
             tokenizer=tokenizer,
             ar_len=args.max_seq_len,
@@ -358,7 +358,7 @@ def eval_llm(args):
 
         # graph_module_inference(
         #     use_kv_cache=False,
-        #     get_example_inputs=lambda use_kv_cache=False: inputs,
+        #     get_example_inputs=lambda: inputs,
         #     module=model,
         #     tokenizer=tokenizer,
         #     ar_len=args.max_seq_len,
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 1267b1ac78b..bddebdc6387 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -36,6 +36,7 @@
     TEXT_EMBEDDING_GRAPH_NAMES,
     TEXT_ENCODER,
     VISION_ENCODER,
+    VISION_ENCODER_INPUT_FILENAME,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import (
     QnnRunnerEvalWrapper,
@@ -45,14 +46,12 @@
     MultiModalManager,
     next_power_of_two,
 )
-
 from executorch.examples.qualcomm.utils import (
     make_output_dir,
     setup_common_args_and_variables,
     SimpleADB,
 )
 
-
 try:
     from lm_eval.evaluator import simple_evaluate
 except ImportError:
@@ -163,38 +162,9 @@ def inference(
         else f"{args.artifact}/{pte_filenames[TEXT_DECODER]}.pte"
     )
 
-    # TODO: support multimodal runtime, we only check pte size for now
-    if is_modality:
-        if args.ip and args.port != -1:
-            encoder_pte_path = (
-                f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte"
-                if args.pre_gen_pte
-                else f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte"
-            )
-            text_embedding_pte_path = (
-                f"{args.pre_gen_pte}/{pte_filenames[TEXT_EMBEDDING]}.pte"
-                if args.pre_gen_pte
-                else f"{args.artifact}/{pte_filenames[TEXT_EMBEDDING]}.pte"
-            )
-            # Prepare validation results for CI system
-            validation_results = {
-                "pte_size": os.path.getsize(pte_path),
-                "encoder_pte_size": os.path.getsize(encoder_pte_path),
-                "text_embedding_pte_size": os.path.getsize(text_embedding_pte_path),
-            }
-            with Client((args.ip, args.port)) as conn:
-                conn.send(json.dumps(validation_results))
-        else:
-            logging.info("Multimodal runtime support is currently under development.")
-            logging.info(
-                "Detected vision/audio encoder in model config. Exiting process safely."
-            )
-            exit(0)
-        return None
-
     # For decoder-only models, enable accuracy evaluation using perplexity
     # TODO: Add support for multimodal accuracy evaluation (e.g., VLM)
-    if args.run_lm_eval:
+    if not is_modality and args.run_lm_eval:
         # Generate the eval wrapper
         eval_wrapper = QnnRunnerEvalWrapper(
             args=args,
@@ -291,19 +261,51 @@ def post_process():
 
         qnn_sdk = os.getenv("QNN_SDK_ROOT")
         target = "x86_64-linux-clang"
-        runner_cmd = " ".join(
-            [
-                f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
-                f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
-                f"--decoder_model_version {decoder_model_config.decoder_model_version}",
-                f"--tokenizer_path {runtime_tokenizer_path}",
-                f"--model_path {pte_path}",
-                f"--seq_len {seq_len}",
-                f"--output_path {args.artifact}/outputs/outputs.txt",
-                f"--performance_output_path {args.artifact}/{performance_output_path}",
-                runner_args,
-            ]
-        )
+        if not is_modality:
+            runner_cmd = " ".join(
+                [
+                    f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
+                    f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
+                    f"--decoder_model_version {decoder_model_config.decoder_model_version}",
+                    f"--tokenizer_path {runtime_tokenizer_path}",
+                    f"--model_path {pte_path}",
+                    f"--seq_len {seq_len}",
+                    f"--output_path {args.artifact}/outputs/outputs.txt",
+                    f"--performance_output_path {args.artifact}/{performance_output_path}",
+                    runner_args,
+                ]
+            )
+        else:
+            # x86 emulator is intended for CI and not performance. Check only the first few tokens.
+            # For multimodal models, use 128 tokens (vs 16 for text-only) due to longer sequence length required for modality embeddings.
+            seq_len = min(seq_len, 128)
+            encoder_pte_path = (
+                f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte"
+                if args.pre_gen_pte
+                else f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte"
+            )
+            text_embedding_pte_path = (
+                f"{args.pre_gen_pte}/{pte_filenames[TEXT_EMBEDDING]}.pte"
+                if args.pre_gen_pte
+                else f"{args.artifact}/{pte_filenames[TEXT_EMBEDDING]}.pte"
+            )
+            runner_cmd = " ".join(
+                [
+                    f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
+                    f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner",
+                    f"--decoder_model_version {decoder_model_config.decoder_model_version}",
+                    f"--tokenizer_path {runtime_tokenizer_path}",
+                    f"--decoder_path {pte_path}",
+                    f"--encoder_path {encoder_pte_path}",
+                    f"--embedding_path {text_embedding_pte_path}",
+                    f"--image_path {args.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw",
+                    f"--seq_len {seq_len}",
+                    f"--output_path {args.artifact}/outputs/outputs.txt",
+                    f"--performance_output_path {args.artifact}/{performance_output_path}",
+                    runner_args,
+                ]
+            )
+
         subprocess.run(
             runner_cmd,
             shell=True,
@@ -312,36 +314,80 @@ def post_process():
         )
         post_process()
     else:
-        runner_cmd = " ".join(
-            [
-                f"cd {workspace} &&",
-                f"./qnn_llama_runner",
-                f"--decoder_model_version {decoder_model_config.decoder_model_version}",
-                f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
-                f"--model_path {pte_filenames[TEXT_DECODER]}.pte",
-                f"--seq_len {seq_len}",
-                "--output_path outputs/outputs.txt",
-                f"--performance_output_path {performance_output_path}",
-                "--shared_buffer",
-                runner_args,
-            ]
-        )
+        if not is_modality:
+            runner_cmd = " ".join(
+                [
+                    f"cd {workspace} &&",
+                    f"./qnn_llama_runner",
+                    f"--decoder_model_version {decoder_model_config.decoder_model_version}",
+                    f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
+                    f"--model_path {pte_filenames[TEXT_DECODER]}.pte",
+                    f"--seq_len {seq_len}",
+                    "--output_path outputs/outputs.txt",
+                    f"--performance_output_path {performance_output_path}",
+                    "--shared_buffer",
+                    runner_args,
+                ]
+            )
+        else:
+            encoder_pte_path = (
+                f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte"
+                if args.pre_gen_pte
+                else f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte"
+            )
+            text_embedding_pte_path = (
+                f"{args.pre_gen_pte}/{pte_filenames[TEXT_EMBEDDING]}.pte"
+                if args.pre_gen_pte
+                else f"{args.artifact}/{pte_filenames[TEXT_EMBEDDING]}.pte"
+            )
+            runner_cmd = " ".join(
+                [
+                    f"cd {workspace} &&",
+                    f"./qnn_multimodal_runner",
+                    f"--decoder_model_version {decoder_model_config.decoder_model_version}",
+                    f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
+                    f"--decoder_path {pte_filenames[TEXT_DECODER]}.pte",
+                    f"--encoder_path {pte_filenames[VISION_ENCODER]}.pte",
+                    f"--embedding_path {pte_filenames[TEXT_EMBEDDING]}.pte",
+                    f"--image_path {VISION_ENCODER_INPUT_FILENAME}.raw",
+                    f"--seq_len {seq_len}",
+                    "--output_path outputs/outputs.txt",
+                    f"--performance_output_path {performance_output_path}",
+                    "--shared_buffer",
+                    runner_args,
+                ]
+            )
+
         adb = SimpleADB(
             qnn_sdk=os.getenv("QNN_SDK_ROOT"),
             build_path=f"{args.build_folder}",
-            pte_path=pte_path,
+            pte_path=(
+                pte_path
+                if not is_modality
+                else [pte_path, encoder_pte_path, text_embedding_pte_path]
+            ),
             workspace=workspace,
             device_id=args.device,
             host_id=args.host,
             soc_model=args.model,
             shared_buffer=True,
             target=args.target,
-            runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
+            runner=(
+                f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner"
+                if not is_modality
+                else f"examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner"
+            ),
         )
 
         # No pregen inputs, input_list is not required
         if not args.skip_push:
-            adb.push(inputs=[], files=[runtime_tokenizer_path])
+            # Always use image from artifact folder since that's where it's saved during preprocessing
+            # regardless of whether pre_gen_pte is used (pre_gen_pte only applies to .pte model files)
+            image_path = f"{args.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw"
+            adb.push(
+                inputs=[],
+                files=[runtime_tokenizer_path] + ([image_path] if is_modality else []),
+            )
         adb.execute(custom_runner_cmd=runner_cmd)
         adb.pull(output_path=args.artifact, callback=post_process)
 
@@ -358,6 +404,16 @@ def post_process():
             "inference_speed": inference_speed,
             "pte_size": os.path.getsize(pte_path),
         }
+
+        # Add multimodal-specific metrics if applicable
+        if is_modality:
+            validation_results.update(
+                {
+                    "encoder_pte_size": os.path.getsize(encoder_pte_path),
+                    "text_embedding_pte_size": os.path.getsize(text_embedding_pte_path),
+                }
+            )
+
         with Client((args.ip, args.port)) as conn:
             conn.send(json.dumps(validation_results))
     else:
@@ -526,6 +582,13 @@ def _build_parser():
         type=int,
     )
 
+    parser.add_argument(
+        "--image_path",
+        help="Path to the image file for multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.",
+        default=None,
+        type=str,
+    )
+
     parser.add_argument("-v", "--verbose", action="store_true")
 
     return parser
@@ -588,6 +651,17 @@ def export_llama(args) -> None:
         args.prompt, chat_template
     )
 
+    # TODO: Implement multi-turn conversation support for multimodal models (vision/audio).
+    assert (
+        not (
+            hasattr(decoder_model_config, VISION_ENCODER)
+            or hasattr(decoder_model_config, AUDIO_ENCODER)
+        )
+    ) or (len(args.prompt) <= 1), (
+        "Multimodal models currently do not support multi-turn. "
+        "Please set `--prompt` to 1 or switch to a unimodal (text-only) decoder."
+    )
+
     if args.pre_gen_pte:
         inference(
             args, decoder_model_config, pte_filenames, runtime_tokenizer_path, tokenizer
diff --git a/examples/qualcomm/oss_scripts/llama/model/embedding.py b/examples/qualcomm/oss_scripts/llama/model/embedding.py
index b9c470b470d..4956012baf0 100644
--- a/examples/qualcomm/oss_scripts/llama/model/embedding.py
+++ b/examples/qualcomm/oss_scripts/llama/model/embedding.py
@@ -35,5 +35,5 @@ def get_example_input(self):
             ),
         )
 
-    def forward(self, input_ids):
-        return self.input_embedding_module(input_ids)
+    def forward(self, tokens):
+        return self.input_embedding_module(tokens)
diff --git a/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py b/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py
index 7ea3696799c..431e28a20d6 100644
--- a/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py
+++ b/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional, Tuple
+from typing import Tuple
 
 import torch
 
@@ -27,7 +27,7 @@
 )
 
 
-# Custom implementation based on `transformers/models/idefics3/modeling_idefics3/Idefics3VisionEmbeddings.py` (Transformers v4.56.1)
+# Custom implementation based on `transformers/models/idefics3/modeling_idefics3/Idefics3VisionEmbeddings.py` (Transformers v5.0.0rc1)
 #
 # Qualcomm optimization:
 # Precompute and register positional IDs as a buffer to avoid computation during forward passes.
@@ -119,62 +119,30 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         return embeddings
 
 
-# Custom implementation based on `transformers/models/idefics3/modeling_idefics3/Idefics3VisionTransformer.py` (Transformers v4.56.1)
+# Custom implementation based on `transformers/models/idefics3/modeling_idefics3/Idefics3VisionTransformer.py` (Transformers v5.0.0rc1)
 #
 # Qualcomm changes:
 # Assume the image is non-empty and skip attention mask propagation to the encoder
 class CustomIdefics3VisionTransformer(Idefics3PreTrainedModel):
     config: Idefics3VisionConfig
-    _supports_sdpa = True
-    _supports_flash_attn = True
-    _supports_flex_attn = True
 
     def __init__(self, config: Idefics3VisionConfig):
         super().__init__(config)
-        embed_dim = config.hidden_size
-
         self.embeddings = Idefics3VisionEmbeddings(config)
         self.encoder = Idefics3Encoder(config)
-        self.patch_size = config.patch_size
-        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-
-    def forward(
-        self,
-        pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
+        self.post_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
         )
 
+    def forward(self, pixel_values):
         hidden_states = self.embeddings(pixel_values=pixel_values)
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.post_layernorm(last_hidden_state)
 
-        if not return_dict:
-            return (last_hidden_state,) + encoder_outputs[1:]
-
         return BaseModelOutput(
             last_hidden_state=last_hidden_state,
             hidden_states=encoder_outputs.hidden_states,
@@ -182,7 +150,7 @@ def forward(
         )
 
 
-# Custom implementation based on `transformers/models/idefics3/modeling_idefics3.py` (Transformers v4.56.1).
+# Custom implementation based on `transformers/models/idefics3/modeling_idefics3.py` (Transformers v5.0.0rc1).
 #
 # Qualcomm optimization:
 # - Dynamic shape support is removed; computations are now static for efficiency.
@@ -277,7 +245,7 @@ def forward(
         return image_hidden_states
 
 
-# Copy from transformers/models/internvl/modeling_internvl.py (Transformers v4.56.1).
+# Copy from transformers/models/internvl/modeling_internvl.py (Transformers v5.0.0rc1).
 class InternVL3VisionEncoder(torch.nn.Module):
     def __init__(
         self, config: InternVLConfig, img_resized_h: int = 448, img_resized_w: int = 448
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp
new file mode 100644
index 00000000000..0b3f2ee4ad1
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * This tool can run SmolVLM 500M, InternVL3 1B
+ * with Qualcomm AI Engine Direct.
+ *
+ */
+
+#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/runtime/platform/log.h>
+#include <gflags/gflags.h>
+
+#include <fstream>
+#include <vector>
+
+// Model paths
+DEFINE_string(
+    embedding_path,
+    "embedding.pte",
+    "Path to embedding model serialized in flatbuffer format.");
+DEFINE_string(
+    encoder_path,
+    "encoder.pte",
+    "Path to vision encoder model serialized in flatbuffer format.");
+DEFINE_string(
+    decoder_path,
+    "decoder.pte",
+    "Path to decoder model serialized in flatbuffer format.");
+
+// Tokenizer and output paths
+DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer path.");
+DEFINE_string(
+    output_path,
+    "outputs.txt",
+    "Executorch inference data output path.");
+DEFINE_string(
+    performance_output_path,
+    "inference_speed.txt",
+    "Records inference speed. For CI purpose.");
+DEFINE_string(
+    dump_logits_path,
+    "",
+    "If path is provided, program will dump all logits generated.");
+
+// Model configuration
+DEFINE_string(decoder_model_version, "llama3", "The decoder model version.");
+DEFINE_string(
+    prompt,
+    "Describe this image:",
+    "Text prompt for the multimodal model.");
+DEFINE_string(
+    tokenized_prompt,
+    "",
+    "This is an alternative of passing prompts. Users could provide this in a raw file, with tokens saved in uint64 format.");
+DEFINE_string(
+    image_path,
+    "",
+    "Path to input image file. If empty, text-only mode is used.");
+DEFINE_string(system_prompt, "", "System prompt for the model.");
+
+// Generation parameters
+DEFINE_double(
+    temperature,
+    0.0f,
+    "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+DEFINE_int32(
+    seq_len,
+    128,
+    "Total number of tokens to generate (prompt + output).");
+DEFINE_int32(
+    eval_mode,
+    1,
+    "0: TokenGenerator(kv) / 1: HybridMode (prefill+kv) / 2: Lookahead Decoding");
+
+DEFINE_bool(
+    shared_buffer,
+    false,
+    "Specifies to use shared buffers for zero-copy use case between the application and device/co-processor associated with the backend.");
+
+// Lookahead decoding parameters
+DEFINE_int32(
+    ngram,
+    0,
+    "[Lookahead Decoding] Size of n-grams used in lookahead process.");
+DEFINE_int32(
+    window,
+    0,
+    "[Lookahead Decoding] Number of future tokens to predict in each step.");
+DEFINE_int32(
+    gcap,
+    0,
+    "[Lookahead Decoding] Maximum number of speculations or candidate n-grams.");
+
+// Execution parameters
+DEFINE_int32(num_iters, 1, "Total number of iterations to run.");
+
+std::vector<std::string> CollectPrompts(int argc, char** argv) {
+  // Collect all prompts from command line, example usage:
+  // --prompt "prompt1" --prompt "prompt2" --prompt "prompt3"
+  std::vector<std::string> prompts;
+  for (int i = 1; i < argc; i++) {
+    if (std::string(argv[i]) == "--prompt" && i + 1 < argc) {
+      prompts.push_back(argv[i + 1]);
+      i++; // Skip the next argument
+    }
+  }
+  return prompts;
+}
+
+/**
+ * Special tokens structure for different models
+ */
+struct SpecialTokens {
+  std::string image_token;
+  std::string global_img;
+  std::string fake_wrap_start;
+  std::string fake_wrap_end;
+};
+
+/**
+ * Get special tokens based on decoder model version
+ */
+SpecialTokens get_special_tokens(
+    example::MultimodalDecoderModelVersion decoder_model_version) {
+  SpecialTokens tokens;
+
+  switch (decoder_model_version) {
+    case example::MultimodalDecoderModelVersion::
+        kSmolvlm: // smolvlm_500m_instruct
+      tokens.image_token = "<image>";
+      tokens.global_img = "<global-img>";
+      tokens.fake_wrap_start = "<fake_token_around_image>";
+      tokens.fake_wrap_end = "<fake_token_around_image>";
+      break;
+    case example::MultimodalDecoderModelVersion::kInternvl3: // internvl3_1b
+      tokens.image_token = "<IMG_CONTEXT>";
+      tokens.global_img = "";
+      tokens.fake_wrap_start = "<img>";
+      tokens.fake_wrap_end = "</img>";
+      break;
+    default:
+      break;
+  }
+
+  return tokens;
+}
+
+/**
+ * Prepare multimodal token IDs by expanding image tokens
+ * This implements the logic from prepare_multimodal_token_ids in Python
+ */
+std::string prepare_multimodal_prompt(
+    const std::string& prompt,
+    int image_seq_len,
+    const SpecialTokens& specials) {
+  // Create image prompt with repeated image tokens
+  std::string image_prompt = specials.fake_wrap_start;
+  image_prompt += specials.global_img;
+  for (int i = 0; i < image_seq_len; ++i) {
+    image_prompt += specials.image_token;
+  }
+  image_prompt += specials.fake_wrap_end;
+
+  // Replace single image token with expanded version
+  size_t pos = 0;
+  std::string expanded = prompt;
+  while ((pos = expanded.find(specials.image_token, pos)) !=
+         std::string::npos) {
+    expanded.replace(pos, specials.image_token.size(), image_prompt);
+    pos += image_prompt.size();
+  }
+  ET_LOG(Info, "Prompt after expanding image token: %s", expanded.c_str());
+
+  return expanded;
+}
+
+/**
+ * Format prompt based on model version with multimodal token expansion
+ */
+std::string get_formatted_prompt(
+    const std::string& prompt,
+    const std::string& system_prompt,
+    example::MultimodalDecoderModelVersion decoder_model_version,
+    int32_t img_seq_len = 0) {
+  std::string formatted_prompt;
+
+  // Get special tokens for this model
+  SpecialTokens specials = get_special_tokens(decoder_model_version);
+
+  switch (decoder_model_version) {
+    case example::MultimodalDecoderModelVersion::kSmolvlm:
+      if (!system_prompt.empty()) {
+        formatted_prompt.append(
+            "<|start_header_id|>system<|end_header_id|>\n\n");
+        formatted_prompt.append(system_prompt);
+        formatted_prompt.append("<|eot_id|>");
+      }
+      formatted_prompt.append("<|im_start|>User:");
+      formatted_prompt.append(specials.image_token);
+      formatted_prompt.append(prompt);
+      formatted_prompt.append("<end_of_utterance>\nAssistant:");
+      break;
+    case example::MultimodalDecoderModelVersion::kInternvl3:
+      if (!system_prompt.empty()) {
+        formatted_prompt.append("<|im_start|>system<|im_end|>\n\n");
+        formatted_prompt.append(system_prompt);
+        formatted_prompt.append("<|im_end|>");
+      }
+      formatted_prompt.append("<|im_start|>user:\n");
+      formatted_prompt.append(specials.image_token);
+      formatted_prompt.append("\n");
+      formatted_prompt.append(prompt);
+      formatted_prompt.append("<|im_end|>assistant\n");
+      break;
+    default:
+      ET_CHECK_MSG(false, "unsupported VLM version");
+      break;
+  }
+
+  // Expand image tokens
+  formatted_prompt =
+      prepare_multimodal_prompt(formatted_prompt, img_seq_len, specials);
+
+  return formatted_prompt;
+}
+
+template <typename T>
+void start_multimodal_runner(
+    std::unique_ptr<example::EncoderRunner> encoder_runner,
+    std::unique_ptr<executorch::extension::Module> module,
+    std::unique_ptr<executorch::extension::Module> embedding,
+    std::vector<std::string>& prompts) {
+  ET_LOG(Info, "Starting multimodal runner");
+
+  bool use_tokenized_prompt =
+      gflags::GetCommandLineFlagInfoOrDie("tokenized_prompt").is_default ? false
+                                                                         : true;
+
+  // Load image, run encoder forward pass, and set image hidden states if
+  // provided
+  bool has_image = !FLAGS_image_path.empty();
+
+  // Load encoder
+  if (encoder_runner->load() != executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to load encoder");
+    return;
+  }
+
+  // Encode image from file
+  auto encode_result =
+      encoder_runner->encode_from_file(FLAGS_image_path.c_str());
+  if (!encode_result.ok()) {
+    ET_LOG(Error, "Failed to encode image");
+    return;
+  }
+
+  auto image_hidden_states = encode_result.get();
+
+  // Create multimodal runner
+  example::MultimodalRunner<T> runner(
+      std::move(module),
+      std::move(embedding),
+      FLAGS_decoder_model_version.c_str(),
+      FLAGS_decoder_path.c_str(),
+      FLAGS_tokenizer_path.c_str(),
+      FLAGS_dump_logits_path.c_str(),
+      FLAGS_performance_output_path.c_str(),
+      FLAGS_temperature,
+      FLAGS_eval_mode,
+      FLAGS_shared_buffer,
+      FLAGS_ngram,
+      FLAGS_window,
+      FLAGS_gcap,
+      std::make_unique<executorch::aten::Tensor>(image_hidden_states));
+
+  auto decoder_model_version = runner.get_decoder_model_version();
+
+  // Prepare output buffer (similar to qnn_llama_runner.cpp)
+  std::vector<char> buf;
+  buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
+  std::ofstream fout(FLAGS_output_path.c_str());
+
+  auto callback = [&](const std::string& piece) {
+    for (const char c : piece) {
+      buf.push_back(c);
+    }
+  };
+
+  // Configure generation
+  executorch::extension::llm::GenerationConfig config{
+      true,
+      -1,
+      false,
+      FLAGS_seq_len,
+      static_cast<float>(FLAGS_temperature),
+      0,
+      0};
+
+  // Get image sequence length from encoder
+  int32_t img_seq_len = encoder_runner->get_image_seq_len();
+  if (use_tokenized_prompt) {
+    runner.generate_from_prompt_or_file(
+        FLAGS_tokenizer_path.c_str(), use_tokenized_prompt, config, callback);
+  } else {
+    // generate tokens & store inference output
+    for (int i = 0; i < FLAGS_num_iters; i++) {
+      for (size_t j = 0; j < prompts.size(); ++j) {
+        const auto& prompt = prompts[j];
+        std::string formatted_prompt;
+        formatted_prompt = get_formatted_prompt(
+            prompt,
+            FLAGS_system_prompt,
+            decoder_model_version.get(),
+            img_seq_len);
+        runner.generate_from_prompt_or_file(
+            formatted_prompt.c_str(), use_tokenized_prompt, config, callback);
+      }
+    }
+  }
+  fout.write(buf.data(), buf.size());
+  fout.close();
+}
+
+int main(int argc, char** argv) {
+  std::vector<std::string> prompts = CollectPrompts(argc, argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (!gflags::GetCommandLineFlagInfoOrDie("prompt").is_default &&
+      !gflags::GetCommandLineFlagInfoOrDie("tokenized_prompt").is_default) {
+    ET_CHECK_MSG(false, "Only provide prompt or tokenized_input but not both.");
+  }
+  if (!gflags::GetCommandLineFlagInfoOrDie("dump_logits_path").is_default &&
+      FLAGS_eval_mode != 0) {
+    ET_CHECK_MSG(
+        false, "Only TokenGenerator(kv) mode is supported to dump all logits.");
+  }
+  ET_LOG(Info, "Embedding: %s", FLAGS_embedding_path.c_str());
+  ET_LOG(Info, "Encoder: %s", FLAGS_encoder_path.c_str());
+  ET_LOG(Info, "Decoder: %s", FLAGS_decoder_path.c_str());
+
+  // Create encoder runner
+  std::unique_ptr<example::EncoderRunner> encoder_runner =
+      std::make_unique<example::EncoderRunner>(FLAGS_encoder_path.c_str());
+
+  // load embedding
+  std::unique_ptr<executorch::extension::Module> embedding =
+      std::make_unique<executorch::extension::Module>(
+          FLAGS_embedding_path.c_str(),
+          executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
+
+  // load decoder
+  std::unique_ptr<executorch::extension::Module> module =
+      std::make_unique<executorch::extension::Module>(
+          FLAGS_decoder_path.c_str(),
+          executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
+
+  // Using 8bit as default since this meta is introduced with 16bit kv io
+  // support and older models only have 8bit kv io.
+  example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8;
+  if (module->method_names()->count("get_kv_io_bit_width") > 0) {
+    kv_bitwidth = static_cast<example::KvBitWidth>(
+        module->get("get_kv_io_bit_width").get().toScalar().to<int64_t>());
+  }
+  // Start runner with appropriate KV bitwidth
+  if (kv_bitwidth == example::KvBitWidth::kWidth8) {
+    start_multimodal_runner<uint8_t>(
+        std::move(encoder_runner),
+        std::move(module),
+        std::move(embedding),
+        prompts);
+  } else if (kv_bitwidth == example::KvBitWidth::kWidth16) {
+    start_multimodal_runner<uint16_t>(
+        std::move(encoder_runner),
+        std::move(module),
+        std::move(embedding),
+        prompts);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unsupported kv bitwidth: %ld",
+        static_cast<int64_t>(kv_bitwidth));
+  }
+
+  return 0;
+}
diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
index aa355335b68..06fe88517a7 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
@@ -163,6 +163,10 @@ class KVManager {
     return total_cache_size_;
   }
 
+  int64_t get_head_dim() const {
+    return metadata_.head_dim;
+  }
+
  private:
   // Helper functions to rearrange and update key and value caches
   void rearrange_key(KVCache<T>& k_cache, int32_t ar_len_dst);
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp
new file mode 100644
index 00000000000..1278a1df7d9
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::runtime::Error;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+namespace example {
+
+EmbeddingProcessor::EmbeddingProcessor(
+    EmbeddingRunner* embedding_runner,
+    const std::string& method_name,
+    Metadata metadata)
+    : embedding_runner_(embedding_runner),
+      method_name_(method_name),
+      metadata_(metadata) {
+  input_toks_.size = metadata_.ar_len * sizeof(int64_t);
+  embeddings_.size = metadata_.ar_len * metadata_.embedding_dim * sizeof(float);
+  prompt_embeddings_.size = 0; // Will be set in prefill()
+}
+
+void EmbeddingProcessor::init_io(
+    IMemAlloc* buffer_manager,
+    Result<MethodMeta> method_meta) {
+  input_tensors_.reserve(method_meta->num_inputs());
+  output_tensors_.reserve(method_meta->num_outputs());
+
+  // [I]: input_tokens
+  Result<TensorInfo> input_toks = method_meta->input_tensor_meta(0);
+  input_toks_.data =
+      reinterpret_cast<int64_t*>(buffer_manager->allocate(input_toks_.size));
+
+  input_toks_.tensor = std::make_unique<TensorImpl>(
+      input_toks->scalar_type(),
+      input_toks->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_toks->sizes().data()),
+      input_toks_.data,
+      const_cast<TensorImpl::DimOrderType*>(input_toks->dim_order().data()));
+  input_tensors_.emplace_back(input_toks_.tensor.get());
+  buffer_manager->add_memory_info(
+      input_toks_.data, input_toks_.size, input_toks.get());
+
+  // [O]: embeddings
+  Result<TensorInfo> embeddings = method_meta->output_tensor_meta(0);
+  embeddings_.data =
+      reinterpret_cast<float*>(buffer_manager->allocate(embeddings_.size));
+
+  embeddings_.tensor = std::make_unique<TensorImpl>(
+      embeddings->scalar_type(),
+      embeddings->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(embeddings->sizes().data()),
+      embeddings_.data,
+      const_cast<TensorImpl::DimOrderType*>(embeddings->dim_order().data()));
+  output_tensors_.emplace_back(embeddings_.tensor.get());
+  buffer_manager->add_memory_info(
+      embeddings_.data, embeddings_.size, embeddings.get());
+
+  inputs_.reserve(input_tensors_.size());
+  for (auto& input_tensor : input_tensors_) {
+    inputs_.emplace_back(std::move(input_tensor));
+  }
+}
+
+void EmbeddingProcessor::update_prompt_embedding(
+    int32_t num_prompt_tokens,
+    int64_t prompt_pos) {
+  for (int i = 0; i < metadata_.ar_len; i++) {
+    if (prompt_pos + i < num_prompt_tokens) {
+      std::memcpy(
+          prompt_embeddings_.data + (prompt_pos + i) * metadata_.embedding_dim,
+          embeddings_.data + i * metadata_.embedding_dim,
+          metadata_.embedding_dim * sizeof(float));
+    }
+  }
+}
+
+void EmbeddingProcessor::prefill(const std::vector<uint64_t>& prompt_tokens) {
+  int64_t prompt_pos = 0;
+  int32_t num_prompt_tokens = prompt_tokens.size();
+  prompt_embeddings_.size =
+      num_prompt_tokens * metadata_.embedding_dim * sizeof(float);
+
+  // Allocate memory using std::vector for smart pointer management
+  prompt_embeddings_buffer_.resize(num_prompt_tokens * metadata_.embedding_dim);
+  prompt_embeddings_.data = prompt_embeddings_buffer_.data();
+
+  // Create TensorImpl for prompt_embeddings_ with shape [1, num_prompt_tokens,
+  // dim] Store sizes and dim_order as member variables to keep them
+  // alive
+  prompt_embeddings_sizes_ = {1, num_prompt_tokens, metadata_.embedding_dim};
+  prompt_embeddings_dim_order_ = {0, 1, 2};
+  prompt_embeddings_.tensor = std::make_unique<TensorImpl>(
+      executorch::aten::ScalarType::Float,
+      prompt_embeddings_sizes_.size(),
+      prompt_embeddings_sizes_.data(),
+      prompt_embeddings_.data,
+      prompt_embeddings_dim_order_.data());
+
+  int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len);
+
+  ET_CHECK_MSG(
+      embedding_runner_->set_outputs(method_name_, output_tensors_) ==
+          executorch::runtime::Error::Ok,
+      "Failed to set output tensor for module %s",
+      method_name_.c_str());
+
+  for (int32_t i = 0; i < num_iters; ++i) {
+    prepare_io(prompt_tokens, prompt_pos);
+
+    embedding_runner_->step(method_name_, inputs_);
+
+    // Update prompt_embedding
+    update_prompt_embedding(num_prompt_tokens, prompt_pos);
+
+    prompt_pos += metadata_.ar_len;
+  }
+}
+
+void EmbeddingProcessor::prepare_io(
+    const std::vector<uint64_t>& prompt_tokens,
+    int64_t prompt_pos) {
+  for (int i = 0; i < metadata_.ar_len; i++) {
+    // Prepare input token data
+    if (prompt_pos + i < prompt_tokens.size()) {
+      // Support CPU 4-bit embedding, which requires int64 input.
+      // However, for QNN embedding, only int32 input is needed.
+      // Therefore, we need to cast to the correct type to write the data.
+      if (metadata_.use_int64_token) {
+        input_toks_.data[i] = prompt_tokens[prompt_pos + i];
+      } else {
+        int32_t* input_toks_ptr = reinterpret_cast<int32_t*>(input_toks_.data);
+        input_toks_ptr[i] = static_cast<int32_t>(prompt_tokens[prompt_pos + i]);
+      }
+    }
+  }
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h
new file mode 100644
index 00000000000..0ece8bf2d03
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/cache_utils.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/utils.h>
+#include <memory>
+#include <string>
+
+namespace example {
+/**
+ * @class EmbeddingProcessor
+ * @brief Class for processing prompts to generate embeddings using embedding
+ * runner.
+ */
+class EmbeddingProcessor {
+ public:
+  struct Metadata {
+    int32_t context_len;
+    int32_t ar_len;
+    int32_t vocab_size;
+    bool use_int64_token;
+    int32_t embedding_dim;
+  };
+
+  EmbeddingProcessor(
+      EmbeddingRunner* embedding_runner,
+      const std::string& method_name,
+      Metadata metadata);
+
+  /**
+   * @brief Initialize I/O tensor and allocate I/O data buffer.
+   * @param buffer_manager Pointer to IMemAlloc instance.
+   * @param method_meta Method metadata.
+   */
+  void init_io(
+      IMemAlloc* buffer_manager,
+      executorch::runtime::Result<executorch::runtime::MethodMeta> method_meta);
+
+  void update_prompt_embedding(int32_t num_prompt_tokens, int64_t prompt_pos);
+
+  /**
+   * Process prompt tokens to generate embeddings.
+   * @param prompt_tokens The text prompt tokens. Encoded by tokenizer.
+   * @param ar_len AR length for chunking.
+   * @return The embedding tensor result.
+   */
+  void prefill(const std::vector<uint64_t>& prompt_tokens);
+
+  /**
+   * @brief Get total I/O size in bytes.
+   * @return Total I/O size in bytes.
+   */
+  inline const size_t total_embedding_processor_io_size_in_bytes() const {
+    return input_toks_.size + embeddings_.size;
+  }
+
+  inline const TensorStruct<float>& get_prompt_embeddings() const {
+    return prompt_embeddings_;
+  }
+
+ private:
+  /**
+   * @brief Fill in I/O buffers with prompt tokens.
+   * @param prompt_tokens Vector of prompt tokens.
+   */
+  void prepare_io(
+      const std::vector<uint64_t>& prompt_tokens,
+      int64_t prompt_pos);
+
+  EmbeddingRunner* embedding_runner_;
+  std::string method_name_;
+
+  // metadata
+  Metadata metadata_;
+
+  // inputs and outputs
+  TensorStruct<int64_t> input_toks_;
+  TensorStruct<float> embeddings_;
+  TensorStruct<float> prompt_embeddings_;
+  std::vector<float> prompt_embeddings_buffer_;
+  std::vector<executorch::aten::TensorImpl::SizesType> prompt_embeddings_sizes_;
+  std::vector<executorch::aten::TensorImpl::DimOrderType>
+      prompt_embeddings_dim_order_;
+
+  std::vector<executorch::runtime::EValue> inputs_;
+  std::vector<executorch::aten::Tensor> input_tensors_;
+  std::vector<executorch::aten::Tensor> output_tensors_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp
new file mode 100644
index 00000000000..bf1008e34b1
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+using executorch::aten::Tensor;
+using executorch::extension::Module;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+
+namespace example {
+
+EmbeddingRunner::EmbeddingRunner(Module* module) : module_(module) {}
+
+Result<Tensor> EmbeddingRunner::step(
+    const std::string& method_name,
+    std::vector<EValue>& inputs) {
+  // Execute embedding module
+  Result<std::vector<EValue>> outputs_res =
+      module_->execute(method_name, inputs);
+
+  ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
+  ET_CHECK_MSG(
+      outputs_res.get()[0].isTensor(),
+      "Non Tensor Output returned from executing Token Embedding");
+
+  // Get the embedding tensor from result
+  return outputs_res.get()[0].toTensor();
+}
+
+Error EmbeddingRunner::set_outputs(
+    const std::string& method_name,
+    std::vector<executorch::aten::Tensor> output_values) {
+  for (size_t i = 0; i < output_values.size(); ++i) {
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        module_->set_output(method_name, output_values[i], i));
+  }
+  return Error::Ok;
+}
+
+Error EmbeddingRunner::load(const std::vector<std::string>& method_names) {
+  if (is_method_loaded(method_names)) {
+    return Error::Ok;
+  }
+  for (const std::string& method_name : method_names) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(method_name));
+  }
+  return Error::Ok;
+}
+
+bool EmbeddingRunner::is_method_loaded(
+    const std::vector<std::string>& method_names) {
+  bool method_loaded = true;
+  for (const std::string& method_name : method_names) {
+    method_loaded &= module_->is_method_loaded(method_name);
+  }
+  return method_loaded;
+}
+
+bool EmbeddingRunner::is_loaded() const {
+  return module_ != nullptr && module_->is_loaded();
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h
new file mode 100644
index 00000000000..d5155a45252
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+#include <string>
+#include <vector>
+
+namespace example {
+
+/**
+ * @class EmbeddingRunner
+ * @brief Class for running embedding module, similar to DecoderRunner
+ */
+class EmbeddingRunner {
+ public:
+  EmbeddingRunner(executorch::extension::Module* module);
+
+  /**
+   * Run embedding module with inputs to generate embeddings.
+   * @param method_name The method name to execute.
+   * @param inputs The inputs to the embedding module.
+   * @return The output embeddings tensor.
+   */
+  executorch::runtime::Result<executorch::aten::Tensor> step(
+      const std::string& method_name,
+      std::vector<executorch::runtime::EValue>& inputs);
+
+  executorch::runtime::Error set_outputs(
+      const std::string& method_name,
+      std::vector<executorch::aten::Tensor> output_values);
+
+  /**
+   * Load the Module for token embedding.
+   * @return The error code.
+   */
+  executorch::runtime::Error load(const std::vector<std::string>& method_names);
+
+  /**
+   * Check if the required methods in the Module are loaded.
+   * @return True if the Module is loaded, false otherwise.
+   */
+  bool is_method_loaded(const std::vector<std::string>& method_names);
+
+  /**
+   * @brief Check if embedding module is loaded
+   * @return true if module is loaded, false otherwise
+   */
+  bool is_loaded() const;
+  executorch::extension::Module* module_;
+};
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp
new file mode 100644
index 00000000000..91789f07a90
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h>
+#include <fstream>
+
+using executorch::aten::Tensor;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::runtime::Error;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+
+namespace example {
+
+EncoderRunner::EncoderRunner(const std::string& model_path)
+    : image_seq_len_(0) {
+  module_ = std::make_unique<Module>(
+      model_path, Module::LoadMode::MmapUseMlockIgnoreErrors);
+  ET_LOG(Info, "Creating encoder module: model_path=%s", model_path.c_str());
+}
+
+bool EncoderRunner::is_method_loaded() const {
+  return module_->is_method_loaded(kEncoderForwardName);
+}
+
+Error EncoderRunner::load() {
+  if (is_method_loaded()) {
+    return Error::Ok;
+  }
+
+  auto load_result = module_->load_method(kEncoderForwardName);
+  if (load_result != Error::Ok) {
+    ET_LOG(Error, "Failed to load encoder method");
+    return load_result;
+  }
+
+  // Get image sequence length from output metadata
+  Result<MethodMeta> method_meta = module_->method_meta(kEncoderForwardName);
+  if (!method_meta.ok()) {
+    ET_LOG(Error, "Failed to get encoder method metadata");
+    return method_meta.error();
+  }
+
+  // vision embedding output shape: [1, seq_len, dim]
+  image_seq_len_ = method_meta->output_tensor_meta(0)->sizes()[1];
+  ET_LOG(Info, "Encoder loaded successfully, image_seq_len=%d", image_seq_len_);
+
+  return Error::Ok;
+}
+
+int32_t EncoderRunner::get_image_seq_len() const {
+  return image_seq_len_;
+}
+
+Result<Tensor> EncoderRunner::encode(TensorPtr& image_tensor) {
+  ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded");
+
+  auto tensor_ptr = image_tensor.get();
+  ET_LOG(Info, "Encoding image tensor with numel: %zu", tensor_ptr->numel());
+
+  std::vector<executorch::runtime::EValue> encoder_inputs;
+  encoder_inputs.emplace_back(*tensor_ptr);
+
+  auto encoder_result = module_->forward(encoder_inputs);
+  ET_CHECK_MSG(encoder_result.ok(), "Encoder execution failed");
+
+  auto encoder_output = encoder_result.get();
+  auto image_hidden_states = encoder_output[0].toTensor();
+  ET_LOG(Info, "Encoder execution completed, got image hidden states");
+
+  return image_hidden_states;
+}
+
+Result<Tensor> EncoderRunner::encode_from_file(
+    const std::string& image_file_path) {
+  ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded");
+
+  // Get input tensor metadata
+  Result<MethodMeta> method_meta = module_->method_meta(kEncoderForwardName);
+  auto sizes_span = method_meta->input_tensor_meta(0)->sizes();
+
+  // Calculate total number of elements
+  int64_t num_elem = 1;
+  for (const auto& size : sizes_span) {
+    num_elem *= size;
+  }
+
+  // Read image data from file
+  ET_LOG(
+      Info,
+      "Reading image from file: %s, num_elements=%ld",
+      image_file_path.c_str(),
+      num_elem);
+  std::ifstream file(image_file_path, std::ios::binary | std::ios::ate);
+  ET_CHECK_MSG(
+      file.is_open(), "Failed to open image file: %s", image_file_path.c_str());
+
+  // To prevent users from passing images that have not been
+  // resized to match the encoder input size.
+  std::streamsize file_size = file.tellg();
+  std::streamsize expected_size = num_elem * sizeof(float);
+  ET_CHECK_MSG(
+      file_size == expected_size,
+      "Image file size mismatch: expected %ld bytes but got %ld bytes (file: %s)",
+      expected_size,
+      file_size,
+      image_file_path.c_str());
+
+  file.seekg(0, std::ios::beg);
+  std::vector<float> buffer(num_elem);
+  file.read(reinterpret_cast<char*>(buffer.data()), expected_size);
+  file.close();
+
+  // Create tensor from buffer
+  TensorPtr tensor = executorch::extension::from_blob(
+      buffer.data(),
+      std::vector<int32_t>(sizes_span.begin(), sizes_span.end()),
+      executorch::aten::ScalarType::Float);
+
+  // Encode the tensor
+  return encode(tensor);
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h
new file mode 100644
index 00000000000..0e1becc05b6
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/log.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace example {
+
+/**
+ * @class EncoderRunner
+ * @brief Class for running vision encoder to generate image hidden states.
+ */
+class EncoderRunner {
+ public:
+  /**
+   * @brief Constructor for EncoderRunner
+   * @param model_path Path to the encoder model PTE file
+   */
+  explicit EncoderRunner(const std::string& model_path);
+
+  /**
+   * @brief Check if the encoder method is loaded
+   * @return true if method is loaded, false otherwise
+   */
+  bool is_method_loaded() const;
+
+  /**
+   * @brief Load the encoder method
+   * @return Error status
+   */
+  executorch::runtime::Error load();
+
+  /**
+   * @brief Get the image sequence length from encoder output metadata
+   * @return Image sequence length
+   */
+  int32_t get_image_seq_len() const;
+
+  /**
+   * @brief Encode image tensor to hidden states
+   * @param image_tensor Input image tensor (B, C, H, W)
+   * @return Result containing the image hidden states tensor
+   */
+  executorch::runtime::Result<executorch::aten::Tensor> encode(
+      executorch::extension::TensorPtr& image_tensor);
+
+  /**
+   * @brief Encode image from raw file
+   * @param image_file_path Path to raw image file
+   * @return Result containing the image hidden states tensor
+   */
+  executorch::runtime::Result<executorch::aten::Tensor> encode_from_file(
+      const std::string& image_file_path);
+
+ private:
+  std::unique_ptr<executorch::extension::Module> module_;
+  inline static const std::string kEncoderForwardName = "forward";
+  int32_t image_seq_len_;
+};
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
new file mode 100644
index 00000000000..5f0375d1c50
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h>
+#include <algorithm>
+#include <cstdlib>
+using executorch::runtime::Result;
+
+namespace example {
+
+template <typename T>
+void MultimodalLhdTokenGenerator<T>::prepare_io(
+    std::vector<uint64_t> input_tokens,
+    std::vector<int32_t> input_pos) {
+  for (int i = 0; i < metadata_.ar_len; i++) {
+    if (i < input_tokens.size()) {
+      // Prepare pos data
+      this->input_pos_.data[i] = input_pos[i];
+    }
+  }
+
+  // Generate embeddings for the first metadata_.ar_len tokens
+  int num_tokens_to_process =
+      std::min(static_cast<int>(input_tokens.size()), metadata_.ar_len);
+  std::vector<uint64_t> tokens_to_process(
+      input_tokens.begin(), input_tokens.begin() + num_tokens_to_process);
+
+  embedding_runner_->prefill(tokens_to_process);
+  const TensorStruct<float>& text_embeddings =
+      embedding_runner_->get_prompt_embeddings();
+  int64_t embedding_dim = text_embeddings.tensor->size(2);
+
+  // Copy embedding to input buffer from the left
+  std::memcpy(
+      this->input_embedding_.data,
+      text_embeddings.data,
+      num_tokens_to_process * embedding_dim * sizeof(float));
+
+  // If metadata_.ar_len > input_tokens.size(), initialize remaining part to 0
+  if (metadata_.ar_len > num_tokens_to_process) {
+    int remaining_tokens = metadata_.ar_len - num_tokens_to_process;
+    std::memset(
+        this->input_embedding_.data + num_tokens_to_process * embedding_dim,
+        0,
+        remaining_tokens * embedding_dim * sizeof(float));
+  }
+}
+
+template <typename T>
+void MultimodalLhdTokenGenerator<T>::init_attention_mask(int32_t n_past) {
+  std::vector<int32_t> attention_map;
+  attention_map.reserve(metadata_.ar_len);
+  // Initialize attention mask with current position
+  for (int i = 0; i < metadata_.window; ++i) {
+    attention_map.push_back(i - 1);
+  }
+  for (int i = 1; i < metadata_.ngram - 1; ++i) {
+    for (int j = 0; j < metadata_.window; ++j) {
+      attention_map.push_back((i - 1) * metadata_.window + j);
+    }
+  }
+  for (int g = 0; g < metadata_.gcap; g++) {
+    for (int j = 0; j < metadata_.ngram - 1; j++) {
+      if (j == 0)
+        attention_map.push_back(0);
+      else
+        attention_map.push_back(
+            (metadata_.window + g) * (metadata_.ngram - 1) + j - 1);
+    }
+  }
+
+  this->kv_manager_->init_attention_mask(
+      this->attention_mask_.data, attention_map, metadata_.ar_len, n_past);
+  // Initialize window attention mask with current position
+  if (metadata_.cache_mode == CacheMode::HybridCache) {
+    this->kv_manager_->init_attention_mask(
+        this->window_attention_mask_.data,
+        attention_map,
+        metadata_.ar_len,
+        n_past,
+        metadata_.sliding_window,
+        position_offset_);
+  }
+}
+
+template <typename T>
+void MultimodalLhdTokenGenerator<T>::init_lookahead_branch(
+    const std::vector<uint64_t>& tokens) {
+  for (int i = 0; i < metadata_.ngram - 1; ++i) {
+    for (int j = 0; j < metadata_.window; ++j) {
+      // there are different ways to init these tokens
+      if (0) {
+        // initialize with a sequence of increasing numbers
+        lhd_branch_[i][j] = 1000 + j;
+      } else {
+        // initialize with the random token from prompt
+        lhd_branch_[i][j] = tokens[1 + rand() % (tokens.size() - 1)];
+      }
+    }
+  }
+  is_lhd_branch_initialized_ = true;
+}
+
+template <typename T>
+void MultimodalLhdTokenGenerator<T>::init_verification_branch(
+    uint64_t cur_token) {
+  const int g_cur = ngrams_pool_.cnt[cur_token];
+
+  v_branch_.resize(g_cur);
+  for (int g = 0; g < g_cur; g++) {
+    v_branch_[g].active = true;
+    v_branch_[g].tokens.resize(metadata_.ngram);
+    v_branch_[g].i_batch.resize(metadata_.ngram);
+    v_branch_[g].seq_id = metadata_.window + 1 + g;
+    v_branch_[g].i_batch[0] = 0;
+    v_branch_[g].tokens[0] = cur_token;
+  }
+
+  for (int j = 0; j < metadata_.ngram - 1; j++) {
+    for (int g = 0; g < g_cur; g++) {
+      const int idx = cur_token * (metadata_.ngram - 1) * metadata_.gcap +
+          g * (metadata_.ngram - 1);
+      const int32_t t = ngrams_pool_.tokens[idx + j];
+      v_branch_[g].tokens[j + 1] = t;
+      v_branch_[g].i_batch[j + 1] = j + 1;
+    }
+  }
+}
+
+template <typename T>
+void MultimodalLhdTokenGenerator<T>::update_ngrams_pool() {
+  std::vector<int32_t> ngram(metadata_.ngram - 1);
+  // n-gram pool generation
+  for (int f = 0; f < metadata_.window; ++f) {
+    const int ft = lhd_branch_prev_[f]; // first token of the n-gram
+
+    for (int j = 0; j < metadata_.ngram - 1; ++j) {
+      ngram[j] = lhd_branch_[j][f];
+    }
+
+    // filter-out repeating n-grams
+    {
+      bool is_unique = true;
+      for (int k = 0; k < ngrams_pool_.cnt[ft]; ++k) {
+        // calculate the related idx by the first n-gram token
+        const int idx = ft * (metadata_.ngram - 1) * metadata_.gcap +
+            k * (metadata_.ngram - 1);
+
+        bool is_match = true;
+        for (int j = 0; j < metadata_.ngram - 1; ++j) {
+          if (ngrams_pool_.tokens[idx + j] != ngram[j]) {
+            is_match = false;
+            break;
+          }
+        }
+
+        // if n-gram match all, discard one of them
+        if (is_match) {
+          is_unique = false;
+          break;
+        }
+      }
+      if (!is_unique) {
+        continue;
+      }
+    }
+
+    const int head = ngrams_pool_.head[ft];
+    const int idx = ft * (metadata_.ngram - 1) * metadata_.gcap +
+        head * (metadata_.ngram - 1);
+
+    for (int i = 0; i < metadata_.ngram - 1; i++) {
+      // update the n-gram pool with new n-gram
+      ngrams_pool_.tokens[idx + i] = ngram[i];
+    }
+
+    ngrams_pool_.cnt[ft] =
+        std::min(metadata_.gcap, (int32_t)ngrams_pool_.cnt[ft] + 1);
+    ngrams_pool_.head[ft] = (head + 1) % metadata_.gcap;
+    ngrams_pool_.n_total++;
+  }
+}
+
+template <typename T>
+void MultimodalLhdTokenGenerator<T>::update_lookahead_branch(
+    const executorch::aten::Tensor& logits_tensor) {
+  for (int i = 0; i < metadata_.window; i++) {
+    lhd_branch_prev_[i] = lhd_branch_[0][i];
+  }
+
+  for (int j = 0; j < metadata_.ngram - 2; j++) {
+    lhd_branch_[j] = lhd_branch_[j + 1];
+  }
+
+  // sample from the last level
+  for (int i = 0; i < metadata_.window; i++) {
+    size_t sample_idx = (metadata_.ngram - 2) * metadata_.window + i;
+    lhd_branch_[metadata_.ngram - 2][i] =
+        this->decoder_runner_->logits_to_token(logits_tensor, sample_idx);
+  }
+}
+
+template <typename T>
+Result<int64_t> MultimodalLhdTokenGenerator<T>::generate(
+    std::vector<uint64_t> tokens,
+    int64_t start_pos,
+    int32_t seq_len,
+    std::function<void(const std::string&)> token_callback,
+    bool dump_logits) {
+  ET_CHECK_MSG(
+      !tokens.empty(), "Token generation loop shouldn't take empty tokens");
+  // position in the sequence
+  int64_t pos = start_pos;
+  int64_t prev_pos;
+  // number of match tokens
+  int32_t n_accept{0};
+  std::vector<uint64_t> result_tokens;
+  uint64_t cur_token = tokens.back();
+  uint64_t prev_token;
+  result_tokens.push_back(cur_token);
+
+  // Manage the inputs of lookahead decoding
+  std::vector<int32_t> input_pos;
+  std::vector<uint64_t> input_tokens;
+  input_tokens.reserve(metadata_.ar_len);
+  input_pos.reserve(metadata_.ar_len);
+
+  // Rearrange KV cache first and initialize the input and output of KV cache
+  this->kv_manager_->rearrange_cache(metadata_.ar_len);
+
+  // Initialize attention mask with pos
+  init_attention_mask(pos);
+
+  // Initialize Lookahead branch at first generation
+  if (!is_lhd_branch_initialized_) {
+    ET_LOG(Info, "Initialize Lookahead branch");
+    init_lookahead_branch(tokens);
+  }
+
+  // Initialize the output of the module
+  ET_CHECK_MSG(
+      this->decoder_runner_->set_outputs(
+          this->method_name_, this->output_tensors_) ==
+          executorch::runtime::Error::Ok,
+      "Failed to set output tensor for module %s",
+      this->method_name_.c_str());
+
+  // Generate tokens
+  while (pos < seq_len - 1) {
+    std::vector<bool> selected(metadata_.ar_len, false);
+
+    input_tokens.clear();
+    input_pos.clear();
+
+    // fill the first token of the first level
+    input_tokens.push_back(cur_token);
+    input_pos.push_back(pos);
+
+    // fill the remaining WINDOW - 1 tokens for the first level
+    for (int i = 1; i < metadata_.window; ++i) {
+      input_tokens.push_back(lhd_branch_[0][i]);
+      input_pos.push_back(pos + i);
+    }
+
+    // fill the rest of the levels
+    for (int i = 1; i < metadata_.ngram - 1; ++i) {
+      for (int j = 0; j < metadata_.window; ++j) {
+        input_tokens.push_back(lhd_branch_[i][j]);
+        input_pos.push_back(pos + i + j);
+      }
+    }
+    // Verification Branch Init
+    init_verification_branch(cur_token);
+
+    for (int g = 0; g < v_branch_.size(); g++) {
+      for (int j = 0; j < metadata_.ngram - 1; j++) {
+        input_tokens.push_back(v_branch_[g].tokens[j + 1]);
+        input_pos.push_back(pos + j + 1);
+      }
+    }
+
+    prepare_io(input_tokens, input_pos);
+
+    // Run inference
+    auto logits_res =
+        this->decoder_runner_->step(this->method_name_, this->inputs_);
+    ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
+    executorch::aten::Tensor& logits_tensor = logits_res.get();
+    prev_pos = pos;
+
+    // verification branch seq-id
+    size_t seq_id_best = 0;
+    // max hit pos
+    size_t i_batch_best = 0;
+
+    // Lookahead decoding and verification
+    for (int v = 0; v < metadata_.ngram; ++v) {
+      // Verification
+      int i_batch = 0;
+      if (v > 0) {
+        for (int g = 0; g < v_branch_.size(); g++) {
+          // record the best matched seq and pos
+          if (v_branch_[g].active) {
+            i_batch = v_branch_[g].i_batch[v];
+            i_batch_best = i_batch;
+            seq_id_best = v_branch_[g].seq_id;
+            ++n_accept;
+            break;
+          }
+        }
+        if (i_batch == 0) {
+          break;
+        }
+      }
+
+      size_t sample_idx;
+      if (seq_id_best == 0)
+        sample_idx = 0;
+      else
+        sample_idx = metadata_.window * (metadata_.ngram - 1) +
+            (seq_id_best - (metadata_.window + 1)) * (metadata_.ngram - 1) +
+            i_batch - 1;
+
+      // vector selected set
+      selected[sample_idx] = true;
+
+      prev_token = cur_token;
+      // sampler from logits all
+      this->stats_->on_sampling_begin();
+      cur_token =
+          this->decoder_runner_->logits_to_token(logits_tensor, sample_idx);
+      this->stats_->on_sampling_end();
+      result_tokens.push_back(cur_token);
+      pos++;
+
+      // print the token as string, decode it with the Tokenizer object
+      token_callback(
+          ET_UNWRAP_TOKENIZER(this->tokenizer_->decode(prev_token, cur_token)));
+
+      // data-dependent terminating condition: we have n_eos_ number of EOS
+      if (this->eos_ids_->count(cur_token) > 0) {
+        printf("\n");
+        ET_LOG(Info, "\nReached to the end of generation");
+        break;
+      }
+
+      // if verify pass, check the next sample token until verifying failed
+      for (int g = 0; g < v_branch_.size(); g++) {
+        // update the n-gram active status
+        if (v_branch_[g].active) {
+          if (v == metadata_.ngram - 1) {
+            v_branch_[g].active = false;
+          } else {
+            if (cur_token != v_branch_[g].tokens[v + 1]) {
+              v_branch_[g].active = false;
+            }
+          }
+        }
+      }
+
+      // only update n-grams pools and lookahead branch when v=0
+      if (v == 0) {
+        // update lookahead branch
+        update_lookahead_branch(logits_tensor);
+        // update n-grams pool
+        update_ngrams_pool();
+      }
+    } // end of verify loop
+
+    if (pos > metadata_.context_len - metadata_.ar_len) {
+      printf("\n");
+      ET_LOG(Info, "\nReached to the maximum sequence length");
+      break;
+    }
+    // Update KV Cache with the output results
+    int32_t n_update = pos - prev_pos;
+    this->kv_manager_->update_cache(
+        metadata_.ar_len, prev_pos, n_update, selected);
+
+    // Update attention mask with current position
+    this->kv_manager_->update_attention_mask(
+        this->attention_mask_.data, metadata_.ar_len, prev_pos, n_update);
+    if (metadata_.cache_mode == CacheMode::HybridCache) {
+      this->kv_manager_->update_attention_mask(
+          this->window_attention_mask_.data,
+          metadata_.ar_len,
+          prev_pos,
+          n_update,
+          metadata_.sliding_window,
+          position_offset_);
+    }
+
+    // data-dependent terminating condition: we have n_eos_ number of EOS
+    if (this->eos_ids_->count(cur_token) > 0) {
+      printf("\n");
+      ET_LOG(Info, "\nReached to the end of generation");
+      break;
+    }
+  }
+  ET_LOG(
+      Info,
+      "Lookahead Decoding: n_generated = %ld / n_accept = %d",
+      pos - start_pos,
+      n_accept);
+
+  return pos - start_pos;
+}
+
+// Explicit instantiations
+template class MultimodalLhdTokenGenerator<uint16_t>;
+template class MultimodalLhdTokenGenerator<uint8_t>;
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
new file mode 100644
index 00000000000..266542f1a72
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h>
+
+namespace example {
+
+/**
+ * @class MultimodalLhdTokenGenerator
+ * @brief Extended LhdTokenGenerator with multimodal embedding support
+ */
+template <typename T>
+class MultimodalLhdTokenGenerator
+    : public example::MultimodalTokenGenerator<T> {
+ public:
+  struct Metadata {
+    int32_t context_len;
+    int64_t num_heads;
+    int64_t num_layers;
+    int32_t ar_len;
+    int32_t vocab_size;
+    bool use_int64_token;
+    int32_t ngram;
+    int32_t window;
+    int32_t gcap;
+    int sliding_window;
+    CacheMode cache_mode;
+    int32_t embedding_dim = 0;
+  };
+  MultimodalLhdTokenGenerator(
+      tokenizers::Tokenizer* tokenizer,
+      EmbeddingProcessor* embedding_runner,
+      DecoderRunner* decoder_runner,
+      KVManager<T>* kv_manager,
+      const std::string& forward_name,
+      std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
+      Metadata metadata,
+      executorch::llm::Stats* stats)
+      : MultimodalTokenGenerator<T>(
+            tokenizer,
+            embedding_runner,
+            decoder_runner,
+            kv_manager,
+            forward_name,
+            std::move(eos_ids),
+            typename MultimodalTokenGenerator<T>::Metadata{
+                metadata.context_len,
+                metadata.num_heads,
+                metadata.num_layers,
+                metadata.ar_len,
+                metadata.vocab_size,
+                metadata.use_int64_token,
+                metadata.sliding_window,
+                metadata.cache_mode,
+                metadata.embedding_dim},
+            stats),
+        embedding_runner_(embedding_runner),
+        metadata_(metadata),
+        lhd_branch_(metadata.ngram - 1, std::vector<int32_t>(metadata.window)),
+        lhd_branch_prev_(metadata.window),
+        ngrams_pool_(metadata.vocab_size, metadata.ngram, metadata.gcap) {
+    ET_LOG(
+        Info,
+        "Use Lookahead decoding: ngram=%d, window=%d, gcap=%d",
+        metadata.ngram,
+        metadata.window,
+        metadata.gcap);
+
+    // initialize position offset
+    position_offset_ = std::vector<int32_t>(metadata.ar_len);
+    int idx = 0;
+    // lookahead branches
+    for (int i = 0; i < metadata.ngram - 1; ++i) {
+      for (int j = 0; j < metadata.window; ++j) {
+        position_offset_[idx++] = i + j;
+      }
+    }
+    // verification branches
+    for (int i = 0; i < metadata.gcap; ++i) {
+      for (int j = 1; j < metadata.ngram; ++j) {
+        position_offset_[idx++] = j;
+      }
+    }
+  }
+
+  ~MultimodalLhdTokenGenerator() = default;
+
+  /**
+     * @brief Generate tokens with lookahead decoding.
+     * @param tokens Vector of input tokens.
+     * @param start_pos Starting position for generation.
+     * @param seq_len Length of the sequence to generate.
+     * @param token_callback Callback function for generated tokens.
+     * @return The number of tokens generated.
+     */
+  executorch::runtime::Result<int64_t> generate(
+      std::vector<uint64_t> tokens,
+      int64_t start_pos,
+      int32_t seq_len,
+      std::function<void(const std::string&)> token_callback,
+      bool dump_logits) override;
+
+ private:
+  /**
+   * @brief Fill in I/O buffers with prompt token and position.
+   * @param cur_token Current token.
+   * @param start_pos Starting position.
+   */
+  void prepare_io(
+      std::vector<uint64_t> input_tokens,
+      std::vector<int32_t> input_pos);
+  void init_attention_mask(int32_t n_past);
+  void init_lookahead_branch(const std::vector<uint64_t>& tokens);
+  void init_verification_branch(uint64_t cur_token);
+  void update_lookahead_branch(const executorch::aten::Tensor& logits_tensor);
+  void update_ngrams_pool();
+
+  // Additional members specific to multimodal
+  EmbeddingProcessor* embedding_runner_;
+
+  struct NgramData {
+    bool active = false;
+    int32_t seq_id = -1;
+
+    // match pos
+    std::vector<int> i_batch;
+    std::vector<int32_t> tokens;
+  };
+
+  // n-gram pool
+  struct NgramContainer {
+    NgramContainer(int n_vocab, int n, int g) {
+      cnt.resize(n_vocab);
+      head.resize(n_vocab);
+      tokens.resize(n_vocab * g * (n - 1));
+    }
+
+    int n_total = 0;
+
+    std::vector<size_t> cnt;
+    std::vector<int> head;
+
+    // [n_vocab][G][N - 1]
+    // for each token of the vocab, keep a ring-buffer of capacity G of n-grams
+    // of size N - 1
+    std::vector<int32_t> tokens;
+  };
+
+  Metadata metadata_;
+
+  // lookahead branch
+  bool is_lhd_branch_initialized_{false};
+  // [N - 1][W]
+  std::vector<std::vector<int32_t>> lhd_branch_;
+  // [W]
+  std::vector<int32_t> lhd_branch_prev_;
+
+  // verification branch
+  std::vector<NgramData> v_branch_;
+
+  // position offset in attention mask
+  std::vector<int32_t> position_offset_;
+
+  // n-gram pools
+  NgramContainer ngrams_pool_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp
new file mode 100644
index 00000000000..07728bfe257
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h>
+#include <numeric>
+
+using executorch::aten::TensorImpl;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+namespace example {
+
+template <typename T>
+MultimodalPromptProcessor<T>::MultimodalPromptProcessor(
+    DecoderRunner* decoder_runner,
+    KVManager<T>* kv_manager,
+    const std::string& method_name,
+    Metadata metadata)
+    : PromptProcessor<T>(
+          decoder_runner,
+          kv_manager,
+          method_name,
+          {metadata.context_len,
+           metadata.num_heads,
+           metadata.num_layers,
+           metadata.ar_len,
+           metadata.vocab_size,
+           metadata.use_int64_token,
+           metadata.sliding_window,
+           metadata.cache_mode}),
+      metadata_(metadata) {
+  // Set input_toks_.size to 0 since we use embeddings instead
+  input_toks_.size = 0;
+  input_embedding_.size =
+      metadata_.ar_len * metadata_.embedding_dim * sizeof(float);
+};
+
+template <typename T>
+void MultimodalPromptProcessor<T>::init_io(
+    IMemAlloc* buffer_manager,
+    Result<MethodMeta> method_meta) {
+  size_t idx = 0;
+  input_tensors_.reserve(method_meta->num_inputs());
+  output_tensors_.reserve(method_meta->num_outputs());
+
+  // [I]: input embedding
+  Result<TensorInfo> input_embedding = method_meta->input_tensor_meta(idx++);
+  input_embedding_.data =
+      reinterpret_cast<float*>(buffer_manager->allocate(input_embedding_.size));
+  input_embedding_.tensor = std::make_unique<TensorImpl>(
+      input_embedding->scalar_type(),
+      input_embedding->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_embedding->sizes().data()),
+      input_embedding_.data,
+      const_cast<TensorImpl::DimOrderType*>(
+          input_embedding->dim_order().data()));
+  input_tensors_.emplace_back(input_embedding_.tensor.get());
+  buffer_manager->add_memory_info(
+      input_embedding_.data, input_embedding_.size, input_embedding.get());
+
+  // [I]: attention_mask
+  Result<TensorInfo> attention_mask = method_meta->input_tensor_meta(idx++);
+  attention_mask_.data = reinterpret_cast<uint16_t*>(
+      buffer_manager->allocate(attention_mask_.size));
+  attention_mask_.tensor = std::make_unique<TensorImpl>(
+      attention_mask->scalar_type(),
+      attention_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(attention_mask->sizes().data()),
+      attention_mask_.data,
+      const_cast<TensorImpl::DimOrderType*>(
+          attention_mask->dim_order().data()));
+  input_tensors_.emplace_back(attention_mask_.tensor.get());
+  buffer_manager->add_memory_info(
+      attention_mask_.data, attention_mask_.size, attention_mask.get());
+
+  // [I]: sliding window attention_mask
+  if (metadata_.cache_mode == CacheMode::HybridCache) {
+    Result<TensorInfo> window_attention_mask =
+        method_meta->input_tensor_meta(idx++);
+    window_attention_mask_.data = reinterpret_cast<uint16_t*>(
+        buffer_manager->allocate(window_attention_mask_.size));
+    window_attention_mask_.tensor = std::make_unique<TensorImpl>(
+        window_attention_mask->scalar_type(),
+        window_attention_mask->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(
+            window_attention_mask->sizes().data()),
+        window_attention_mask_.data,
+        const_cast<TensorImpl::DimOrderType*>(
+            window_attention_mask->dim_order().data()));
+    input_tensors_.emplace_back(window_attention_mask_.tensor.get());
+    buffer_manager->add_memory_info(
+        window_attention_mask_.data,
+        window_attention_mask_.size,
+        window_attention_mask.get());
+  }
+
+  if (!is_bert()) {
+    // [I]: input_pos
+    Result<TensorInfo> input_pos = method_meta->input_tensor_meta(idx++);
+    input_pos_.data =
+        reinterpret_cast<int32_t*>(buffer_manager->allocate(input_pos_.size));
+    input_pos_.tensor = std::make_unique<TensorImpl>(
+        input_pos->scalar_type(),
+        input_pos->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
+        input_pos_.data,
+        const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
+    input_tensors_.emplace_back(input_pos_.tensor.get());
+    buffer_manager->add_memory_info(
+        input_pos_.data, input_pos_.size, input_pos.get());
+
+    // [I] kv_cache
+    size_t index = idx; // bypass input_tokens, atten_mask, input_pos
+    for (int cache_group = 0; cache_group < 2; ++cache_group) {
+      std::vector<std::unique_ptr<TensorImpl>>& cache =
+          (cache_group == 0 ? k_cache_in_ : v_cache_in_);
+      std::vector<KVCache<T>> cache_ptrs = (cache_group == 0)
+          ? kv_manager_->get_k_cache_()
+          : kv_manager_->get_v_cache_();
+      for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) {
+        Result<TensorInfo> kv_cache = method_meta->input_tensor_meta(index);
+
+        T* cache_ptr = cache_ptrs[layer].buffer;
+
+        cache[layer] = std::make_unique<TensorImpl>(
+            kv_cache->scalar_type(),
+            kv_cache->sizes().size(),
+            const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+            cache_ptr,
+            const_cast<TensorImpl::DimOrderType*>(
+                kv_cache->dim_order().data()));
+        input_tensors_.emplace_back(cache[layer].get());
+        buffer_manager->add_memory_info(
+            cache_ptr, cache[layer]->nbytes(), kv_cache.get());
+      }
+    }
+  }
+
+  // [O]: logits
+  Result<TensorInfo> logits = method_meta->output_tensor_meta(0);
+  logits_.data =
+      reinterpret_cast<uint16_t*>(buffer_manager->allocate(logits_.size));
+  logits_.tensor = std::make_unique<TensorImpl>(
+      logits->scalar_type(),
+      logits->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+      logits_.data,
+      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+  output_tensors_.emplace_back(logits_.tensor.get());
+  buffer_manager->add_memory_info(logits_.data, logits_.size, logits.get());
+
+  // [O] kv_cache
+  size_t index = 1;
+  for (int cache_group = 0; cache_group < 2; ++cache_group) {
+    std::vector<std::unique_ptr<TensorImpl>>& cache =
+        (cache_group == 0 ? k_cache_out_ : v_cache_out_);
+    std::vector<KVCache<T>> cache_ptrs = (cache_group == 0)
+        ? kv_manager_->get_k_cache_()
+        : kv_manager_->get_v_cache_();
+    for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) {
+      Result<TensorInfo> kv_cache = method_meta->output_tensor_meta(index);
+      T* cache_ptr = cache_ptrs[layer].output_buffer;
+      cache[layer] = std::make_unique<TensorImpl>(
+          kv_cache->scalar_type(),
+          kv_cache->sizes().size(),
+          const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+          cache_ptr,
+          const_cast<TensorImpl::DimOrderType*>(kv_cache->dim_order().data()));
+      output_tensors_.emplace_back(cache[layer].get());
+      buffer_manager->add_memory_info(
+          cache_ptr, cache[layer]->nbytes(), kv_cache.get());
+    }
+  }
+
+  // Prepare the vector of EValue to run inference
+  inputs_.reserve(input_tensors_.size());
+  for (auto& input_tensor : input_tensors_) {
+    inputs_.emplace_back(std::move(input_tensor));
+  }
+}
+
+// prepare embedding
+template <typename T>
+void MultimodalPromptProcessor<T>::prepare_io(
+    const TensorStruct<float>& prompt_embedding,
+    int32_t num_prompt_tokens,
+    int64_t prompt_pos,
+    int64_t start_pos) {
+  for (int i = 0; i < metadata_.ar_len; i++) {
+    if (!is_bert()) {
+      // Prepare pos data
+      input_pos_.data[i] = start_pos + i;
+    }
+
+    // Prepare input token data
+    if (prompt_pos + i < num_prompt_tokens) {
+      std::memcpy(
+          input_embedding_.data + i * metadata_.embedding_dim,
+          prompt_embedding.data + (prompt_pos + i) * metadata_.embedding_dim,
+          metadata_.embedding_dim * sizeof(float));
+    }
+  }
+}
+
+template <typename T>
+Result<uint64_t> MultimodalPromptProcessor<T>::prefill(
+    const TensorStruct<float>& prompt_embedding,
+    int64_t start_pos,
+    bool dump_logits) {
+  int32_t num_prompt_tokens = prompt_embedding.tensor->size(1);
+  if (!is_bert()) {
+    ET_CHECK_MSG(
+        (start_pos + num_prompt_tokens) <=
+            (metadata_.context_len - metadata_.ar_len),
+        "The sequence length exceeds the maximum limit that the prompt processor can handle.");
+  } else {
+    ET_CHECK_MSG(
+        start_pos == 0, "Bert model doesn't support multi-turn conversation.");
+  }
+
+  // store the token
+  int64_t cur_token;
+  int64_t prompt_pos = 0;
+  int64_t pos = start_pos;
+  int32_t n_update = metadata_.ar_len;
+  int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len);
+  ET_LOG(
+      Info,
+      "Prompt Processor: total %d prompt tokens (AR-%d * %d iters)",
+      num_prompt_tokens,
+      metadata_.ar_len,
+      num_iters);
+
+  // Rearrange KV cache first
+  kv_manager_->rearrange_cache(metadata_.ar_len);
+  std::vector<int32_t> attention_map(metadata_.ar_len);
+  std::iota(attention_map.begin(), attention_map.end(), -1);
+  // Initialize attention mask with current position
+  kv_manager_->init_attention_mask(
+      attention_mask_.data, attention_map, metadata_.ar_len, pos);
+  // Initialize window attention mask with current position
+  if (metadata_.cache_mode == CacheMode::HybridCache) {
+    kv_manager_->init_attention_mask(
+        window_attention_mask_.data,
+        attention_map,
+        metadata_.ar_len,
+        pos,
+        metadata_.sliding_window);
+  }
+
+  // Initialize the output of the module
+  ET_CHECK_MSG(
+      decoder_runner_->set_outputs(method_name_, output_tensors_) ==
+          executorch::runtime::Error::Ok,
+      "Failed to set output tensor for module %s",
+      method_name_.c_str());
+  for (int i = 0; i < num_iters; ++i) {
+    // Fill in the embedding and position data
+    prepare_io(prompt_embedding, num_prompt_tokens, prompt_pos, pos);
+
+    // Run inference
+    for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+      std::vector<KVCache<T>> k_cache_ptrs = kv_manager_->get_k_cache_();
+      T* k_cache_data = k_cache_ptrs[layer].buffer;
+    }
+    for (int layer = 0; layer < metadata_.num_layers; ++layer) {
+      std::vector<KVCache<T>> v_cache_ptrs = kv_manager_->get_v_cache_();
+      T* v_cache_data = v_cache_ptrs[layer].buffer;
+    }
+
+    decoder_runner_->step(method_name_, inputs_);
+    if (dump_logits) {
+      prompt_all_logits_.insert(
+          prompt_all_logits_.end(),
+          logits_.data,
+          logits_.data + metadata_.ar_len * metadata_.vocab_size);
+    }
+    // In the last run, offset to the meaningful logits.
+    if (i == num_iters - 1) {
+      n_update = 1 + ((num_prompt_tokens - 1) % metadata_.ar_len);
+    }
+    // Update KV Cache with the output results
+    kv_manager_->update_cache(metadata_.ar_len, pos, n_update, {});
+
+    // Update attention mask with current position
+    kv_manager_->update_attention_mask(
+        attention_mask_.data, metadata_.ar_len, pos, n_update);
+    if (metadata_.cache_mode == CacheMode::HybridCache) {
+      kv_manager_->update_attention_mask(
+          window_attention_mask_.data,
+          metadata_.ar_len,
+          pos,
+          n_update,
+          metadata_.sliding_window);
+    }
+    prompt_pos += metadata_.ar_len;
+    pos += metadata_.ar_len;
+  }
+
+  cur_token = decoder_runner_->logits_to_token(
+      output_tensors_[0],
+      (num_prompt_tokens + metadata_.ar_len - 1) % metadata_.ar_len);
+  return cur_token;
+}
+
+// Explicit instantiations
+template class MultimodalPromptProcessor<uint16_t>;
+template class MultimodalPromptProcessor<uint8_t>;
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h
new file mode 100644
index 00000000000..540b147def3
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h>
+
+namespace example {
+
+/**
+ * @class MultimodalPromptProcessor
+ * @brief Extended PromptProcessor with multimodal embedding support
+ */
+template <typename T>
+class MultimodalPromptProcessor : public example::PromptProcessor<T> {
+ public:
+  struct Metadata {
+    int32_t context_len;
+    int64_t num_heads;
+    int64_t num_layers;
+    int32_t ar_len;
+    int32_t vocab_size;
+    bool use_int64_token;
+    int sliding_window;
+    CacheMode cache_mode;
+    int32_t embedding_dim = 0;
+  };
+
+  MultimodalPromptProcessor(
+      DecoderRunner* decoder_runner,
+      KVManager<T>* kv_manager,
+      const std::string& method_name,
+      Metadata metadata);
+
+  int64_t get_num_heads() const {
+    return metadata_.num_heads;
+  }
+  int64_t get_num_layers() const {
+    return metadata_.num_layers;
+  }
+
+  /**
+   * @brief Initialize I/O tensor and allocate I/O data buffer.
+   * @param buffer_manager Pointer to IMemAlloc instance which depends on
+   * kv_updater.
+   * @param method_meta Method metadata.
+   */
+  void init_io(
+      IMemAlloc* buffer_manager,
+      executorch::runtime::Result<executorch::runtime::MethodMeta> method_meta);
+
+  /**
+   * Prefill an Decoder Module with the given embedding input.
+   * @param prompt_embedding The embedding tensor from embedding module.
+   * @param start_pos The starting position in KV cache of the input in the LLM
+   * Module.
+   * @param dump_logits Used to save all logits. Only enable when analyzing
+   * accuracy.
+   * @return The next token of the LLM Module after prefill.
+   */
+  executorch::runtime::Result<uint64_t> prefill(
+      const TensorStruct<float>& prompt_embedding,
+      int64_t start_pos,
+      bool dump_logits);
+
+  /**
+   * @brief Get total I/O size in bytes (excluding the KV cache size)
+   * @return Total I/O size in bytes.
+   */
+  inline const size_t total_prompt_processor_io_size_in_bytes() const {
+    if (metadata_.cache_mode == CacheMode::HybridCache) {
+      return input_toks_.size + input_pos_.size + attention_mask_.size +
+          window_attention_mask_.size + logits_.size + input_embedding_.size;
+    } else {
+      return input_toks_.size + input_pos_.size + attention_mask_.size +
+          logits_.size + input_embedding_.size;
+    }
+  }
+
+ private:
+  // Reuse members from token_generator
+  using PromptProcessor<T>::decoder_runner_;
+  using PromptProcessor<T>::kv_manager_;
+  using PromptProcessor<T>::method_name_;
+  using PromptProcessor<T>::k_cache_in_;
+  using PromptProcessor<T>::v_cache_in_;
+  using PromptProcessor<T>::k_cache_out_;
+  using PromptProcessor<T>::v_cache_out_;
+  using PromptProcessor<T>::input_toks_;
+  using PromptProcessor<T>::input_pos_;
+  using PromptProcessor<T>::attention_mask_;
+  using PromptProcessor<T>::window_attention_mask_;
+  using PromptProcessor<T>::logits_;
+  using PromptProcessor<T>::inputs_;
+  using PromptProcessor<T>::input_tensors_;
+  using PromptProcessor<T>::output_tensors_;
+  using PromptProcessor<T>::prompt_all_logits_;
+  using PromptProcessor<T>::is_bert;
+
+  /**
+   * @brief Fill in I/O buffers with embedding data and position.
+   * @param prompt_embedding The embedding tensor.
+   * @param prompt_pos Position of the prompt.
+   * @param start_pos Starting position.
+   */
+  void prepare_io(
+      const TensorStruct<float>& prompt_embedding,
+      int32_t num_prompt_tokens,
+      int64_t prompt_pos,
+      int64_t start_pos);
+
+  // metadata specific to multimodal
+  Metadata metadata_;
+
+  // Additional input for multimodal
+  TensorStruct<float> input_embedding_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
new file mode 100644
index 00000000000..274920ec00f
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
@@ -0,0 +1,752 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A llama 3.2 runner that includes preprocessing and post processing
+// logic. The module takes in a string as input and emits a string as output.
+
+#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/client_mem.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h>
+#include <executorch/extension/llm/runner/text_llm_runner.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/log.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <algorithm>
+#include <fstream>
+
+#if defined(__aarch64__)
+#include "arm_neon.h"
+#endif
+
+using executorch::aten::Tensor;
+using executorch::extension::Module;
+using executorch::extension::llm::get_rss_bytes;
+using executorch::extension::llm::print_report;
+using executorch::extension::llm::Stats;
+using executorch::extension::llm::time_in_ms;
+using executorch::runtime::Error;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+namespace llm = ::executorch::extension::llm;
+
+namespace example {
+namespace {
+void print_performance_report(
+    const Stats& stats,
+    const std::string& performance_output_path) {
+  // For now, we just print the total inference time for CI, can save more info
+  // in future if needed.
+  std::ofstream outfile(performance_output_path.c_str());
+  if (outfile.is_open()) {
+    double num_tok = 0;
+    if (stats.num_generated_tokens == 0) {
+      // For cases like evaluate perplexity where prompt_len == cache_len
+      num_tok = ((stats.num_prompt_tokens)) /
+          (double)(stats.prompt_eval_end_ms - stats.inference_start_ms) *
+          stats.SCALING_FACTOR_UNITS_PER_SECOND;
+    } else {
+      num_tok = (stats.num_generated_tokens) /
+          (double)(stats.inference_end_ms - stats.inference_start_ms) *
+          stats.SCALING_FACTOR_UNITS_PER_SECOND;
+    }
+
+    outfile << num_tok;
+    outfile.close();
+  } else {
+    ET_LOG(Error, "Error saving the inference speed file");
+  }
+}
+
+void save_logits(
+    const std::string& dump_logits_path,
+    const std::vector<uint16_t>& prefill_logits,
+    const std::vector<uint16_t>& decode_logits) {
+  std::ofstream outFile(dump_logits_path.c_str(), std::ios::binary);
+  if (outFile.is_open()) {
+    outFile.write(
+        reinterpret_cast<const char*>(prefill_logits.data()),
+        prefill_logits.size() * sizeof(uint16_t));
+
+    outFile.write(
+        reinterpret_cast<const char*>(decode_logits.data()),
+        decode_logits.size() * sizeof(uint16_t));
+    outFile.close();
+  } else {
+    ET_CHECK_MSG(false, "Error saving the dump logits file");
+  }
+}
+
+} // namespace
+
+template <typename T>
+MultimodalRunner<T>::MultimodalRunner(
+    std::unique_ptr<executorch::extension::Module> module,
+    std::unique_ptr<executorch::extension::Module> embedding_module,
+    const std::string& decoder_model_version,
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    const std::string& dump_logits_path,
+    const std::string& performance_output_path,
+    const float temperature,
+    const int eval_mode,
+    const bool shared_buffer,
+    const int ngram,
+    const int window,
+    const int gcap,
+    std::unique_ptr<executorch::aten::Tensor> image_hidden_states)
+    : module_(std::move(module)),
+      embedding_module_(std::move(embedding_module)),
+      image_hidden_states_(std::move(image_hidden_states)),
+      ngram_(ngram),
+      window_(window),
+      gcap_(gcap),
+      tokenizer_path_(tokenizer_path),
+      performance_output_path_(performance_output_path),
+      dump_logits_path_(dump_logits_path),
+      temperature_(temperature),
+      eval_mode_(static_cast<EvalMode>(eval_mode)),
+      shared_buffer_(shared_buffer) {
+  stats_.reset();
+
+  if (decoder_model_version == "smolvlm") {
+    decoder_model_version_ = MultimodalDecoderModelVersion::kSmolvlm;
+  } else if (decoder_model_version == "internvl3") {
+    decoder_model_version_ = MultimodalDecoderModelVersion::kInternvl3;
+  } else {
+    ET_CHECK_MSG(false, "Unsupported Decoder Model");
+  }
+
+  ET_LOG(Info, "creating module: model_path=%s", model_path.c_str());
+  ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str());
+  ET_LOG(Info, "eval mode=%d", eval_mode_);
+}
+
+template <typename T>
+bool MultimodalRunner<T>::is_loaded() const {
+  return module_->is_loaded() && embedding_module_->is_loaded() && tokenizer_ &&
+      decoder_runner_ && prompt_processor_ && token_generator_ && kv_manager_ &&
+      buffer_manager_;
+}
+
+template <typename T>
+Error MultimodalRunner<T>::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+
+  std::string prompt_embedding_method_name, token_embedding_method_name;
+  std::string token_generator_method_name, prompt_processor_method_name;
+  std::vector<std::string> method_names;
+  switch (eval_mode_) {
+    case EvalMode::kKVCached:
+      prompt_embedding_method_name = "tok_embedding_kv_forward";
+      token_embedding_method_name = "tok_embedding_kv_forward";
+      prompt_processor_method_name = "forward";
+      token_generator_method_name = "forward";
+      method_names.emplace_back(prompt_processor_method_name);
+      method_names.emplace_back(token_generator_method_name);
+      break;
+    case EvalMode::kHybrid:
+    case EvalMode::kLookaheadDecoding:
+      prompt_embedding_method_name = "tok_embedding_prefill_forward";
+      token_embedding_method_name = "tok_embedding_kv_forward";
+      prompt_processor_method_name = "prefill_forward";
+      token_generator_method_name = "kv_forward";
+      method_names.emplace_back(prompt_processor_method_name);
+      method_names.emplace_back(token_generator_method_name);
+      break;
+    case EvalMode::kUnsupported:
+      ET_CHECK_MSG(false, "Unsupported llama evaluation mode");
+      break;
+  }
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  if (tokenizer_ != nullptr) {
+    eos_ids->insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
+    eos_ids->insert(tokenizer_->encode("<|eot|>", 0, 0).get()[0]);
+    eos_ids->insert(tokenizer_->encode("<|end_of_text|>", 0, 0).get()[0]);
+  } else {
+    tokenizer_ = llm::load_tokenizer(tokenizer_path_);
+    if (tokenizer_ == nullptr) {
+      ET_LOG(
+          Error, "Failed to load tokenizer with %s", tokenizer_path_.c_str());
+      return Error::Internal;
+    }
+    eos_ids->insert(tokenizer_->eos_tok());
+  }
+  if (decoder_model_version_ == MultimodalDecoderModelVersion::kSmolvlm) {
+    eos_ids->insert(tokenizer_->encode("<end_of_utterance>", 0, 0).get()[0]);
+  } else if (
+      decoder_model_version_ == MultimodalDecoderModelVersion::kInternvl3) {
+    eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]);
+  }
+
+  // Try avoid getMetadataHelper as it is time consuming.
+  Result<MethodMeta> method_meta =
+      module_->method_meta(token_generator_method_name);
+
+  // For some tokenizer.json, runtime vocab_size might be different, use output
+  // shape to get vocab size.
+  int32_t vocab_size = method_meta->output_tensor_meta(0)->sizes()[2];
+  decoder_runner_ =
+      std::make_unique<DecoderRunner>(module_.get(), vocab_size, temperature_);
+
+  ET_CHECK_OK_OR_RETURN_ERROR(decoder_runner_->load(method_names));
+
+  ET_LOG(Info, "Reading metadata from model");
+  // retrieve any method meta, can be either prefill or kv
+  int64_t num_layers =
+      ET_UNWRAP(module_->get("get_n_layers")).toScalar().to<int64_t>();
+
+  ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers");
+  // k_cache: [1, n_heads, head_dim, seq_len]
+  auto k_cache_shape = method_meta->output_tensor_meta(1)->sizes();
+  int64_t num_heads = k_cache_shape[1];
+  int64_t head_dim = k_cache_shape[2];
+
+  // TODO: filter shape hidden_state: [1, ar_len, dim]
+  int64_t dim = embedding_module_->method_meta(token_embedding_method_name)
+                    ->output_tensor_meta(0)
+                    ->sizes()[2];
+  bool use_int64_token =
+      embedding_module_->method_meta(token_embedding_method_name)
+          ->input_tensor_meta(0)
+          ->scalar_type() == executorch::aten::ScalarType::Long;
+
+  // Use attention mask length to retrieve AR length and context length
+  // Cache len equals to context_len - ar_len
+  int32_t prompt_processor_ar_len = 0;
+  int32_t token_generator_ar_len = 0;
+  int32_t max_cache_len = 0;
+  int32_t max_ar_len = 0;
+  // atten mask: [1, AR-N, CL]
+  auto atten_mask_meta_token = method_meta->input_tensor_meta(1);
+  token_generator_ar_len = atten_mask_meta_token->sizes()[1];
+  context_len_ = atten_mask_meta_token->sizes()[2];
+  if (eval_mode_ == EvalMode::kKVCached) {
+    prompt_processor_ar_len = token_generator_ar_len;
+  } else if (
+      eval_mode_ == EvalMode::kHybrid ||
+      eval_mode_ == EvalMode::kLookaheadDecoding) {
+    auto atten_mask_meta_prompt =
+        module_->method_meta(prompt_processor_method_name)
+            ->input_tensor_meta(1);
+    prompt_processor_ar_len = atten_mask_meta_prompt->sizes()[1];
+  }
+  if (prompt_processor_ar_len == context_len_)
+    max_cache_len = context_len_;
+  else
+    max_cache_len = context_len_ -
+        std::min(token_generator_ar_len, prompt_processor_ar_len);
+  max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len);
+
+  embedding_runner_ =
+      std::make_unique<EmbeddingRunner>(embedding_module_.get());
+  ET_CHECK_OK_OR_RETURN_ERROR(embedding_runner_->load(
+      {prompt_embedding_method_name, token_embedding_method_name}));
+  // Initialize EmbeddingProcessor
+  embedding_processor_ = std::make_unique<EmbeddingProcessor>(
+      embedding_runner_.get(),
+      prompt_embedding_method_name,
+      EmbeddingProcessor::Metadata{
+          context_len_,
+          prompt_processor_ar_len,
+          vocab_size,
+          use_int64_token,
+          static_cast<int32_t>(dim)});
+
+  // Load the sliding window size if the model supports it.
+  // This is used to configure the attention mask for models with window
+  // attention
+  int32_t sliding_window = context_len_;
+  if (module_->method_names()->count("get_sliding_window") > 0) {
+    sliding_window = ET_UNWRAP(module_->get("get_sliding_window")).toInt();
+  }
+  kv_manager_ = std::make_unique<KVManager<T>>(typename KVManager<T>::Metadata{
+      context_len_,
+      head_dim,
+      max_ar_len,
+      max_cache_len,
+      num_heads,
+      num_layers});
+
+  prompt_processor_ = std::make_unique<MultimodalPromptProcessor<T>>(
+      decoder_runner_.get(),
+      kv_manager_.get(),
+      prompt_processor_method_name,
+      typename MultimodalPromptProcessor<T>::Metadata{
+          context_len_,
+          num_heads,
+          num_layers,
+          prompt_processor_ar_len,
+          vocab_size,
+          use_int64_token,
+          sliding_window,
+          cache_mode_,
+          static_cast<int32_t>(dim)});
+
+  if (eval_mode_ == EvalMode::kLookaheadDecoding ||
+      eval_mode_ == EvalMode::kHybrid) {
+    output_k_cache_scales_.resize(num_layers);
+    output_k_cache_zero_points_.resize(num_layers);
+    output_v_cache_scales_.resize(num_layers);
+    output_v_cache_zero_points_.resize(num_layers);
+    for (int i = 0; i < num_layers; i++) {
+      std::string get_k_scale_output_name =
+          "get_k_scale_output_" + std::to_string(i);
+      std::string get_k_zero_point_output_name =
+          "get_k_zero_point_output_" + std::to_string(i);
+      std::string get_v_scale_output_name =
+          "get_v_scale_output_" + std::to_string(i);
+      std::string get_v_zero_point_output_name =
+          "get_v_zero_point_output_" + std::to_string(i);
+
+      if (module_->method_names()->count(get_k_scale_output_name) > 0) {
+        output_k_cache_scales_[i] = static_cast<float>(
+            ET_UNWRAP(module_->get(get_k_scale_output_name)).toDouble());
+      } else {
+        ET_LOG(Error, "Cannot find method %s", get_k_scale_output_name.c_str());
+        return Error::Internal;
+      }
+      if (module_->method_names()->count(get_k_zero_point_output_name) > 0) {
+        output_k_cache_zero_points_[i] = static_cast<T>(
+            ET_UNWRAP(module_->get(get_k_zero_point_output_name)).toInt());
+      } else {
+        ET_LOG(
+            Error,
+            "Cannot find method %s",
+            get_k_zero_point_output_name.c_str());
+        return Error::Internal;
+      }
+      if (module_->method_names()->count(get_v_scale_output_name) > 0) {
+        output_v_cache_scales_[i] = static_cast<float>(
+            ET_UNWRAP(module_->get(get_v_scale_output_name)).toDouble());
+      } else {
+        ET_LOG(Error, "Cannot find method %s", get_v_scale_output_name.c_str());
+        return Error::Internal;
+      }
+      if (module_->method_names()->count(get_v_zero_point_output_name) > 0) {
+        output_v_cache_zero_points_[i] = static_cast<T>(
+            ET_UNWRAP(module_->get(get_v_zero_point_output_name)).toInt());
+      } else {
+        ET_LOG(
+            Error,
+            "Cannot find method %s",
+            get_v_zero_point_output_name.c_str());
+        return Error::Internal;
+      }
+    }
+    // Load scale and zero point for quantized input KV cache
+    input_k_cache_scales_.resize(num_layers);
+    input_k_cache_zero_points_.resize(num_layers);
+    input_v_cache_scales_.resize(num_layers);
+    input_v_cache_zero_points_.resize(num_layers);
+    for (int i = 0; i < num_layers; i++) {
+      std::string get_k_scale_input_name =
+          "get_k_scale_input_" + std::to_string(i);
+      std::string get_k_zero_point_input_name =
+          "get_k_zero_point_input_" + std::to_string(i);
+      std::string get_v_scale_input_name =
+          "get_v_scale_input_" + std::to_string(i);
+      std::string get_v_zero_point_input_name =
+          "get_v_zero_point_input_" + std::to_string(i);
+      if (module_->method_names()->count(get_k_scale_input_name) > 0) {
+        input_k_cache_scales_[i] = static_cast<float>(
+            ET_UNWRAP(module_->get(get_k_scale_input_name)).toDouble());
+      } else {
+        ET_LOG(Error, "Cannot find method %s", get_k_scale_input_name.c_str());
+        return Error::Internal;
+      }
+      if (module_->method_names()->count(get_k_zero_point_input_name) > 0) {
+        input_k_cache_zero_points_[i] = static_cast<T>(
+            ET_UNWRAP(module_->get(get_k_zero_point_input_name)).toInt());
+      } else {
+        ET_LOG(
+            Error,
+            "Cannot find method %s",
+            get_k_zero_point_input_name.c_str());
+        return Error::Internal;
+      }
+      if (module_->method_names()->count(get_v_scale_input_name) > 0) {
+        input_v_cache_scales_[i] = static_cast<float>(
+            ET_UNWRAP(module_->get(get_v_scale_input_name)).toDouble());
+      } else {
+        ET_LOG(Error, "Cannot find method %s", get_v_scale_input_name.c_str());
+        return Error::Internal;
+      }
+      if (module_->method_names()->count(get_v_zero_point_input_name) > 0) {
+        input_v_cache_zero_points_[i] = static_cast<T>(
+            ET_UNWRAP(module_->get(get_v_zero_point_input_name)).toInt());
+      } else {
+        ET_LOG(
+            Error,
+            "Cannot find method %s",
+            get_v_zero_point_input_name.c_str());
+        return Error::Internal;
+      }
+    }
+  }
+
+  // Initialize EmbeddingGenerator
+  embedding_generator_ = std::make_unique<EmbeddingProcessor>(
+      embedding_runner_.get(),
+      token_embedding_method_name,
+      EmbeddingProcessor::Metadata{
+          context_len_,
+          token_generator_ar_len,
+          vocab_size,
+          use_int64_token,
+          static_cast<int32_t>(dim)});
+  if (eval_mode_ == EvalMode::kLookaheadDecoding) {
+    // Initialize TokenGenerator
+    token_generator_ = std::make_unique<MultimodalLhdTokenGenerator<T>>(
+        tokenizer_.get(),
+        embedding_generator_.get(),
+        decoder_runner_.get(),
+        kv_manager_.get(),
+        token_generator_method_name,
+        std::move(eos_ids),
+        typename MultimodalLhdTokenGenerator<T>::Metadata{
+            context_len_,
+            num_heads,
+            num_layers,
+            token_generator_ar_len,
+            vocab_size,
+            use_int64_token,
+            ngram_,
+            window_,
+            gcap_,
+            sliding_window,
+            cache_mode_,
+            static_cast<int32_t>(dim)},
+        &stats_);
+  } else {
+    token_generator_ = std::make_unique<MultimodalTokenGenerator<T>>(
+        tokenizer_.get(),
+        embedding_generator_.get(),
+        decoder_runner_.get(),
+        kv_manager_.get(),
+        token_generator_method_name,
+        std::move(eos_ids),
+        typename MultimodalTokenGenerator<T>::Metadata{
+            context_len_,
+            num_heads,
+            num_layers,
+            token_generator_ar_len,
+            vocab_size,
+            use_int64_token,
+            sliding_window,
+            cache_mode_,
+            static_cast<int32_t>(dim)},
+        &stats_);
+  }
+
+  buffer_manager_ = std::make_unique<ClientMem>();
+  if (shared_buffer_) {
+    buffer_manager_ = std::make_unique<RpcMem>(
+        kv_manager_->total_cache_size_in_bytes(),
+        prompt_processor_->total_prompt_processor_io_size_in_bytes(),
+        token_generator_->total_token_generator_io_size_in_bytes(),
+        embedding_processor_->total_embedding_processor_io_size_in_bytes(),
+        embedding_generator_->total_embedding_processor_io_size_in_bytes());
+  }
+
+  ET_LOG(Info, "creating io_memory");
+  // prepare io
+  kv_manager_->init_cache(buffer_manager_.get(), prompt_processor_ar_len);
+  prompt_processor_->init_io(
+      buffer_manager_.get(),
+      module_->method_meta(prompt_processor_method_name));
+  token_generator_->init_io(
+      buffer_manager_.get(), module_->method_meta(token_generator_method_name));
+  // Prepare io for embedding
+  embedding_processor_->init_io(
+      buffer_manager_.get(),
+      embedding_module_->method_meta(prompt_embedding_method_name));
+  embedding_generator_->init_io(
+      buffer_manager_.get(),
+      embedding_module_->method_meta(token_embedding_method_name));
+  return Error::Ok;
+}
+
+template <typename T>
+Error MultimodalRunner<T>::generate(
+    const std::string& prompt,
+    const llm::GenerationConfig& config,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+  return generate_from_prompt_or_file(
+      prompt, false, config, token_callback, stats_callback);
+}
+
+template <typename T>
+Error MultimodalRunner<T>::generate_from_prompt_or_file(
+    const std::string& prompt,
+    bool tokenized_prompt,
+    const llm::GenerationConfig& config,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+  ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
+  if (!is_loaded()) {
+    stats_.model_load_start_ms = time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_.model_load_end_ms = time_in_ms();
+  }
+  stats_.inference_start_ms = time_in_ms();
+
+  int32_t seq_len = config.seq_len;
+  if (seq_len > context_len_) {
+    ET_LOG(
+        Info,
+        "Warning: Requested seq_len (%d) exceeds compiled max_seq_len (%d). Clamping to %d.",
+        seq_len,
+        context_len_,
+        context_len_);
+    seq_len = context_len_;
+  } else if (seq_len <= 0) {
+    ET_LOG(
+        Info,
+        "Warning: Invalid seq_len (%d). Using compiled max_seq_len (%d).",
+        seq_len,
+        context_len_);
+    seq_len = context_len_;
+  }
+  // For multimodal, we will disable n_bos
+  int32_t n_bos = 0;
+
+  // encode the (string) prompt into tokens sequence
+  std::vector<uint64_t> prompt_tokens;
+  if (tokenized_prompt) {
+    std::ifstream inFile(prompt, std::ios::binary);
+    if (inFile.is_open()) {
+      // Get file size
+      inFile.seekg(0, std::ios::end);
+      size_t fileSize = inFile.tellg();
+      inFile.seekg(0, std::ios::beg);
+
+      // Resize vector and read raw data
+      prompt_tokens.resize(fileSize / sizeof(uint64_t));
+
+      inFile.read(reinterpret_cast<char*>(prompt_tokens.data()), fileSize);
+      inFile.close();
+    } else {
+      ET_CHECK_MSG(
+          false,
+          "Unable to read tokenized prompt from file: %s",
+          prompt.c_str());
+    }
+  } else {
+    tokenizers::Result<std::vector<uint64_t>> encode_res =
+        tokenizer_->encode(prompt, n_bos, 0);
+    ET_CHECK_TK_OK_OR_RETURN_ERROR(
+        encode_res.error(), "failed to encode prompt %s", prompt.c_str());
+    prompt_tokens = encode_res.get();
+  }
+  int num_prompt_tokens = prompt_tokens.size();
+  ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
+  ET_CHECK_MSG(
+      cur_pos_ + num_prompt_tokens < seq_len,
+      "sequence length exceeded - please increase the seq_len value");
+
+  // Prompt Processor first
+  if (token_callback && config.echo) {
+    token_callback(prompt);
+  }
+  bool dump_logits = dump_logits_path_.empty() ? false : true;
+  embedding_processor_->prefill(prompt_tokens);
+  const TensorStruct<float>& text_embeddings =
+      embedding_processor_->get_prompt_embeddings();
+  int64_t embedding_dim = text_embeddings.tensor->size(2);
+
+  uint64_t placeholder_token_id = 0;
+  if (module_->method_names()->count("modality_placeholder_token_id") > 0) {
+    placeholder_token_id =
+        module_->get("modality_placeholder_token_id")->toInt();
+  }
+
+  ET_LOG(Info, "Merging text embeddings with image hidden states");
+  merge_multimodal_embeddings(
+      prompt_tokens, text_embeddings, placeholder_token_id);
+
+  auto prefill_res =
+      prompt_processor_->prefill(merged_embeddings_, cur_pos_, dump_logits);
+  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
+  uint64_t cur_token = prefill_res.get();
+  cur_pos_ += num_prompt_tokens;
+  stats_.first_token_ms = time_in_ms();
+  stats_.prompt_eval_end_ms = time_in_ms();
+
+  // print the first token from prefill. No prev_token so use cur_token for
+  // it.
+  if (token_callback) {
+    token_callback(
+        ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
+  }
+  ET_LOG(
+      Info,
+      "RSS after prompt prefill: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // start the main loop
+  prompt_tokens.push_back(cur_token);
+
+  // Requant kv cache for prefill decode I/O
+  if (eval_mode_ == EvalMode::kLookaheadDecoding ||
+      eval_mode_ == EvalMode::kHybrid) {
+    int64_t num_heads = prompt_processor_->get_num_heads();
+    int64_t num_layers = prompt_processor_->get_num_layers();
+    int64_t head_dim = kv_manager_->get_head_dim();
+    std::vector<KVCache<T>> k_cache_ptrs = kv_manager_->get_k_cache_();
+    std::vector<KVCache<T>> v_cache_ptrs = kv_manager_->get_v_cache_();
+
+    const int64_t num_elems_per_layer =
+        (context_len_ - 1) * num_heads * head_dim;
+    // Requant kv cache from prefill output scale/zero_point to decode input
+    // scale/zero_point
+    for (int layer_idx = 0; layer_idx < num_layers; layer_idx++) {
+      T* k_cache_data = k_cache_ptrs[layer_idx].buffer;
+      T* v_cache_data = v_cache_ptrs[layer_idx].buffer;
+
+      const float scale_ratio_k =
+          output_k_cache_scales_[layer_idx] / input_k_cache_scales_[layer_idx];
+      const float scale_ratio_v =
+          output_v_cache_scales_[layer_idx] / input_v_cache_scales_[layer_idx];
+
+      for (int64_t i = 0; i < num_elems_per_layer; i++) {
+        // Requant k_cache_data from prefill output scale/zero_point to decode
+        // input scale/zero_point
+        k_cache_data[i] = static_cast<T>(
+            (k_cache_data[i] - output_k_cache_zero_points_[layer_idx]) *
+                scale_ratio_k +
+            input_k_cache_zero_points_[layer_idx]);
+
+        // Requant v_cache_data from prefill output scale/zero_point to decode
+        // input scale/zero_point
+        v_cache_data[i] = static_cast<T>(
+            (v_cache_data[i] - output_v_cache_zero_points_[layer_idx]) *
+                scale_ratio_v +
+            input_v_cache_zero_points_[layer_idx]);
+      }
+    }
+  }
+
+  int64_t num_generated_tokens = ET_UNWRAP(token_generator_->generate(
+      prompt_tokens, cur_pos_, seq_len, token_callback, dump_logits));
+  stats_.inference_end_ms = time_in_ms();
+  ET_LOG(
+      Info,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+  cur_pos_ += num_generated_tokens;
+  if (cur_pos_ == seq_len) {
+    ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
+  }
+
+  stats_.num_prompt_tokens = num_prompt_tokens;
+  stats_.num_generated_tokens = num_generated_tokens;
+  print_report(stats_);
+  print_performance_report(stats_, performance_output_path_);
+  if (dump_logits) {
+    save_logits(
+        dump_logits_path_,
+        prompt_processor_->get_all_logits(),
+        token_generator_->get_all_logits());
+  }
+  if (stats_callback) {
+    stats_callback(stats_);
+  }
+  return Error::Ok;
+}
+
+template <typename T>
+void MultimodalRunner<T>::merge_multimodal_embeddings(
+    const std::vector<uint64_t>& input_ids,
+    const TensorStruct<float>& text_embeddings,
+    uint64_t placeholder_token_id) {
+  // This implements the modality_inputs_merger logic from decoder_utils.py
+  // Find positions where placeholder tokens appear
+  std::vector<size_t> placeholder_positions;
+  for (size_t i = 0; i < input_ids.size(); ++i) {
+    if (input_ids[i] == placeholder_token_id) {
+      placeholder_positions.push_back(i);
+    }
+  }
+
+  int64_t embedding_dim;
+  int64_t num_tokens = input_ids.size();
+  if (text_embeddings.tensor) {
+    embedding_dim = text_embeddings.tensor->size(2);
+    num_tokens = text_embeddings.tensor->size(1);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "text_embeddings.tensor is null; cannot determine embedding dim during multimodal embedding merge");
+  }
+
+  // Allocate new buffer for merged embeddings
+  size_t total_elements = num_tokens * embedding_dim;
+  multimodal_embeddings_buffer_.resize(total_elements);
+
+  // First, copy all text embeddings to the new buffer
+  std::memcpy(
+      multimodal_embeddings_buffer_.data(),
+      text_embeddings.data,
+      total_elements * sizeof(float));
+
+  // Then replace placeholder positions with image hidden states
+  auto* image_data = image_hidden_states_->const_data_ptr<float>();
+  auto* merged_data = multimodal_embeddings_buffer_.data();
+
+  int64_t image_seq_len = image_hidden_states_->size(1);
+
+  // Copy image hidden states to placeholder positions
+  for (int32_t i = 0; i < placeholder_positions.size(); ++i) {
+    int32_t pos = placeholder_positions[i];
+    std::memcpy(
+        merged_data + pos * embedding_dim,
+        image_data + i * embedding_dim,
+        embedding_dim * sizeof(float));
+  }
+
+  merged_embeddings_.data = multimodal_embeddings_buffer_.data();
+  merged_embeddings_.size = total_elements * sizeof(float);
+
+  // Create TensorImpl with proper shape [1, num_tokens, embedding_dim]
+  multimodal_embeddings_sizes_ = {
+      1, static_cast<int>(num_tokens), static_cast<int>(embedding_dim)};
+  multimodal_embeddings_dim_order_ = {0, 1, 2};
+  merged_embeddings_.tensor = std::make_unique<executorch::aten::TensorImpl>(
+      executorch::aten::ScalarType::Float,
+      multimodal_embeddings_sizes_.size(),
+      multimodal_embeddings_sizes_.data(),
+      merged_embeddings_.data,
+      multimodal_embeddings_dim_order_.data());
+
+  ET_LOG(Info, "Multimodal embeddings merged successfully");
+}
+
+template <typename T>
+Result<MultimodalDecoderModelVersion>
+MultimodalRunner<T>::get_decoder_model_version() {
+  if (!is_loaded()) {
+    stats_.model_load_start_ms = time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_.model_load_end_ms = time_in_ms();
+  }
+  return decoder_model_version_;
+}
+
+// Explicit instantiations
+template class MultimodalRunner<uint16_t>;
+template class MultimodalRunner<uint8_t>;
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h
new file mode 100644
index 00000000000..b7967964de2
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Multimodal runner that extends the base llama runner with vision capabilities
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/cache_utils.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h>
+#include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+namespace example {
+
+// Extend DecoderModelVersion enum with multimodal models
+enum MultimodalDecoderModelVersion {
+  kSmolvlm = 0,
+  kInternvl3,
+};
+
+enum KvBitWidth {
+  kWidth8 = 8,
+  kWidth16 = 16,
+};
+
+template <typename T>
+class MultimodalRunner : public executorch::extension::llm::IRunner {
+ public:
+  explicit MultimodalRunner(
+      std::unique_ptr<executorch::extension::Module> module,
+      std::unique_ptr<executorch::extension::Module> embedding_module,
+      const std::string& decoder_model,
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      const std::string& performance_output_path,
+      const std::string& dump_logits_path,
+      const float temperature = 0.8f,
+      const int eval_mode = EvalMode::kHybrid,
+      const bool shared_buffer = false,
+      const int ngram = 0,
+      const int window = 0,
+      const int gcap = 0,
+      std::unique_ptr<executorch::aten::Tensor> image_hidden_states = nullptr);
+
+  bool is_loaded() const override;
+  executorch::runtime::Error load() override;
+
+  // Override generate to support multimodal inputs
+  executorch::runtime::Error generate(
+      const std::string& prompt,
+      const executorch::extension::llm::GenerationConfig& config,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const executorch::llm::Stats&)> stats_callback = {})
+      override;
+
+  // Multimodal-specific generation with image embeddings
+  executorch::runtime::Error generate_from_prompt_or_file(
+      const std::string& prompt,
+      bool tokenized_prompt,
+      const executorch::extension::llm::GenerationConfig& config,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const executorch::llm::Stats&)> stats_callback = {});
+  void stop() override {};
+  void reset() override {};
+  executorch::runtime::Result<MultimodalDecoderModelVersion>
+  get_decoder_model_version();
+
+  // Multimodal-specific method for merging embeddings
+  void merge_multimodal_embeddings(
+      const std::vector<uint64_t>& input_ids,
+      const TensorStruct<float>& text_embeddings,
+      uint64_t placeholder_token_id);
+
+ private:
+  enum EvalMode {
+    kKVCached = 0,
+    kHybrid,
+    kLookaheadDecoding,
+    kUnsupported,
+  };
+
+  // Modules
+  std::unique_ptr<executorch::extension::Module> module_;
+  std::unique_ptr<executorch::extension::Module> embedding_module_;
+
+  int32_t context_len_{0};
+
+  int ngram_{0};
+  int window_{0};
+  int gcap_{0};
+
+  // Defaults to StaticCahce, indicating that the model does not use a
+  // global/local architecture.
+  CacheMode cache_mode_{CacheMode::StaticCahce};
+  int64_t cur_pos_{0};
+
+  std::string tokenizer_path_;
+  std::string performance_output_path_;
+  std::string dump_logits_path_;
+  float temperature_;
+  EvalMode eval_mode_;
+  bool shared_buffer_;
+
+  MultimodalDecoderModelVersion decoder_model_version_;
+  std::unique_ptr<IMemAlloc> buffer_manager_;
+  std::unique_ptr<KVManager<T>> kv_manager_;
+  std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
+  std::unique_ptr<DecoderRunner> decoder_runner_;
+  std::unique_ptr<MultimodalPromptProcessor<T>> prompt_processor_;
+  std::unique_ptr<MultimodalTokenGenerator<T>> token_generator_;
+  std::unique_ptr<EmbeddingRunner> embedding_runner_;
+  std::unique_ptr<EmbeddingProcessor> embedding_processor_;
+  std::unique_ptr<EmbeddingProcessor> embedding_generator_;
+
+  // Image hidden states storage
+  std::unique_ptr<executorch::aten::Tensor> image_hidden_states_;
+
+  // Multimodal embeddings storage
+  std::vector<float> multimodal_embeddings_buffer_;
+  std::vector<executorch::aten::TensorImpl::SizesType>
+      multimodal_embeddings_sizes_;
+  std::vector<executorch::aten::TensorImpl::DimOrderType>
+      multimodal_embeddings_dim_order_;
+  TensorStruct<float> merged_embeddings_;
+
+  // scale and zero point for quantized KV cache
+  std::vector<float> input_k_cache_scales_;
+  std::vector<T> input_k_cache_zero_points_;
+  std::vector<float> input_v_cache_scales_;
+  std::vector<T> input_v_cache_zero_points_;
+  std::vector<float> output_k_cache_scales_;
+  std::vector<T> output_k_cache_zero_points_;
+  std::vector<float> output_v_cache_scales_;
+  std::vector<T> output_v_cache_zero_points_;
+
+  // stats
+  executorch::llm::Stats stats_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp
new file mode 100644
index 00000000000..89b8614d407
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h>
+#include <numeric>
+using executorch::aten::TensorImpl;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+namespace example {
+// Constructor with embedding runner support
+template <typename T>
+MultimodalTokenGenerator<T>::MultimodalTokenGenerator(
+    tokenizers::Tokenizer* tokenizer,
+    EmbeddingProcessor* embedding_runner,
+    DecoderRunner* decoder_runner,
+    KVManager<T>* kv_manager,
+    const std::string& method_name,
+    std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
+    Metadata metadata,
+    executorch::llm::Stats* stats)
+    : TokenGenerator<T>(
+          tokenizer,
+          decoder_runner,
+          kv_manager,
+          method_name,
+          std::move(eos_ids),
+          {metadata.context_len,
+           metadata.num_heads,
+           metadata.num_layers,
+           metadata.ar_len,
+           metadata.vocab_size,
+           metadata.use_int64_token,
+           metadata.sliding_window,
+           metadata.cache_mode},
+          stats),
+      embedding_runner_(embedding_runner),
+      metadata_(metadata) {
+  // Set input_toks_.size to 0 since we use embeddings instead
+  input_toks_.size = 0;
+  input_embedding_.size =
+      metadata_.ar_len * metadata_.embedding_dim * sizeof(float);
+}
+
+template <typename T>
+void MultimodalTokenGenerator<T>::init_io(
+    IMemAlloc* buffer_manager,
+    Result<MethodMeta> method_meta) {
+  size_t idx = 0;
+  input_tensors_.reserve(method_meta->num_inputs());
+  output_tensors_.reserve(method_meta->num_outputs());
+
+  // [I]: input embedding
+  Result<TensorInfo> input_embedding = method_meta->input_tensor_meta(idx++);
+  input_embedding_.data =
+      reinterpret_cast<float*>(buffer_manager->allocate(input_embedding_.size));
+  input_embedding_.tensor = std::make_unique<TensorImpl>(
+      input_embedding->scalar_type(),
+      input_embedding->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_embedding->sizes().data()),
+      input_embedding_.data,
+      const_cast<TensorImpl::DimOrderType*>(
+          input_embedding->dim_order().data()));
+  input_tensors_.emplace_back(input_embedding_.tensor.get());
+  buffer_manager->add_memory_info(
+      input_embedding_.data, input_embedding_.size, input_embedding.get());
+
+  // [I]: attention_mask
+  Result<TensorInfo> attention_mask = method_meta->input_tensor_meta(idx++);
+  attention_mask_.data = reinterpret_cast<uint16_t*>(
+      buffer_manager->allocate(attention_mask_.size));
+  attention_mask_.tensor = std::make_unique<TensorImpl>(
+      attention_mask->scalar_type(),
+      attention_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(attention_mask->sizes().data()),
+      attention_mask_.data,
+      const_cast<TensorImpl::DimOrderType*>(
+          attention_mask->dim_order().data()));
+  input_tensors_.emplace_back(attention_mask_.tensor.get());
+  buffer_manager->add_memory_info(
+      attention_mask_.data, attention_mask_.size, attention_mask.get());
+
+  // [I]: sliding window attention_mask
+  if (metadata_.cache_mode == CacheMode::HybridCache) {
+    Result<TensorInfo> window_attention_mask =
+        method_meta->input_tensor_meta(idx++);
+    window_attention_mask_.data = reinterpret_cast<uint16_t*>(
+        buffer_manager->allocate(window_attention_mask_.size));
+    window_attention_mask_.tensor = std::make_unique<TensorImpl>(
+        window_attention_mask->scalar_type(),
+        window_attention_mask->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(
+            window_attention_mask->sizes().data()),
+        window_attention_mask_.data,
+        const_cast<TensorImpl::DimOrderType*>(
+            window_attention_mask->dim_order().data()));
+    input_tensors_.emplace_back(window_attention_mask_.tensor.get());
+    buffer_manager->add_memory_info(
+        window_attention_mask_.data,
+        window_attention_mask_.size,
+        window_attention_mask.get());
+  }
+
+  // [I]: input_pos
+  Result<TensorInfo> input_pos = method_meta->input_tensor_meta(idx++);
+  input_pos_.data =
+      reinterpret_cast<int32_t*>(buffer_manager->allocate(input_pos_.size));
+  input_pos_.tensor = std::make_unique<TensorImpl>(
+      input_pos->scalar_type(),
+      input_pos->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
+      input_pos_.data,
+      const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
+  input_tensors_.emplace_back(input_pos_.tensor.get());
+  buffer_manager->add_memory_info(
+      input_pos_.data, input_pos_.size, input_pos.get());
+
+  // [I] kv_cache
+  size_t index = idx; // bypass input_tokens, atten_mask, input_pos
+  for (int cache_group = 0; cache_group < 2; ++cache_group) {
+    std::vector<std::unique_ptr<TensorImpl>>& cache =
+        (cache_group == 0 ? k_cache_in_ : v_cache_in_);
+    std::vector<KVCache<T>> cache_ptrs = (cache_group == 0)
+        ? kv_manager_->get_k_cache_()
+        : kv_manager_->get_v_cache_();
+    for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) {
+      Result<TensorInfo> kv_cache = method_meta->input_tensor_meta(index);
+
+      T* cache_ptr = cache_ptrs[layer].buffer;
+
+      cache[layer] = std::make_unique<TensorImpl>(
+          kv_cache->scalar_type(),
+          kv_cache->sizes().size(),
+          const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+          cache_ptr,
+          const_cast<TensorImpl::DimOrderType*>(kv_cache->dim_order().data()));
+      input_tensors_.emplace_back(cache[layer].get());
+      buffer_manager->add_memory_info(
+          cache_ptr, cache[layer]->nbytes(), kv_cache.get());
+    }
+  }
+
+  // [O]: logits
+  Result<TensorInfo> logits = method_meta->output_tensor_meta(0);
+  logits_.data =
+      reinterpret_cast<uint16_t*>(buffer_manager->allocate(logits_.size));
+  logits_.tensor = std::make_unique<TensorImpl>(
+      logits->scalar_type(),
+      logits->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+      logits_.data,
+      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+  output_tensors_.emplace_back(logits_.tensor.get());
+  buffer_manager->add_memory_info(logits_.data, logits_.size, logits.get());
+
+  // [O] kv_cache
+  index = 1;
+  for (int cache_group = 0; cache_group < 2; ++cache_group) {
+    std::vector<std::unique_ptr<TensorImpl>>& cache =
+        (cache_group == 0 ? k_cache_out_ : v_cache_out_);
+    std::vector<KVCache<T>> cache_ptrs = (cache_group == 0)
+        ? kv_manager_->get_k_cache_()
+        : kv_manager_->get_v_cache_();
+    for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) {
+      Result<TensorInfo> kv_cache = method_meta->output_tensor_meta(index);
+      T* cache_ptr = cache_ptrs[layer].output_buffer;
+      cache[layer] = std::make_unique<TensorImpl>(
+          kv_cache->scalar_type(),
+          kv_cache->sizes().size(),
+          const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+          cache_ptr,
+          const_cast<TensorImpl::DimOrderType*>(kv_cache->dim_order().data()));
+      output_tensors_.emplace_back(cache[layer].get());
+      buffer_manager->add_memory_info(
+          cache_ptr, cache[layer]->nbytes(), kv_cache.get());
+    }
+  }
+
+  // Prepare the vector of EValue to run inference
+  inputs_.reserve(input_tensors_.size());
+  for (auto& input_tensor : input_tensors_) {
+    inputs_.emplace_back(std::move(input_tensor));
+  }
+}
+
+// This function only considers the case where token_generator_ar_len equals 1.
+template <typename T>
+void MultimodalTokenGenerator<T>::prepare_io(
+    uint64_t cur_token,
+    int64_t start_pos) {
+  // Generate embedding for current token using embedding runner
+  embedding_runner_->prefill({cur_token});
+  const TensorStruct<float>& text_embeddings =
+      embedding_runner_->get_prompt_embeddings();
+  int64_t embedding_dim = text_embeddings.tensor->size(2);
+  // Copy embedding to input buffer
+  std::memcpy(
+      input_embedding_.data,
+      text_embeddings.data,
+      metadata_.ar_len * embedding_dim * sizeof(float));
+
+  // update position_ids
+  *input_pos_.data = static_cast<int32_t>(start_pos);
+}
+
+// Explicit instantiations
+template class MultimodalTokenGenerator<uint16_t>;
+template class MultimodalTokenGenerator<uint8_t>;
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h
new file mode 100644
index 00000000000..b010bf3748e
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/token_generator.h>
+
+namespace example {
+
+/**
+ * @class MultimodalTokenGenerator
+ * @brief Extended TokenGenerator with multimodal embedding support
+ */
+template <typename T>
+class MultimodalTokenGenerator : public example::TokenGenerator<T> {
+ public:
+  struct Metadata {
+    int32_t context_len;
+    int64_t num_heads;
+    int64_t num_layers;
+    int32_t ar_len;
+    int32_t vocab_size;
+    bool use_int64_token;
+    int sliding_window;
+    CacheMode cache_mode;
+    int32_t embedding_dim = 0;
+  };
+
+  // Constructor with embedding generator support
+  MultimodalTokenGenerator(
+      tokenizers::Tokenizer* tokenizer,
+      EmbeddingProcessor* embedding_runner,
+      DecoderRunner* decoder_runner,
+      KVManager<T>* kv_manager,
+      const std::string& method_name,
+      std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
+      Metadata metadata,
+      executorch::llm::Stats* stats);
+
+  virtual ~MultimodalTokenGenerator() = default;
+
+  /**
+   * @brief Initialize I/O tensor and allocate I/O data buffer with embedding
+   * support.
+   */
+  void init_io(
+      IMemAlloc* buffer_manager,
+      executorch::runtime::Result<executorch::runtime::MethodMeta> method_meta)
+      override;
+
+  inline const size_t total_token_generator_io_size_in_bytes() const {
+    if (metadata_.cache_mode == CacheMode::HybridCache) {
+      return input_toks_.size + input_pos_.size + attention_mask_.size +
+          window_attention_mask_.size + logits_.size + input_embedding_.size;
+    } else {
+      return input_toks_.size + input_pos_.size + attention_mask_.size +
+          logits_.size + input_embedding_.size;
+    }
+  }
+
+ protected:
+  // Reuse members from token_generator
+  using TokenGenerator<T>::kv_manager_;
+  using TokenGenerator<T>::input_pos_;
+  using TokenGenerator<T>::attention_mask_;
+  using TokenGenerator<T>::window_attention_mask_;
+  using TokenGenerator<T>::inputs_;
+  using TokenGenerator<T>::input_tensors_;
+  using TokenGenerator<T>::output_tensors_;
+
+  // Additional members specific to multimodal
+  TensorStruct<float> input_embedding_;
+
+ private:
+  // Reuse members from token_generator
+  using TokenGenerator<T>::input_toks_;
+  using TokenGenerator<T>::logits_;
+  using TokenGenerator<T>::k_cache_in_;
+  using TokenGenerator<T>::v_cache_in_;
+  using TokenGenerator<T>::k_cache_out_;
+  using TokenGenerator<T>::v_cache_out_;
+
+  // Additional members specific to multimodal
+  EmbeddingProcessor* embedding_runner_;
+
+  /**
+   * @brief Fill in I/O buffers with prompt token and position.
+   * @param cur_token Current token.
+   * @param start_pos Starting position.
+   */
+  void prepare_io(uint64_t cur_token, int64_t start_pos) override;
+
+  // metadata specific to multimodal
+  Metadata metadata_;
+};
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
index 5c97e510987..bd2e1cfc8d8 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
@@ -84,7 +84,7 @@ class PromptProcessor {
     }
   }
 
- private:
+ protected:
   // If the cache length is zero, it indicates a BERT model, which does not use
   // position ids or KV cache inputs.
   bool is_bert() const {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
index 67d7ec80aab..f89ee7bde66 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.cpp
@@ -23,6 +23,19 @@ RpcMem::RpcMem(
   shared_buffer_base_ptr_ = QnnExecuTorchAllocCustomMem(
       total_bytes, MemoryAllocator::kDefaultAlignment);
 }
+RpcMem::RpcMem(
+    const size_t total_cache_size,
+    const size_t total_prompt_processor_io_size,
+    const size_t total_token_generator_io_size,
+    const size_t total_embedding_processor_io_size,
+    const size_t total_embedding_generator_io_size)
+    : calculated_offsets_(0) {
+  size_t total_bytes = total_cache_size + total_prompt_processor_io_size +
+      total_token_generator_io_size + total_embedding_processor_io_size +
+      total_embedding_generator_io_size;
+  shared_buffer_base_ptr_ = QnnExecuTorchAllocCustomMem(
+      total_bytes, MemoryAllocator::kDefaultAlignment);
+}
 RpcMem::~RpcMem() {
   QnnExecuTorchFreeCustomMem(shared_buffer_base_ptr_);
 }
diff --git a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
index 99e9cb1dec1..c3dd54d00c7 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h
@@ -19,16 +19,26 @@ namespace example {
  */
 class RpcMem final : public IMemAlloc {
  public:
+  RpcMem(
+      const size_t total_cache_size,
+      const size_t total_prompt_processor_io_size,
+      const size_t total_token_generator_io_size);
   /**
    * @brief Constructor to allocate RpcMem with total sizes.
    * @param total_cache_size Total size of the cache.
    * @param total_prompt_processor_io_size Total size for prompt processor I/O.
    * @param total_token_generator_io_size Total size for token generator I/O.
-   */
+   * @param total_embedding_processor_io_size Total size for embedding prompt
+processor I/O.
+   * @param total_embedding_generator_io_size Total size for embedding generator
+I/O.    */
   RpcMem(
       const size_t total_cache_size,
       const size_t total_prompt_processor_io_size,
-      const size_t total_token_generator_io_size);
+      const size_t total_token_generator_io_size,
+      const size_t total_embedding_processor_io_size,
+      const size_t total_embedding_generator_io_size);
+
   // Disable copy constructors, r-value referencing, etc
   RpcMem(const RpcMem&) = delete;
   RpcMem& operator=(const RpcMem&) = delete;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 54a5c0b1d5c..aae61cec245 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -214,7 +214,6 @@ Error Runner<T>::load() {
     eos_ids->insert(tokenizer_->encode("<|user|>", 0, 0).get()[0]);
   }
 
-  // Try avoid getMetadataHelper as it is time consuming.
   Result<MethodMeta> method_meta =
       module_->method_meta(token_generator_method_name);
 
diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h
index 329a4d49cc6..b74e0ac65aa 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h
@@ -112,14 +112,14 @@ class TokenGenerator {
   // stats
   executorch::llm::Stats* stats_;
 
- private:
   /**
    * @brief Fill in I/O buffers with prompt token and position.
    * @param cur_token Current token.
    * @param start_pos Starting position.
    */
-  void prepare_io(uint64_t cur_token, int64_t start_pos);
+  virtual void prepare_io(uint64_t cur_token, int64_t start_pos);
 
+ private:
   // metadata
   Metadata metadata_;
 
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers.py
index 61190c5d7e9..6be5b702f80 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers.py
@@ -85,6 +85,37 @@
 from transformers import AutoConfig, AutoModel
 
 
+def is_node_src_start_with_name(node: torch.fx.Node, prefix: str) -> bool:
+    """
+    Return True if any NodeSource in node.meta['from_node']
+    has a `name` starting with `prefix`.
+    """
+
+    def has_source_name_prefix(
+        node_src: torch.fx.traceback.NodeSource, prefix: str
+    ) -> bool:
+
+        name = getattr(node_src, "name", None)
+        if isinstance(name, str) and name.startswith(prefix):
+            return True
+
+        children = getattr(node_src, "from_node", None)
+        if not children:
+            return False
+
+        for src in children:
+            if has_source_name_prefix(src, prefix):
+                return True
+
+        return False
+
+    node_srcs = node.meta.get("from_node", None)
+    if not node_srcs:
+        return False
+
+    return any(has_source_name_prefix(node_src, prefix) for node_src in node_srcs)
+
+
 def log_info(func):
     class TimeIt:
         def __init__(self, event):
@@ -173,7 +204,7 @@ def __init__(
         self.mode = mode
         self.passes_job = get_capture_program_passes()
         self.dep_table = get_passes_dependency_for_capture_program()
-        self.meta = None
+        self.meta = {}
         self.quant_recipe: StaticLLMQuantRecipe = (
             self.config.quant_recipe(True) if self.config.quant_recipe else None
         )
@@ -203,7 +234,7 @@ def __init__(
             self.meta = self.decoder.get_metadata()
 
         # check if sharding required
-        if instance and self.config.num_sharding > 1:
+        if self.decoder and self.config.num_sharding > 1:
             SplitGraph, setting = model_sharding.get_split_graph_pass(
                 self.meta["get_n_layers"],
                 shares=self.config.num_sharding,
@@ -237,7 +268,6 @@ def _prepare_model(self):  # noqa: C901
         if (instance := self._get_model_instance()) is None:
             return None
         tok_embedding, decoder = instance
-
         # load parameters for HF models
         if self.control_args.checkpoint is None:
             checkpoint = download_and_convert_hf_checkpoint(
@@ -343,7 +373,6 @@ def permute(w, heads, partial_rotary_dim):
             self.passes_job[I64toI32][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][
                 "skip_node"
             ] = {"tokens"}
-
             if self.apply_embedding:
                 tok_embedding = get_quant_embedding_transform(
                     embedding_quantize=self.control_args.embedding_quantize
@@ -360,13 +389,11 @@ def permute(w, heads, partial_rotary_dim):
     def _get_model_specific_kwargs(self):
         """
         Retrieve model-specific config required for Static LLaMA.
-
         This method handles architecture-specific requirements for both Vision-Language Models (VLMs)
         and Language-only Models (LLMs), extracting necessary config from HuggingFace configs.
 
         """
         kwargs = {}
-
         # Vision-Language Model (VLM)
         # For multimodal models, we need the special token ID that represents image placeholders
         # in the input sequence. This token is used to mark positions where image embeddings
@@ -374,14 +401,12 @@ def _get_model_specific_kwargs(self):
         if hasattr(self.config, VISION_ENCODER):
             hf_config = AutoConfig.from_pretrained(self.config.repo_id)
             kwargs["modality_placeholder_token_id"] = hf_config.image_token_id
-
         # TODO: Support Audio modality
         elif hasattr(self.config, AUDIO_ENCODER):
             raise NotImplementedError(
                 "Audio encoder modality is not currently supported. "
                 "Please provide a valid modality_placeholder_token_id in kwargs."
             )
-
         # Language-only Model (LLM) configuration
         # Handle architecture-specific parameters for models that require special configurations
         # beyond the general Static LLaMA architecture
@@ -440,14 +465,13 @@ def _get_model_instance(self) -> LlamaModel:
                 self.config.repo_id, _attn_implementation="eager"
             )
             tok_embedding = TextEmbedding(
-                auto_model.get_input_embeddings(),
+                auto_model.get_input_embeddings().to(torch.float32),
                 self.model_args.max_batch_size,
                 ar_len,
                 self.model_args.vocab_size,
                 self.model_args.dim,
                 use_i64_token,
             )
-
         # get decoder model
         self.model_args.max_batch_size = 1
         self.model_args.max_seq_len = self.control_args.max_seq_len
@@ -471,6 +495,7 @@ def _get_model_instance(self) -> LlamaModel:
         # get example input
         self.meta = decoder.get_metadata()
         self.example_input = decoder.get_example_inputs()
+        self.get_example_inputs = decoder.get_example_inputs
         self.export_input = (
             self.example_input[0],  # tokens or hidden_states
             *self.example_input[1],  # attn_mask
@@ -508,6 +533,62 @@ def _save_logits_quant_attrs(self):
                             self.meta["get_logits_zero_point"] = output_node.args[2]
                             break
 
+    def _save_input_kv_cache_quant_attrs(self):
+        input_kv_cache_shape = {
+            # single head, k input
+            (
+                self.meta["get_head_dim"],
+                self.meta["get_max_seq_len"] - self.meta["get_ar_len"],
+            ),
+            # single head, v input
+            (
+                self.meta["get_max_seq_len"] - self.meta["get_ar_len"],
+                self.meta["get_head_dim"],
+            ),
+        }
+
+        idx = 0
+        for node in self.decoder.graph.nodes:
+            if (
+                node.op == "placeholder"
+                and len(users := list(node.users)) == 1
+                and "val" in node.meta
+            ):
+                if node.meta["val"].size()[-2:] in input_kv_cache_shape:
+                    scale_cache_name = f"get_k_scale_input_{idx}"
+                    zero_point_cache_name = f"get_k_zero_point_input_{idx}"
+                    if idx >= self.meta["get_n_layers"]:
+                        scale_cache_name = (
+                            f"get_v_scale_input_{idx % self.meta['get_n_layers']}"
+                        )
+                        zero_point_cache_name = (
+                            f"get_v_zero_point_input_{idx % self.meta['get_n_layers']}"
+                        )
+                    self.meta[scale_cache_name] = users[0].args[1]
+                    self.meta[zero_point_cache_name] = users[0].args[2]
+                    idx += 1
+
+    def _save_output_kv_cache_quant_attrs(self):
+        output_kv_cache_shape = {
+            (self.meta["get_head_dim"], self.meta["get_ar_len"]),
+            (self.meta["get_ar_len"], self.meta["get_head_dim"]),
+        }
+        k_idx = 0
+        v_idx = 0
+        for node in self.decoder.graph.nodes:
+            if not is_graph_output(node):
+                continue
+            cache_output_node = node.args[0].args[0]
+            if cache_output_node.meta["val"].size()[-2:] in output_kv_cache_shape:
+                if is_node_src_start_with_name(cache_output_node, "k_"):
+                    self.meta[f"get_k_scale_output_{k_idx}"] = node.args[1]
+                    self.meta[f"get_k_zero_point_output_{k_idx}"] = node.args[2]
+                    k_idx += 1
+                elif is_node_src_start_with_name(cache_output_node, "v_"):
+                    self.meta[f"get_v_scale_output_{v_idx}"] = node.args[1]
+                    self.meta[f"get_v_zero_point_output_{v_idx}"] = node.args[2]
+                    v_idx += 1
+
     def _tag_ios(self, node, fixed_point_type):
         # shape of k caches and v caches
         kv_cache_shape = {
@@ -601,7 +682,7 @@ def _calibrate(
         if has_task_calibration and not is_multimodal:
             graph_module_inference(
                 use_kv_cache=self.meta["get_use_kv_cache"],
-                example_input=self.example_input,
+                get_example_inputs=self.get_example_inputs,
                 module=model,
                 tokenizer=tokenizer,
                 ar_len=self.meta["get_ar_len"],
@@ -627,7 +708,7 @@ def _calibrate(
         for prompt in user_calibration_data:
             graph_module_inference(
                 use_kv_cache=self.meta["get_use_kv_cache"],
-                example_input=self.example_input,
+                get_example_inputs=self.get_example_inputs,
                 hidden_states=intermediate_outputs,  # hidden_states for multimodal
                 module=model,
                 tok_embedding=tok_embedding,
@@ -739,26 +820,36 @@ def quantize(self, request: Request):  # noqa: C901
                     intermediate_outputs=image_embedding,
                 )
 
-        # propagate kv cache quantization attributes for prefill model
-        if self.mode == self.Mode.DECODE:
-            kv_quant_attrs, output_indices = {}, 0
-            for node in self.decoder.graph.nodes:
-                if node.op == "output":
-                    for output in node.args[0]:
-                        kv_quant_attrs[output_indices] = output.args[1:]
-                        output_indices += 1
-                    break
-
-            data.custom_annotation += (
-                partial(
-                    annotate_prefill_kv_output,
-                    kv_quant_attrs=kv_quant_attrs,
-                ),
-            )
-
         # save logit's quantization attributes to meta
         self._save_logits_quant_attrs()
 
+        # LLM: propagate kv cache quantization attributes for prefill model
+        if not self.apply_embedding:
+            if self.mode == self.Mode.DECODE:
+                kv_quant_attrs, output_indices = {}, 0
+                for node in self.decoder.graph.nodes:
+                    if node.op == "output":
+                        for output in node.args[0]:
+                            kv_quant_attrs[output_indices] = output.args[1:]
+                            output_indices += 1
+                        break
+
+                data.custom_annotation += (
+                    partial(
+                        annotate_prefill_kv_output,
+                        kv_quant_attrs=kv_quant_attrs,
+                    ),
+                )
+        # MultiModal: save kv cache IO quantization attributes to requant kv cache from prefill output scale/zero_point to decode input scale/zero_point
+        else:
+            # save input kv cache's quantization attributes to meta
+            if self.mode == self.Mode.DECODE:
+                self._save_input_kv_cache_quant_attrs()
+
+            # save output kv cache's quantization attributes to meta
+            if self.mode == self.Mode.PREFILL:
+                self._save_output_kv_cache_quant_attrs()
+
         # setup quantized IO
         self.passes_job[TagQuantIO][QCOM_PASS_ACTIVATE_KEY] = True
         self.passes_job[TagQuantIO][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][
@@ -909,7 +1000,7 @@ def compile(self, request: Request):  # noqa: C901
             module=dict(zip(graph_names, [model.decoder for model in models])),
             inputs=dict(zip(graph_names, example_inputs)),
             compiler_specs=dict(zip(graph_names, data.compile_spec)),
-            constant_methods=self.decode.meta,
+            constant_methods={**self.prefill.meta, **self.decode.meta},
             dep_table=dict(zip(graph_names, [model.dep_table for model in models])),
             passes_job=dict(zip(graph_names, [model.passes_job for model in models])),
             skip_node_op_set={"llama.fallback.default"},