pytorch · cccclai · Jan 26, 2026 · Nov 3, 2025
@@ -6529,70 +6529,55 @@ def test_qwen2_5(self):
 
 
 class TestExampleMultimodalityScript(TestQNN):
-    def test_smolvlm_500m_instruct(self):
-        if not self.required_envs():
-            self.skipTest("missing required envs")
 
-        prompt = "Can you describe this image?"
-        cmds = [
-            "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
-            "--artifact",
-            self.artifact_dir,
-            "--build_folder",
-            self.build_folder,
-            "--model",
-            self.model,
-            "--ip",
-            self.ip,
-            "--port",
-            str(self.port),
-            "--prompt",
-            prompt,
-            "--temperature",
-            "0",
-            "--decoder_model",
-            "smolvlm_500m_instruct",
-            "--model_mode",
-            "kv",
-            "--max_seq_len",
-            "128",
-        ]
-        if self.compile_only:
-            cmds.extend(["--compile_only"])
-        elif self.device:
-            cmds.extend(["--device", self.device])
-        if self.host:
-            cmds.extend(["--host", self.host])
-        elif self.enable_x86_64:
-            cmds.extend(["--enable_x86_64"])
-        if self.pre_gen_pte:
-            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+    @dataclass(frozen=True)
+    class MLLMSpecs:
+        max_seq_len: int
+        sm8650_token_rate: float
+        sm8750_token_rate: float
+        encoder_pte_size: float
+        text_embedding_pte_size: float
+        decoder_pte_size: float
 
-        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
-        with Listener((self.ip, self.port)) as listener:
-            conn = listener.accept()
-            p.communicate()
-            msg = json.loads(conn.recv())
-            if "Error" in msg:
-                self.fail(msg["Error"])
-            else:
-                if not self.enable_x86_64:
-                    encoder_pte_size = msg["encoder_pte_size"]
-                    text_embedding_pte_size = msg["text_embedding_pte_size"]
-                    decoder_pte_size = msg["pte_size"]
-                    self.assertLessEqual(encoder_pte_size, 110_000_000)  # 110MB
-                    self.assertLessEqual(text_embedding_pte_size, 100_000_000)  # 100MB
-                    self.assertLessEqual(decoder_pte_size, 400_000_000)  # 400MB
-                    print(f"Encoder PTE Size: {encoder_pte_size} bytes")
-                    print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
-                    print(f"Decoder PTE Size: {decoder_pte_size} bytes")
+    @dataclass(frozen=True)
+    class VLMSpecs(MLLMSpecs):
+        image_path: str
+        golden_image_feature: str
 
-    def test_internvl3_1b(self):
-        if not self.required_envs():
+    # TODO: refactor to support different backends
+    def setUp(self):
+        self.vlm_specs = {
+            "smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
+                max_seq_len=128,
+                sm8650_token_rate=50,
+                sm8750_token_rate=55,
+                encoder_pte_size=110_000_000,  # 110MB
+                text_embedding_pte_size=100_000_000,  # 100MB
+                decoder_pte_size=400_000_000,  # 400MB
+                image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",  # New York Bay
+                golden_image_feature="city",
+            ),
+            "internvl3_1b": TestExampleMultimodalityScript.VLMSpecs(
+                max_seq_len=320,
+                sm8650_token_rate=11,
+                sm8750_token_rate=13,
+                encoder_pte_size=425_000_000,  # 425MB
+                text_embedding_pte_size=300_000_000,  # 300MB
+                decoder_pte_size=550_000_000,  # 550 MB
+                image_path="http://images.cocodataset.org/val2017/000000039769.jpg",  # Two cats lying on a blanket
+                golden_image_feature="cats",
+            ),
+        }
+
+    def test_static_vlm(self):
+        if not self.required_envs([self.model_name]):
             self.skipTest("missing required envs")
 
+        vlm_specs: TestExampleMultimodalityScript.VLMSpecs = self.vlm_specs[
+            self.model_name
+        ]
         prompt = "Can you describe this image?"
+        image_path = vlm_specs.image_path
         cmds = [
             "python",
             f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
@@ -6608,14 +6593,16 @@ def test_internvl3_1b(self):
             str(self.port),
             "--prompt",
             prompt,
+            "--image_path",
+            image_path,
             "--temperature",
             "0",
             "--decoder_model",
-            "internvl3_1b",
+            f"{self.model_name}",
             "--model_mode",
             "kv",
             "--max_seq_len",
-            "320",
+            f"{vlm_specs.max_seq_len}",
         ]
         if self.compile_only:
             cmds.extend(["--compile_only"])
@@ -6636,17 +6623,41 @@ def test_internvl3_1b(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(
+                        vlm_specs.golden_image_feature in model_out,
+                        f"Expected Output contains feature: '{vlm_specs.golden_image_feature}'  Actual Output: '{model_out}'",
+                    )
+                    print(f"Image Path: {image_path}")
+                    print(f"Query: {prompt}")
+                    print(f"Answer: {model_out}")
                 if not self.enable_x86_64:
                     encoder_pte_size = msg["encoder_pte_size"]
                     text_embedding_pte_size = msg["text_embedding_pte_size"]
                     decoder_pte_size = msg["pte_size"]
-                    self.assertLessEqual(encoder_pte_size, 425_000_000)  # 425MB
-                    self.assertLessEqual(text_embedding_pte_size, 300_000_000)  # 300MB
-                    self.assertLessEqual(decoder_pte_size, 550_000_000)  # 550MB
+                    self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
+                    self.assertLessEqual(
+                        text_embedding_pte_size, vlm_specs.text_embedding_pte_size
+                    )
+                    self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
                     print(f"Encoder PTE Size: {encoder_pte_size} bytes")
                     print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
                     print(f"Decoder PTE Size: {decoder_pte_size} bytes")
 
+                attr_name = f"{self.model.lower()}_token_rate"
+                if (
+                    not self.compile_only
+                    and not self.enable_x86_64
+                    and hasattr(vlm_specs, attr_name)
+                ):
+                    device_inference_speed = msg["inference_speed"]
+                    expected_inference_speed = getattr(vlm_specs, attr_name)
+                    print(f"Prompt Evaluation: {device_inference_speed} tokens/second")
+                    self.assertGreaterEqual(
+                        device_inference_speed, expected_inference_speed
+                    )
+
 
 class TestExampleOssScript(TestQNN):
     def test_albert(self):

@@ -81,3 +81,60 @@ target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
 set_target_properties(
   qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
+
+# build qnn multimodal runner preprocess qnn runner src files for multimodal
+set(_multimodal_runner__srcs ${_llama_runner__srcs})
+list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*qnn_llama_runner.*")
+list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*runner/runner\.(cpp|h)")
+list(
+  PREPEND
+  _multimodal_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.h
+)
+
+list(APPEND _multimodal_runner__srcs)
+
+# build qnn multimodal runner
+add_executable(qnn_multimodal_runner ${_multimodal_runner__srcs})
+target_include_directories(
+  qnn_multimodal_runner PUBLIC ${_common_include_directories}
+)
+target_include_directories(
+  qnn_multimodal_runner
+  PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
+)
+target_compile_options(qnn_multimodal_runner PUBLIC ${_common_compile_options})
+
+target_link_libraries(
+  qnn_multimodal_runner
+  qnn_executorch_backend
+  executorch_core
+  extension_data_loader
+  extension_flat_tensor
+  extension_llm_runner
+  extension_module
+  extension_tensor
+  gflags
+  custom_ops
+  quantized_ops_lib
+  quantized_kernels
+  tokenizers::tokenizers
+)
+
+set_target_properties(
+  qnn_multimodal_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+)