Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 74 additions & 63 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6529,70 +6529,55 @@ def test_qwen2_5(self):


class TestExampleMultimodalityScript(TestQNN):
def test_smolvlm_500m_instruct(self):
if not self.required_envs():
self.skipTest("missing required envs")

prompt = "Can you describe this image?"
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--model",
self.model,
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
prompt,
"--temperature",
"0",
"--decoder_model",
"smolvlm_500m_instruct",
"--model_mode",
"kv",
"--max_seq_len",
"128",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])
if self.pre_gen_pte:
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
@dataclass(frozen=True)
class MLLMSpecs:
max_seq_len: int
sm8650_token_rate: float
sm8750_token_rate: float
encoder_pte_size: float
text_embedding_pte_size: float
decoder_pte_size: float

p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
if not self.enable_x86_64:
encoder_pte_size = msg["encoder_pte_size"]
text_embedding_pte_size = msg["text_embedding_pte_size"]
decoder_pte_size = msg["pte_size"]
self.assertLessEqual(encoder_pte_size, 110_000_000) # 110MB
self.assertLessEqual(text_embedding_pte_size, 100_000_000) # 100MB
self.assertLessEqual(decoder_pte_size, 400_000_000) # 400MB
print(f"Encoder PTE Size: {encoder_pte_size} bytes")
print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
print(f"Decoder PTE Size: {decoder_pte_size} bytes")
@dataclass(frozen=True)
class VLMSpecs(MLLMSpecs):
image_path: str
golden_image_feature: str

def test_internvl3_1b(self):
if not self.required_envs():
# TODO: refactor to support different backends
def setUp(self):
self.vlm_specs = {
"smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
max_seq_len=128,
sm8650_token_rate=50,
sm8750_token_rate=55,
encoder_pte_size=110_000_000, # 110MB
text_embedding_pte_size=100_000_000, # 100MB
decoder_pte_size=400_000_000, # 400MB
image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", # New York Bay
golden_image_feature="city",
),
"internvl3_1b": TestExampleMultimodalityScript.VLMSpecs(
max_seq_len=320,
sm8650_token_rate=11,
sm8750_token_rate=13,
encoder_pte_size=425_000_000, # 425MB
text_embedding_pte_size=300_000_000, # 300MB
decoder_pte_size=550_000_000, # 550 MB
image_path="http://images.cocodataset.org/val2017/000000039769.jpg", # Two cats lying on a blanket
golden_image_feature="cats",
),
}

def test_static_vlm(self):
if not self.required_envs([self.model_name]):
self.skipTest("missing required envs")

vlm_specs: TestExampleMultimodalityScript.VLMSpecs = self.vlm_specs[
self.model_name
]
prompt = "Can you describe this image?"
image_path = vlm_specs.image_path
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
Expand All @@ -6608,14 +6593,16 @@ def test_internvl3_1b(self):
str(self.port),
"--prompt",
prompt,
"--image_path",
image_path,
"--temperature",
"0",
"--decoder_model",
"internvl3_1b",
f"{self.model_name}",
"--model_mode",
"kv",
"--max_seq_len",
"320",
f"{vlm_specs.max_seq_len}",
]
if self.compile_only:
cmds.extend(["--compile_only"])
Expand All @@ -6636,17 +6623,41 @@ def test_internvl3_1b(self):
if "Error" in msg:
self.fail(msg["Error"])
else:
if not self.compile_only:
model_out = msg["result"][0]
self.assertTrue(
vlm_specs.golden_image_feature in model_out,
f"Expected Output contains feature: '{vlm_specs.golden_image_feature}' Actual Output: '{model_out}'",
)
print(f"Image Path: {image_path}")
print(f"Query: {prompt}")
print(f"Answer: {model_out}")
if not self.enable_x86_64:
encoder_pte_size = msg["encoder_pte_size"]
text_embedding_pte_size = msg["text_embedding_pte_size"]
decoder_pte_size = msg["pte_size"]
self.assertLessEqual(encoder_pte_size, 425_000_000) # 425MB
self.assertLessEqual(text_embedding_pte_size, 300_000_000) # 300MB
self.assertLessEqual(decoder_pte_size, 550_000_000) # 550MB
self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
self.assertLessEqual(
text_embedding_pte_size, vlm_specs.text_embedding_pte_size
)
self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
print(f"Encoder PTE Size: {encoder_pte_size} bytes")
print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
print(f"Decoder PTE Size: {decoder_pte_size} bytes")

attr_name = f"{self.model.lower()}_token_rate"
if (
not self.compile_only
and not self.enable_x86_64
and hasattr(vlm_specs, attr_name)
):
device_inference_speed = msg["inference_speed"]
expected_inference_speed = getattr(vlm_specs, attr_name)
print(f"Prompt Evaluation: {device_inference_speed} tokens/second")
self.assertGreaterEqual(
device_inference_speed, expected_inference_speed
)


class TestExampleOssScript(TestQNN):
def test_albert(self):
Expand Down
57 changes: 57 additions & 0 deletions examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,60 @@ target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
set_target_properties(
qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
)

# build qnn multimodal runner preprocess qnn runner src files for multimodal
set(_multimodal_runner__srcs ${_llama_runner__srcs})
list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*qnn_llama_runner.*")
list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*runner/runner\.(cpp|h)")
list(
PREPEND
_multimodal_runner__srcs
${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.h
)

list(APPEND _multimodal_runner__srcs)

# build qnn multimodal runner
add_executable(qnn_multimodal_runner ${_multimodal_runner__srcs})
target_include_directories(
qnn_multimodal_runner PUBLIC ${_common_include_directories}
)
target_include_directories(
qnn_multimodal_runner
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
)
target_compile_options(qnn_multimodal_runner PUBLIC ${_common_compile_options})

target_link_libraries(
qnn_multimodal_runner
qnn_executorch_backend
executorch_core
extension_data_loader
extension_flat_tensor
extension_llm_runner
extension_module
extension_tensor
gflags
custom_ops
quantized_ops_lib
quantized_kernels
tokenizers::tokenizers
)

set_target_properties(
qnn_multimodal_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
)
Loading
Loading