pytorch · DannyYuyang-quic · Jan 15, 2026
@@ -6661,7 +6661,7 @@ class MLLMSpecs:
         sm8650_token_rate: float
         sm8750_token_rate: float
         encoder_pte_size: float
-        text_embedding_pte_size: float
+        tok_embedding_pte_size: float
         decoder_pte_size: float
 
     @dataclass(frozen=True)
@@ -6677,7 +6677,7 @@ def setUp(self):
                 sm8650_token_rate=50,
                 sm8750_token_rate=55,
                 encoder_pte_size=110_000_000,  # 110MB
-                text_embedding_pte_size=100_000_000,  # 100MB
+                tok_embedding_pte_size=100_000_000,  # 100MB
                 decoder_pte_size=400_000_000,  # 400MB
                 image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",  # New York Bay
                 golden_image_feature="city",
@@ -6687,7 +6687,7 @@ def setUp(self):
                 sm8650_token_rate=11,
                 sm8750_token_rate=13,
                 encoder_pte_size=425_000_000,  # 425MB
-                text_embedding_pte_size=300_000_000,  # 300MB
+                tok_embedding_pte_size=300_000_000,  # 300MB
                 decoder_pte_size=550_000_000,  # 550 MB
                 image_path="http://images.cocodataset.org/val2017/000000039769.jpg",  # Two cats lying on a blanket
                 golden_image_feature="cats",
@@ -6759,16 +6759,16 @@ def test_static_vlm(self):
                     print(f"Answer: {model_out}")
                 if not self.enable_x86_64:
                     encoder_pte_size = msg["encoder_pte_size"]
-                    text_embedding_pte_size = msg["text_embedding_pte_size"]
+                    tok_embedding_pte_size = msg["tok_embedding_pte_size"]
                     decoder_pte_size = msg["pte_size"]
                     self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
                     self.assertLessEqual(
-                        text_embedding_pte_size, vlm_specs.text_embedding_pte_size
+                        tok_embedding_pte_size, vlm_specs.tok_embedding_pte_size
                     )
                     self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
                     print(f"Encoder PTE Size: {encoder_pte_size} bytes")
-                    print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
-                    print(f"Decoder PTE Size: {decoder_pte_size} bytes")
+                    print(f"Token Embedding PTE Size: {tok_embedding_pte_size} bytes")
+                    print(f"Text Decoder PTE Size: {decoder_pte_size} bytes")
 
                 attr_name = f"{self.model.lower()}_token_rate"
                 if (

@@ -94,12 +94,15 @@ list(
   ${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/utils.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
-  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
-  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp

@@ -308,6 +308,37 @@ If you have already compiled a VLM model, you can run inference with pre-generat
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
 ```
 
+### Multi-Turn Conversation with VLM
+
+The framework supports multi-turn conversations with VLMs, allowing you to conduct dialogues that can involve multiple images.
+
+- **Multi-Turn Prompts**: To engage in a conversation, provide multiple prompts sequentially using the `--prompt` argument. Each string will be treated as a separate turn.
+- **Multiple Images**: You can supply multiple images (from URLs or local paths) using the `--image_path` argument.
+- **Flexible Image Placement**: Use the `<image>` token within your prompt to specify exactly where each image's embeddings should be placed. The images provided via `--image_path` will replace the `<image>` tokens in the order they appear.
+
+**Example**:
+
+In this example, the first turn compares two images, the second turn asks a follow-up question about the first image, and the third turn asks for a caption for a third image.
+
+```bash
+# Define image URLs and prompts for a 3-turn conversation
+IMAGE1_URL="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+IMAGE2_URL="http://images.cocodataset.org/val2017/000000039769.jpg"
+IMAGE3_URL="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+
+PROMPT1="<image><image>Compare these images above and list the differences."
+PROMPT2="Answer the question: What's the main object in first image?"
+PROMPT3="<image>Caption this image."
+
+# Execute the multi-turn conversation
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL"
+```
+
+**How it works:**
+- **Turn 1**: The prompt `"<image><image>Compare these images above and list the differences."` uses the first two images (`$IMAGE1_URL`, `$IMAGE2_URL`).
+- **Turn 2**: The prompt `"Answer the question: What's the main object in first image?"` is a text-only follow-up. The conversation context is maintained from the previous turn.
+- **Turn 3**: The prompt `"<image>Caption this image."` uses the third image (`$IMAGE3_URL`).
+
 ### VLM Processing Details
 
 The VLM inference pipeline consists of:

@@ -5,25 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
-import warnings
-from typing import Callable, List, Optional
+from typing import Callable, Dict, List, Optional
 
 from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     AUDIO_ENCODER,
     TEXT_DECODER,
-    TEXT_EMBEDDING,
     TEXT_ENCODER,
+    TOK_EMBEDDING,
     VISION_ENCODER,
-    VISION_ENCODER_INPUT_FILENAME,
 )
 
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
     MultiModalityConfig,
     VisionModalityConfig,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper
-
 from transformers import AutoProcessor
 from transformers.image_utils import load_image
 
@@ -43,35 +40,30 @@ def __init__(
         self.artifact = control_args.artifact
         self.repo_id = config.repo_id
 
-    def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
+    def _build_vision_dataset(
+        self, config: VisionModalityConfig, prompt: str, files_path: List[str]
+    ):
         """
         This will processes images using the HuggingFace processor and saves
         the processed pixel values for runtime evaluation.
 
         Args:
             config (VisionModalityConfig): containing image URL and resize parameters
-            prompt (str): Text prompt to be processed alongside the image
+            prompt (str): Text prompt
+            files_path (List[str]): List of file paths for images. Each path can be either a URL or a local file path.
 
         Returns:
             tuple of pixel values tensors
         """
-        # Load image from user-specified path (URL or local file)
-        # fall back to the default image URL if no image is provided.
-        image_path = self.control_args.image_path or config.img_url
-        if not self.control_args.image_path:
-            warnings.warn(
-                f"No image path/URL provided, using default image URL: {config.img_url}",
-                UserWarning,
-                stacklevel=1,
-            )
-        image = load_image(image_path)
+
+        images = [load_image(image_path) for image_path in files_path]
 
         # Process image with text prompt using HuggingFace processor
         # Some HF processors (e.g. InternVL3) need to pass text arg or it will cause error and process failed
         processor = AutoProcessor.from_pretrained(self.repo_id)
         pixel_values = processor(
             text=prompt,
-            images=[image],
+            images=images,
             return_tensors="pt",
             crop_to_patches=False,
             size={
@@ -80,19 +72,26 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
             },
         ).pixel_values
 
-        # save image file for runtime evaluation
-        pixel_values.detach().numpy().tofile(
-            f"{self.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw"
+        assert pixel_values.dim() in (4, 5), (
+            f"Unsupported pixel_values dim={pixel_values.dim()}); "
+            f"expected 5D (1,N,C,H,W) or 4D (N,C,H,W)."
         )
-        return (pixel_values,)
+
+        # HTP Prepare failed when pixel_values has 5D dimension, so we squeeze the batch dimension here.
+        if pixel_values.dim() == 5:
+            pixel_values = pixel_values.squeeze(0)  # (N, C, H, W)
+
+        # save image file for runtime evaluation
+        return [(pixel_values[i][None, ...],) for i in range(len(pixel_values))]
 
     def _build_dataset_for_encoder(
         self,
         config: MultiModalityConfig,
         prompt: str,
+        files_path: List[str],
     ) -> Optional[tuple]:
         if issubclass(config, VisionModalityConfig):
-            return self._build_vision_dataset(config, prompt)
+            return self._build_vision_dataset(config, prompt, files_path)
         else:
             # Audio and text encoder dataset building are not yet implemented
             # TODO: Add support for AudioModalityConfig and TextModalityConfig
@@ -106,22 +105,33 @@ def prepare_calibration_dataset(
         prompts: List[str],
         chat_template: Callable,
     ):
-        calibration_data = {
-            AUDIO_ENCODER: [],
-            TEXT_ENCODER: [],
-            VISION_ENCODER: [],
-            TEXT_EMBEDDING: [],
-            TEXT_DECODER: [],
+        # 1. Initialize data
+        # Shape convention: (num_samples, num_turns).
+        # Currently, user prompt calibration is one-shot per prompt (num_samples = 1).
+        calibration_data: Dict[str, List[List]] = {
+            # Encoders / embeddings: initialize an empty turn list for each prompt.
+            AUDIO_ENCODER: [[] for _ in range(len(prompts))],
+            TEXT_ENCODER: [[] for _ in range(len(prompts))],
+            VISION_ENCODER: [[] for _ in range(len(prompts))],
+            TOK_EMBEDDING: [[] for _ in range(len(prompts))],
+            # Decoder targets: one string per prompt.
+            TEXT_DECODER: ["" for _ in range(len(prompts))],
         }
 
+        # 2. Prepare messages for multi-turn conversation
+        messages = self.tokenizer_wrapper.prepare_messages(prompts)
+
+        # 3. build dataset by modality
         is_multimodal = any(
             [
                 hasattr(self.config, AUDIO_ENCODER),
                 hasattr(self.config, VISION_ENCODER),
             ]
         )
-        for prompt in prompts:
-            # Apply chat template formatting if available (for instruction-tuned/reasoning models)
+        for turn_idx, message in enumerate(messages):
+            prompt = message["text"]
+
+            # 3.1. Apply chat template formatting if available (for instruction-tuned/reasoning models)
             prompt = (
                 self.tokenizer_wrapper.apply_prompt_template(
                     chat_template, prompt, self.control_args.system_prompt
@@ -130,23 +140,26 @@ def prepare_calibration_dataset(
                 else prompt
             )
 
-            # Build calibration datasets for each available encoder modality
+            # 3.2 Build calibration datasets for each available encoder modality
             for modality in [AUDIO_ENCODER, TEXT_ENCODER, VISION_ENCODER]:
-                if hasattr(self.config, modality):
-                    data = self._build_dataset_for_encoder(
-                        getattr(self.config, modality),
-                        prompt,
-                    )
-                    calibration_data[modality].append(data)
-
-            # Expand multimodal tokens in prompt for decoder
+                if not hasattr(self.config, modality) or not message["files_path"]:
+                    continue
+
+                data = self._build_dataset_for_encoder(
+                    getattr(self.config, modality),
+                    prompt,
+                    message["files_path"],
+                )
+                calibration_data[modality][turn_idx] = data
+
+            # 3.3. Expand multimodal tokens in prompt for decoder
             prompt = (
                 self.tokenizer_wrapper.prepare_multimodal_prompt(prompt)
                 if is_multimodal
                 else prompt
             )
 
             # Add prompt to decoder calibration data
-            calibration_data[TEXT_DECODER].append(prompt)
+            calibration_data[TEXT_DECODER][turn_idx] = prompt
 
         return calibration_data
@@ -12,20 +12,21 @@
 TASKS_EVAL = "tasks_eval"
 SQNR_EVAL = "sqnr_eval"
 
-# filenames for vision model
-VISION_ENCODER_INPUT_FILENAME = "vision_encoder_input"
-
-
 # Component identifiers
 AUDIO_ENCODER = "audio_encoder"
 VISION_ENCODER = "vision_encoder"
 TEXT_ENCODER = "text_encoder"
-TEXT_EMBEDDING = "text_embedding"
+TOK_EMBEDDING = "tok_embedding"
 TEXT_DECODER = "text_decoder"
 ATTENTION_SINK_EVICTOR = "attention_sink_evictor"
 
+# Mapping of input flags for the runner
+MODALITY_INPUT_FLAG_MAP = {
+    VISION_ENCODER: "image_path",
+}
+
 # Text embedding graph names
-TEXT_EMBEDDING_GRAPH_NAMES = [
+TOK_EMBEDDING_GRAPH_NAMES = [
     "tok_embedding_kv_forward",
     "tok_embedding_prefill_forward",
 ]