Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6661,7 +6661,7 @@ class MLLMSpecs:
sm8650_token_rate: float
sm8750_token_rate: float
encoder_pte_size: float
text_embedding_pte_size: float
tok_embedding_pte_size: float
decoder_pte_size: float

@dataclass(frozen=True)
Expand All @@ -6677,7 +6677,7 @@ def setUp(self):
sm8650_token_rate=50,
sm8750_token_rate=55,
encoder_pte_size=110_000_000, # 110MB
text_embedding_pte_size=100_000_000, # 100MB
tok_embedding_pte_size=100_000_000, # 100MB
decoder_pte_size=400_000_000, # 400MB
image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", # New York Bay
golden_image_feature="city",
Expand All @@ -6687,7 +6687,7 @@ def setUp(self):
sm8650_token_rate=11,
sm8750_token_rate=13,
encoder_pte_size=425_000_000, # 425MB
text_embedding_pte_size=300_000_000, # 300MB
tok_embedding_pte_size=300_000_000, # 300MB
decoder_pte_size=550_000_000, # 550 MB
image_path="http://images.cocodataset.org/val2017/000000039769.jpg", # Two cats lying on a blanket
golden_image_feature="cats",
Expand Down Expand Up @@ -6759,16 +6759,16 @@ def test_static_vlm(self):
print(f"Answer: {model_out}")
if not self.enable_x86_64:
encoder_pte_size = msg["encoder_pte_size"]
text_embedding_pte_size = msg["text_embedding_pte_size"]
tok_embedding_pte_size = msg["tok_embedding_pte_size"]
decoder_pte_size = msg["pte_size"]
self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
self.assertLessEqual(
text_embedding_pte_size, vlm_specs.text_embedding_pte_size
tok_embedding_pte_size, vlm_specs.tok_embedding_pte_size
)
self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
print(f"Encoder PTE Size: {encoder_pte_size} bytes")
print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
print(f"Decoder PTE Size: {decoder_pte_size} bytes")
print(f"Token Embedding PTE Size: {tok_embedding_pte_size} bytes")
print(f"Text Decoder PTE Size: {decoder_pte_size} bytes")

attr_name = f"{self.model.lower()}_token_rate"
if (
Expand Down
11 changes: 7 additions & 4 deletions examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,15 @@ list(
${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/utils.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp
Expand Down
31 changes: 31 additions & 0 deletions examples/qualcomm/oss_scripts/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,37 @@ If you have already compiled a VLM model, you can run inference with pre-generat
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
```

### Multi-Turn Conversation with VLM

The framework supports multi-turn conversations with VLMs, allowing you to conduct dialogues that can involve multiple images.

- **Multi-Turn Prompts**: To engage in a conversation, provide multiple prompts sequentially using the `--prompt` argument. Each string will be treated as a separate turn.
- **Multiple Images**: You can supply multiple images (from URLs or local paths) using the `--image_path` argument.
- **Flexible Image Placement**: Use the `<image>` token within your prompt to specify exactly where each image's embeddings should be placed. The images provided via `--image_path` will replace the `<image>` tokens in the order they appear.

**Example**:

In this example, the first turn compares two images, the second turn asks a follow-up question about the first image, and the third turn asks for a caption for a third image.

```bash
# Define image URLs and prompts for a 3-turn conversation
IMAGE1_URL="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
IMAGE2_URL="http://images.cocodataset.org/val2017/000000039769.jpg"
IMAGE3_URL="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"

PROMPT1="<image><image>Compare these images above and list the differences."
PROMPT2="Answer the question: What's the main object in first image?"
PROMPT3="<image>Caption this image."

# Execute the multi-turn conversation
python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL"
```

**How it works:**
- **Turn 1**: The prompt `"<image><image>Compare these images above and list the differences."` uses the first two images (`$IMAGE1_URL`, `$IMAGE2_URL`).
- **Turn 2**: The prompt `"Answer the question: What's the main object in first image?"` is a text-only follow-up. The conversation context is maintained from the previous turn.
- **Turn 3**: The prompt `"<image>Caption this image."` uses the third image (`$IMAGE3_URL`).

### VLM Processing Details

The VLM inference pipeline consists of:
Expand Down
95 changes: 54 additions & 41 deletions examples/qualcomm/oss_scripts/llama/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,22 @@
# LICENSE file in the root directory of this source tree.

import argparse
import warnings
from typing import Callable, List, Optional
from typing import Callable, Dict, List, Optional

from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
AUDIO_ENCODER,
TEXT_DECODER,
TEXT_EMBEDDING,
TEXT_ENCODER,
TOK_EMBEDDING,
VISION_ENCODER,
VISION_ENCODER_INPUT_FILENAME,
)

from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
MultiModalityConfig,
VisionModalityConfig,
)
from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper

from transformers import AutoProcessor
from transformers.image_utils import load_image

Expand All @@ -43,35 +40,30 @@ def __init__(
self.artifact = control_args.artifact
self.repo_id = config.repo_id

def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
def _build_vision_dataset(
self, config: VisionModalityConfig, prompt: str, files_path: List[str]
):
"""
This will processes images using the HuggingFace processor and saves
the processed pixel values for runtime evaluation.

Args:
config (VisionModalityConfig): containing image URL and resize parameters
prompt (str): Text prompt to be processed alongside the image
prompt (str): Text prompt
files_path (List[str]): List of file paths for images. Each path can be either a URL or a local file path.

Returns:
tuple of pixel values tensors
"""
# Load image from user-specified path (URL or local file)
# fall back to the default image URL if no image is provided.
image_path = self.control_args.image_path or config.img_url
if not self.control_args.image_path:
warnings.warn(
f"No image path/URL provided, using default image URL: {config.img_url}",
UserWarning,
stacklevel=1,
)
image = load_image(image_path)

images = [load_image(image_path) for image_path in files_path]

# Process image with text prompt using HuggingFace processor
# Some HF processors (e.g. InternVL3) need to pass text arg or it will cause error and process failed
processor = AutoProcessor.from_pretrained(self.repo_id)
pixel_values = processor(
text=prompt,
images=[image],
images=images,
return_tensors="pt",
crop_to_patches=False,
size={
Expand All @@ -80,19 +72,26 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
},
).pixel_values

# save image file for runtime evaluation
pixel_values.detach().numpy().tofile(
f"{self.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw"
assert pixel_values.dim() in (4, 5), (
f"Unsupported pixel_values dim={pixel_values.dim()}); "
f"expected 5D (1,N,C,H,W) or 4D (N,C,H,W)."
)
return (pixel_values,)

# HTP Prepare failed when pixel_values has 5D dimension, so we squeeze the batch dimension here.
if pixel_values.dim() == 5:
pixel_values = pixel_values.squeeze(0) # (N, C, H, W)

# save image file for runtime evaluation
return [(pixel_values[i][None, ...],) for i in range(len(pixel_values))]

def _build_dataset_for_encoder(
self,
config: MultiModalityConfig,
prompt: str,
files_path: List[str],
) -> Optional[tuple]:
if issubclass(config, VisionModalityConfig):
return self._build_vision_dataset(config, prompt)
return self._build_vision_dataset(config, prompt, files_path)
else:
# Audio and text encoder dataset building are not yet implemented
# TODO: Add support for AudioModalityConfig and TextModalityConfig
Expand All @@ -106,22 +105,33 @@ def prepare_calibration_dataset(
prompts: List[str],
chat_template: Callable,
):
calibration_data = {
AUDIO_ENCODER: [],
TEXT_ENCODER: [],
VISION_ENCODER: [],
TEXT_EMBEDDING: [],
TEXT_DECODER: [],
# 1. Initialize data
# Shape convention: (num_samples, num_turns).
# Currently, user prompt calibration is one-shot per prompt (num_samples = 1).
calibration_data: Dict[str, List[List]] = {
# Encoders / embeddings: initialize an empty turn list for each prompt.
AUDIO_ENCODER: [[] for _ in range(len(prompts))],
TEXT_ENCODER: [[] for _ in range(len(prompts))],
VISION_ENCODER: [[] for _ in range(len(prompts))],
TOK_EMBEDDING: [[] for _ in range(len(prompts))],
# Decoder targets: one string per prompt.
TEXT_DECODER: ["" for _ in range(len(prompts))],
}

# 2. Prepare messages for multi-turn conversation
messages = self.tokenizer_wrapper.prepare_messages(prompts)

# 3. build dataset by modality
is_multimodal = any(
[
hasattr(self.config, AUDIO_ENCODER),
hasattr(self.config, VISION_ENCODER),
]
)
for prompt in prompts:
# Apply chat template formatting if available (for instruction-tuned/reasoning models)
for turn_idx, message in enumerate(messages):
prompt = message["text"]

# 3.1. Apply chat template formatting if available (for instruction-tuned/reasoning models)
prompt = (
self.tokenizer_wrapper.apply_prompt_template(
chat_template, prompt, self.control_args.system_prompt
Expand All @@ -130,23 +140,26 @@ def prepare_calibration_dataset(
else prompt
)

# Build calibration datasets for each available encoder modality
# 3.2 Build calibration datasets for each available encoder modality
for modality in [AUDIO_ENCODER, TEXT_ENCODER, VISION_ENCODER]:
if hasattr(self.config, modality):
data = self._build_dataset_for_encoder(
getattr(self.config, modality),
prompt,
)
calibration_data[modality].append(data)

# Expand multimodal tokens in prompt for decoder
if not hasattr(self.config, modality) or not message["files_path"]:
continue

data = self._build_dataset_for_encoder(
getattr(self.config, modality),
prompt,
message["files_path"],
)
calibration_data[modality][turn_idx] = data

# 3.3. Expand multimodal tokens in prompt for decoder
prompt = (
self.tokenizer_wrapper.prepare_multimodal_prompt(prompt)
if is_multimodal
else prompt
)

# Add prompt to decoder calibration data
calibration_data[TEXT_DECODER].append(prompt)
calibration_data[TEXT_DECODER][turn_idx] = prompt

return calibration_data
13 changes: 7 additions & 6 deletions examples/qualcomm/oss_scripts/llama/decoder_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,21 @@
TASKS_EVAL = "tasks_eval"
SQNR_EVAL = "sqnr_eval"

# filenames for vision model
VISION_ENCODER_INPUT_FILENAME = "vision_encoder_input"


# Component identifiers
AUDIO_ENCODER = "audio_encoder"
VISION_ENCODER = "vision_encoder"
TEXT_ENCODER = "text_encoder"
TEXT_EMBEDDING = "text_embedding"
TOK_EMBEDDING = "tok_embedding"
TEXT_DECODER = "text_decoder"
ATTENTION_SINK_EVICTOR = "attention_sink_evictor"

# Mapping of input flags for the runner
MODALITY_INPUT_FLAG_MAP = {
VISION_ENCODER: "image_path",
}

# Text embedding graph names
TEXT_EMBEDDING_GRAPH_NAMES = [
TOK_EMBEDDING_GRAPH_NAMES = [
"tok_embedding_kv_forward",
"tok_embedding_prefill_forward",
]
Expand Down
Loading
Loading