Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
299 changes: 187 additions & 112 deletions examples/offline_inference/vision_language.py

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,9 @@ def _process_image_input(

return self.multi_modal_projector(image_outputs, image_attn_mask)

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,9 @@ def _process_image_input(self,

return self.language_projection(query_output)

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -986,7 +986,9 @@ def _parse_and_validate_image_input(
data=self._validate_pixel_values(pixel_values),
)

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,9 @@ def _process_image_input(
return self._pixel_values_to_embedding(
pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)

def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
def get_multimodal_embeddings(
self, **kwargs: object
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/florence2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,7 +1037,9 @@ def _process_image_input(
pixel_values = image_input["data"]
return self._encode_image(pixel_values)

def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
def get_multimodal_embeddings(
self, **kwargs: object
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
6 changes: 4 additions & 2 deletions vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
""" PyTorch Fuyu model."""
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import List, Literal, Optional, Set, Tuple, TypedDict
from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union

import torch
import torch.nn as nn
Expand Down Expand Up @@ -327,7 +327,9 @@ def _process_image_input(
image_patches_flat)
return vision_embeddings_flat.split(patches_per_image, dim=0)

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,9 @@ def _process_image_input(

return self.transformer.vision(pixel_values)

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
self.sampler = get_sampler()

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self.model._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ class SupportsMultiModal(Protocol):
MRO of your model class.
"""

def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this particular case, we should change the typevar instead.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, tested

"""
Returns multimodal embeddings generated from multimodal kwargs
to be merged with text embeddings.
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,9 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
else:
self.visual_token_mask = None

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,9 @@ def _process_image_input(self,
image_features = self._process_image_pixels(image_input)
return self.multi_modal_projector(image_features)

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,9 @@ def _process_image_input(
for i, patch_features_batch in enumerate(patch_embeddings)
]

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,9 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
raise ValueError(
f"Unsupported type of video input {type(video_pixels)}")

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
video_input = self._parse_and_validate_video_input(**kwargs)
if video_input is None:
return None
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
PromptInsertion, PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.utils import JSONTree, json_map_leaves
from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves

from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
SupportsQuant)
Expand Down Expand Up @@ -1576,21 +1576,24 @@ def _get_mm_embeds(

return embeds_in_batch

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None

image_features = self._process_image_input(image_input)

return [
nested_embeds = [
self._get_mm_embeds(*args) for args in zip(
image_features,
image_input["feat_is_patch"],
image_input["num_crops"],
image_input["embed_is_patch"],
)
]
return flatten_2d_lists(nested_embeds)

def get_input_embeddings(
self,
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,9 @@ def _process_image_input(

return self.multi_modal_projector(image_features)

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,9 @@ def _process_image_input(

return image_embeds

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,9 @@ def sampler(self):

return get_sampler()

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input, image_tokens = self._parse_and_validate_image_input(
**kwargs)
if image_input is None:
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,9 @@ def _process_audio_input(self,
return torch.split(masked_audio_features,
audio_output_lengths.flatten().tolist())

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/qwen_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,9 @@ def _process_image_input(self,

return self.transformer.visual(image_input["data"])

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,9 @@ def _process_audio_input(

return result

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is None:
return None
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,9 @@ def forward(
)
return decoder_outputs

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs
) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
# TODO: This method does not obey the interface for SupportsMultiModal.
# Refactor this once encoder/decoder support is implemented in V1.
audio_input = self._parse_and_validate_audio_input(**kwargs)
Expand Down
Loading