From f78adf9bc14175a9ccded1e082fae3dd861c26a3 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 25 Feb 2025 17:23:24 +0800 Subject: [PATCH 1/3] fix num images > 2 Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/deepseek_vl2.py | 23 +++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index c58b65d49348..2ada05e33dcc 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -138,18 +138,24 @@ def get_hf_processor(self, **kwargs: object): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_num_image_tokens(self, *, image_width: int, - image_height: int) -> int: + def get_num_image_tokens(self, + *, + image_width: int, + image_height: int, + cropping: bool = True) -> int: hf_processor = self.get_hf_processor() image_size = hf_processor.image_size patch_size = hf_processor.patch_size downsample_ratio = hf_processor.downsample_ratio - best_width, best_height = hf_processor.select_best_resolution( - (image_width, image_height)) + if cropping: + best_width, best_height = hf_processor.select_best_resolution( + (image_width, image_height)) + num_width_tiles, num_height_tiles = (best_width // image_size, + best_height // image_size) + else: + num_width_tiles = num_height_tiles = 1 - num_width_tiles, num_height_tiles = (best_width // image_size, - best_height // image_size) h = w = math.ceil((image_size // patch_size) / downsample_ratio) global_views_tokens = h * (w + 1) @@ -169,10 +175,12 @@ def get_mm_max_tokens_per_item( seq_len: int, mm_counts: Mapping[str, int], ) -> Mapping[str, int]: + num_images = mm_counts.get("image", 0) max_image_size = self.get_image_size_with_most_features() max_image_tokens = self.get_num_image_tokens( image_height=max_image_size.height, - image_width=max_image_size.width) + image_width=max_image_size.width, + cropping=num_images <= 2) return {"image": max_image_tokens} @@ -271,6 +279,7 @@ def get_replacement_deepseek_vl2(item_idx: int): num_image_tokens = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, + cropping=len(images) <= 2, ) return [image_token_id] * num_image_tokens From 28b304734d6d1721d98c47836df61519ea32ccc4 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 25 Feb 2025 19:15:25 +0800 Subject: [PATCH 2/3] disable cache with num images > 2 Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/deepseek_vl2.py | 27 +++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 2ada05e33dcc..ea217e244404 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -25,7 +25,8 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, ProcessingCache, + PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, @@ -215,6 +216,30 @@ def get_dummy_processor_inputs( class DeepseekVL2MultiModalProcessor( BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]): + def __init__( + self, + info: DeepseekVL2ProcessingInfo, + dummy_inputs: "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]", + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__( + info, + dummy_inputs, + cache=cache, + enable_sanity_checks=enable_sanity_checks, + ) + + mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt + if self.cache is not None and mm_limit["image"] > 2: + # The processor output depends on the number of images passed, + # making it incompatible with processing cache which is supposed + # to be invariant of how many images are passed per prompt + self.cache = None + logger.warning_once( + f"{type(self).__name__} does not support processing cache with " + "image limit larger than 2.") + def _call_hf_processor( self, prompt: str, From 451ccf29363011ce50201bc70344696826e32bb9 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 25 Feb 2025 19:34:12 +0800 Subject: [PATCH 3/3] modify h2ovl Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/h2ovl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 01b721fa79e1..bab9c256b9aa 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -477,13 +477,15 @@ def __init__(self, enable_sanity_checks=enable_sanity_checks, ) - if self.cache is not None: + mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt + if self.cache is not None and mm_limit["image"] >= 2: # The processor output depends on the number of images passed, # making it incompatible with processing cache which is supposed # to be invariant of how many images are passed per prompt self.cache = None logger.warning_once( - f"{type(self).__name__} does not support processing cache.") + f"{type(self).__name__} does not support processing cache with " + "multi-image support enabled.") def _get_prompt_replacements( self,