2525from vllm .multimodal .parse import (ImageEmbeddingItems , ImageProcessorItems ,
2626 ImageSize , MultiModalDataItems )
2727from vllm .multimodal .processing import (BaseMultiModalProcessor ,
28- BaseProcessingInfo , PromptReplacement )
28+ BaseProcessingInfo , ProcessingCache ,
29+ PromptReplacement )
2930from vllm .multimodal .profiling import BaseDummyInputsBuilder , ProcessorInputs
3031from vllm .sequence import IntermediateTensors
3132from vllm .transformers_utils .configs .deepseek_vl2 import (DeepseekVLV2Config ,
@@ -138,18 +139,24 @@ def get_hf_processor(self, **kwargs: object):
138139 def get_supported_mm_limits (self ) -> Mapping [str , Optional [int ]]:
139140 return {"image" : None }
140141
141- def get_num_image_tokens (self , * , image_width : int ,
142- image_height : int ) -> int :
142+ def get_num_image_tokens (self ,
143+ * ,
144+ image_width : int ,
145+ image_height : int ,
146+ cropping : bool = True ) -> int :
143147 hf_processor = self .get_hf_processor ()
144148 image_size = hf_processor .image_size
145149 patch_size = hf_processor .patch_size
146150 downsample_ratio = hf_processor .downsample_ratio
147151
148- best_width , best_height = hf_processor .select_best_resolution (
149- (image_width , image_height ))
152+ if cropping :
153+ best_width , best_height = hf_processor .select_best_resolution (
154+ (image_width , image_height ))
155+ num_width_tiles , num_height_tiles = (best_width // image_size ,
156+ best_height // image_size )
157+ else :
158+ num_width_tiles = num_height_tiles = 1
150159
151- num_width_tiles , num_height_tiles = (best_width // image_size ,
152- best_height // image_size )
153160 h = w = math .ceil ((image_size // patch_size ) / downsample_ratio )
154161
155162 global_views_tokens = h * (w + 1 )
@@ -169,10 +176,12 @@ def get_mm_max_tokens_per_item(
169176 seq_len : int ,
170177 mm_counts : Mapping [str , int ],
171178 ) -> Mapping [str , int ]:
179+ num_images = mm_counts .get ("image" , 0 )
172180 max_image_size = self .get_image_size_with_most_features ()
173181 max_image_tokens = self .get_num_image_tokens (
174182 image_height = max_image_size .height ,
175- image_width = max_image_size .width )
183+ image_width = max_image_size .width ,
184+ cropping = num_images <= 2 )
176185
177186 return {"image" : max_image_tokens }
178187
@@ -207,6 +216,30 @@ def get_dummy_processor_inputs(
207216class DeepseekVL2MultiModalProcessor (
208217 BaseMultiModalProcessor [DeepseekVL2ProcessingInfo ]):
209218
219+ def __init__ (
220+ self ,
221+ info : DeepseekVL2ProcessingInfo ,
222+ dummy_inputs : "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]" ,
223+ * ,
224+ cache : Optional [ProcessingCache ] = None ,
225+ enable_sanity_checks : bool = True ) -> None :
226+ super ().__init__ (
227+ info ,
228+ dummy_inputs ,
229+ cache = cache ,
230+ enable_sanity_checks = enable_sanity_checks ,
231+ )
232+
233+ mm_limit = self .info .ctx .model_config .multimodal_config .limit_per_prompt
234+ if self .cache is not None and mm_limit ["image" ] > 2 :
235+ # The processor output depends on the number of images passed,
236+ # making it incompatible with processing cache which is supposed
237+ # to be invariant of how many images are passed per prompt
238+ self .cache = None
239+ logger .warning_once (
240+ f"{ type (self ).__name__ } does not support processing cache with "
241+ "image limit larger than 2." )
242+
210243 def _call_hf_processor (
211244 self ,
212245 prompt : str ,
@@ -271,6 +304,7 @@ def get_replacement_deepseek_vl2(item_idx: int):
271304 num_image_tokens = self .info .get_num_image_tokens (
272305 image_width = image_size .width ,
273306 image_height = image_size .height ,
307+ cropping = len (images ) <= 2 ,
274308 )
275309 return [image_token_id ] * num_image_tokens
276310
0 commit comments