Skip to content

Commit 94fa5ed

Browse files
DarkLight1337garg-amit
authored andcommitted
[VLM] Use SequenceData.from_token_counts to create dummy data (vllm-project#8687)
Signed-off-by: Amit Garg <[email protected]>
1 parent 8ba628f commit 94fa5ed

File tree

12 files changed

+73
-80
lines changed

12 files changed

+73
-80
lines changed

vllm/inputs/registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def _default_dummy_data_factory(
125125
# Avoid circular import
126126
from vllm.sequence import SequenceData
127127

128-
dummy_seq_data = SequenceData.from_counts({0: seq_len})
128+
dummy_seq_data = SequenceData.from_token_counts((0, seq_len))
129129
dummy_multi_modal_data = None
130130

131131
return dummy_seq_data, dummy_multi_modal_data

vllm/model_executor/models/blip.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Minimal implementation of BlipVisionModel intended to be only used
22
within a vision language model."""
3-
from array import array
43
from typing import Optional, Union
54

65
import torch
@@ -19,7 +18,7 @@
1918
from vllm.model_executor.layers.quantization import QuantizationConfig
2019
from vllm.multimodal.utils import (cached_get_tokenizer,
2120
repeat_and_pad_placeholder_tokens)
22-
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
21+
from vllm.sequence import SequenceData
2322

2423
try:
2524
from xformers import ops as xops
@@ -53,6 +52,7 @@ def get_max_blip_image_tokens(
5352
def dummy_seq_data_for_blip(
5453
hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
5554
seq_len: int,
55+
num_images: int,
5656
*,
5757
image_token_id: int,
5858
image_feature_size_override: Optional[int] = None,
@@ -62,11 +62,10 @@ def dummy_seq_data_for_blip(
6262
else:
6363
image_feature_size = image_feature_size_override
6464

65-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
66-
[image_token_id]) * image_feature_size
67-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
68-
[0]) * (seq_len - image_feature_size)
69-
return SequenceData(token_ids)
65+
return SequenceData.from_token_counts(
66+
(image_token_id, image_feature_size * num_images),
67+
(0, seq_len - image_feature_size * num_images),
68+
)
7069

7170

7271
def dummy_image_for_blip(

vllm/model_executor/models/blip2.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from array import array
21
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
32
TypedDict, Union)
43

@@ -18,8 +17,7 @@
1817
from vllm.model_executor.models.opt import OPTModel
1918
from vllm.model_executor.sampling_metadata import SamplingMetadata
2019
from vllm.multimodal import MULTIMODAL_REGISTRY
21-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
22-
SequenceData)
20+
from vllm.sequence import IntermediateTensors, SequenceData
2321

2422
from .blip import (BlipVisionModel, dummy_image_for_blip,
2523
get_max_blip_image_tokens)
@@ -429,11 +427,10 @@ def dummy_seq_data_for_blip2(
429427
else:
430428
image_feature_size = image_feature_size_override
431429

432-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
433-
[image_token_id]) * image_feature_size * num_images
434-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
435-
[0]) * (seq_len - image_feature_size * num_images)
436-
return SequenceData(token_ids)
430+
return SequenceData.from_token_counts(
431+
(image_token_id, image_feature_size * num_images),
432+
(0, seq_len - image_feature_size * num_images),
433+
)
437434

438435

439436
def dummy_data_for_blip2(ctx: InputContext, seq_len: int,

vllm/model_executor/models/chameleon.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from array import array
21
from functools import cached_property
32
from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
43
Tuple, TypedDict)
@@ -32,8 +31,7 @@
3231
from vllm.multimodal import MULTIMODAL_REGISTRY
3332
from vllm.multimodal.utils import (cached_get_tokenizer,
3433
repeat_and_pad_placeholder_tokens)
35-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
36-
SequenceData)
34+
from vllm.sequence import IntermediateTensors, SequenceData
3735
from vllm.utils import print_warning_once
3836

3937
from .interfaces import SupportsMultiModal
@@ -72,11 +70,10 @@ def dummy_seq_data_for_chameleon(
7270
else:
7371
image_feature_size = image_feature_size_override
7472

75-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
76-
[image_token_id]) * image_feature_size * num_images
77-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
78-
[0]) * (seq_len - image_feature_size * num_images)
79-
return SequenceData(token_ids)
73+
return SequenceData.from_token_counts(
74+
(image_token_id, image_feature_size * num_images),
75+
(0, seq_len - image_feature_size * num_images),
76+
)
8077

8178

8279
def dummy_image_for_chameleon(

vllm/model_executor/models/clip.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Minimal implementation of CLIPVisionModel intended to be only used
22
within a vision language model."""
3-
from array import array
43
from typing import Iterable, List, Optional, Tuple, Union
54

65
import torch
@@ -20,7 +19,7 @@
2019
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
2120
from vllm.multimodal.utils import (cached_get_tokenizer,
2221
repeat_and_pad_placeholder_tokens)
23-
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
22+
from vllm.sequence import SequenceData
2423

2524
try:
2625
from xformers import ops as xops
@@ -62,11 +61,10 @@ def dummy_seq_data_for_clip(
6261
else:
6362
image_feature_size = image_feature_size_override
6463

65-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
66-
[image_token_id]) * image_feature_size * num_images
67-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
68-
[0]) * (seq_len - image_feature_size * num_images)
69-
return SequenceData(token_ids)
64+
return SequenceData.from_token_counts(
65+
(image_token_id, image_feature_size * num_images),
66+
(0, seq_len - image_feature_size * num_images),
67+
)
7068

7169

7270
def dummy_image_for_clip(

vllm/model_executor/models/minicpmv.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
2424
import math
2525
import re
26-
from array import array
2726
from functools import partial
2827
from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple,
2928
TypedDict)
@@ -56,8 +55,7 @@
5655
from vllm.multimodal import MULTIMODAL_REGISTRY
5756
from vllm.multimodal.image import cached_get_image_processor
5857
from vllm.multimodal.utils import cached_get_tokenizer
59-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
60-
SequenceData)
58+
from vllm.sequence import IntermediateTensors, SequenceData
6159

6260
from .idefics2_vision_model import Idefics2VisionTransformer
6361

@@ -259,8 +257,7 @@ def get_max_minicpmv_image_tokens(ctx: InputContext):
259257

260258

261259
def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int):
262-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * seq_len
263-
return SequenceData(token_ids)
260+
return SequenceData.from_token_counts((0, seq_len))
264261

265262

266263
def dummy_image_for_minicpmv(hf_config: PretrainedConfig, num_images: int):

vllm/model_executor/models/pixtral.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from array import array
21
from dataclasses import dataclass, fields
32
from itertools import tee
43
from typing import Iterable, List, Mapping, Optional, Tuple, Union
@@ -24,8 +23,7 @@
2423
from vllm.multimodal import MULTIMODAL_REGISTRY
2524
from vllm.multimodal.base import MultiModalInputs
2625
from vllm.multimodal.utils import cached_get_tokenizer
27-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
28-
SequenceData)
26+
from vllm.sequence import IntermediateTensors, SequenceData
2927

3028
from .interfaces import SupportsMultiModal
3129
from .utils import init_vllm_registered_model
@@ -63,13 +61,11 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
6361
image_feature_size = (size**2) // (patch_size**2)
6462

6563
num_image_tokens = image_feature_size * num_images
64+
seq_data = SequenceData.from_token_counts(
65+
(image_token_id, num_image_tokens),
66+
(0, seq_len - num_image_tokens),
67+
)
6668

67-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
68-
[image_token_id]) * num_image_tokens
69-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
70-
[0]) * (seq_len - num_image_tokens)
71-
72-
seq_data = SequenceData(token_ids)
7369
mm_data = {"image": num_images * [image]}
7470
return seq_data, mm_data
7571

vllm/model_executor/models/qwen.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import math
99
import re
10-
from array import array
1110
from functools import partial
1211
from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
1312
Optional, Tuple, TypedDict, Union)
@@ -45,8 +44,7 @@
4544
from vllm.multimodal import MULTIMODAL_REGISTRY
4645
from vllm.multimodal.base import MultiModalInputs
4746
from vllm.multimodal.utils import cached_get_tokenizer
48-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
49-
SequenceData)
47+
from vllm.sequence import IntermediateTensors, SequenceData
5048
from vllm.utils import is_list_of
5149

5250
from .utils import flatten_bn, is_pp_missing_parameter, make_layers
@@ -819,7 +817,7 @@ def dummy_data_for_qwen(
819817
# The presence of a visual config indicates this is a multimodal model.
820818
# If we don't have it, the model is considered an LLM for warmup purposes.
821819
if not hasattr(hf_config, "visual"):
822-
seq_data = SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len))
820+
seq_data = SequenceData.from_token_counts((0, seq_len))
823821
mm_data = None
824822
return seq_data, mm_data
825823

@@ -846,11 +844,13 @@ def dummy_data_for_qwen(
846844
if len(toks) < seq_len:
847845
toks += [0] * (seq_len - len(toks))
848846

847+
seq_data = SequenceData.from_seqs(toks)
848+
849849
# Build the input images; width/height doesn't actually matter here since
850850
# the data will get resized and the # of tokens per image is constant
851851
image = Image.new("RGB", (224, 224), color=0)
852852
mm_data = {"image": image if num_images == 1 else [image] * num_images}
853-
return SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, toks)), mm_data
853+
return seq_data, mm_data
854854

855855

856856
@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen)

vllm/model_executor/models/qwen2_vl.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
# See the License for the specific language governing permissions and
2323
# limitations under the License.
2424
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
25-
from array import array
2625
from functools import lru_cache, partial
2726
from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict,
2827
Union)
@@ -66,8 +65,7 @@
6665
from vllm.multimodal.base import MultiModalData
6766
from vllm.multimodal.image import cached_get_image_processor
6867
from vllm.platforms import current_platform
69-
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
70-
SequenceData)
68+
from vllm.sequence import IntermediateTensors, SequenceData
7169
from vllm.transformers_utils.processor import get_processor
7270

7371
logger = init_logger(__name__)
@@ -681,15 +679,14 @@ def dummy_data_for_qwen2_vl(
681679
"--limit-mm-per-prompt.")
682680

683681
hf_config = ctx.get_hf_config(Qwen2VLConfig)
684-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
685-
[hf_config.vision_start_token_id])
686-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
687-
[hf_config.image_token_id]) * max_llm_image_tokens
688-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
689-
[hf_config.vision_end_token_id])
690-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
691-
[0]) * (seq_len - max_llm_image_tokens - 2)
692-
dummy_seqdata = SequenceData(token_ids)
682+
683+
dummy_seqdata = SequenceData.from_token_counts(
684+
(hf_config.vision_start_token_id, 1),
685+
(hf_config.image_token_id, max_llm_image_tokens),
686+
(hf_config.vision_end_token_id, 1),
687+
(0, seq_len - max_llm_image_tokens - 2),
688+
)
689+
693690
dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
694691
color=0)
695692

vllm/model_executor/models/siglip.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
within a vision language model."""
33

44
import math
5-
from array import array
65
from typing import Iterable, List, Optional, Tuple, Union
76

87
import torch
@@ -24,7 +23,7 @@
2423
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
2524
from vllm.multimodal.utils import (cached_get_tokenizer,
2625
repeat_and_pad_placeholder_tokens)
27-
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
26+
from vllm.sequence import SequenceData
2827

2928
try:
3029
from xformers import ops as xops
@@ -67,11 +66,10 @@ def dummy_seq_data_for_siglip(
6766
else:
6867
image_feature_size = image_feature_size_override
6968

70-
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
71-
[image_token_id]) * image_feature_size
72-
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
73-
[0]) * (seq_len - image_feature_size)
74-
return SequenceData(token_ids)
69+
return SequenceData.from_token_counts(
70+
(image_token_id, image_feature_size * num_images),
71+
(0, seq_len - image_feature_size * num_images),
72+
)
7573

7674

7775
def dummy_image_for_siglip(

0 commit comments

Comments
 (0)