Skip to content

Commit 6397563

Browse files
Merge branch 'vllm-project:main' into fix/flashinfer-prefill-token-limit
2 parents 978f575 + ef28354 commit 6397563

File tree

546 files changed

+20236
-9948
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

546 files changed

+20236
-9948
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ steps:
7676
queue: arm64_cpu_queue_postmerge
7777
commands:
7878
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
79-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
79+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
8080
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
8181

8282
# Add job to create multi-arch manifest

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,8 @@ function cpu_tests() {
5858
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
5959
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
6060
61-
# Note: disable Bart until supports V1
62-
pytest -x -v -s tests/models/language/generation -m cpu_model \
63-
--ignore=tests/models/language/generation/test_bart.py
64-
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
65-
--ignore=tests/models/language/generation/test_bart.py
61+
pytest -x -v -s tests/models/language/generation -m cpu_model
62+
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
6663
6764
pytest -x -v -s tests/models/language/pooling -m cpu_model
6865
pytest -x -v -s tests/models/multimodal/generation \

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,15 @@ docker run \
3535
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
3636
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
3737
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
38-
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
38+
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
3939
cd tests
4040
pytest -v -s v1/core
4141
pytest -v -s v1/engine
4242
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
4343
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
4444
pytest -v -s v1/structured_output
45-
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
45+
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
4646
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
47+
pytest -v -s v1/test_metrics
4748
pytest -v -s v1/test_serial_utils.py
48-
pytest -v -s v1/test_utils.py
49-
pytest -v -s v1/test_metrics_reader.py
5049
'

.buildkite/test-pipeline.yaml

Lines changed: 109 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,28 @@ steps:
5050
mirror_hardwares: [amdexperimental]
5151
source_file_dependencies:
5252
- vllm/
53+
- tests/multimodal
54+
- tests/utils_
55+
commands:
56+
- pytest -v -s -m 'not cpu_test' multimodal
57+
- pytest -v -s utils_
58+
59+
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
60+
timeout_in_minutes: 10
61+
source_file_dependencies:
62+
- vllm/
5363
- tests/test_inputs.py
5464
- tests/test_outputs.py
5565
- tests/multimodal
56-
- tests/utils_
5766
- tests/standalone_tests/lazy_imports.py
5867
- tests/transformers_utils
68+
no_gpu: true
5969
commands:
6070
- python3 standalone_tests/lazy_imports.py
6171
- pytest -v -s test_inputs.py
6272
- pytest -v -s test_outputs.py
63-
- pytest -v -s multimodal
64-
- pytest -v -s utils_ # Utils
65-
- pytest -v -s transformers_utils # transformers_utils
73+
- pytest -v -s -m 'cpu_test' multimodal
74+
- pytest -v -s transformers_utils
6675

6776
- label: Python-only Installation Test # 10min
6877
timeout_in_minutes: 20
@@ -159,10 +168,7 @@ steps:
159168
- examples/offline_inference/rlhf.py
160169
- examples/offline_inference/rlhf_colocate.py
161170
- tests/examples/offline_inference/data_parallel.py
162-
- tests/v1/test_async_llm_dp.py
163-
- tests/v1/test_external_lb_dp.py
164-
- tests/v1/test_internal_lb_dp.py
165-
- tests/v1/test_hybrid_lb_dp.py
171+
- tests/v1/distributed
166172
- tests/v1/engine/test_engine_core_client.py
167173
- tests/distributed/test_symm_mem_allreduce.py
168174
commands:
@@ -180,10 +186,10 @@ steps:
180186
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
181187
# test with internal dp
182188
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
183-
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
184-
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
185-
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
186-
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
189+
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
190+
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
191+
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
192+
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
187193
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
188194
- pytest -v -s distributed/test_utils.py
189195
- pytest -v -s compile/test_basic_correctness.py
@@ -290,24 +296,34 @@ steps:
290296
- tests/v1
291297
commands:
292298
# split the test to avoid interference
293-
- pytest -v -s v1/core
294299
- pytest -v -s v1/executor
295300
- pytest -v -s v1/kv_offload
296301
- pytest -v -s v1/sample
297302
- pytest -v -s v1/logits_processors
298303
- pytest -v -s v1/worker
299-
- pytest -v -s v1/structured_output
300304
- pytest -v -s v1/spec_decode
301-
- pytest -v -s v1/kv_connector/unit
302-
- pytest -v -s v1/metrics
303-
- pytest -v -s v1/test_serial_utils.py
304-
- pytest -v -s v1/test_utils.py
305+
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
306+
- pytest -v -s -m 'not cpu_test' v1/metrics
305307
- pytest -v -s v1/test_oracle.py
306-
- pytest -v -s v1/test_metrics_reader.py
308+
- pytest -v -s v1/test_request.py
307309
# Integration test for streaming correctness (requires special branch).
308310
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
309311
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
310312

313+
- label: V1 Test others (CPU) # 5 mins
314+
source_file_dependencies:
315+
- vllm/
316+
- tests/v1
317+
no_gpu: true
318+
commands:
319+
# split the test to avoid interference
320+
- pytest -v -s v1/core
321+
- pytest -v -s v1/structured_output
322+
- pytest -v -s v1/test_serial_utils.py
323+
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
324+
- pytest -v -s -m 'cpu_test' v1/metrics
325+
326+
311327
- label: Examples Test # 30min
312328
timeout_in_minutes: 45
313329
mirror_hardwares: [amdexperimental]
@@ -463,29 +479,18 @@ steps:
463479
commands:
464480
- pytest -v -s kernels/mamba
465481

466-
- label: Tensorizer Test # 14min
467-
timeout_in_minutes: 25
468-
mirror_hardwares: [amdexperimental]
469-
source_file_dependencies:
470-
- vllm/model_executor/model_loader
471-
- tests/tensorizer_loader
472-
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
473-
commands:
474-
- apt-get update && apt-get install -y curl libsodium23
475-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
476-
- pytest -v -s tensorizer_loader
477-
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
478-
479-
- label: Model Executor Test # 7min
480-
timeout_in_minutes: 20
482+
- label: Model Executor Test # 23min
483+
timeout_in_minutes: 35
481484
mirror_hardwares: [amdexperimental]
482485
source_file_dependencies:
483486
- vllm/model_executor
484487
- tests/model_executor
488+
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
485489
commands:
486490
- apt-get update && apt-get install -y curl libsodium23
487491
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
488492
- pytest -v -s model_executor
493+
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
489494

490495
- label: Benchmarks # 11min
491496
timeout_in_minutes: 20
@@ -520,7 +525,7 @@ steps:
520525
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
521526
# we can only upgrade after this is resolved
522527
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
523-
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
528+
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
524529

525530
- label: LM Eval Small Models # 53min
526531
timeout_in_minutes: 75
@@ -548,10 +553,17 @@ steps:
548553
source_file_dependencies:
549554
- vllm/
550555
- tests/tool_use
551-
- tests/mistral_tool_use
552556
commands:
553-
- pytest -v -s tool_use
554-
- pytest -v -s mistral_tool_use
557+
- pytest -v -s -m 'not cpu_test' tool_use
558+
559+
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
560+
timeout_in_minutes: 10
561+
source_file_dependencies:
562+
- vllm/
563+
- tests/tool_use
564+
no_gpu: true
565+
commands:
566+
- pytest -v -s -m 'cpu_test' tool_use
555567

556568
##### models test #####
557569

@@ -591,13 +603,19 @@ steps:
591603
- vllm/
592604
- tests/models/test_transformers.py
593605
- tests/models/test_registry.py
606+
commands:
607+
- pytest -v -s models/test_transformers.py models/test_registry.py
608+
609+
- label: Basic Models Test (Other CPU) # 5min
610+
timeout_in_minutes: 10
611+
torch_nightly: true
612+
source_file_dependencies:
613+
- vllm/
594614
- tests/models/test_utils.py
595615
- tests/models/test_vision.py
616+
no_gpu: true
596617
commands:
597-
- pytest -v -s models/test_transformers.py \
598-
models/test_registry.py \
599-
models/test_utils.py \
600-
models/test_vision.py
618+
- pytest -v -s models/test_utils.py models/test_vision.py
601619

602620
- label: Language Models Tests (Standard)
603621
timeout_in_minutes: 25
@@ -767,11 +785,13 @@ steps:
767785
commands:
768786
- pip install --upgrade git+https://github.com/huggingface/transformers
769787
- pytest -v -s tests/models/test_initialization.py
788+
- pytest -v -s tests/models/test_transformers.py
770789
- pytest -v -s tests/models/multimodal/processing/
771790
- pytest -v -s tests/models/multimodal/test_mapping.py
772791
- python3 examples/offline_inference/basic/chat.py
773-
- python3 examples/offline_inference/audio_language.py --model-type whisper
774792
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
793+
# Whisper needs spawn method to avoid deadlock
794+
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
775795

776796
- label: Blackwell Test # 38 min
777797
timeout_in_minutes: 60
@@ -827,6 +847,23 @@ steps:
827847
- uv pip install --system 'gpt-oss[eval]==0.0.5'
828848
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
829849

850+
- label: Blackwell Quantized MoE Test
851+
timeout_in_minutes: 60
852+
working_dir: "/vllm-workspace/"
853+
gpu: b200
854+
source_file_dependencies:
855+
- tests/quantization/test_blackwell_moe.py
856+
- vllm/model_executor/models/deepseek_v2.py
857+
- vllm/model_executor/models/gpt_oss.py
858+
- vllm/model_executor/models/llama4.py
859+
- vllm/model_executor/layers/fused_moe
860+
- vllm/model_executor/layers/quantization/compressed_tensors
861+
- vllm/model_executor/layers/quantization/modelopt.py
862+
- vllm/model_executor/layers/quantization/mxfp4.py
863+
- vllm/v1/attention/backends/flashinfer.py
864+
commands:
865+
- pytest -s -v tests/quantization/test_blackwell_moe.py
866+
830867
##### 1 GPU test #####
831868
##### multi gpus test #####
832869

@@ -869,48 +906,58 @@ steps:
869906
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
870907
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
871908

872-
- label: Distributed Tests (2 GPUs) # 110min
873-
timeout_in_minutes: 150
909+
- label: Distributed Tests (2 GPUs) # 68min
910+
timeout_in_minutes: 90
874911
mirror_hardwares: [amdexperimental]
875912
working_dir: "/vllm-workspace/tests"
876913
num_gpus: 2
877914
source_file_dependencies:
915+
- vllm/compilation/
878916
- vllm/distributed/
879917
- vllm/engine/
880918
- vllm/executor/
881-
- vllm/model_executor/models/
882-
- tests/distributed/
883-
- vllm/compilation
884919
- vllm/worker/worker_base.py
885-
- entrypoints/llm/test_collective_rpc.py
886-
- tests/v1/test_async_llm_dp.py
887-
- tests/v1/test_external_lb_dp.py
888-
- tests/v1/entrypoints/openai/test_multi_api_servers.py
889920
- vllm/v1/engine/
890921
- vllm/v1/worker/
922+
- tests/compile/test_basic_correctness.py
923+
- tests/compile/test_wrapper.py
924+
- tests/distributed/
925+
- tests/entrypoints/llm/test_collective_rpc.py
926+
- tests/v1/distributed
927+
- tests/v1/entrypoints/openai/test_multi_api_servers.py
928+
- tests/v1/shutdown
891929
- tests/v1/worker/test_worker_memory_snapshot.py
892930
commands:
893-
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
894-
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
931+
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
932+
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
895933
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
896934
- pytest -v -s entrypoints/llm/test_collective_rpc.py
897935
- pytest -v -s ./compile/test_basic_correctness.py
898936
- pytest -v -s ./compile/test_wrapper.py
899937
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
938+
- pytest -v -s distributed/test_sequence_parallel.py
939+
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
940+
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
941+
942+
- label: Distributed Model Tests (2 GPUs) # 37min
943+
timeout_in_minutes: 50
944+
mirror_hardwares: [amdexperimental]
945+
working_dir: "/vllm-workspace/tests"
946+
num_gpus: 2
947+
source_file_dependencies:
948+
- vllm/model_executor/model_loader/sharded_state_loader.py
949+
- vllm/model_executor/models/
950+
- tests/basic_correctness/
951+
- tests/model_executor/model_loader/test_sharded_state_loader.py
952+
- tests/models/
953+
commands:
900954
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
955+
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
901956
# Avoid importing model tests that cause CUDA reinitialization error
902957
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
903958
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
904959
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
905960
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
906-
# test sequence parallel
907-
- pytest -v -s distributed/test_sequence_parallel.py
908-
# this test fails consistently.
909-
# TODO: investigate and fix
910-
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
911-
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
912-
- pytest -v -s models/multimodal/generation/test_maverick.py
913-
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
914961

915962
- label: Plugin Tests (2 GPUs) # 40min
916963
timeout_in_minutes: 60

.github/CODEOWNERS

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
/vllm/model_executor/layers/mamba @tdoublep
1313
/vllm/model_executor/model_loader @22quinn
1414
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
15-
/vllm/v1/attention @LucasWilkinson
16-
/vllm/v1/sample @22quinn @houseroad
1715
/vllm/vllm_flash_attn @LucasWilkinson
1816
/vllm/lora @jeejeelee
1917
/vllm/reasoning @aarnphm @chaunceyjiang
@@ -28,11 +26,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
2826

2927
# vLLM V1
3028
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
31-
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
32-
/vllm/v1/spec_decode @benchislett @luccafong
29+
/vllm/v1/attention @LucasWilkinson
3330
/vllm/v1/attention/backends/flashinfer.py @mgoin
3431
/vllm/v1/attention/backends/triton_attn.py @tdoublep
3532
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
33+
/vllm/v1/sample @22quinn @houseroad @njhill
34+
/vllm/v1/spec_decode @benchislett @luccafong
35+
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
3636
/vllm/v1/kv_cache_interface.py @heheda12345
3737
/vllm/v1/offloading @ApostaC
3838

@@ -54,7 +54,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
5454
/tests/weight_loading @mgoin @youkaichao @yewentao256
5555
/tests/lora @jeejeelee
5656
/tests/models/language/generation/test_hybrid.py @tdoublep
57-
/tests/v1/kv_connector/nixl_integration @NickLucche
57+
/tests/v1/kv_connector/nixl_integration @NickLucche
5858
/tests/v1/kv_connector @ApostaC
5959
/tests/v1/offloading @ApostaC
6060

0 commit comments

Comments
 (0)