Skip to content

Commit fa40146

Browse files
committed
Merge remote-tracking branch 'origin/dev/antropic_v2' into dev/antropic_v2
2 parents 78b6608 + 94f9731 commit fa40146

File tree

489 files changed

+12974
-7105
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

489 files changed

+12974
-7105
lines changed
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
# For hf script, without -t option (tensor parallel size).
2-
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
2+
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
33
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
44
backend: "vllm-vlm"
55
tasks:
66
- name: "chartqa"
77
metrics:
88
- name: "relaxed_accuracy,none"
9-
value: 0.90
9+
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
10+
value: 0.80
1011
limit: 100
1112
num_fewshot: 0

.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# For hf script, without -t option (tensor parallel size).
2-
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
2+
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
33
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
4-
backend: "vllm-vlm"
54
tasks:
65
- name: "mmlu_pro"
76
metrics:

.buildkite/release-pipeline.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,20 @@ steps:
1515
env:
1616
DOCKER_BUILDKIT: "1"
1717

18+
# aarch64 build.
19+
- label: "Build arm64 CPU wheel"
20+
depends_on: ~
21+
id: build-wheel-arm64-cpu
22+
agents:
23+
queue: arm64_cpu_queue_postmerge
24+
commands:
25+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
26+
- "mkdir artifacts"
27+
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
28+
- "bash .buildkite/scripts/upload-wheels.sh"
29+
env:
30+
DOCKER_BUILDKIT: "1"
31+
1832
- label: "Build wheel - CUDA 12.8"
1933
depends_on: ~
2034
id: build-wheel-cuda-12-8
@@ -142,6 +156,22 @@ steps:
142156
env:
143157
DOCKER_BUILDKIT: "1"
144158

159+
- block: "Build arm64 CPU release image"
160+
key: block-arm64-cpu-release-image-build
161+
depends_on: ~
162+
163+
- label: "Build and publish arm64 CPU release image"
164+
depends_on: block-arm64-cpu-release-image-build
165+
agents:
166+
queue: arm64_cpu_queue_postmerge
167+
commands:
168+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
169+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
170+
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
171+
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
172+
env:
173+
DOCKER_BUILDKIT: "1"
174+
145175
- label: "Build and publish nightly multi-arch image to DockerHub"
146176
depends_on:
147177
- create-multi-arch-manifest

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ function cpu_tests() {
7070
docker exec cpu-test-"$NUMA_NODE" bash -c "
7171
set -e
7272
pytest -x -s -v \
73-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
73+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
7474

7575
# Note: disable it until supports V1
7676
# Run AWQ test

.buildkite/test-amd.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,7 @@ steps:
10811081
- pytest -v -s ./compile/test_basic_correctness.py
10821082
- pytest -v -s ./compile/test_wrapper.py
10831083
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
1084+
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
10841085
- pytest -v -s distributed/test_sequence_parallel.py
10851086
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
10861087
- pytest -v -s v1/worker/test_worker_memory_snapshot.py

.buildkite/test-pipeline.yaml

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,12 @@ steps:
384384
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
385385
--ignore=lora/test_chatglm3_tp.py \
386386
--ignore=lora/test_llama_tp.py \
387-
--ignore=lora/test_llm_with_multi_loras.py
387+
--ignore=lora/test_llm_with_multi_loras.py \
388+
--ignore=lora/test_olmoe_tp.py \
389+
--ignore=lora/test_deepseekv2_tp.py \
390+
--ignore=lora/test_gptoss.py \
391+
--ignore=lora/test_qwen3moe_tp.py
392+
388393
parallelism: 4
389394

390395
- label: PyTorch Compilation Unit Tests # 15min
@@ -416,15 +421,16 @@ steps:
416421
- pytest -v -s compile/test_basic_correctness.py
417422
- pytest -v -s compile/piecewise/
418423

419-
- label: PyTorch Fullgraph Test # 20min
420-
timeout_in_minutes: 30
424+
- label: PyTorch Fullgraph Test # 22min
425+
timeout_in_minutes: 35
421426
mirror_hardwares: [amdexperimental]
422427
torch_nightly: true
423428
source_file_dependencies:
424429
- vllm/
425430
- tests/compile
426431
commands:
427432
- pytest -v -s compile/test_full_graph.py
433+
- pytest -v -s compile/test_fusions_e2e.py
428434

429435
- label: Kernels Core Operation Test # 48min
430436
timeout_in_minutes: 75
@@ -807,8 +813,8 @@ steps:
807813
# Whisper needs spawn method to avoid deadlock
808814
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
809815

810-
- label: Blackwell Test # 38 min
811-
timeout_in_minutes: 60
816+
- label: Blackwell Test # 21 min
817+
timeout_in_minutes: 30
812818
working_dir: "/vllm-workspace/"
813819
gpu: b200
814820
# optional: true
@@ -821,8 +827,6 @@ steps:
821827
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
822828
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
823829
- vllm/v1/attention/backends/flashinfer.py
824-
- vllm/compilation/fusion.py
825-
- vllm/compilation/fusion_attn.py
826830
commands:
827831
- nvidia-smi
828832
- python3 examples/offline_inference/basic/chat.py
@@ -839,15 +843,32 @@ steps:
839843
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
840844
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
841845
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
846+
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
847+
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
842848
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
843849
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
844-
# Fusion
845-
- pytest -v -s tests/compile/test_fusion_all_reduce.py
846-
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
847850
- pytest -v -s tests/kernels/moe/test_flashinfer.py
851+
852+
- label: Blackwell Fusion Tests # 30 min
853+
timeout_in_minutes: 40
854+
working_dir: "/vllm-workspace/"
855+
gpu: b200
856+
source_file_dependencies:
857+
- csrc/quantization/fp4/
858+
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
859+
- vllm/v1/attention/backends/flashinfer.py
860+
- vllm/compilation/
861+
# can affect pattern matching
862+
- vllm/model_executor/layers/layernorm.py
863+
- vllm/model_executor/layers/activation.py
864+
- vllm/model_executor/layers/quantization/input_quant_fp8.py
865+
commands:
866+
- nvidia-smi
867+
- pytest -v -s tests/compile/test_fusion_attn.py
848868
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
849-
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
850-
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
869+
# this runner has 2 GPUs available even though num_gpus=2 is not set
870+
- pytest -v -s tests/compile/test_fusion_all_reduce.py
871+
- pytest -v -s tests/compile/test_fusions_e2e.py
851872

852873
- label: Blackwell GPT-OSS Eval
853874
timeout_in_minutes: 60
@@ -961,6 +982,7 @@ steps:
961982
- pytest -v -s ./compile/test_basic_correctness.py
962983
- pytest -v -s ./compile/test_wrapper.py
963984
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
985+
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
964986
- pytest -v -s distributed/test_sequence_parallel.py
965987
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
966988
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -1004,6 +1026,11 @@ steps:
10041026
- pytest -v -s plugins_tests/test_io_processor_plugins.py
10051027
- pip uninstall prithvi_io_processor_plugin -y
10061028
# end io_processor plugins test
1029+
# begin stat_logger plugins test
1030+
- pip install -e ./plugins/vllm_add_dummy_stat_logger
1031+
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
1032+
- pip uninstall dummy_stat_logger -y
1033+
# end stat_logger plugins test
10071034
# other tests continue here:
10081035
- pytest -v -s plugins_tests/test_scheduler_plugins.py
10091036
- pip install -e ./plugins/vllm_add_dummy_model
@@ -1043,6 +1070,7 @@ steps:
10431070
- pytest -v -s -x lora/test_chatglm3_tp.py
10441071
- pytest -v -s -x lora/test_llama_tp.py
10451072
- pytest -v -s -x lora/test_llm_with_multi_loras.py
1073+
- pytest -v -s -x lora/test_olmoe_tp.py
10461074

10471075

10481076
- label: Weight Loading Multiple GPU Test # 33min
@@ -1068,6 +1096,17 @@ steps:
10681096
- tests/weight_loading
10691097
commands:
10701098
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
1099+
1100+
- label: NixlConnector PD accuracy tests (Distributed) # 30min
1101+
timeout_in_minutes: 30
1102+
working_dir: "/vllm-workspace/tests"
1103+
num_gpus: 4
1104+
source_file_dependencies:
1105+
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
1106+
- tests/v1/kv_connector/nixl_integration/
1107+
commands:
1108+
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
1109+
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
10711110

10721111

10731112
##### multi gpus test #####
@@ -1100,14 +1139,16 @@ steps:
11001139
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
11011140

11021141
##### H200 test #####
1103-
- label: Distrubted Tests (H200) # optional
1142+
- label: Distributed Tests (H200) # optional
11041143
gpu: h200
11051144
optional: true
11061145
working_dir: "/vllm-workspace/"
11071146
num_gpus: 2
11081147
commands:
11091148
- pytest -v -s tests/compile/test_async_tp.py
11101149
- pytest -v -s tests/compile/test_sequence_parallelism.py
1150+
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1151+
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
11111152
- pytest -v -s tests/distributed/test_context_parallel.py
11121153
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
11131154

.github/CODEOWNERS

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
/vllm/attention @LucasWilkinson
66
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
77
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
8-
/vllm/model_executor/layers/fused_moe @mgoin
9-
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
8+
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
9+
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
1010
/vllm/model_executor/layers/mamba @tdoublep
1111
/vllm/model_executor/model_loader @22quinn
1212
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
@@ -25,7 +25,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
2525

2626
# vLLM V1
2727
/vllm/v1/attention @LucasWilkinson
28-
/vllm/v1/attention/backends/flashinfer.py @mgoin
28+
/vllm/v1/attention/backends/mla @pavanimajety
29+
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
2930
/vllm/v1/attention/backends/triton_attn.py @tdoublep
3031
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
3132
/vllm/v1/sample @22quinn @houseroad @njhill
@@ -44,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
4445
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
4546
/tests/models @DarkLight1337 @ywang96
4647
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
47-
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
48+
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
4849
/tests/test_inputs.py @DarkLight1337 @ywang96
4950
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
5051
/tests/v1/structured_output @mgoin @russellb @aarnphm

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ ipython_config.py
9494
# generated files
9595
**/generated/**
9696

97+
# uv
98+
uv.lock
99+
97100
# pyenv
98101
# For a library or package, you might want to ignore these files since the code is
99102
# intended to run in multiple environments; otherwise, check them in:

.markdownlint.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ MD013: false
44
MD024:
55
siblings_only: true
66
MD033: false
7-
MD042: false
87
MD045: false
98
MD046: false
109
MD051: false

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
883883
set(VLLM_MOE_EXT_SRC
884884
"csrc/moe/torch_bindings.cpp"
885885
"csrc/moe/moe_align_sum_kernels.cu"
886+
"csrc/moe/moe_lora_align_sum_kernels.cu"
886887
"csrc/moe/topk_softmax_kernels.cu")
887888

888889
if(VLLM_GPU_LANG STREQUAL "CUDA")

0 commit comments

Comments
 (0)