@@ -384,7 +384,12 @@ steps:
384384 --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
385385 --ignore=lora/test_chatglm3_tp.py \
386386 --ignore=lora/test_llama_tp.py \
387- --ignore=lora/test_llm_with_multi_loras.py
387+ --ignore=lora/test_llm_with_multi_loras.py \
388+ --ignore=lora/test_olmoe_tp.py \
389+ --ignore=lora/test_deepseekv2_tp.py \
390+ --ignore=lora/test_gptoss.py \
391+ --ignore=lora/test_qwen3moe_tp.py
392+
388393 parallelism : 4
389394
390395- label : PyTorch Compilation Unit Tests # 15min
@@ -416,15 +421,16 @@ steps:
416421 - pytest -v -s compile/test_basic_correctness.py
417422 - pytest -v -s compile/piecewise/
418423
419- - label : PyTorch Fullgraph Test # 20min
420- timeout_in_minutes : 30
424+ - label : PyTorch Fullgraph Test # 22min
425+ timeout_in_minutes : 35
421426 mirror_hardwares : [amdexperimental]
422427 torch_nightly : true
423428 source_file_dependencies :
424429 - vllm/
425430 - tests/compile
426431 commands :
427432 - pytest -v -s compile/test_full_graph.py
433+ - pytest -v -s compile/test_fusions_e2e.py
428434
429435- label : Kernels Core Operation Test # 48min
430436 timeout_in_minutes : 75
@@ -807,8 +813,8 @@ steps:
807813 # Whisper needs spawn method to avoid deadlock
808814 - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
809815
810- - label : Blackwell Test # 38 min
811- timeout_in_minutes : 60
816+ - label : Blackwell Test # 21 min
817+ timeout_in_minutes : 30
812818 working_dir : " /vllm-workspace/"
813819 gpu : b200
814820 # optional: true
@@ -821,8 +827,6 @@ steps:
821827 - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
822828 - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
823829 - vllm/v1/attention/backends/flashinfer.py
824- - vllm/compilation/fusion.py
825- - vllm/compilation/fusion_attn.py
826830 commands :
827831 - nvidia-smi
828832 - python3 examples/offline_inference/basic/chat.py
@@ -839,15 +843,32 @@ steps:
839843 - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
840844 - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
841845 - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
846+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
847+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
842848 - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
843849 - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
844- # Fusion
845- - pytest -v -s tests/compile/test_fusion_all_reduce.py
846- - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
847850 - pytest -v -s tests/kernels/moe/test_flashinfer.py
851+
852+ - label : Blackwell Fusion Tests # 30 min
853+ timeout_in_minutes : 40
854+ working_dir : " /vllm-workspace/"
855+ gpu : b200
856+ source_file_dependencies :
857+ - csrc/quantization/fp4/
858+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
859+ - vllm/v1/attention/backends/flashinfer.py
860+ - vllm/compilation/
861+ # can affect pattern matching
862+ - vllm/model_executor/layers/layernorm.py
863+ - vllm/model_executor/layers/activation.py
864+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
865+ commands :
866+ - nvidia-smi
867+ - pytest -v -s tests/compile/test_fusion_attn.py
848868 - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
849- - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
850- - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
869+ # this runner has 2 GPUs available even though num_gpus=2 is not set
870+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
871+ - pytest -v -s tests/compile/test_fusions_e2e.py
851872
852873- label : Blackwell GPT-OSS Eval
853874 timeout_in_minutes : 60
@@ -961,6 +982,7 @@ steps:
961982 - pytest -v -s ./compile/test_basic_correctness.py
962983 - pytest -v -s ./compile/test_wrapper.py
963984 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
985+ - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
964986 - pytest -v -s distributed/test_sequence_parallel.py
965987 - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
966988 - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -1004,6 +1026,11 @@ steps:
10041026 - pytest -v -s plugins_tests/test_io_processor_plugins.py
10051027 - pip uninstall prithvi_io_processor_plugin -y
10061028 # end io_processor plugins test
1029+ # begin stat_logger plugins test
1030+ - pip install -e ./plugins/vllm_add_dummy_stat_logger
1031+ - pytest -v -s plugins_tests/test_stats_logger_plugins.py
1032+ - pip uninstall dummy_stat_logger -y
1033+ # end stat_logger plugins test
10071034 # other tests continue here:
10081035 - pytest -v -s plugins_tests/test_scheduler_plugins.py
10091036 - pip install -e ./plugins/vllm_add_dummy_model
@@ -1043,6 +1070,7 @@ steps:
10431070 - pytest -v -s -x lora/test_chatglm3_tp.py
10441071 - pytest -v -s -x lora/test_llama_tp.py
10451072 - pytest -v -s -x lora/test_llm_with_multi_loras.py
1073+ - pytest -v -s -x lora/test_olmoe_tp.py
10461074
10471075
10481076- label : Weight Loading Multiple GPU Test # 33min
@@ -1068,6 +1096,17 @@ steps:
10681096 - tests/weight_loading
10691097 commands :
10701098 - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
1099+
1100+ - label : NixlConnector PD accuracy tests (Distributed) # 30min
1101+ timeout_in_minutes : 30
1102+ working_dir : " /vllm-workspace/tests"
1103+ num_gpus : 4
1104+ source_file_dependencies :
1105+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
1106+ - tests/v1/kv_connector/nixl_integration/
1107+ commands :
1108+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
1109+ - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
10711110
10721111
10731112# #### multi gpus test #####
@@ -1100,14 +1139,16 @@ steps:
11001139 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
11011140
11021141# #### H200 test #####
1103- - label : Distrubted Tests (H200) # optional
1142+ - label : Distributed Tests (H200) # optional
11041143 gpu : h200
11051144 optional : true
11061145 working_dir : " /vllm-workspace/"
11071146 num_gpus : 2
11081147 commands :
11091148 - pytest -v -s tests/compile/test_async_tp.py
11101149 - pytest -v -s tests/compile/test_sequence_parallelism.py
1150+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
1151+ - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
11111152 - pytest -v -s tests/distributed/test_context_parallel.py
11121153 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
11131154
0 commit comments