Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 25 additions & 11 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -302,20 +302,32 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization

- label: V1 Test e2e + engine # 30min
- label: V1 Test e2e (excluding spec_decode) + engine # 30min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
# Run v1/e2e tests without AITER, excluding spec_decode tests
# - pytest -v -s v1/e2e --ignore=v1/e2e/test_spec_decode.py
- pytest -v -s v1/engine

# - label: V1 Test spec_decode with AITER # 15min
# timeout_in_minutes: 25
# mirror_hardwares: [amdexperimental, amdproduction]
# agent_pool: mi325_1
# # grade: Blocking
# source_file_dependencies:
# - vllm/
# - tests/v1/e2e/test_spec_decode.py
# commands:
# # Spec decode tests require AITER FA for ROCm
# # See: https://github.com/vllm-project/vllm/issues/27619
# - VLLM_ROCM_USE_AITER=1 pytest -v -s v1/e2e/test_spec_decode.py

- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
Expand Down Expand Up @@ -371,7 +383,7 @@ steps:

- label: Examples Test # 30min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/examples"
Expand All @@ -390,13 +402,15 @@ steps:
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# Encoder-decoder and encoder-only models are not supported on AMD yet: https://github.com/vllm-project/vllm/issues/27442
# - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# - python3 offline_inference/basic/embed.py
# - python3 offline_inference/basic/score.py
# Use AITER FA to test spec decode until triton attention is fixed: https://github.com/vllm-project/vllm/issues/27619
- VLLM_ROCM_USE_AITER=1 python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- VLLM_ROCM_USE_AITER=1 python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
#- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048

- label: Platform Tests (CUDA) # 4min
Expand Down
28 changes: 20 additions & 8 deletions tests/v1/e2e/test_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,14 +255,19 @@ def test_speculators_model_integration(
True,
marks=large_gpu_mark(min_gb=80),
), # works on 4x H100
(
pytest.param(
(
"eagle",
"eagle618/deepseek-v3-random",
"eagle618/eagle-deepseek-v3-random",
1,
),
False,
marks=pytest.mark.skipif(
current_platform.is_rocm(),
reason="DeepSeek head_dim=192 not supported by "
"AiterFlashAttention on ROCm",
),
),
],
ids=[
Expand All @@ -289,6 +294,11 @@ def test_eagle_correctness(
"TREE_ATTN is flaky in the test disable for now until it can be "
"resolved (see https://github.com/vllm-project/vllm/issues/22922)"
)
if attn_backend == "TRITON_ATTN":
pytest.skip(
"TRITON_ATTN has illegal memory access issue in the test disable for now "
"until it can be resolved (see https://github.com/vllm-project/vllm/issues/27619)"
)

# Generate test prompts inside the function instead of using fixture
test_prompts = get_test_prompts(mm_enabled)
Expand All @@ -307,12 +317,6 @@ def test_eagle_correctness(
m.setenv("VLLM_MLA_DISABLE", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)

if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
m.setenv("VLLM_ROCM_USE_AITER", "1")

Expand Down Expand Up @@ -361,7 +365,15 @@ def test_eagle_correctness(
["model_setup", "mm_enabled"],
[
(("mtp", "XiaomiMiMo/MiMo-7B-Base", 1), False),
(("mtp", "ZixiQi/DeepSeek-V3-4layers-MTP-FP8", 1), False),
pytest.param(
("mtp", "ZixiQi/DeepSeek-V3-4layers-MTP-FP8", 1),
False,
marks=pytest.mark.skipif(
current_platform.is_rocm(),
reason="DeepSeek head_dim=192 not supported by "
"AiterFlashAttention on ROCm",
),
),
],
ids=["mimo", "deepseek"],
)
Expand Down
6 changes: 3 additions & 3 deletions tests/v1/spec_decode/test_eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,10 +426,10 @@ class _TargetModelStub(LlamaForCausalLM):
def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
if attn_backend == "TRITON_ATTN":
pytest.skip(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
"TRITON_ATTN has illegal memory access issue in the test disable for now "
"until it can be resolved (see https://github.com/vllm-project/vllm/issues/27619)"
)

if attn_backend == "TREE_ATTN":
Expand Down