From 7c50b3e928916ee99e425b59a20691528464f159 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Sat, 25 Oct 2025 08:29:28 +0000 Subject: [PATCH 01/13] Upgrade to 0.11.1 newest vllm commit Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/kv_offload/cpu_npu.py | 8 ++++++- vllm_ascend/models/qwen3_next.py | 2 +- vllm_ascend/sample/rejection_sampler.py | 26 +++++++++++++++++------ vllm_ascend/spec_decode/eagle_proposer.py | 3 ++- vllm_ascend/worker/model_runner_v1.py | 5 +++-- 5 files changed, 33 insertions(+), 11 deletions(-) diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py index 8924ebcf74..c19ec1b0b2 100644 --- a/vllm_ascend/kv_offload/cpu_npu.py +++ b/vllm_ascend/kv_offload/cpu_npu.py @@ -2,11 +2,17 @@ import torch from vllm.attention import AttentionBackend from vllm.logger import init_logger -from vllm.utils import is_pin_memory_available from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, TransferResult, TransferSpec) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import is_pin_memory_available +else: + from vllm.utils.platform_utils import is_pin_memory_available + logger = init_logger(__name__) diff --git a/vllm_ascend/models/qwen3_next.py b/vllm_ascend/models/qwen3_next.py index f5b4b8a142..aa1cd9f517 100644 --- a/vllm_ascend/models/qwen3_next.py +++ b/vllm_ascend/models/qwen3_next.py @@ -697,4 +697,4 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_physical_experts = example_layer.n_physical_experts self.num_local_physical_experts = example_layer.n_local_physical_experts self.num_routed_experts = example_layer.n_routed_experts - self.num_redundant_experts = example_layer.n_redundant_experts + self.num_redundant_experts = example_layer.n_redundant_experts \ No newline at end of file diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index e0d770df26..52787fa29f 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -5,10 +5,17 @@ import torch.nn as nn import vllm.v1.sample.rejection_sampler as rs from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.sample.rejection_sampler import (RejectionSampler, compute_probs, +from vllm.v1.sample.rejection_sampler import (RejectionSampler, generate_uniform_probs) from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.v1.sample.rejection_sampler import compute_probs +else: + from vllm.v1.sample.rejection_sampler import apply_sampling_constraints + PLACEHOLDER_TOKEN_ID = -1 GREEDY_TEMPERATURE = -1 # Maximum number of speculative draft tokens allowed per request in a single @@ -82,11 +89,18 @@ def forward( # [num_tokens, vocab_size] # NOTE(woosuk): `target_logits` can be updated in place inside the # `compute_probs` function. - target_probs = compute_probs( - target_logits, - metadata.cu_num_draft_tokens, - sampling_metadata, - ) + if vllm_version_is("0.11.0"): + target_probs = compute_probs( + target_logits, + metadata.cu_num_draft_tokens, + sampling_metadata, + ) + else: + target_probs = apply_sampling_constraints( + target_logits, + metadata.cu_num_draft_tokens, + sampling_metadata, + ) output_token_ids = rejection_sample( metadata.draft_token_ids, diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 6a8bb8d69c..74e2917806 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -12,7 +12,6 @@ from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM -from vllm.utils import is_pin_memory_available from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata @@ -27,8 +26,10 @@ if vllm_version_is("0.11.0"): from vllm.config import CompilationLevel + from vllm.utils import is_pin_memory_available else: from vllm.config import CompilationMode + from vllm.utils.platform_utils import is_pin_memory_available PADDING_SLOT_ID = -1 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index f30a9a39b4..88cfcd5c20 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -72,7 +72,7 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask -from vllm.utils import cdiv, is_pin_memory_available +from vllm.utils import cdiv from vllm.utils.jsontree import json_map_leaves from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( @@ -159,13 +159,14 @@ if vllm_version_is("0.11.0"): from vllm.attention.layer import Attention from vllm.config import CompilationLevel - from vllm.utils import LazyLoader + from vllm.utils import LazyLoader, is_pin_memory_available from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention else: from vllm.attention.layer import MLAAttention from vllm.config import CompilationMode from vllm.utils.import_utils import LazyLoader + from vllm.utils.platform_utils import is_pin_memory_available if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] From 94c91255d9d5930e4baf73e449b57467e0a927ce Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Sat, 25 Oct 2025 08:41:38 +0000 Subject: [PATCH 02/13] change commit and fix send_delta_data Signed-off-by: Icey <1790571317@qq.com> --- .github/workflows/format_pr_body.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 6 +- .github/workflows/vllm_ascend_test_full.yaml | 2 +- vllm_ascend/platform.py | 59 ++++++++++++++------ 4 files changed, 46 insertions(+), 23 deletions(-) diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index 2d8a729d53..a95dcc6f2d 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca + VLLM_COMMIT=83f478bb19489b41e9d208b47b4bb5a95ac171ac echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index cc504b8e37..b0b1bd2079 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca + vllm: 83f478bb19489b41e9d208b47b4bb5a95ac171ac changes: runs-on: ubuntu-latest @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] + vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0] steps: - name: Install packages run: | @@ -140,7 +140,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] + vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index a821263f55..088036e6d8 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -69,7 +69,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] + vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index fa4e802c65..e47924f035 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -142,24 +142,47 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "Non-MLA LLMs forcibly disable the chunked prefill feature," "as the performance of operators supporting this feature " "functionality is currently suboptimal.") - if not model_config.is_multimodal_model and \ - structured_outputs_config.backend == "auto" and \ - not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ - scheduler_config.policy == "fcfs": - ascend_scheduler_config.enabled = True - chunked_prefill_enabled_in_ascend_scheduler = getattr( - ascend_scheduler_config, "enable_chunked_prefill", False) - if chunked_prefill_enabled_in_ascend_scheduler: - logger.warning( - "Chunked prefill feature is enabled in ascend_scheduler," - "but note that the operator supporting this feature " - "would lead to performance degradation.") - # In this situation, max_num_batched_tokens would have been rewritten. - # So we must make sure max_num_batched_tokens is not smaller than max_model_len. - if (scheduler_config.max_num_batched_tokens - < scheduler_config.max_model_len - and not chunked_prefill_enabled_in_ascend_scheduler): - scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len + if vllm_version_is("0.11.0"): + if not model_config.is_multimodal_model and \ + structured_outputs_config.backend == "auto" and \ + not scheduler_config.send_delta_data and \ + not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ + scheduler_config.policy == "fcfs": + ascend_scheduler_config.enabled = True + chunked_prefill_enabled_in_ascend_scheduler = getattr( + ascend_scheduler_config, "enable_chunked_prefill", + False) + if chunked_prefill_enabled_in_ascend_scheduler: + logger.warning( + "Chunked prefill feature is enabled in ascend_scheduler," + "but note that the operator supporting this feature " + "would lead to performance degradation.") + # In this situation, max_num_batched_tokens would have been rewritten. + # So we must make sure max_num_batched_tokens is not smaller than max_model_len. + if (scheduler_config.max_num_batched_tokens + < scheduler_config.max_model_len and + not chunked_prefill_enabled_in_ascend_scheduler): + scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len + else: + if not model_config.is_multimodal_model and \ + structured_outputs_config.backend == "auto" and \ + not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ + scheduler_config.policy == "fcfs": + ascend_scheduler_config.enabled = True + chunked_prefill_enabled_in_ascend_scheduler = getattr( + ascend_scheduler_config, "enable_chunked_prefill", + False) + if chunked_prefill_enabled_in_ascend_scheduler: + logger.warning( + "Chunked prefill feature is enabled in ascend_scheduler," + "but note that the operator supporting this feature " + "would lead to performance degradation.") + # In this situation, max_num_batched_tokens would have been rewritten. + # So we must make sure max_num_batched_tokens is not smaller than max_model_len. + if (scheduler_config.max_num_batched_tokens + < scheduler_config.max_model_len and + not chunked_prefill_enabled_in_ascend_scheduler): + scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len kv_cache_dtype = vllm_config.additional_config.get( "kv_cache_dtype", None) From c2dc16549d34f8f51c8622a13b9a01a88ba029e1 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Sat, 25 Oct 2025 09:54:30 +0000 Subject: [PATCH 03/13] fix init_with_cudagraph_sizes Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/platform.py | 7 +++++-- vllm_ascend/utils.py | 12 ++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index e47924f035..64ffc7f6e6 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -260,8 +260,11 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: f"{vllm_config.parallel_config.tensor_parallel_size}") if len(sp_aclgraph_sizes) != len(original_sizes): compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes - vllm_config.compilation_config.init_with_cudagraph_sizes( - sp_aclgraph_sizes) + if vllm_version_is("0.11.0"): + compilation_config.init_with_cudagraph_sizes( + sp_aclgraph_sizes) + else: + vllm_config.compilation_config.post_init_cudagraph_sizes() # TODO: Full graph is fully supported later, and the default value will be set to full graph. if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index a3b908a09e..303adfa9cc 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -402,7 +402,11 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: indices[0], indices[-1] = 0, len(original_sizes) - 1 sampled_sizes = [original_sizes[i] for i in indices] - compilation_config.init_with_cudagraph_sizes(sampled_sizes) + if vllm_version_is("0.11.0"): + compilation_config.init_with_cudagraph_sizes(sampled_sizes) + else: + compilation_config.cudagraph_capture_sizes = sampled_sizes + compilation_config.post_init_cudagraph_sizes() logger.info( "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes", @@ -433,7 +437,11 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs: enlarged_sizes = [(num_speculative_tokens + 1) * size for size in original_sizes] - compilation_config.init_with_cudagraph_sizes(enlarged_sizes) + if vllm_version_is("0.11.0"): + compilation_config.init_with_cudagraph_sizes(enlarged_sizes) + else: + compilation_config.cudagraph_capture_sizes = enlarged_sizes + compilation_config.post_init_cudagraph_sizes() logger.info( "Adjusted ACL graphs: %s → %s for speculative decoding", original_sizes, enlarged_sizes) From 6ba3f392e526c5526a12e95413fe121131e35f50 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 27 Oct 2025 03:22:15 +0000 Subject: [PATCH 04/13] skit embed aclgraph e2e Signed-off-by: Icey <1790571317@qq.com> --- tests/e2e/singlecard/test_embedding_aclgraph.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/e2e/singlecard/test_embedding_aclgraph.py b/tests/e2e/singlecard/test_embedding_aclgraph.py index e0851b0646..56ea57a4b0 100644 --- a/tests/e2e/singlecard/test_embedding_aclgraph.py +++ b/tests/e2e/singlecard/test_embedding_aclgraph.py @@ -24,10 +24,10 @@ from tests.e2e.utils import check_embeddings_close os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_USE_MODELSCOPE"] = "True" MODELS = ["BAAI/bge-m3"] - @pytest.mark.parametrize("model_name", MODELS) def test_aclgrpah_embed_models_correctness(model_name): queries = ['What is the capital of China?', 'Explain gravity'] @@ -36,12 +36,14 @@ def test_aclgrpah_embed_models_correctness(model_name): model_name, task="embed", enforce_eager=False, + # load_format="dummy", ) as vllm_aclgraph_runner: vllm_aclgraph_outputs = vllm_aclgraph_runner.encode(queries) with VllmRunner( model_name, task="embed", + # load_format="dummy", enforce_eager=True, ) as vllm_runner: vllm_outputs = vllm_runner.encode(queries) From e8849b4c8afcdc68c349f04c51d4190bb982c153 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 27 Oct 2025 08:34:39 +0000 Subject: [PATCH 05/13] fix init_with_cudagraph_sizes Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/platform.py | 6 ++++-- vllm_ascend/utils.py | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 64ffc7f6e6..d7550bf11b 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -33,7 +33,8 @@ delete_torchair_cache_file) from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p, prefill_context_parallel_enable, - update_aclgraph_sizes, vllm_version_is) + update_aclgraph_sizes, + update_cudagraph_capture_sizes, vllm_version_is) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -264,7 +265,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: compilation_config.init_with_cudagraph_sizes( sp_aclgraph_sizes) else: - vllm_config.compilation_config.post_init_cudagraph_sizes() + update_cudagraph_capture_sizes(vllm_config, + sp_aclgraph_sizes) # TODO: Full graph is fully supported later, and the default value will be set to full graph. if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 303adfa9cc..a6a4d6d013 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -311,6 +311,40 @@ def _rec_find(d): return max(layer_counts) +# Update cudagraph capture sizes for vllm config +def update_cudagraph_capture_sizes( + vllm_config: VllmConfig, + cudagraph_capture_sizes: List[int]) -> List[int]: + valid_max_size = (cudagraph_capture_sizes[-1] + if cudagraph_capture_sizes else 0) + if (vllm_config.compilation_config.max_cudagraph_capture_size is not None + and vllm_config.compilation_config.max_cudagraph_capture_size + != valid_max_size): + if vllm_config.compilation_config.cudagraph_capture_sizes is not None: + raise ValueError( + "customized max_cudagraph_capture_size" + f"(={vllm_config.compilation_config.max_cudagraph_capture_size}) " + "should be consistent with the max value of " + f"cudagraph_capture_sizes(={valid_max_size})") + logger.warning( + "Truncating max_cudagraph_capture_size to %d", + valid_max_size, + ) + + vllm_config.compilation_config.max_cudagraph_capture_size = valid_max_size + if vllm_config.compilation_config.cudagraph_capture_sizes is not None and len( + cudagraph_capture_sizes) < len( + vllm_config.compilation_config.cudagraph_capture_sizes): + logger.warning( + ("cudagraph_capture_sizes specified in compilation_config" + " %s is overridden by config %s"), + vllm_config.compilation_config.cudagraph_capture_sizes, + cudagraph_capture_sizes, + ) + vllm_config.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes + vllm_config.compilation_config.post_init_cudagraph_sizes() + + def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: """Update ACL graph capture sizes based on hardware limitations""" # NOTE: Currently, we can only capture 1800 graphs at most, @@ -405,8 +439,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: if vllm_version_is("0.11.0"): compilation_config.init_with_cudagraph_sizes(sampled_sizes) else: - compilation_config.cudagraph_capture_sizes = sampled_sizes - compilation_config.post_init_cudagraph_sizes() + update_cudagraph_capture_sizes(sampled_sizes) logger.info( "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes", @@ -440,8 +473,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: if vllm_version_is("0.11.0"): compilation_config.init_with_cudagraph_sizes(enlarged_sizes) else: - compilation_config.cudagraph_capture_sizes = enlarged_sizes - compilation_config.post_init_cudagraph_sizes() + update_cudagraph_capture_sizes(vllm_config, enlarged_sizes) logger.info( "Adjusted ACL graphs: %s → %s for speculative decoding", original_sizes, enlarged_sizes) From 0ca98f5036d99f35416864321eb3f8dc8fe5f769 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 27 Oct 2025 08:37:01 +0000 Subject: [PATCH 06/13] change commit id to 0.11.1 Signed-off-by: Icey <1790571317@qq.com> --- .github/workflows/format_pr_body.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 6 +++--- .github/workflows/vllm_ascend_test_full.yaml | 2 +- tests/e2e/singlecard/test_embedding_aclgraph.py | 4 +--- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index a95dcc6f2d..f790bb8986 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=83f478bb19489b41e9d208b47b4bb5a95ac171ac + VLLM_COMMIT=releases/v0.11.1 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index b0b1bd2079..da76175086 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: 83f478bb19489b41e9d208b47b4bb5a95ac171ac + vllm: releases/v0.11.1 changes: runs-on: ubuntu-latest @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0] + vllm_version: [releases/v0.11.1, v0.11.0] steps: - name: Install packages run: | @@ -140,7 +140,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0] + vllm_version: [releases/v0.11.1, v0.11.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index 088036e6d8..d9c6f6c61e 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -69,7 +69,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0] + vllm_version: [releases/v0.11.1, v0.11.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/tests/e2e/singlecard/test_embedding_aclgraph.py b/tests/e2e/singlecard/test_embedding_aclgraph.py index 56ea57a4b0..e0851b0646 100644 --- a/tests/e2e/singlecard/test_embedding_aclgraph.py +++ b/tests/e2e/singlecard/test_embedding_aclgraph.py @@ -24,10 +24,10 @@ from tests.e2e.utils import check_embeddings_close os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_USE_MODELSCOPE"] = "True" MODELS = ["BAAI/bge-m3"] + @pytest.mark.parametrize("model_name", MODELS) def test_aclgrpah_embed_models_correctness(model_name): queries = ['What is the capital of China?', 'Explain gravity'] @@ -36,14 +36,12 @@ def test_aclgrpah_embed_models_correctness(model_name): model_name, task="embed", enforce_eager=False, - # load_format="dummy", ) as vllm_aclgraph_runner: vllm_aclgraph_outputs = vllm_aclgraph_runner.encode(queries) with VllmRunner( model_name, task="embed", - # load_format="dummy", enforce_eager=True, ) as vllm_runner: vllm_outputs = vllm_runner.encode(queries) From e8f87f6eb4ea9d6f0bbbf9f0c461f759aad27526 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 27 Oct 2025 08:56:16 +0000 Subject: [PATCH 07/13] tiny fix Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index a6a4d6d013..64e2f888d0 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -314,7 +314,7 @@ def _rec_find(d): # Update cudagraph capture sizes for vllm config def update_cudagraph_capture_sizes( vllm_config: VllmConfig, - cudagraph_capture_sizes: List[int]) -> List[int]: + cudagraph_capture_sizes: List[int]): valid_max_size = (cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0) if (vllm_config.compilation_config.max_cudagraph_capture_size is not None @@ -439,7 +439,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: if vllm_version_is("0.11.0"): compilation_config.init_with_cudagraph_sizes(sampled_sizes) else: - update_cudagraph_capture_sizes(sampled_sizes) + update_cudagraph_capture_sizes(vllm_config, sampled_sizes) logger.info( "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes", From 8e828431f8e9d22e2163164dc171eadcf1cbb90b Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 27 Oct 2025 11:03:24 +0000 Subject: [PATCH 08/13] fix eagle Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/sample/rejection_sampler.py | 3 ++- vllm_ascend/utils.py | 5 ++--- vllm_ascend/worker/model_runner_v1.py | 10 +++++++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index 52787fa29f..991e07fb8c 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -96,11 +96,12 @@ def forward( sampling_metadata, ) else: - target_probs = apply_sampling_constraints( + target_logits = apply_sampling_constraints( target_logits, metadata.cu_num_draft_tokens, sampling_metadata, ) + target_probs = target_logits.softmax(dim=-1, dtype=torch.float32) output_token_ids = rejection_sample( metadata.draft_token_ids, diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 64e2f888d0..36960288e6 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -312,9 +312,8 @@ def _rec_find(d): # Update cudagraph capture sizes for vllm config -def update_cudagraph_capture_sizes( - vllm_config: VllmConfig, - cudagraph_capture_sizes: List[int]): +def update_cudagraph_capture_sizes(vllm_config: VllmConfig, + cudagraph_capture_sizes: List[int]): valid_max_size = (cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0) if (vllm_config.compilation_config.max_cudagraph_capture_size is not None diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 88cfcd5c20..e90ed11e8a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -387,7 +387,11 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.drafter = get_spec_decode_method( self.speculative_config.method, self.vllm_config, self.device, self) - self.rejection_sampler = AscendRejectionSampler() + if vllm_version_is("0.11.0"): + self.rejection_sampler = AscendRejectionSampler() + else: + self.rejection_sampler = AscendRejectionSampler( + self.sampler) self.actual_seq_lengths_q = list( range(self.decode_token_per_req, self.max_num_tokens + 1, self.decode_token_per_req)) @@ -1882,6 +1886,9 @@ def _calc_spec_decode_metadata( # TODO: Optimize the CPU -> NPU copy. cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to( self.device, non_blocking=True) + + cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to( + self.device, non_blocking=True) logits_indices = torch.from_numpy(logits_indices).to(self.device, non_blocking=True) target_logits_indices = torch.from_numpy(target_logits_indices).to( @@ -1898,6 +1905,7 @@ def _calc_spec_decode_metadata( draft_token_ids=draft_token_ids, num_draft_tokens=num_draft_tokens.tolist(), cu_num_draft_tokens=cu_num_draft_tokens, + cu_num_sampled_tokens=cu_num_sampled_tokens, target_logits_indices=target_logits_indices, bonus_logits_indices=bonus_logits_indices, logits_indices=logits_indices, From c166742de0039a2e481533a9279b2d165cf586f6 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 27 Oct 2025 12:29:12 +0000 Subject: [PATCH 09/13] fix aclgraph Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/utils.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 36960288e6..3af82cf9c5 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -314,6 +314,7 @@ def _rec_find(d): # Update cudagraph capture sizes for vllm config def update_cudagraph_capture_sizes(vllm_config: VllmConfig, cudagraph_capture_sizes: List[int]): + valid_max_size = (cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0) if (vllm_config.compilation_config.max_cudagraph_capture_size is not None @@ -330,17 +331,18 @@ def update_cudagraph_capture_sizes(vllm_config: VllmConfig, valid_max_size, ) - vllm_config.compilation_config.max_cudagraph_capture_size = valid_max_size - if vllm_config.compilation_config.cudagraph_capture_sizes is not None and len( - cudagraph_capture_sizes) < len( - vllm_config.compilation_config.cudagraph_capture_sizes): - logger.warning( - ("cudagraph_capture_sizes specified in compilation_config" - " %s is overridden by config %s"), - vllm_config.compilation_config.cudagraph_capture_sizes, - cudagraph_capture_sizes, - ) - vllm_config.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes + vllm_config.compilation_config.max_cudagraph_capture_size = valid_max_size + + if vllm_config.compilation_config.cudagraph_capture_sizes is not None and len( + cudagraph_capture_sizes) < len( + vllm_config.compilation_config.cudagraph_capture_sizes): + logger.warning( + ("cudagraph_capture_sizes specified in compilation_config" + " %s is overridden by config %s"), + vllm_config.compilation_config.cudagraph_capture_sizes, + cudagraph_capture_sizes, + ) + vllm_config.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes vllm_config.compilation_config.post_init_cudagraph_sizes() From 89db00760a80592581431b786ea1edb2a6ec876c Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 27 Oct 2025 12:32:17 +0000 Subject: [PATCH 10/13] skip test_embedding_aclgraph test Signed-off-by: Icey <1790571317@qq.com> --- .github/workflows/_e2e_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 1638e7f198..c43fe7c035 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -94,7 +94,7 @@ jobs: pytest -sv tests/e2e/singlecard/test_camem.py pytest -sv tests/e2e/singlecard/test_chunked.py pytest -sv tests/e2e/singlecard/test_embedding.py - pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py + # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py pytest -sv tests/e2e/singlecard/test_guided_decoding.py pytest -sv tests/e2e/singlecard/test_ilama_lora.py pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py From 28b130660de9fae3a051debdb36d20119a321be6 Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Mon, 27 Oct 2025 13:30:59 +0000 Subject: [PATCH 11/13] tiny fix Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/models/qwen3_next.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 35 +++++++++++++++++---------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/models/qwen3_next.py b/vllm_ascend/models/qwen3_next.py index aa1cd9f517..f5b4b8a142 100644 --- a/vllm_ascend/models/qwen3_next.py +++ b/vllm_ascend/models/qwen3_next.py @@ -697,4 +697,4 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_physical_experts = example_layer.n_physical_experts self.num_local_physical_experts = example_layer.n_local_physical_experts self.num_routed_experts = example_layer.n_routed_experts - self.num_redundant_experts = example_layer.n_redundant_experts \ No newline at end of file + self.num_redundant_experts = example_layer.n_redundant_experts diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index e90ed11e8a..a8ed6950f6 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1886,9 +1886,9 @@ def _calc_spec_decode_metadata( # TODO: Optimize the CPU -> NPU copy. cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to( self.device, non_blocking=True) - - cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to( - self.device, non_blocking=True) + if not vllm_version_is("0.11.0"): + cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to( + self.device, non_blocking=True) logits_indices = torch.from_numpy(logits_indices).to(self.device, non_blocking=True) target_logits_indices = torch.from_numpy(target_logits_indices).to( @@ -1900,16 +1900,25 @@ def _calc_spec_decode_metadata( # draft_token_indices: [ 1, 2, 3, 105, 106, 208] draft_token_ids = self.input_ids[logits_indices] draft_token_ids = draft_token_ids[target_logits_indices + 1] - - metadata = SpecDecodeMetadata( - draft_token_ids=draft_token_ids, - num_draft_tokens=num_draft_tokens.tolist(), - cu_num_draft_tokens=cu_num_draft_tokens, - cu_num_sampled_tokens=cu_num_sampled_tokens, - target_logits_indices=target_logits_indices, - bonus_logits_indices=bonus_logits_indices, - logits_indices=logits_indices, - ) + if vllm_version_is("0.11.0"): + metadata = SpecDecodeMetadata( + draft_token_ids=draft_token_ids, + num_draft_tokens=num_draft_tokens.tolist(), + cu_num_draft_tokens=cu_num_draft_tokens, + target_logits_indices=target_logits_indices, + bonus_logits_indices=bonus_logits_indices, + logits_indices=logits_indices, + ) + else: + metadata = SpecDecodeMetadata( + draft_token_ids=draft_token_ids, + num_draft_tokens=num_draft_tokens.tolist(), + cu_num_draft_tokens=cu_num_draft_tokens, + cu_num_sampled_tokens=cu_num_sampled_tokens, + target_logits_indices=target_logits_indices, + bonus_logits_indices=bonus_logits_indices, + logits_indices=logits_indices, + ) return metadata def apply_grammar_bitmask( From 445650b1cd44bb2456eb3b9044214ecfbd19cf7b Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Tue, 28 Oct 2025 02:53:55 +0000 Subject: [PATCH 12/13] fix vl Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/models/qwen2_5_vl.py | 3 +++ vllm_ascend/models/qwen2_vl.py | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index b19ed87486..39613e56d2 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -40,6 +40,7 @@ Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo) from vllm.model_executor.models.utils import maybe_prefix +from vllm.model_executor.models.vision import conv3d_to_linear_weight from vllm.multimodal import MULTIMODAL_REGISTRY from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz @@ -355,6 +356,8 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: + if name.endswith("patch_embed.proj.weight"): + loaded_weight = conv3d_to_linear_weight(loaded_weight) for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py index ccd461613b..f24f982364 100644 --- a/vllm_ascend/models/qwen2_vl.py +++ b/vllm_ascend/models/qwen2_vl.py @@ -38,6 +38,7 @@ Qwen2VLForConditionalGeneration, Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo) from vllm.model_executor.models.utils import maybe_prefix +from vllm.model_executor.models.vision import conv3d_to_linear_weight from vllm.multimodal import MULTIMODAL_REGISTRY from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz @@ -304,6 +305,9 @@ def load_weights(self, weights: Iterable[Tuple[str, loaded_params: Set[str] = set() for name, loaded_weight in weights: + if name.endswith("patch_embed.proj.weight"): + loaded_weight = conv3d_to_linear_weight(loaded_weight) + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue From 5b09cc06aca4fcfd96201a20aa9cc5fe8a3786dc Mon Sep 17 00:00:00 2001 From: Icey <1790571317@qq.com> Date: Tue, 28 Oct 2025 03:32:12 +0000 Subject: [PATCH 13/13] tiny fix Signed-off-by: Icey <1790571317@qq.com> --- vllm_ascend/models/qwen2_5_vl.py | 12 ++++++++---- vllm_ascend/models/qwen2_vl.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index 39613e56d2..9ccfa43f62 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -40,10 +40,13 @@ Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo) from vllm.model_executor.models.utils import maybe_prefix -from vllm.model_executor.models.vision import conv3d_to_linear_weight from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz, + vllm_version_is) + +if not vllm_version_is("0.11.0"): + from vllm.model_executor.models.vision import conv3d_to_linear_weight MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -356,8 +359,9 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) + if not vllm_version_is("0.11.0"): + if name.endswith("patch_embed.proj.weight"): + loaded_weight = conv3d_to_linear_weight(loaded_weight) for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py index f24f982364..7b1ce44a21 100644 --- a/vllm_ascend/models/qwen2_vl.py +++ b/vllm_ascend/models/qwen2_vl.py @@ -38,10 +38,13 @@ Qwen2VLForConditionalGeneration, Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo) from vllm.model_executor.models.utils import maybe_prefix -from vllm.model_executor.models.vision import conv3d_to_linear_weight from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz, + vllm_version_is) + +if not vllm_version_is("0.11.0"): + from vllm.model_executor.models.vision import conv3d_to_linear_weight MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -305,8 +308,9 @@ def load_weights(self, weights: Iterable[Tuple[str, loaded_params: Set[str] = set() for name, loaded_weight in weights: - if name.endswith("patch_embed.proj.weight"): - loaded_weight = conv3d_to_linear_weight(loaded_weight) + if not vllm_version_is("0.11.0"): + if name.endswith("patch_embed.proj.weight"): + loaded_weight = conv3d_to_linear_weight(loaded_weight) for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: