Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
pytest -sv tests/e2e/singlecard/test_camem.py
pytest -sv tests/e2e/singlecard/test_chunked.py
pytest -sv tests/e2e/singlecard/test_embedding.py
pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/format_pr_body.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:

- name: Get vLLM version
run: |
VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
VLLM_COMMIT=releases/v0.11.1
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
lint:
uses: ./.github/workflows/pre-commit.yml
with:
vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
vllm: releases/v0.11.1

changes:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
vllm_version: [releases/v0.11.1, v0.11.0]
steps:
- name: Install packages
run: |
Expand Down Expand Up @@ -140,7 +140,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
vllm_version: [releases/v0.11.1, v0.11.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
vllm_version: [releases/v0.11.1, v0.11.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
8 changes: 7 additions & 1 deletion vllm_ascend/kv_offload/cpu_npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@
import torch
from vllm.attention import AttentionBackend
from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
TransferResult, TransferSpec)

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.utils import is_pin_memory_available
else:
from vllm.utils.platform_utils import is_pin_memory_available

logger = init_logger(__name__)


Expand Down
9 changes: 8 additions & 1 deletion vllm_ascend/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@
from vllm.model_executor.models.utils import maybe_prefix
from vllm.multimodal import MULTIMODAL_REGISTRY

from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz,
vllm_version_is)

if not vllm_version_is("0.11.0"):
from vllm.model_executor.models.vision import conv3d_to_linear_weight

MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight
Expand Down Expand Up @@ -355,6 +359,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
params_dict = dict(self.named_parameters(remove_duplicate=False))
loaded_params: Set[str] = set()
for name, loaded_weight in weights:
if not vllm_version_is("0.11.0"):
if name.endswith("patch_embed.proj.weight"):
loaded_weight = conv3d_to_linear_weight(loaded_weight)
for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
Expand Down
10 changes: 9 additions & 1 deletion vllm_ascend/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@
from vllm.model_executor.models.utils import maybe_prefix
from vllm.multimodal import MULTIMODAL_REGISTRY

from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz,
vllm_version_is)

if not vllm_version_is("0.11.0"):
from vllm.model_executor.models.vision import conv3d_to_linear_weight

MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight
Expand Down Expand Up @@ -304,6 +308,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
loaded_params: Set[str] = set()

for name, loaded_weight in weights:
if not vllm_version_is("0.11.0"):
if name.endswith("patch_embed.proj.weight"):
loaded_weight = conv3d_to_linear_weight(loaded_weight)

for (param_name, weight_name, shard_id) in stacked_params_mapping:
if weight_name not in name:
continue
Expand Down
70 changes: 49 additions & 21 deletions vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
delete_torchair_cache_file)
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
prefill_context_parallel_enable,
update_aclgraph_sizes, vllm_version_is)
update_aclgraph_sizes,
update_cudagraph_capture_sizes, vllm_version_is)

if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig
Expand Down Expand Up @@ -142,24 +143,47 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
"as the performance of operators supporting this feature "
"functionality is currently suboptimal.")
if not model_config.is_multimodal_model and \
structured_outputs_config.backend == "auto" and \
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
scheduler_config.policy == "fcfs":
ascend_scheduler_config.enabled = True
chunked_prefill_enabled_in_ascend_scheduler = getattr(
ascend_scheduler_config, "enable_chunked_prefill", False)
if chunked_prefill_enabled_in_ascend_scheduler:
logger.warning(
"Chunked prefill feature is enabled in ascend_scheduler,"
"but note that the operator supporting this feature "
"would lead to performance degradation.")
# In this situation, max_num_batched_tokens would have been rewritten.
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
if (scheduler_config.max_num_batched_tokens
< scheduler_config.max_model_len
and not chunked_prefill_enabled_in_ascend_scheduler):
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
if vllm_version_is("0.11.0"):
if not model_config.is_multimodal_model and \
structured_outputs_config.backend == "auto" and \
not scheduler_config.send_delta_data and \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getattr(scheduler_config, "send_delta_data", False)

not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
scheduler_config.policy == "fcfs":
ascend_scheduler_config.enabled = True
chunked_prefill_enabled_in_ascend_scheduler = getattr(
ascend_scheduler_config, "enable_chunked_prefill",
False)
if chunked_prefill_enabled_in_ascend_scheduler:
logger.warning(
"Chunked prefill feature is enabled in ascend_scheduler,"
"but note that the operator supporting this feature "
"would lead to performance degradation.")
# In this situation, max_num_batched_tokens would have been rewritten.
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
if (scheduler_config.max_num_batched_tokens
< scheduler_config.max_model_len and
not chunked_prefill_enabled_in_ascend_scheduler):
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
else:
if not model_config.is_multimodal_model and \
structured_outputs_config.backend == "auto" and \
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
scheduler_config.policy == "fcfs":
ascend_scheduler_config.enabled = True
chunked_prefill_enabled_in_ascend_scheduler = getattr(
ascend_scheduler_config, "enable_chunked_prefill",
False)
if chunked_prefill_enabled_in_ascend_scheduler:
logger.warning(
"Chunked prefill feature is enabled in ascend_scheduler,"
"but note that the operator supporting this feature "
"would lead to performance degradation.")
# In this situation, max_num_batched_tokens would have been rewritten.
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
if (scheduler_config.max_num_batched_tokens
< scheduler_config.max_model_len and
not chunked_prefill_enabled_in_ascend_scheduler):
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len

kv_cache_dtype = vllm_config.additional_config.get(
"kv_cache_dtype", None)
Expand Down Expand Up @@ -237,8 +261,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
f"{vllm_config.parallel_config.tensor_parallel_size}")
if len(sp_aclgraph_sizes) != len(original_sizes):
compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes
vllm_config.compilation_config.init_with_cudagraph_sizes(
sp_aclgraph_sizes)
if vllm_version_is("0.11.0"):
compilation_config.init_with_cudagraph_sizes(
sp_aclgraph_sizes)
else:
update_cudagraph_capture_sizes(vllm_config,
sp_aclgraph_sizes)

# TODO: Full graph is fully supported later, and the default value will be set to full graph.
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
Expand Down
27 changes: 21 additions & 6 deletions vllm_ascend/sample/rejection_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,17 @@
import torch.nn as nn
import vllm.v1.sample.rejection_sampler as rs
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import (RejectionSampler, compute_probs,
from vllm.v1.sample.rejection_sampler import (RejectionSampler,
generate_uniform_probs)
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.11.0"):
from vllm.v1.sample.rejection_sampler import compute_probs
else:
from vllm.v1.sample.rejection_sampler import apply_sampling_constraints

PLACEHOLDER_TOKEN_ID = -1
GREEDY_TEMPERATURE = -1
# Maximum number of speculative draft tokens allowed per request in a single
Expand Down Expand Up @@ -82,11 +89,19 @@ def forward(
# [num_tokens, vocab_size]
# NOTE(woosuk): `target_logits` can be updated in place inside the
# `compute_probs` function.
target_probs = compute_probs(
target_logits,
metadata.cu_num_draft_tokens,
sampling_metadata,
)
if vllm_version_is("0.11.0"):
target_probs = compute_probs(
target_logits,
metadata.cu_num_draft_tokens,
sampling_metadata,
)
else:
target_logits = apply_sampling_constraints(
target_logits,
metadata.cu_num_draft_tokens,
sampling_metadata,
)
target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)

output_token_ids = rejection_sample(
metadata.draft_token_ids,
Expand Down
3 changes: 2 additions & 1 deletion vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models import supports_multimodal
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm.utils import is_pin_memory_available
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
Expand All @@ -27,8 +26,10 @@

if vllm_version_is("0.11.0"):
from vllm.config import CompilationLevel
from vllm.utils import is_pin_memory_available
else:
from vllm.config import CompilationMode
from vllm.utils.platform_utils import is_pin_memory_available

PADDING_SLOT_ID = -1

Expand Down
45 changes: 43 additions & 2 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,41 @@ def _rec_find(d):
return max(layer_counts)


# Update cudagraph capture sizes for vllm config
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is maybe not correct. I'll look more

def update_cudagraph_capture_sizes(vllm_config: VllmConfig,
cudagraph_capture_sizes: List[int]):

valid_max_size = (cudagraph_capture_sizes[-1]
if cudagraph_capture_sizes else 0)
if (vllm_config.compilation_config.max_cudagraph_capture_size is not None
and vllm_config.compilation_config.max_cudagraph_capture_size
!= valid_max_size):
if vllm_config.compilation_config.cudagraph_capture_sizes is not None:
raise ValueError(
"customized max_cudagraph_capture_size"
f"(={vllm_config.compilation_config.max_cudagraph_capture_size}) "
"should be consistent with the max value of "
f"cudagraph_capture_sizes(={valid_max_size})")
logger.warning(
"Truncating max_cudagraph_capture_size to %d",
valid_max_size,
)

vllm_config.compilation_config.max_cudagraph_capture_size = valid_max_size

if vllm_config.compilation_config.cudagraph_capture_sizes is not None and len(
cudagraph_capture_sizes) < len(
vllm_config.compilation_config.cudagraph_capture_sizes):
logger.warning(
("cudagraph_capture_sizes specified in compilation_config"
" %s is overridden by config %s"),
vllm_config.compilation_config.cudagraph_capture_sizes,
cudagraph_capture_sizes,
)
vllm_config.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes
vllm_config.compilation_config.post_init_cudagraph_sizes()


def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
"""Update ACL graph capture sizes based on hardware limitations"""
# NOTE: Currently, we can only capture 1800 graphs at most,
Expand Down Expand Up @@ -402,7 +437,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
indices[0], indices[-1] = 0, len(original_sizes) - 1

sampled_sizes = [original_sizes[i] for i in indices]
compilation_config.init_with_cudagraph_sizes(sampled_sizes)
if vllm_version_is("0.11.0"):
compilation_config.init_with_cudagraph_sizes(sampled_sizes)
else:
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)

logger.info(
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
Expand Down Expand Up @@ -433,7 +471,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
enlarged_sizes = [(num_speculative_tokens + 1) * size
for size in original_sizes]
compilation_config.init_with_cudagraph_sizes(enlarged_sizes)
if vllm_version_is("0.11.0"):
compilation_config.init_with_cudagraph_sizes(enlarged_sizes)
else:
update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
logger.info(
"Adjusted ACL graphs: %s → %s for speculative decoding",
original_sizes, enlarged_sizes)
Expand Down
Loading
Loading