Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_rmsnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):

print(f"Naive output={output_naive}")
print(f"FlashInfer output={output_flashinfer}")
print(f"VLLM output={output_vllm}")
print(f"vLLM output={output_vllm}")

if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
rtol=1e-2) and torch.allclose(
Expand Down
2 changes: 1 addition & 1 deletion docs/source/contributing/vulnerability_management.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ you may contact the following individuals:

## Slack Discussion

You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
to discuss security-related topics. However, please do not disclose any
vulnerabilities in this channel. If you need to report a vulnerability, please
use the GitHub security advisory system or contact a VMT member privately.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/design/v1/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
memory. This is also known as "KV cache offloading" and is configured
with `--swap-space` and `--preemption-mode`.

In v0, [VLLM has long supported beam
In v0, [vLLM has long supported beam
search](gh-issue:6226). The
SequenceGroup encapsulated the idea of N Sequences which
all shared the same prompt kv blocks. This enabled KV cache block
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and launch an additional LMCache server.
KV cache is transferred in the following manner:
VLLM prefill node -> LMCache server -> VLLM decode node.
vLLM prefill node -> LMCache server -> vLLM decode node.

Note that `pip install lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.
Expand Down
2 changes: 1 addition & 1 deletion tests/tpu/test_quantization_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def get_model_args(self) -> str:
GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
excepted_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
# a follow up, move this into the LM-EVAL section of the CI.
# GSM8KAccuracyTestConfig(
Expand Down
2 changes: 1 addition & 1 deletion vllm/attention/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
# (default behavior if this variable is None)
#
# THIS SELECTION TAKES PRECEDENCE OVER THE
# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE
# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
forced_attn_backend: Optional[_Backend] = None


Expand Down
2 changes: 1 addition & 1 deletion vllm/compilation/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def call_module(self, target: torch.fx.node.Target,


class VllmBackend:
"""The compilation backend for `torch.compile` with VLLM.
"""The compilation backend for `torch.compile` with vLLM.
It is used for compilation level of `CompilationLevel.PIECEWISE`,
where we customize the compilation.

Expand Down
2 changes: 1 addition & 1 deletion vllm/compilation/compiler_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):

def compute_hash(self, vllm_config: VllmConfig) -> str:
"""
Gather all the relevant information from the VLLM config,
Gather all the relevant information from the vLLM config,
to compute a hash so that we can cache the compiled model.

See :meth:`VllmConfig.compute_hash` to check what information
Expand Down
8 changes: 4 additions & 4 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3572,11 +3572,11 @@ def __str__(self):
@contextmanager
def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
"""
Temporarily set the current VLLM config.
Temporarily set the current vLLM config.
Used during model initialization.
We save the current VLLM config in a global variable,
We save the current vLLM config in a global variable,
so that all modules can access it, e.g. custom ops
can access the VLLM config to determine how to dispatch.
can access the vLLM config to determine how to dispatch.
"""
global _current_vllm_config
old_vllm_config = _current_vllm_config
Expand Down Expand Up @@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig:
# in ci, usually when we test custom ops/modules directly,
# we don't set the vllm config. In that case, we set a default
# config.
logger.warning("Current VLLM config is not set.")
logger.warning("Current vLLM config is not set.")
from vllm.config import VllmConfig
return VllmConfig()
return _current_vllm_config
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
tool_choice: Optional[Union[Literal["none"], Literal["auto"],
ChatCompletionNamedToolChoiceParam]] = "none"

# NOTE this will be ignored by VLLM -- the model determines the behavior
# NOTE this will be ignored by vLLM -- the model determines the behavior
parallel_tool_calls: Optional[bool] = False
user: Optional[str] = None

Expand Down
6 changes: 3 additions & 3 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
"VERBOSE":
lambda: bool(int(os.getenv('VERBOSE', '0'))),

# Root directory for VLLM configuration files
# Root directory for vLLM configuration files
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
# Note that this not only affects how vllm finds its configuration files
# during runtime, but also affects how vllm installs its configuration
Expand All @@ -178,7 +178,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:

# ================== Runtime Env Vars ==================

# Root directory for VLLM cache files
# Root directory for vLLM cache files
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
"VLLM_CACHE_ROOT":
lambda: os.path.expanduser(
Expand Down Expand Up @@ -260,7 +260,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
"VLLM_ENGINE_ITERATION_TIMEOUT_S":
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),

# API key for VLLM API server
# API key for vLLM API server
"VLLM_API_KEY":
lambda: os.environ.get("VLLM_API_KEY", None),

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/phi4mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0):
@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
"""
Implements the Phi-4-multimodal-instruct model in VLLM.
Implements the Phi-4-multimodal-instruct model in vLLM.
"""
packed_modules_mapping = {
"qkv_proj": [
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1:
raise NotImplementedError(
"Multi-step scheduling is not supported (and not "
"needed) on VLLM V1. Please launch without "
"needed) on vLLM V1. Please launch without "
"--num-scheduler-steps.")
else:
parallel_config.worker_cls = \
Expand Down
4 changes: 2 additions & 2 deletions vllm/platforms/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,15 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1:
raise NotImplementedError(
"Multi-step scheduling is not supported (and not "
"needed) on VLLM V1. Please launch without "
"needed) on vLLM V1. Please launch without "
"--num-scheduler-steps.")
else:
parallel_config.worker_cls = \
"vllm.worker.multi_step_worker.MultiStepWorker"
elif vllm_config.speculative_config:
if envs.VLLM_USE_V1:
raise NotImplementedError(
"Speculative decoding is not yet supported on VLLM V1."
"Speculative decoding is not yet supported on vLLM V1."
)
else:
parallel_config.worker_cls = \
Expand Down
2 changes: 1 addition & 1 deletion vllm/transformers_utils/tokenizers/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
revision=revision)
return tokenizer_file

# the following attributes are set to fit VLLM's design and are used
# the following attributes are set to fit vLLM's design and are used
# by the guided structured output backends.
@property
def all_special_tokens_extended(self) -> List[str]:
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/engine/core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def __init__(
# TODO(rob): rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.
# error code to the clients calling vLLM.
def sigusr1_handler(signum, frame):
logger.fatal("Got fatal signal from worker processes, shutting "
"down. See stack trace above for root cause issue.")
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def process_outputs(

****************** NOTE FOR DEVELOPERS ******************

VLLM V1 minimizes the number of python loops over the full
vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the
only function that should loop over EngineCoreOutputs.

Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ def _validate_supported_sampling_params(
) -> None:
# Best of not yet supported.
if params.best_of is not None and params.best_of > 1:
raise ValueError("VLLM V1 does not yet support best_of.")
raise ValueError("vLLM V1 does not yet support best_of.")
# Logits processors not supported.
if params.logits_processors:
raise ValueError("VLLM V1 does not support per request "
raise ValueError("vLLM V1 does not support per request "
"user provided logits processors.")

def _validate_params(
Expand Down