vllm-project · DarkLight1337 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
 
     print(f"Naive output={output_naive}")
     print(f"FlashInfer output={output_flashinfer}")
-    print(f"VLLM output={output_vllm}")
+    print(f"vLLM output={output_vllm}")
 
     if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
                       rtol=1e-2) and torch.allclose(

diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md
@@ -37,7 +37,7 @@ you may contact the following individuals:
 
 ## Slack Discussion
 
-You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
+You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
 to discuss security-related topics. However, please do not disclose any
 vulnerabilities in this channel. If you need to report a vulnerability, please
 use the GitHub security advisory system or contact a VMT member privately.

diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
@@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
 memory. This is also known as "KV cache offloading" and is configured
 with `--swap-space` and `--preemption-mode`.
 
-In v0, [VLLM has long supported beam
+In v0, [vLLM has long supported beam
 search](gh-issue:6226). The
 SequenceGroup encapsulated the idea of N Sequences which
 all shared the same prompt kv blocks. This enabled KV cache block

diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/offline_inference/disaggregated_prefill_lmcache.py
@@ -5,7 +5,7 @@
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
 and launch an additional LMCache server.
 KV cache is transferred in the following manner: 
-VLLM prefill node -> LMCache server -> VLLM decode node.
+vLLM prefill node -> LMCache server -> vLLM decode node.
 
 Note that `pip install lmcache` is needed to run this example.
 Learn more about LMCache in https://github.com/LMCache/LMCache.

diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
@@ -25,7 +25,7 @@ def get_model_args(self) -> str:
     GSM8KAccuracyTestConfig(
         model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
         excepted_value=0.76),  # no bias
-    # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
+    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
     # so only one of these tests can run in a single call to pytest. As
     # a follow up, move this into the LM-EVAL section of the CI.
     # GSM8KAccuracyTestConfig(

@@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
 # (default behavior if this variable is None)
 #
 # THIS SELECTION TAKES PRECEDENCE OVER THE
-# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE
+# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
 forced_attn_backend: Optional[_Backend] = None
 
 

@@ -278,7 +278,7 @@ def call_module(self, target: torch.fx.node.Target,
 
 
 class VllmBackend:
-    """The compilation backend for `torch.compile` with VLLM.
+    """The compilation backend for `torch.compile` with vLLM.
     It is used for compilation level of `CompilationLevel.PIECEWISE`,
     where we customize the compilation.
 

@@ -31,7 +31,7 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         """
-        Gather all the relevant information from the VLLM config,
+        Gather all the relevant information from the vLLM config,
         to compute a hash so that we can cache the compiled model.
 
         See :meth:`VllmConfig.compute_hash` to check what information

diff --git a/vllm/config.py b/vllm/config.py
@@ -3572,11 +3572,11 @@ def __str__(self):
 @contextmanager
 def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
     """
-    Temporarily set the current VLLM config.
+    Temporarily set the current vLLM config.
     Used during model initialization.
-    We save the current VLLM config in a global variable,
+    We save the current vLLM config in a global variable,
     so that all modules can access it, e.g. custom ops
-    can access the VLLM config to determine how to dispatch.
+    can access the vLLM config to determine how to dispatch.
     """
     global _current_vllm_config
     old_vllm_config = _current_vllm_config
@@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig:
         # in ci, usually when we test custom ops/modules directly,
         # we don't set the vllm config. In that case, we set a default
         # config.
-        logger.warning("Current VLLM config is not set.")
+        logger.warning("Current vLLM config is not set.")
         from vllm.config import VllmConfig
         return VllmConfig()
     return _current_vllm_config
@@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],
                                 ChatCompletionNamedToolChoiceParam]] = "none"
 
-    # NOTE this will be ignored by VLLM -- the model determines the behavior
+    # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
     user: Optional[str] = None
 

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -164,7 +164,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VERBOSE":
     lambda: bool(int(os.getenv('VERBOSE', '0'))),
 
-    # Root directory for VLLM configuration files
+    # Root directory for vLLM configuration files
     # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
     # Note that this not only affects how vllm finds its configuration files
     # during runtime, but also affects how vllm installs its configuration
@@ -178,7 +178,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # ================== Runtime Env Vars ==================
 
-    # Root directory for VLLM cache files
+    # Root directory for vLLM cache files
     # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
     "VLLM_CACHE_ROOT":
     lambda: os.path.expanduser(
@@ -260,7 +260,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_ENGINE_ITERATION_TIMEOUT_S":
     lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
 
-    # API key for VLLM API server
+    # API key for vLLM API server
     "VLLM_API_KEY":
     lambda: os.environ.get("VLLM_API_KEY", None),
 

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
@@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
 class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
-    Implements the Phi-4-multimodal-instruct model in VLLM.
+    Implements the Phi-4-multimodal-instruct model in vLLM.
     """
     packed_modules_mapping = {
         "qkv_proj": [

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -119,7 +119,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 if envs.VLLM_USE_V1:
                     raise NotImplementedError(
                         "Multi-step scheduling is not supported (and not "
-                        "needed) on VLLM V1. Please launch without "
+                        "needed) on vLLM V1. Please launch without "
                         "--num-scheduler-steps.")
                 else:
                     parallel_config.worker_cls = \

@@ -173,15 +173,15 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 if envs.VLLM_USE_V1:
                     raise NotImplementedError(
                         "Multi-step scheduling is not supported (and not "
-                        "needed) on VLLM V1. Please launch without "
+                        "needed) on vLLM V1. Please launch without "
                         "--num-scheduler-steps.")
                 else:
                     parallel_config.worker_cls = \
                         "vllm.worker.multi_step_worker.MultiStepWorker"
             elif vllm_config.speculative_config:
                 if envs.VLLM_USE_V1:
                     raise NotImplementedError(
-                        "Speculative decoding is not yet supported on VLLM V1."
+                        "Speculative decoding is not yet supported on vLLM V1."
                     )
                 else:
                     parallel_config.worker_cls = \

@@ -249,7 +249,7 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
                                          revision=revision)
         return tokenizer_file
 
-    # the following attributes are set to fit VLLM's design and are used
+    # the following attributes are set to fit vLLM's design and are used
     # by the guided structured output backends.
     @property
     def all_special_tokens_extended(self) -> List[str]:

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -255,7 +255,7 @@ def __init__(
         # TODO(rob): rather than killing the main process, we should
         # figure out how to raise an AsyncEngineDeadError and
         # handle at the API server level so we can return a better
-        # error code to the clients calling VLLM.
+        # error code to the clients calling vLLM.
         def sigusr1_handler(signum, frame):
             logger.fatal("Got fatal signal from worker processes, shutting "
                          "down. See stack trace above for root cause issue.")

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -248,7 +248,7 @@ def process_outputs(
 
         ****************** NOTE FOR DEVELOPERS ******************
 
-        VLLM V1 minimizes the number of python loops over the full
+        vLLM V1 minimizes the number of python loops over the full
         batch to ensure system overheads are minimized. This is the 
         only function that should loop over EngineCoreOutputs.
 

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -93,10 +93,10 @@ def _validate_supported_sampling_params(
     ) -> None:
         # Best of not yet supported.
         if params.best_of is not None and params.best_of > 1:
-            raise ValueError("VLLM V1 does not yet support best_of.")
+            raise ValueError("vLLM V1 does not yet support best_of.")
         # Logits processors not supported.
         if params.logits_processors:
-            raise ValueError("VLLM V1 does not support per request "
+            raise ValueError("vLLM V1 does not support per request "
                              "user provided logits processors.")
 
     def _validate_params(