vllm-project
diff --git a/‎docs/usage/v1_guide.md‎
Lines changed: 0 additions & 2 deletions b/‎docs/usage/v1_guide.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 0 additions & 20 deletions b/‎tests/conftest.py‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎tests/v1/engine/test_async_llm.py‎
Lines changed: 2 additions & 5 deletions b/‎tests/v1/engine/test_async_llm.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎tests/v1/entrypoints/llm/test_struct_output_generate.py‎
Lines changed: 0 additions & 3 deletions b/‎tests/v1/entrypoints/llm/test_struct_output_generate.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tests/v1/sample/test_logprobs.py‎
Lines changed: 59 additions & 62 deletions b/‎tests/v1/sample/test_logprobs.py‎
Lines changed: 59 additions & 62 deletions
diff --git a/‎vllm/attention/layers/chunked_local_attention.py‎
Lines changed: 6 additions & 14 deletions b/‎vllm/attention/layers/chunked_local_attention.py‎
Lines changed: 6 additions & 14 deletions
diff --git a/‎vllm/attention/layers/cross_attention.py‎
Lines changed: 4 additions & 12 deletions b/‎vllm/attention/layers/cross_attention.py‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎vllm/attention/layers/encoder_only_attention.py‎
Lines changed: 4 additions & 13 deletions b/‎vllm/attention/layers/encoder_only_attention.py‎
Lines changed: 4 additions & 13 deletions
diff --git a/‎vllm/attention/selector.py‎
Lines changed: 1 addition & 5 deletions b/‎vllm/attention/selector.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎vllm/distributed/kv_transfer/kv_connector/factory.py‎
Lines changed: 0 additions & 7 deletions b/‎vllm/distributed/kv_transfer/kv_connector/factory.py‎
Lines changed: 0 additions & 7 deletions
@@ -6,8 +6,6 @@
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
-To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
-
 ## Why vLLM V1?
 
 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
 
@@ -154,26 +154,6 @@ def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
 """Singleton instance of {class}`AudioTestAssets`."""
 
 
-@pytest.fixture(scope="function", autouse=True)
-def cleanup_VLLM_USE_V1(monkeypatch):
-    """
-    The V1 oracle sets "VLLM_USE_V1" during loading. This means
-    that each invocation of a test change the env variable.
-
-    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
-    made during the test run by vLLM will be cleaned up.
-
-    This fixture is used by every test.
-    """
-
-    # If VLLM_USE_V1 is not set, set then delete. This will
-    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
-    # if VLLM modifies the value of envs.VLLM_USE_V1.
-    if "VLLM_USE_V1" not in os.environ:
-        monkeypatch.setenv("VLLM_USE_V1", "")
-        monkeypatch.delenv("VLLM_USE_V1")
-
-
 @pytest.fixture(autouse=True)
 def init_test_http_connection():
     # pytest_asyncio may use a different event loop per test
 
@@ -424,15 +424,12 @@ async def test_customize_loggers(monkeypatch):
 
 
 @pytest.mark.asyncio
-async def test_customize_aggregated_loggers(monkeypatch):
+async def test_customize_aggregated_loggers():
     """Test that we can customize the aggregated loggers.
     If a customized logger is provided at the init, it should
     be added to the default loggers.
     """
-
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
         with set_default_torch_num_threads(1):
             engine = AsyncLLM.from_engine_args(
                 TEXT_ENGINE_ARGS,
 
@@ -868,11 +868,8 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
 
 @pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
 def test_structured_output_with_structural_tag(
-    monkeypatch: pytest.MonkeyPatch,
     guided_decoding_backend: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",
         guided_decoding_backend=guided_decoding_backend,
 
@@ -530,7 +530,6 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
 def test_spec_decode_logprobs(
     logprobs_mode: LogprobsMode,
     model_setup: tuple[str, str, str],
-    monkeypatch: pytest.MonkeyPatch,
 ):
     """Spec decode logprobs should match those of the base model.
 
@@ -541,64 +540,62 @@ def test_spec_decode_logprobs(
     """
     from vllm import LLM
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        prompt = "Hello world"
-        sampling_params = SamplingParams(
-            temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
-        )
-        method, model_name, spec_model_name = model_setup
-        max_model_len = 256
-
-        # Run base LLM.
-        ref_llm = LLM(
-            model=model_name,
-            max_logprobs=5,
-            max_model_len=max_model_len,
-            seed=42,
-            logprobs_mode=logprobs_mode,
-            gpu_memory_utilization=0.4,
-        )
-        ref_results = ref_llm.generate([prompt], sampling_params)
-        # Collect logprobs outputs from reference LLM.
-        ref_logprobs = []
-        for output in ref_results[0].outputs:
-            for logprobs in output.logprobs:
-                for token_id in logprobs:
-                    ref_logprobs.append(logprobs[token_id])
-        del ref_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
-
-        # Run spec decode LLM.
-        spec_llm = LLM(
-            model_name,
-            speculative_config={
-                "method": method,
-                "model": spec_model_name,
-                "num_speculative_tokens": 3,
-                "max_model_len": max_model_len,
-            },
-            max_logprobs=5,
-            max_model_len=max_model_len,
-            seed=42,
-            logprobs_mode=logprobs_mode,
-            gpu_memory_utilization=0.4,
-        )
-        spec_results = spec_llm.generate([prompt], sampling_params)
-        # Collect logprobs outputs from spec decode LLM.
-        spec_logprobs = []
-        for output in spec_results[0].outputs:
-            for logprobs in output.logprobs:
-                for token_id in logprobs:
-                    spec_logprobs.append(logprobs[token_id])
-        del spec_llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
-
-        # Per-token logprobs are expected to be the same.
-        assert len(ref_logprobs) == len(spec_logprobs)
-        for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
-            assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
-            assert ref_logprob.rank == spec_logprob.rank
-            assert ref_logprob.decoded_token == spec_logprob.decoded_token
+    prompt = "Hello world"
+    sampling_params = SamplingParams(
+        temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
+    )
+    method, model_name, spec_model_name = model_setup
+    max_model_len = 256
+
+    # Run base LLM.
+    ref_llm = LLM(
+        model=model_name,
+        max_logprobs=5,
+        max_model_len=max_model_len,
+        seed=42,
+        logprobs_mode=logprobs_mode,
+        gpu_memory_utilization=0.4,
+    )
+    ref_results = ref_llm.generate([prompt], sampling_params)
+    # Collect logprobs outputs from reference LLM.
+    ref_logprobs = []
+    for output in ref_results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                ref_logprobs.append(logprobs[token_id])
+    del ref_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Run spec decode LLM.
+    spec_llm = LLM(
+        model_name,
+        speculative_config={
+            "method": method,
+            "model": spec_model_name,
+            "num_speculative_tokens": 3,
+            "max_model_len": max_model_len,
+        },
+        max_logprobs=5,
+        max_model_len=max_model_len,
+        seed=42,
+        logprobs_mode=logprobs_mode,
+        gpu_memory_utilization=0.4,
+    )
+    spec_results = spec_llm.generate([prompt], sampling_params)
+    # Collect logprobs outputs from spec decode LLM.
+    spec_logprobs = []
+    for output in spec_results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                spec_logprobs.append(logprobs[token_id])
+    del spec_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Per-token logprobs are expected to be the same.
+    assert len(ref_logprobs) == len(spec_logprobs)
+    for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
+        assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
+        assert ref_logprob.rank == spec_logprob.rank
+        assert ref_logprob.decoded_token == spec_logprob.decoded_token
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import functools
 from typing import ClassVar
 
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
@@ -22,7 +20,6 @@
 from ..layer import Attention
 
 
-@functools.lru_cache
 def create_chunked_local_attention_backend(
     underlying_attn_backend: AttentionBackend,
     attention_chunk_size: int,
@@ -78,17 +75,12 @@ def __init__(
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(
-                head_size, dtype, kv_cache_dtype, block_size
-            )
-
-            attn_backend = create_chunked_local_attention_backend(
-                underlying_attn_backend, attention_chunk_size, block_size
-            )
-        else:
-            # in v0 the local attention is handled inside the backends
-            attn_backend = None
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
+        attn_backend = create_chunked_local_attention_backend(
+            underlying_attn_backend, attention_chunk_size, block_size
+        )
 
         super().__init__(
             num_heads=num_heads,
 
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import functools
 from copy import copy
 
 import numpy as np
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionMetadata,
@@ -78,7 +76,6 @@ def _get_cross_slot_mapping(
         return torch.empty(0, dtype=torch.int64, device=device)
 
 
-@functools.lru_cache
 def create_cross_attention_backend(
     underlying_attn_backend: AttentionBackend,
 ) -> type[AttentionBackend]:
@@ -150,15 +147,10 @@ def __init__(
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(
-                head_size, dtype, kv_cache_dtype, block_size
-            )
-
-            attn_backend = create_cross_attention_backend(underlying_attn_backend)
-        else:
-            # in v0 cross attention is handled inside the backends
-            attn_backend = None
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
+        attn_backend = create_cross_attention_backend(underlying_attn_backend)
 
         if attn_type is not None:
             assert attn_type == AttentionType.ENCODER_DECODER, (
 
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import functools
 from copy import copy
 
 import torch
 
-from vllm import envs
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionMetadata,
@@ -22,7 +20,6 @@
 from vllm.v1.kv_cache_interface import KVCacheSpec
 
 
-@functools.lru_cache
 def create_encoder_only_attention_backend(
     underlying_attn_backend: AttentionBackend,
 ) -> type[AttentionBackend]:
@@ -74,17 +71,11 @@ def __init__(
             kv_cache_dtype = "auto"
             block_size = 16
 
-        if envs.VLLM_USE_V1:
-            underlying_attn_backend = get_attn_backend(
-                head_size, dtype, kv_cache_dtype, block_size
-            )
+        underlying_attn_backend = get_attn_backend(
+            head_size, dtype, kv_cache_dtype, block_size
+        )
 
-            attn_backend = create_encoder_only_attention_backend(
-                underlying_attn_backend
-            )
-        else:
-            # in v0 encoder only attention is handled inside the backends
-            attn_backend = None
+        attn_backend = create_encoder_only_attention_backend(underlying_attn_backend)
 
         if attn_type is not None:
             assert attn_type == AttentionType.ENCODER_ONLY, (
 
@@ -134,16 +134,12 @@ def get_attn_backend(
     use_sparse: bool = False,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
-    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
-    # value to be returned from the cache if the value changes between calls.
-    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
-    # private function.
     return _cached_get_attn_backend(
         head_size=head_size,
         dtype=dtype,
         kv_cache_dtype=kv_cache_dtype,
         block_size=block_size,
-        use_v1=envs.VLLM_USE_V1,
+        use_v1=True,
         use_mla=use_mla,
         has_sink=has_sink,
         use_sparse=use_sparse,
 
@@ -5,7 +5,6 @@
 from collections.abc import Callable
 from typing import TYPE_CHECKING, cast
 
-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import (
     KVConnectorBase,
@@ -44,12 +43,6 @@ def create_connector(
         config: VllmConfig,
         role: KVConnectorRole,
     ) -> KVConnectorBase:
-        if not envs.VLLM_USE_V1:
-            raise ValueError(
-                "Attempting to initialize a V1 Connector, "
-                f"but found {envs.VLLM_USE_V1=}"
-            )
-
         kv_transfer_config = config.kv_transfer_config
         if kv_transfer_config is None:
             raise ValueError("kv_transfer_config must be set to create a connector")