Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,7 +891,6 @@ def test_kv_connector_basic():
scheduler = create_scheduler(
enable_prefix_caching=True,
use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
)
NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
BLOCK_SIZE = scheduler.cache_config.block_size
Expand Down Expand Up @@ -1017,7 +1016,6 @@ def test_external_prefix_cache_metrics():
scheduler = create_scheduler(
enable_prefix_caching=False,
use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
)

# Mock connector to simulate a partial external cache hit
Expand Down Expand Up @@ -1082,7 +1080,6 @@ def test_kv_connector_unable_to_allocate():
use_kv_connector=True,
block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS,
disable_hybrid_kv_cache_manager=True,
)
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
Expand Down Expand Up @@ -1166,7 +1163,6 @@ def test_kv_connector_handles_preemption():
use_kv_connector=True,
block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS,
disable_hybrid_kv_cache_manager=True,
)

NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
Expand Down Expand Up @@ -1383,7 +1379,6 @@ def create_scheduler_with_priority(
block_size: int = 16,
max_model_len: int | None = None,
num_speculative_tokens: int | None = None,
disable_hybrid_kv_cache_manager: bool = False,
) -> Scheduler:
"""Create scheduler with priority policy enabled.

Expand All @@ -1408,7 +1403,6 @@ def create_scheduler_with_priority(
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True,
policy="priority", # Enable priority scheduling
disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
)
model_config = ModelConfig(
model=model,
Expand Down Expand Up @@ -2015,7 +2009,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
num_blocks=5, # Can hold 64 tokens (first block is null)
block_size=16, # Standard block size
use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
)

# Create a request and schedule it
Expand Down
2 changes: 0 additions & 2 deletions tests/v1/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def create_scheduler(
num_speculative_tokens: int | None = None,
skip_tokenizer_init: bool = False,
async_scheduling: bool = False,
disable_hybrid_kv_cache_manager: bool = False,
) -> Scheduler | AsyncScheduler:
"""Create scheduler under test.

Expand All @@ -71,7 +70,6 @@ def create_scheduler(
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True,
async_scheduling=async_scheduling,
disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
)
model_config = ModelConfig(
model=model,
Expand Down
2 changes: 0 additions & 2 deletions tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ run_tests_for_model() {
vllm serve $model_name \
--port $PORT \
--enforce-eager \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The NixlConnector used by vllm serve does not support the Hybrid Memory Allocator (HMA). Removing --disable-hybrid-kv-cache-manager will cause the server to fail on startup because HMA is enabled by default. This line should be restored.

Suggested change
--enforce-eager \
--enforce-eager \
--disable-hybrid-kv-cache-manager \

--disable-hybrid-kv-cache-manager \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--tensor-parallel-size $PREFILLER_TP_SIZE \
--kv-transfer-config '$KV_CONFIG'"
Expand Down Expand Up @@ -179,7 +178,6 @@ run_tests_for_model() {
--port $PORT \
--enforce-eager \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The NixlConnector used by vllm serve does not support the Hybrid Memory Allocator (HMA). Removing --disable-hybrid-kv-cache-manager will cause the server to fail on startup because HMA is enabled by default. This line should be restored.

Suggested change
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--disable-hybrid-kv-cache-manager \

--disable-hybrid-kv-cache-manager \
--kv-transfer-config '$KV_CONFIG'"

# DP-EP attention mode
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ run_tests_for_model() {
--port $PREFILL_PORT \
--enforce-eager \
--gpu-memory-utilization 0.2 \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The NixlConnector used by vllm serve does not support the Hybrid Memory Allocator (HMA). Removing --disable-hybrid-kv-cache-manager will cause the server to fail on startup because HMA is enabled by default. This line should be restored.

Suggested change
--gpu-memory-utilization 0.2 \
--gpu-memory-utilization 0.2 \
--disable-hybrid-kv-cache-manager \

--disable-hybrid-kv-cache-manager \
--kv-transfer-config '$KV_CONFIG'"

if [ -n "$model_args" ]; then
Expand All @@ -104,7 +103,6 @@ run_tests_for_model() {
--port $DECODE_PORT \
--enforce-eager \
--gpu-memory-utilization 0.2 \
--disable-hybrid-kv-cache-manager \
--kv-transfer-config '$KV_CONFIG'"

if [ -n "$model_args" ]; then
Expand Down
1 change: 0 additions & 1 deletion tests/v1/kv_connector/unit/test_multi_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def test_multi_shared_storage_connector_consistency():
enforce_eager=True,
gpu_memory_utilization=0.5,
kv_transfer_config=kv_transfer_config,
disable_hybrid_kv_cache_manager=True,
)
# Run generation - this should trigger saving KV cache
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
Expand Down
1 change: 0 additions & 1 deletion tests/v1/kv_connector/unit/test_nixl_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,7 +1020,6 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
"gpu_memory_utilization": 0.5,
"kv_transfer_config": kv_transfer_config,
"distributed_executor_backend": distributed_executor_backend,
"disable_hybrid_kv_cache_manager": True,
}

timeout = 6
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ def test_shared_storage_connector_hashes(tmp_path):
enforce_eager=True,
kv_transfer_config=kv_transfer_config,
limit_mm_per_prompt={"image": 2},
disable_hybrid_kv_cache_manager=True,
)

# don't put this import at the top level
Expand Down
3 changes: 0 additions & 3 deletions tests/v1/kv_connector/unit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,6 @@ def create_vllm_config(
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len,
enable_chunked_prefill=enable_chunked_prefill,
# Disable hybrid KV cache manager for testing
# Should be removed after we support hybrid KV cache manager-based testing.
disable_hybrid_kv_cache_manager=True,
)
model_config = ModelConfig(
model=model,
Expand Down
Loading