[Core][Perf] Only invoke save_new_computed_blocks when computed blocks are not empty (#27799)

Jialin · web-flow · commit 4b68c4a55b0f · 2025-10-30T19:47:30.000Z
Signed-off-by: Jialin Ouyang &lt;Jialin.Ouyang@gmail.com&gt;
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -306,11 +306,12 @@ def allocate_slots(
                 "Computed blocks should be empty when prefix caching is disabled"
             )
 
-        # Append the new computed blocks to the request blocks until now to
-        # avoid the case where the new blocks cannot be allocated.
-        self.coordinator.save_new_computed_blocks(
-            request.request_id, new_computed_block_list
-        )
+        if new_computed_block_list is not self.empty_kv_cache_blocks.blocks:
+            # Append the new computed blocks to the request blocks until now to
+            # avoid the case where the new blocks cannot be allocated.
+            self.coordinator.save_new_computed_blocks(
+                request.request_id, new_computed_block_list
+            )
 
         new_blocks = self.coordinator.allocate_new_blocks(
             request.request_id, num_tokens_need_slot, num_encoder_tokens
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -151,7 +151,7 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
             num_tokens: The total number of tokens that need to be cached
                 (including tokens that are already cached).
         """
-        num_cached_blocks = self.num_cached_block[request.request_id]
+        num_cached_blocks = self.num_cached_block.get(request.request_id, 0)
         num_full_blocks = num_tokens // self.block_size
 
         if num_cached_blocks >= num_full_blocks: