Fix critical NWOR bugs causing 0% acceptance rate

yuz207 · yuz207 · commit c663f315d5c6 · 2025-09-26T08:39:06.000-07:00
Bug #1 (CRITICAL): Add missing begin() and stage() methods to KVWriteRouter - Flash attention backend calls router.begin() and router.stage() - KVWriteRouter only had write() and commit() methods - Added begin() to store slot_mapping and initialize shadow buffer - Added stage() to extract per-timestep slot and stage KV pairs - Without these, no tokens were being staged → 0% acceptance rate Bug #2 (MODERATE): Fix bonus token counting in accepted_lens - valid_sampled_token_ids includes [accepted_draft_tokens..., bonus_token] - Previous: len([bonus]) = 1, incorrectly counted as 1 accepted draft token - Fixed: Use max(0, len(seq) - 1) to exclude bonus token from count - Now correctly reports 0 accepted when only bonus token is present Files modified: - vllm/v1/kv_cache/write_router.py: Added begin() and stage() methods - vllm/v1/worker/gpu_model_runner.py: Fixed accepted_lens calculation
diff --git a/vllm/v1/kv_cache/write_router.py b/vllm/v1/kv_cache/write_router.py
@@ -100,6 +100,7 @@ def __init__(self, persistent_writer: PersistentKVWriter):
         self._persistent = persistent_writer
         self._shadow = None
         self._mode = "immediate"  # or "defer"
+        self._slot_mapping = None  # Stored during begin() for use in stage()
 
     def immediate(self):
         """Switch to immediate write mode (normal operation)."""
@@ -116,6 +117,53 @@ def defer(self, shadow):
         self._mode = "defer"
         self._shadow = shadow
 
+    @torch.no_grad()
+    def begin(self, length_hint: int, slot_mapping: torch.Tensor, seg_lens: Optional[torch.Tensor] = None):
+        """
+        Begin staging for a verification window.
+        Called by flash_attn backend before staging tokens.
+
+        Args:
+            length_hint: Expected number of tokens to stage
+            slot_mapping: Slot mapping tensor for all tokens in this window
+            seg_lens: Segment lengths (optional, for context)
+        """
+        if self._mode == "defer" and self._shadow is not None:
+            # Store slot_mapping for use in stage() calls
+            self._slot_mapping = slot_mapping
+            # Initialize shadow buffer for this verification window
+            self._shadow.begin(length_hint)
+
+    @torch.no_grad()
+    def stage(self, layer_idx: int, t: int, k_slice: torch.Tensor, v_slice: torch.Tensor):
+        """
+        Stage a single timestep's KV during verification.
+        Called by flash_attn backend for each token being verified.
+
+        Args:
+            layer_idx: Transformer layer index
+            t: Position in the staging buffer (0-indexed)
+            k_slice: Key tensor [1, H, D]
+            v_slice: Value tensor [1, H, D]
+        """
+        if self._mode == "defer" and self._shadow is not None:
+            # Extract slot mapping for this specific timestep
+            if self._slot_mapping is not None:
+                slot_t = self._slot_mapping[t:t+1]
+            else:
+                # Fallback: create a dummy slot mapping
+                slot_t = torch.tensor([t], dtype=torch.int64, device=k_slice.device)
+
+            # Stage in shadow buffer
+            self._shadow.stage(layer_idx, t, k_slice, v_slice, slot_t)
+        elif self._mode == "immediate":
+            # In immediate mode, write directly to persistent cache
+            if self._slot_mapping is not None:
+                slot_t = self._slot_mapping[t:t+1]
+            else:
+                slot_t = torch.tensor([t], dtype=torch.int64, device=k_slice.device)
+            self._persistent.append_slice(layer_idx, k_slice, v_slice, slot_t)
+
     @torch.no_grad()
     def write(self,
               layer_idx: int,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2353,10 +2353,14 @@ def propose_draft_token_ids(sampled_token_ids):
         # NWOR: Commit accepted tokens and disarm router
         if self._router_token is not None:
             if isinstance(valid_sampled_token_ids, list):
-                accepted_lens = torch.tensor([len(seq) for seq in valid_sampled_token_ids],
+                # Compute actual draft tokens accepted (exclude bonus token)
+                # valid_sampled_token_ids includes: [accepted_draft_tokens..., bonus_token]
+                # So len(seq) - 1 gives the number of accepted draft tokens
+                accepted_lens = torch.tensor([max(0, len(seq) - 1) for seq in valid_sampled_token_ids],
                                             dtype=torch.int32, device=self.device)
             else:
-                accepted_lens = torch.tensor([1] * len(valid_sampled_token_ids),
+                # Non-list case: assume no draft tokens accepted (only bonus)
+                accepted_lens = torch.tensor([0] * len(valid_sampled_token_ids),
                                             dtype=torch.int32, device=self.device)
 
             self.drafter.kv_router.commit(accepted_lens)