Restore Bug #4 and #5 fixes: nucleus and smoothing diagnostics

yuz207 · yuz207 · commit 65f57a3b7140 · 2025-09-27T19:34:53.000-07:00
Bug #4 fix: Change nucleus top_p fallback from 1.0 to 0.95, add [NUCLEUS_DEBUG] diagnostic logging. This ensures nucleus runs even if config attribute is missing, preventing 32000 survivors (full vocab). Bug #5 fix: Add [SMOOTH_DEBUG] diagnostic logging for smoothing lambda. These fixes were accidentally removed during the bug #2 draft-anchored rewrite (commit 595a371). Restoring them does not affect bug #2's core algorithm - they only improve fallback behavior and diagnostics.
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -258,7 +258,9 @@ def _sample_draft_tokens(
                 x = torch.full_like(x, float("-inf")).scatter(-1, idx, vals)
 
             # Top-p (nucleus) with correct boundary rule
-            top_p = float(getattr(self.opt_config, "draft_top_p", 1.0) or 1.0)
+            top_p = float(getattr(self.opt_config, "draft_top_p", 0.95) or 0.95)
+            print(f"[NUCLEUS_DEBUG] draft_top_p from config: {top_p}, will run nucleus: {0.0 < top_p < 1.0}",
+                  file=sys.stderr, flush=True)
             if 0.0 < top_p < 1.0:
                 p = torch.softmax(x, dim=-1)
                 sp, si = torch.sort(p, dim=-1, descending=True)
@@ -272,6 +274,8 @@ def _sample_draft_tokens(
             # Optional smoothing with untempered baseline
             probs_full = torch.softmax(x, dim=-1)
             lam = float(getattr(self.opt_config, "draft_mix_lambda_max", 0.0) or 0.0)
+            print(f"[SMOOTH_DEBUG] lambda_max from config: {lam}, will run smoothing: {lam > 0.0}",
+                  file=sys.stderr, flush=True)
             if lam > 0.0:
                 base = torch.softmax(logits_f32, dim=-1)  # untempered baseline
                 probs_full = (1.0 - lam) * probs_full + lam * base