Add tests for quantized target / draft model

tomasruizt · tomasruizt · commit 37f013ecf114 · 2025-10-13T16:12:28.000+02:00
Signed-off-by: Tomas Ruiz &lt;tomas.ruiz.te@gmail.com&gt;
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
@@ -399,14 +399,37 @@ class ArgsTest:
 
 @pytest.mark.parametrize("args", cases)
 @pytest.mark.parametrize("enforce_eager", [True, False])
-def test_draft_model_correctness(
-    args: ArgsTest,
-    enforce_eager: bool,
-    monkeypatch: pytest.MonkeyPatch,
-):
+def test_draft_model_correctness(args: ArgsTest, enforce_eager: bool):
+    assert_draft_model_correctness(args, enforce_eager)
+
+
+@pytest.mark.parametrize(
+    "models",
+    [
+        # target_model,         draft_model
+        ("Qwen/Qwen3-1.7B-FP8", "Qwen/Qwen3-0.6B"),  # target quantized
+        ("Qwen/Qwen3-1.7B", "Qwen/Qwen3-0.6B-FP8"),  # draft quantized
+    ],
+    ids=["target_quantized", "draft_quantized"],
+)
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_draft_model_quantization(models: tuple[str, str], enforce_eager: bool):
+    tgt_model, draft_model = models
+    sd_case = ArgsTest(
+        model=tgt_model,
+        draft_model=draft_model,
+        sampling_config=greedy_sampling(),
+        num_speculative_tokens=3,
+        expected_acceptance_len=2.95 + 1,
+        expected_acceptance_rate=0.95,
+        expected_same_output_fraction=0.95,
+    )
+    assert_draft_model_correctness(sd_case, enforce_eager)
+
+
+def assert_draft_model_correctness(args: ArgsTest, enforce_eager: bool):
     """Compare the outputs using and not using speculative decoding.
     In the greedy decoding case, the outputs must match EXACTLY."""
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     test_prompts = get_test_prompts(mm_enabled=False, quiet=True)
 
     spec_llm = LLM(
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
@@ -757,6 +757,14 @@ def compile_debug_dump_path(self) -> Path | None:
         path = self.compilation_config.debug_dump_path / append_path
         return path
 
+    def replace(self, **kwargs):
+        """
+        Replace attributes of the config, and 'recompute' the config.
+        dataclass.replace() calls __init__() and __post_init__(), source:
+        https://docs.python.org/3/library/dataclasses.html#dataclasses.replace
+        """
+        return replace(self, **kwargs)
+
     def __str__(self):
         return (
             f"model={self.model_config.model!r}, "
diff --git a/vllm/v1/spec_decode/draft_model.py b/vllm/v1/spec_decode/draft_model.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass, replace
+from dataclasses import dataclass
 from typing import Any
 
 import torch
@@ -118,8 +118,11 @@ def load_model(self, target_model: Any) -> None:
         draft_model_config: ModelConfig = (
             self.vllm_config.speculative_config.draft_model_config
         )
-        vllm_config_draft: VllmConfig = replace(
-            self.vllm_config, model_config=draft_model_config
+        # Recompute quant_config, which is configured for the target model
+        # But the draft model might not be quantized.
+        vllm_config_draft: VllmConfig = self.vllm_config.replace(
+            quant_config=None,
+            model_config=draft_model_config,
         )
 
         # This must be computed before loading the draft model