vllm-project · njhill · Nov 1, 2025 · Oct 13, 2025 · Oct 15, 2025 · Oct 16, 2025
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -6,6 +6,9 @@
 
 from tblib import pickling_support
 
+# Import fixture
+from tests.v1.entrypoints.conftest import sample_json_schema  # noqa
+
 # ruff: noqa
 
 # Install support for pickling exceptions so that we can nicely propagate

@@ -337,8 +337,6 @@ def test_stop_via_update_from_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_output = ModelRunnerOutput(
@@ -385,8 +383,6 @@ def test_stop_via_update_from_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_output = ModelRunnerOutput(
@@ -431,8 +427,6 @@ def test_stop_via_update_from_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_output = ModelRunnerOutput(
@@ -472,8 +466,6 @@ def test_stop_via_update_from_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_output = ModelRunnerOutput(
@@ -1988,7 +1980,6 @@ def test_schedule_skip_tokenizer_init():
         scheduler.add_request(request)
     output = scheduler.schedule()
     assert len(output.scheduled_new_reqs) == len(requests)
-    assert output.grammar_bitmask is None
 
 
 def test_schedule_skip_tokenizer_init_structured_output_request():

diff --git a/tests/v1/e2e/test_async_sched_and_preempt.py → tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_sched_and_preempt.py → tests/v1/e2e/test_async_scheduling.py
@@ -7,6 +7,7 @@
 
 from vllm import SamplingParams
 from vllm.logprobs import Logprob
+from vllm.sampling_params import StructuredOutputsParams
 
 from ...conftest import VllmRunner
 from ...models.utils import check_outputs_equal
@@ -15,9 +16,12 @@
 
 
 @dynamo_config.patch(cache_size_limit=16)
-def test_preempt_and_async_scheduling_e2e(monkeypatch: pytest.MonkeyPatch):
+def test_preempt_and_async_scheduling_e2e(
+    sample_json_schema, monkeypatch: pytest.MonkeyPatch
+):
     """Test consistency of combos of async scheduling, preemption,
-    uni/multiproc executor, and various sampling parameters."""
+    uni/multiproc executor, and various sampling parameters
+    including structured outputs."""
 
     first_prompt = (
         "The following numbers of the sequence "
@@ -35,6 +39,12 @@ def test_preempt_and_async_scheduling_e2e(monkeypatch: pytest.MonkeyPatch):
         dict(bad_words=["the", " the"]),
         dict(logprobs=2),
         dict(logprobs=2, presence_penalty=-1.0),
+        dict(structured_outputs=StructuredOutputsParams(json=sample_json_schema)),
+        dict(
+            structured_outputs=StructuredOutputsParams(json=sample_json_schema),
+            logprobs=2,
+            presence_penalty=-1.0,
+        ),
     ]
 
     default_params = dict(

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
@@ -248,7 +248,7 @@ def execute_model(
             self,
             scheduler_output,
             non_block=False,
-        ) -> Future[ModelRunnerOutput]:
+        ) -> Future[ModelRunnerOutput | None]:
             """Make execute_model non-blocking."""
 
             # DummyExecutor used only for testing async case.
@@ -263,6 +263,23 @@ def _execute():
             # Use the thread pool instead of creating a new thread
             return self.thread_pool.submit(_execute)
 
+        def sample_tokens(
+            self, grammar_output, non_block=False
+        ) -> Future[ModelRunnerOutput]:
+            """Make sample_tokens non-blocking."""
+
+            # DummyExecutor used only for testing async case.
+            assert non_block
+
+            def _execute():
+                output = self.collective_rpc("sample_tokens", args=(grammar_output,))
+                # Make a copy because output[0] may be reused
+                # by the next batch.
+                return copy.deepcopy(output[0])
+
+            # Use the thread pool instead of creating a new thread
+            return self.thread_pool.submit(_execute)
+
         @property
         def max_concurrent_batches(self) -> int:
             return 2

diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py
@@ -31,7 +31,9 @@ def collective_rpc(
         # Drop marker to show that this was run
         with open(".marker", "w"):
             ...
-        return super().collective_rpc(method, timeout, args, kwargs)
+        return super().collective_rpc(
+            method, timeout, args, kwargs, non_block, unique_reply_rank
+        )
 
 
 CustomMultiprocExecutorAsync = CustomMultiprocExecutor

@@ -26,8 +26,6 @@ def _make_empty_scheduler_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
         kv_connector_metadata=SharedStorageConnectorMetadata(),
     )
 

@@ -981,9 +981,7 @@ def test_scheduler_kv_connector_stats_aggregation():
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=[0],
         finished_req_ids=set(),
-        free_encoder_mm_hashes=set(),
-        structured_output_request_ids={},
-        grammar_bitmask=None,
+        free_encoder_mm_hashes=[],
     )
 
     engine_core_outputs = scheduler.update_from_output(scheduler_output, model_output)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -92,8 +92,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
 
@@ -171,8 +169,6 @@ def test_update_states_request_finished(model_runner):
         num_common_prefix_blocks=[],
         finished_req_ids={req_id},
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -201,8 +197,6 @@ def test_update_states_request_resumed(model_runner):
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -230,8 +224,6 @@ def test_update_states_request_resumed(model_runner):
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -261,8 +253,6 @@ def test_update_states_no_changes(model_runner):
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -296,8 +286,6 @@ def test_update_states_request_unscheduled(model_runner):
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
@@ -150,8 +150,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
 
@@ -216,8 +214,6 @@ def test_update_states_request_finished(model_runner, dist_init):
         num_common_prefix_blocks=[],
         finished_req_ids={req_id},
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -248,8 +244,6 @@ def test_update_states_request_resumed(model_runner, dist_init):
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -277,8 +271,6 @@ def test_update_states_request_resumed(model_runner, dist_init):
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -370,8 +362,6 @@ def test_update_states_no_changes(model_runner, dist_init):
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -407,8 +397,6 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids=[],
-        grammar_bitmask=None,
     )
 
     metadata_before = model_runner._update_states(scheduler_output)

@@ -6,7 +6,7 @@
 
 from collections.abc import Sequence
 from concurrent.futures import CancelledError, Future
-from typing import TYPE_CHECKING, Literal, cast
+from typing import TYPE_CHECKING, Literal
 
 import torch
 
@@ -138,8 +138,11 @@ def from_connector(cls, connector: "KVConnectorBase", world_size: int):
         return cls(connector.get_finished_count() or world_size)
 
     def aggregate(
-        self, outputs: list[ModelRunnerOutput], output_rank: int = 0
-    ) -> ModelRunnerOutput:
+        self, outputs: list[ModelRunnerOutput | None], output_rank: int = 0
+    ) -> ModelRunnerOutput | None:
+        if not outputs[output_rank]:
+            return None
+
         # Aggregate kv_connector_output from all workers
 
         def update_finished_set(
@@ -161,6 +164,7 @@ def update_finished_set(
         aggregated_kv_connector_stats = None
         invalid_block_ids = set[int]()
         for model_runner_output in outputs:
+            assert model_runner_output is not None
             kv_output = model_runner_output.kv_connector_output
             if not kv_output:
                 continue
@@ -204,6 +208,7 @@ def update_finished_set(
         # select output of the worker specified by output_rank
         output = outputs[output_rank]
 
+        assert output is not None
         output.kv_connector_output = KVConnectorOutput(
             finished_sending=finished_sending or None,
             finished_recving=finished_recving or None,
@@ -215,13 +220,16 @@ def update_finished_set(
         return output
 
     def async_aggregate(
-        self, output_futures: Sequence[Future[ModelRunnerOutput]], output_rank: int = 0
-    ) -> Future[ModelRunnerOutput]:
+        self,
+        output_futures: Sequence[Future[ModelRunnerOutput | None]],
+        output_rank: int = 0,
+    ) -> Future[ModelRunnerOutput | None]:
         """Takes a list of futures and returns a single future which resolves
         to the respective list of outputs."""
-        result_future: Future[ModelRunnerOutput] = Future()
+        result_future: Future[ModelRunnerOutput | None] = Future()
 
         outputs: list[ModelRunnerOutput | None] = [None] * len(output_futures)
+        remaining = len(output_futures)
 
         def make_callback(idx):
             def callback(fut):
@@ -236,12 +244,10 @@ def callback(fut):
                     result_future.set_exception(e)
 
                 # this check assumes io_thread_pool uses a single thread
-                if all(outputs):
-                    result_future.set_result(
-                        self.aggregate(
-                            cast(list[ModelRunnerOutput], outputs), output_rank
-                        )
-                    )
+                nonlocal remaining
+                remaining -= 1
+                if not remaining:
+                    result_future.set_result(self.aggregate(outputs, output_rank))
 
             return callback
 

@@ -15,8 +15,12 @@ def _update_after_schedule(
         scheduler_output: SchedulerOutput,
     ) -> None:
         super()._update_after_schedule(scheduler_output)
+        pending_structured_output_tokens = False
         for req_id in scheduler_output.num_scheduled_tokens:
             request = self.requests[req_id]
+            pending_structured_output_tokens |= (
+                request.use_structured_output and request.num_output_placeholders > 0
+            )
             if (
                 request.num_computed_tokens
                 == request.num_tokens + request.num_output_placeholders
@@ -25,6 +29,10 @@ def _update_after_schedule(
                 # TODO(woosuk): Support speculative decoding.
                 request.num_output_placeholders += 1
 
+        scheduler_output.pending_structured_output_tokens = (
+            pending_structured_output_tokens
+        )
+
     def _update_request_with_output(
         self,
         request: Request,

@@ -6,7 +6,7 @@
 
 if TYPE_CHECKING:
     from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
-    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
     from vllm.v1.engine import EngineCoreOutputs
     from vllm.v1.metrics.stats import SchedulerStats
     from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
@@ -40,6 +40,12 @@ def schedule(self) -> "SchedulerOutput":
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_grammar_bitmask(
+        self, scheduler_output: "SchedulerOutput"
+    ) -> "GrammarOutput | None":
+        raise NotImplementedError
+
     @abstractmethod
     def update_from_output(
         self,