support stream interval

elvischenv · elvischenv · commit 04e4a8895847 · 2025-10-31T08:57:47.000-07:00
Signed-off-by: elvischenv &lt;219235043+elvischenv@users.noreply.github.com&gt;
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -137,6 +137,12 @@ class SchedulerConfig:
     structured outputs, speculative decoding, and pipeline parallelism.
     """
 
+    stream_interval: int = field(default=1, ge=1)
+    """The interval (or buffer size) for streaming in terms of token length.
+    A smaller value (1) makes streaming smoother by sending each token immediately,
+    while a larger value (e.g., 10) reduces host overhead and increases throughput
+    by batching multiple tokens before sending."""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -551,6 +551,8 @@ class EngineArgs:
 
     async_scheduling: bool = SchedulerConfig.async_scheduling
 
+    stream_interval: int = SchedulerConfig.stream_interval
+
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
 
     def __post_init__(self):
@@ -1044,6 +1046,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         scheduler_group.add_argument(
             "--async-scheduling", **scheduler_kwargs["async_scheduling"]
         )
+        scheduler_group.add_argument(
+            "--stream-interval", **scheduler_kwargs["stream_interval"]
+        )
 
         # Compilation arguments
         compilation_kwargs = get_kwargs(CompilationConfig)
@@ -1588,6 +1593,7 @@ def create_engine_config(
             long_prefill_token_threshold=self.long_prefill_token_threshold,
             disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
             async_scheduling=self.async_scheduling,
+            stream_interval=self.stream_interval,
         )
 
         if not model_config.is_multimodal_model and self.default_mm_loras:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -128,8 +128,9 @@ def __init__(
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
+            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
@@ -109,8 +109,9 @@ def __init__(
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
+            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -99,6 +99,7 @@ def __init__(
         arrival_time: float,
         queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
         top_p: float | None = None,
         n: int | None = None,
         temperature: float | None = None,
@@ -126,6 +127,11 @@ def __init__(
 
         self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None
 
+        # Stream Interval
+        self.stream_interval = stream_interval
+        self.total_num_output_tokens = 0  # Track total num of output tokens
+        self.sent_tokens_offset = 0  # Offset of sent tokens
+
     @classmethod
     def from_new_request(
         cls,
@@ -136,6 +142,7 @@ def from_new_request(
         request_index: int,
         queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
     ) -> "RequestState":
         if sampling_params := request.sampling_params:
             if not sampling_params.detokenize:
@@ -183,6 +190,7 @@ def from_new_request(
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
+            stream_interval=stream_interval,
         )
 
     def make_request_output(
@@ -200,13 +208,40 @@ def make_request_output(
             # Only the final output is required in FINAL_ONLY mode.
             return None
 
+        # Stream Interval buffering: only apply for DELTA mode and stream_interval > 1
+        is_delta_streaming = self.output_kind == RequestOutputKind.DELTA
+        if is_delta_streaming and self.stream_interval > 1:
+            # Track total tokens generated
+            self.total_num_output_tokens += len(new_token_ids)
+
+            # should send output when it is the first token or reach the stream interval
+            should_send_output = (
+                self.sent_tokens_offset == 0
+                or self.total_num_output_tokens - self.sent_tokens_offset
+                >= self.stream_interval
+            )
+
+            # Do NOT send output if not finished and should not send output
+            if not finished and not should_send_output:
+                return None
+
+            # Send tokens from the offset
+            assert self.detokenizer is not None
+            tokens_to_send = self.detokenizer.output_token_ids[
+                self.sent_tokens_offset :
+            ]
+            self.sent_tokens_offset = len(self.detokenizer.output_token_ids)
+        else:
+            # Send tokens immediately
+            tokens_to_send = new_token_ids
+
         request_id = self.request_id
         if pooling_output is not None:
             return self._new_request_output(
                 request_id, [self._new_pooling_output(pooling_output)], finished
             )
 
-        output = self._new_completion_output(new_token_ids, finish_reason, stop_reason)
+        output = self._new_completion_output(tokens_to_send, finish_reason, stop_reason)
 
         if self.parent_req is None:
             outputs = [output]
@@ -305,9 +340,12 @@ def _new_pooling_output(
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(self, tokenizer: AnyTokenizer, log_stats: bool):
+    def __init__(
+        self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1
+    ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
+        self.stream_interval = stream_interval
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates()
@@ -380,6 +418,7 @@ def add_request(
             request_index=request_index,
             queue=queue,
             log_stats=self.log_stats,
+            stream_interval=self.stream_interval,
         )
         self.request_states[request_id] = req_state
         self.lora_states.add_request(req_state)

Original file line number	Diff line number	Diff line change
`@@ -128,8 +128,9 @@ def __init__(`
`128`	`128`	`)`
`129`	`129`
`130`	`130`	`# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).`
	`131`	`+ stream_interval = self.vllm_config.scheduler_config.stream_interval`
`131`	`132`	`self.output_processor = OutputProcessor(`
`132`		`- self.tokenizer, log_stats=self.log_stats`
	`133`	`+ self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval`
`133`	`134`	`)`
`134`	`135`	`endpoint = self.observability_config.otlp_traces_endpoint`
`135`	`136`	`if endpoint is not None:`
Original file line number	Diff line number	Diff line change
`@@ -109,8 +109,9 @@ def __init__(`
`109`	`109`	`)`
`110`	`110`
`111`	`111`	`# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).`
	`112`	`+ stream_interval = self.vllm_config.scheduler_config.stream_interval`
`112`	`113`	`self.output_processor = OutputProcessor(`
`113`		`- self.tokenizer, log_stats=self.log_stats`
	`114`	`+ self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval`
`114`	`115`	`)`
`115`	`116`	`endpoint = self.observability_config.otlp_traces_endpoint`
`116`	`117`	`if endpoint is not None:`