vllm-project · Ronald1995 · Aug 23, 2025 · gemini-code-assist · Aug 28, 2025 · gemini-code-assist
@@ -159,6 +159,12 @@ class SchedulerConfig:
     structured outputs, speculative decoding, and pipeline parallelism.
     """
 
+    async_execute_model: bool = False
+    """EXPERIMENTAL: If set to True, perform async model execution.
+    This may help reduce the CPU overheads, leading to better latency
+    and throughput. Moreover, this rely on async scheduling.
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -247,6 +253,10 @@ def __post_init__(self) -> None:
             self.scheduler_cls = (
                 "vllm.v1.core.sched.async_scheduler.AsyncScheduler")
 
+        if self.async_execute_model:
+            assert self.async_scheduling, (
+                "async_execute_model requires async_scheduling to be True.")
+
     @model_validator(mode='after')
     def _verify_args(self) -> Self:
         if (self.max_num_batched_tokens < self.max_model_len

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -33,8 +33,8 @@ class SpinTimer:
     def record_activity(self):
         pass
 
-    def spin(self):
-        sched_yield()
+    def spin(self, sleep_time: Optional[float] = None):
+        sched_yield(sleep_time)
 
 
 class SpinSleepTimer(SpinTimer):
@@ -370,7 +370,11 @@ def wait_until_ready(self):
             assert recv == b"READY"
 
     @contextmanager
-    def acquire_write(self, timeout: Optional[float] = None):
+    def acquire_write(
+        self,
+        timeout: Optional[float] = None,
+        sleep_time: Optional[float] = None,
+    ):
         assert self._is_writer, "Only writers can acquire write"
         start_time = time.monotonic()
         n_warning = 1
@@ -385,7 +389,7 @@ def acquire_write(self, timeout: Optional[float] = None):
                     # we need to wait until it is read by all readers
 
                     # Release the processor to other threads
-                    sched_yield()
+                    sched_yield(sleep_time)
 
                     # if we wait for a long time, log a message
                     if (time.monotonic() - start_time
@@ -428,9 +432,12 @@ def acquire_write(self, timeout: Optional[float] = None):
                 break
 
     @contextmanager
-    def acquire_read(self,
-                     timeout: Optional[float] = None,
-                     cancel: Optional[Event] = None):
+    def acquire_read(
+        self,
+        timeout: Optional[float] = None,
+        cancel: Optional[Event] = None,
+        sleep_time: Optional[float] = None,
+    ):
         assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
@@ -448,7 +455,7 @@ def acquire_read(self,
                     # we need to wait until it is written
 
                     # Release the processor to other threads
-                    self._read_spin_timer.spin()
+                    self._read_spin_timer.spin(sleep_time)
 
                     # if we wait for a long time, log a message
                     if (time.monotonic() - start_time
@@ -483,28 +490,36 @@ def acquire_read(self,
                 self._read_spin_timer.record_activity()
                 break
 
-    def enqueue(self, obj, timeout: Optional[float] = None):
+    def enqueue(
+        self,
+        obj,
+        timeout: Optional[float] = None,
+        sleep_time: Optional[float] = None,
+    ):
         """ Write to message queue with optional timeout (in seconds) """
         assert self._is_writer, "Only writers can enqueue"
         serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
         if self.n_local_reader > 0:
             if len(serialized_obj) >= self.buffer.max_chunk_bytes:
-                with self.acquire_write(timeout) as buf:
+                with self.acquire_write(timeout, sleep_time) as buf:
                     buf[0] = 1  # overflow
                 self.local_socket.send(serialized_obj)
             else:
-                with self.acquire_write(timeout) as buf:
+                with self.acquire_write(timeout, sleep_time) as buf:
                     buf[0] = 0  # not overflow
                     buf[1:len(serialized_obj) + 1] = serialized_obj
         if self.n_remote_reader > 0:
             self.remote_socket.send(serialized_obj)
 
-    def dequeue(self,
-                timeout: Optional[float] = None,
-                cancel: Optional[Event] = None):
+    def dequeue(
+        self,
+        timeout: Optional[float] = None,
+        cancel: Optional[Event] = None,
+        sleep_time: Optional[float] = None,
+    ) -> Any:
         """ Read from message queue with optional timeout (in seconds) """
         if self._is_local_reader:
-            with self.acquire_read(timeout, cancel) as buf:
+            with self.acquire_read(timeout, cancel, sleep_time) as buf:
                 overflow = buf[0] == 1
                 if not overflow:
                     # no need to know the size of serialized object

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
@@ -38,8 +38,15 @@
                        and sys.version_info[2] >= 8))
 
 
-def sched_yield():
-    if USE_SCHED_YIELD:
+def sched_yield(sleep_time: Optional[float] = None):
+    # when we set more than one threads in Worker Process,
+    # os.sched_yield() and time.sleep(0) both set the thread to ready state,
+    # but the cpu may reschedule it immediately,
+    # so we add a small sleep time to make sure the thread is set to blocked state,
+    # and the cpu can schedule other threads.
+    if sleep_time is not None:
+        time.sleep(sleep_time)
+    elif USE_SCHED_YIELD:
         os.sched_yield()
     else:
         time.sleep(0)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -445,6 +445,8 @@ class EngineArgs:
 
     async_scheduling: bool = SchedulerConfig.async_scheduling
 
+    async_execute_model: bool = SchedulerConfig.async_execute_model
+
     kv_sharing_fast_prefill: bool = \
         CacheConfig.kv_sharing_fast_prefill
 
@@ -864,6 +866,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         scheduler_group.add_argument("--async-scheduling",
                                      **scheduler_kwargs["async_scheduling"])
 
+        scheduler_group.add_argument("--async-execute-model",
+                                     **scheduler_kwargs["async_execute_model"])
+
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
         vllm_group = parser.add_argument_group(
@@ -1254,6 +1259,12 @@ def create_engine_config(
                 raise ValueError("Async scheduling is not supported with "
                                  "pipeline-parallel-size > 1.")
 
+        if self.async_execute_model:
+            # TODO(Ronald1995): Support async execute model with ray.
+            if self.distributed_executor_backend != "mp":
+                raise ValueError("Async execute model is only supported with "
+                                 "mp-based distributed executor backend.")
+
             # Currently, async scheduling does not support speculative decoding.
             # TODO(woosuk): Support it.
             if self.speculative_config is not None:

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -155,6 +155,7 @@ def __init__(self,
 
             self.request_block_hasher = get_request_block_hasher(
                 block_size, caching_hash_fn)
+        self.async_execute_model = self.vllm_config.scheduler_config.async_execute_model
 
     def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
@@ -341,6 +342,13 @@ def step_with_batch_queue(
         # but peeking the first element in a queue is not thread-safe,
         # so we need more work.
         if not scheduled_batch and not self.batch_queue.empty():
+            # when enable async_execute_model, we should not block to get
+            # future restult when total_num_scheduled_tokens equals to 0.
+            # cause in this case, it wont's send execute_model task to workers.
+            if (self.async_execute_model
+                    and scheduler_output.total_num_scheduled_tokens == 0):
+                return engine_core_outputs, scheduled_batch
+
             future, scheduler_output = self.batch_queue.get_nowait()
 
             # Blocking until the first result is available.

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
@@ -16,6 +16,7 @@
 from multiprocessing.process import BaseProcess
 from threading import Thread
 from typing import Any, Callable, Optional, Union, cast
+import queue
 
 import cloudpickle
 
@@ -403,6 +404,12 @@ def __init__(
         # Initializes a message queue for sending the model output
         self.worker_response_mq = MessageQueue(1, 1)
 
+        # queue size and threadpool size are set to 2 to coincide with
+        # the max_concurrent_batches of the executor when enable async scheduling.
+        self.exe_queue = queue.Queue(2)
+        self.exe_thread_pool = ThreadPoolExecutor(
+            max_workers=2, thread_name_prefix="execute_model")
+
         # Initialize device and loads weights
         self.worker.init_device()
         self.worker.load_model()
@@ -586,6 +593,12 @@ class ResponseStatus(Enum):
 
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
+        async_execute_model = self.worker.vllm_config.scheduler_config.async_execute_model
+        events = {
+            "d2h_copy_event": threading.Event(),
+            "update_sampled_tokens_event": threading.Event()
+        }
+        exe_count = 0
         while True:
             method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue()
 
@@ -594,7 +607,19 @@ def worker_busy_loop(self):
                     func = getattr(self.worker, method)
                 elif isinstance(method, bytes):
                     func = partial(cloudpickle.loads(method), self.worker)
-                output = func(*args, **kwargs)
+
+                if async_execute_model and func.__name__ == "execute_model":
+                    args = (*args, exe_count, events)
+                    output = self.execute_model_with_queue(
+                        func,
+                        *args,
+                        **kwargs,
+                    )
+                    exe_count += 1
+                    if not output:
+                        continue
+                else:
+                    output = func(*args, **kwargs)
             except Exception as e:
                 # Notes have been introduced in python 3.11
                 if hasattr(e, "add_note"):
@@ -610,3 +635,14 @@ def worker_busy_loop(self):
             if output_rank is None or self.rank == output_rank:
                 self.worker_response_mq.enqueue(
                     (WorkerProc.ResponseStatus.SUCCESS, output))
+
+    def execute_model_with_queue(self, func, *args, **kwargs):
+        """Execute model with a queue for async execution."""
+        output = None
+        if not self.exe_queue.full():
+            output_future = self.exe_thread_pool.submit(func, *args, **kwargs)
+            self.exe_queue.put_nowait(output_future)
+        if self.exe_queue.full():
+            output = self.exe_queue.get().result()
+            self.exe_queue.task_done()
+        return output
@@ -121,7 +121,7 @@ def parse_output(
         Returns:
             A list of lists of token IDs.
         """
-        output_token_ids_np = output_token_ids.cpu().numpy()
+        output_token_ids_np = output_token_ids.numpy()
         # Create mask for valid tokens.
         valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
                       (output_token_ids_np < vocab_size))