-
-
Notifications
You must be signed in to change notification settings - Fork 11.3k
[WIP] implement overlap of prepare_input during execute_model #23811
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
| from multiprocessing.process import BaseProcess | ||
| from threading import Thread | ||
| from typing import Any, Callable, Optional, Union, cast | ||
| import queue | ||
|
|
||
| import cloudpickle | ||
|
|
||
|
|
@@ -403,6 +404,12 @@ def __init__( | |
| # Initializes a message queue for sending the model output | ||
| self.worker_response_mq = MessageQueue(1, 1) | ||
|
|
||
| # queue size and threadpool size are set to 2 to coincide with | ||
| # the max_concurrent_batches of the executor when enable async scheduling. | ||
| self.exe_queue = queue.Queue(2) | ||
| self.exe_thread_pool = ThreadPoolExecutor( | ||
| max_workers=2, thread_name_prefix="execute_model") | ||
|
|
||
| # Initialize device and loads weights | ||
| self.worker.init_device() | ||
| self.worker.load_model() | ||
|
|
@@ -586,6 +593,12 @@ class ResponseStatus(Enum): | |
|
|
||
| def worker_busy_loop(self): | ||
| """Main busy loop for Multiprocessing Workers""" | ||
| async_execute_model = self.worker.vllm_config.scheduler_config.async_execute_model | ||
| events = { | ||
| "d2h_copy_event": threading.Event(), | ||
| "update_sampled_tokens_event": threading.Event() | ||
| } | ||
| exe_count = 0 | ||
| while True: | ||
| method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue() | ||
|
|
||
|
|
@@ -594,7 +607,19 @@ def worker_busy_loop(self): | |
| func = getattr(self.worker, method) | ||
| elif isinstance(method, bytes): | ||
| func = partial(cloudpickle.loads(method), self.worker) | ||
| output = func(*args, **kwargs) | ||
|
|
||
| if async_execute_model and func.__name__ == "execute_model": | ||
| args = (*args, exe_count, events) | ||
| output = self.execute_model_with_queue( | ||
| func, | ||
| *args, | ||
| **kwargs, | ||
| ) | ||
| exe_count += 1 | ||
| if not output: | ||
| continue | ||
| else: | ||
| output = func(*args, **kwargs) | ||
| except Exception as e: | ||
| # Notes have been introduced in python 3.11 | ||
| if hasattr(e, "add_note"): | ||
|
|
@@ -610,3 +635,14 @@ def worker_busy_loop(self): | |
| if output_rank is None or self.rank == output_rank: | ||
| self.worker_response_mq.enqueue( | ||
| (WorkerProc.ResponseStatus.SUCCESS, output)) | ||
|
|
||
| def execute_model_with_queue(self, func, *args, **kwargs): | ||
| """Execute model with a queue for async execution.""" | ||
| output = None | ||
| if not self.exe_queue.full(): | ||
| output_future = self.exe_thread_pool.submit(func, *args, **kwargs) | ||
| self.exe_queue.put_nowait(output_future) | ||
| if self.exe_queue.full(): | ||
| output = self.exe_queue.get().result() | ||
| self.exe_queue.task_done() | ||
| return output | ||
|
Comment on lines
+639
to
+648
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current implementation of
To prevent this deadlock, |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This new conditional block introduces two critical issues:
AttributeErrorBug:scheduler_outputcan beNonewhen this block is reached. This occurs ifself.batch_queueis full, becausescheduler.schedule()is not called, andscheduler_outputremainsNone. Accessingscheduler_output.total_num_scheduled_tokenswill then raise anAttributeError.Potential Livelock: Even if the
AttributeErroris fixed (e.g., by checkingscheduler_output is not None), a logical flaw remains. If this condition is met, the function returns without processing items fromself.batch_queue. Since the state that led to this condition might not change, subsequent calls tostep_with_batch_queuecould repeatedly hit the same condition, causing items in the queue to be starved and leading to a livelock.The logic for when to process items from the queue versus returning early needs to be reconsidered to avoid these problems.