2727 IPC_OUTPUT_EXT , RPC_REQUEST_T ,
2828 VLLM_RPC_SUCCESS_STR , RPCAbortRequest ,
2929 RPCAdapterLoadedResponse , RPCError ,
30+ RPCIsSleepingRequest ,
31+ RPCIsSleepingResponse ,
3032 RPCLoadAdapterRequest ,
3133 RPCProcessRequest ,
3234 RPCResetPrefixCacheRequest ,
@@ -246,7 +248,9 @@ async def run_output_handler_loop(self):
246248 if queue is not None :
247249 queue .put_nowait (exception )
248250 # Put each output into the appropriate queue.
249- elif isinstance (request_outputs , RPCAdapterLoadedResponse ):
251+ elif isinstance (
252+ request_outputs ,
253+ (RPCAdapterLoadedResponse , RPCIsSleepingResponse )):
250254 self ._add_output (request_outputs )
251255 else :
252256 for request_output in request_outputs :
@@ -256,7 +260,8 @@ async def run_output_handler_loop(self):
256260 logger .debug ("Shutting down MQLLMEngineClient output handler." )
257261
258262 def _add_output (self , request_output : Union [RequestOutput ,
259- RPCAdapterLoadedResponse ]):
263+ RPCAdapterLoadedResponse ,
264+ RPCIsSleepingResponse ]):
260265 queue = self .output_queues .get (request_output .request_id )
261266 if queue is not None :
262267 queue .put_nowait (request_output )
@@ -696,6 +701,24 @@ async def wake_up(self) -> None:
696701 return await self ._send_one_way_rpc_request (
697702 request = RPCWakeUpRequest .WAKE_UP , socket = self .input_socket )
698703
704+ async def is_sleeping (self ) -> bool :
705+ """Check whether the engine is sleeping"""
706+ request = RPCIsSleepingRequest ()
707+
708+ queue : asyncio .Queue [Union [BaseException ,
709+ RPCIsSleepingResponse ]] = asyncio .Queue ()
710+ self .output_queues [request .request_id ] = queue
711+
712+ request_bytes = pickle .dumps (request )
713+ await self .input_socket .send_multipart ((request_bytes , ), copy = False )
714+
715+ request_output = await queue .get ()
716+ self .output_queues .pop (request .request_id )
717+
718+ if isinstance (request_output , BaseException ):
719+ raise request_output
720+ return request_output .is_sleeping
721+
699722 async def add_lora (self , lora_request : LoRARequest ) -> None :
700723 """Load a new LoRA adapter into the engine for future requests."""
701724 # Uses the same I/O as generate requests
0 commit comments