Skip to content

Commit 6eba30b

Browse files
waltformeshreyankg
authored andcommitted
[Core] Expose API endpoint /is_sleeping (vllm-project#14312)
Signed-off-by: Jun Duan <[email protected]>
1 parent 30710a8 commit 6eba30b

File tree

12 files changed

+100
-4
lines changed

12 files changed

+100
-4
lines changed

tests/entrypoints/openai/test_sleep.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,12 @@ def test_sleep_mode():
2828
response = requests.post(remote_server.url_for("/sleep"),
2929
data={"level": "1"})
3030
assert response.status_code == 200
31+
response = requests.get(remote_server.url_for("/is_sleeping"))
32+
assert response.status_code == 200
33+
assert response.json().get("is_sleeping") is True
34+
3135
response = requests.post(remote_server.url_for("/wake_up"))
3236
assert response.status_code == 200
37+
response = requests.get(remote_server.url_for("/is_sleeping"))
38+
assert response.status_code == 200
39+
assert response.json().get("is_sleeping") is False

vllm/engine/async_llm_engine.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,6 +1225,9 @@ async def sleep(self, level: int = 1) -> None:
12251225
async def wake_up(self) -> None:
12261226
self.engine.wake_up()
12271227

1228+
async def is_sleeping(self) -> bool:
1229+
return self.engine.is_sleeping()
1230+
12281231
async def add_lora(self, lora_request: LoRARequest) -> None:
12291232
self.engine.add_lora(lora_request)
12301233

vllm/engine/llm_engine.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1948,6 +1948,9 @@ def wake_up(self) -> None:
19481948
"Sleep mode is not enabled in the model config")
19491949
self.model_executor.wake_up()
19501950

1951+
def is_sleeping(self) -> bool:
1952+
return self.model_executor.is_sleeping
1953+
19511954
def check_health(self) -> None:
19521955
if self.tokenizer:
19531956
self.tokenizer.check_health()

vllm/engine/multiprocessing/__init__.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,18 @@ class RPCWakeUpRequest(Enum):
136136
WAKE_UP = 1
137137

138138

139+
@dataclass
140+
class RPCIsSleepingRequest:
141+
# Set the default value of request_id to a new UUID
142+
request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
143+
144+
145+
@dataclass
146+
class RPCIsSleepingResponse:
147+
request_id: str
148+
is_sleeping: bool
149+
150+
139151
@dataclass
140152
class RPCLoadAdapterRequest:
141153
lora_request: LoRARequest
@@ -151,10 +163,10 @@ class RPCAdapterLoadedResponse:
151163
RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
152164
RPCUProfileRequest, RPCLoadAdapterRequest,
153165
RPCResetPrefixCacheRequest, RPCSleepRequest,
154-
RPCWakeUpRequest]
166+
RPCWakeUpRequest, RPCIsSleepingRequest]
155167

156168
REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
157-
RPCError]
169+
RPCIsSleepingResponse, RPCError]
158170

159171

160172
def ENGINE_DEAD_ERROR(

vllm/engine/multiprocessing/client.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
IPC_OUTPUT_EXT, RPC_REQUEST_T,
2828
VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
2929
RPCAdapterLoadedResponse, RPCError,
30+
RPCIsSleepingRequest,
31+
RPCIsSleepingResponse,
3032
RPCLoadAdapterRequest,
3133
RPCProcessRequest,
3234
RPCResetPrefixCacheRequest,
@@ -246,7 +248,9 @@ async def run_output_handler_loop(self):
246248
if queue is not None:
247249
queue.put_nowait(exception)
248250
# Put each output into the appropriate queue.
249-
elif isinstance(request_outputs, RPCAdapterLoadedResponse):
251+
elif isinstance(
252+
request_outputs,
253+
(RPCAdapterLoadedResponse, RPCIsSleepingResponse)):
250254
self._add_output(request_outputs)
251255
else:
252256
for request_output in request_outputs:
@@ -256,7 +260,8 @@ async def run_output_handler_loop(self):
256260
logger.debug("Shutting down MQLLMEngineClient output handler.")
257261

258262
def _add_output(self, request_output: Union[RequestOutput,
259-
RPCAdapterLoadedResponse]):
263+
RPCAdapterLoadedResponse,
264+
RPCIsSleepingResponse]):
260265
queue = self.output_queues.get(request_output.request_id)
261266
if queue is not None:
262267
queue.put_nowait(request_output)
@@ -696,6 +701,24 @@ async def wake_up(self) -> None:
696701
return await self._send_one_way_rpc_request(
697702
request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
698703

704+
async def is_sleeping(self) -> bool:
705+
"""Check whether the engine is sleeping"""
706+
request = RPCIsSleepingRequest()
707+
708+
queue: asyncio.Queue[Union[BaseException,
709+
RPCIsSleepingResponse]] = asyncio.Queue()
710+
self.output_queues[request.request_id] = queue
711+
712+
request_bytes = pickle.dumps(request)
713+
await self.input_socket.send_multipart((request_bytes, ), copy=False)
714+
715+
request_output = await queue.get()
716+
self.output_queues.pop(request.request_id)
717+
718+
if isinstance(request_output, BaseException):
719+
raise request_output
720+
return request_output.is_sleeping
721+
699722
async def add_lora(self, lora_request: LoRARequest) -> None:
700723
"""Load a new LoRA adapter into the engine for future requests."""
701724
# Uses the same I/O as generate requests

vllm/engine/multiprocessing/engine.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
1919
VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
2020
RPCAdapterLoadedResponse, RPCError,
21+
RPCIsSleepingRequest,
22+
RPCIsSleepingResponse,
2123
RPCLoadAdapterRequest,
2224
RPCProcessRequest,
2325
RPCResetPrefixCacheRequest,
@@ -271,6 +273,8 @@ def handle_new_input(self):
271273
self.sleep(request.value)
272274
elif isinstance(request, RPCWakeUpRequest):
273275
self.wake_up()
276+
elif isinstance(request, RPCIsSleepingRequest):
277+
self._handle_is_sleeping_request(request)
274278
else:
275279
raise ValueError("Unknown RPCRequest Type: "
276280
f"{type(request)}")
@@ -337,6 +341,12 @@ def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
337341
self._send_outputs(
338342
RPCAdapterLoadedResponse(request_id=request.request_id))
339343

344+
def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest):
345+
is_sleeping = self.is_sleeping()
346+
self._send_outputs(
347+
RPCIsSleepingResponse(request_id=request.request_id,
348+
is_sleeping=is_sleeping))
349+
340350
def _health_check(self):
341351
# Send unhealthy if engine has already errored
342352
if self._errored_with is not None:
@@ -406,6 +416,9 @@ def sleep(self, level: int = 1) -> None:
406416
def wake_up(self) -> None:
407417
self.engine.wake_up()
408418

419+
def is_sleeping(self) -> bool:
420+
return self.engine.is_sleeping()
421+
409422

410423
def signal_handler(*_) -> None:
411424
raise KeyboardInterrupt("MQLLMEngine terminated")

vllm/engine/protocol.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,11 @@ async def wake_up(self) -> None:
288288
"""Wake up the engine"""
289289
...
290290

291+
@abstractmethod
292+
async def is_sleeping(self) -> bool:
293+
"""Check whether the engine is sleeping"""
294+
...
295+
291296
@abstractmethod
292297
async def add_lora(self, lora_request: LoRARequest) -> None:
293298
"""Load a new LoRA adapter into the engine for future requests."""

vllm/entrypoints/openai/api_server.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,12 @@ async def wake_up(raw_request: Request):
694694
# is sent but does not finish yet when we return a response.
695695
return Response(status_code=200)
696696

697+
@router.get("/is_sleeping")
698+
async def is_sleeping(raw_request: Request):
699+
logger.info("check whether the engine is sleeping")
700+
is_sleeping = await engine_client(raw_request).is_sleeping()
701+
return JSONResponse(content={"is_sleeping": is_sleeping})
702+
697703

698704
@router.post("/invocations", dependencies=[Depends(validate_json_request)])
699705
async def invocations(raw_request: Request):

vllm/v1/engine/async_llm.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,9 @@ async def sleep(self, level: int = 1) -> None:
407407
async def wake_up(self) -> None:
408408
await self.engine_core.wake_up_async()
409409

410+
async def is_sleeping(self) -> bool:
411+
return await self.engine_core.is_sleeping_async()
412+
410413
async def add_lora(self, lora_request: LoRARequest) -> bool:
411414
"""Load a new LoRA adapter into the engine for future requests."""
412415
return await self.engine_core.add_lora_async(lora_request)

vllm/v1/engine/core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@ def sleep(self, level: int = 1):
253253
def wake_up(self):
254254
self.model_executor.wake_up()
255255

256+
def is_sleeping(self) -> bool:
257+
return self.model_executor.is_sleeping
258+
256259
def execute_dummy_batch(self):
257260
self.model_executor.collective_rpc("execute_dummy_batch")
258261

0 commit comments

Comments
 (0)