Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion vllm/compilation/compiler_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
Gather all the relevant information from the vLLM config,
to compute a hash so that we can cache the compiled model.

See {meth}`VllmConfig.compute_hash` to check what information
See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
to check what information
is already considered by default. This function should only
consider the information that is specific to the compiler.
"""
Expand Down
32 changes: 18 additions & 14 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,8 @@ async def add_request_async(
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
"""Async version of {meth}`add_request`."""
"""Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
Expand Down Expand Up @@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async(


class AsyncLLMEngine(EngineClient):
"""An asynchronous wrapper for {class}`LLMEngine`.
"""An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].

This class is used to wrap the {class}`LLMEngine` class to make it
asynchronous. It uses asyncio to create a background loop that keeps
processing incoming requests. The {class}`LLMEngine` is kicked by the
generate method when there are requests in the waiting queue. The generate
method yields the outputs from the {class}`LLMEngine` to the caller.
This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
make it asynchronous. It uses asyncio to create a background loop that keeps
processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
by the generate method when there are requests in the waiting queue. The
generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
to the caller.

Args:
log_requests: Whether to log the requests.
start_engine_loop: If True, the background task to run the engine
will be automatically started in the generate call.
*args: Arguments for {class}`LLMEngine`.
**kwargs: Arguments for {class}`LLMEngine`.
*args: Arguments for [`LLMEngine`][vllm.LLMEngine].
**kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
"""

_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
Expand Down Expand Up @@ -985,8 +987,9 @@ async def generate(
from the LLMEngine to the caller.

Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
sampling_params: The sampling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
Expand All @@ -1003,7 +1006,7 @@ async def generate(
Details:
- If the engine is not running, start the background loop,
which iteratively invokes
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
[`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
to process the waiting requests.
- Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to
Expand Down Expand Up @@ -1075,8 +1078,9 @@ async def encode(
from the LLMEngine to the caller.

Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
Expand Down
8 changes: 4 additions & 4 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,11 @@ class LLMEngine:
iteration-level scheduling and efficient memory management to maximize the
serving throughput.

The [LLM][vllm.LLM] class wraps this class for offline batched inference
and the [AsyncLLMEngine][] class wraps this class for online serving.
The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
class wraps this class for online serving.

The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See
[engine-args][])
The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].

Args:
vllm_config: The configuration for initializing and running vLLM.
Expand Down
10 changes: 6 additions & 4 deletions vllm/engine/multiprocessing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,9 @@ def generate(
from the LLMEngine to the caller.
Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
sampling_params: The sampling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
Expand Down Expand Up @@ -561,8 +562,9 @@ def encode(
from the LLMEngine to the caller.
Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
prompt: The prompt to the LLM. See
[`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request.
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
Expand Down
19 changes: 11 additions & 8 deletions vllm/engine/multiprocessing/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,22 @@


class MQLLMEngine:
"""A multiprocessing wrapper for {class}`LLMEngine`.
"""A multiprocessing wrapper for
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine].

This class is used to wrap the {class}`LLMEngine` class to enable use
This class is used to wrap the
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
in concurrnet manner. It runs a background loop and uses zeromq to
receive new requests and stream outputs incrementally via ipc.

The {class}`LLMEngine` generate or encode process is kicked off when a new
RPCProcessRequest is received by the input_socket.
The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
process is kicked off when a new RPCProcessRequest is received by the
input_socket.

The self.engine_loop checks the input_socket for new requests,
adds them to the LLMEngine if there are any, calls the internal
{class}`LLMEngine.step()`, and sends the RequestOutputs back over
the output_socket.
[`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
the RequestOutputs back over the output_socket.

If use_async_sockets is set, the logic associated with reading new
requests from the socket and sending data to the socket is passed
Expand All @@ -65,8 +68,8 @@ class MQLLMEngine:
ipc_path: Base path for zeromq interprocess messaging
use_async_sockets: Whether to make send/recv async with GPU
log_requests: Whether to log the requests.
*args: Arguments for {class}`LLMEngine`.
**kwargs: Arguments for {class}`LLMEngine`.
*args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
**kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
"""

def __init__(self,
Expand Down
7 changes: 5 additions & 2 deletions vllm/engine/output_processor/multi_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,11 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
scheduled computation.

Args:
seq_group: the outputs are associated with this {class}`SequenceGroup`
outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
seq_group: the outputs are associated with this
[`SequenceGroup`][vllm.sequence.SequenceGroup]
outputs: the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
for all scheduler steps
"""
for output in outputs:
# Concatenate single-step prompt logprob processing results.
Expand Down
21 changes: 14 additions & 7 deletions vllm/engine/output_processor/single_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,21 @@
def single_step_process_prompt_logprob(
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
output: CompletionSequenceGroupOutput) -> None:
"""Process prompt logprobs associated with the {class}`SequenceGroupOutput`
for a given step.
"""Process prompt logprobs associated with the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.

Do nothing if the output has no prompt logprobs.

Account for the fact that transformers do not compute first-token logprobs.

Args:
sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
seq_group: the output is associated with this {class}`SequenceGroup`
output: the {class}`SequenceGroupOutput` for a single scheduler step
sg_output_proc:
[`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
instance
seq_group: the output is associated with this
[`SequenceGroup`][vllm.sequence.SequenceGroup]
output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
for a single scheduler step
"""
prompt_logprobs = output.prompt_logprobs

Expand Down Expand Up @@ -103,8 +107,11 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
scheduled computation.

Args:
seq_group: the output is associated with this {class}`SequenceGroup`
outputs: the {class}`SequenceGroupOutput` for a single scheduler step
seq_group: the output is associated with this
[`SequenceGroup`][vllm.sequence.SequenceGroup]
outputs: the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
for a single scheduler step
"""
assert len(outputs) == 1, "Single step should only have 1 output."
output = outputs[0]
Expand Down
5 changes: 2 additions & 3 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ class LLM:
compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration.
**kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See
[engine-args][])
**kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].

Note:
This class is intended to be used for offline inference. For online
Expand Down Expand Up @@ -494,7 +493,7 @@ def collective_rpc(self,
`self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
[`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method.

Expand Down
6 changes: 4 additions & 2 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,8 @@ def _tokenize_prompt_input(
add_special_tokens: bool = True,
) -> TextTokensPrompt:
"""
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
A simpler implementation of
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
that assumes single input.
"""
return next(
Expand All @@ -603,7 +604,8 @@ def _tokenize_prompt_inputs(
add_special_tokens: bool = True,
) -> Iterator[TextTokensPrompt]:
"""
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
A simpler implementation of
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
that assumes multiple inputs.
"""
for text in prompt_inputs:
Expand Down
2 changes: 1 addition & 1 deletion vllm/executor/executor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def collective_rpc(self,
`self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
[`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method.

Expand Down
5 changes: 3 additions & 2 deletions vllm/inputs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@

INPUT_REGISTRY = InputRegistry()
"""
The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine`
to dispatch data processing according to the target model.
The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
target model.
"""

__all__ = [
Expand Down
Loading