1010from vllm .engine .async_llm_engine import AsyncLLMEngine
1111from vllm .entrypoints .openai .protocol import (ChatCompletionRequest ,
1212 CompletionRequest ,
13+ DetokenizeRequest ,
1314 EmbeddingRequest , ErrorResponse ,
1415 ModelCard , ModelList ,
15- ModelPermission )
16+ ModelPermission , TokenizeRequest )
1617from vllm .logger import init_logger
1718from vllm .lora .request import LoRARequest
1819from vllm .sequence import Logprob
@@ -99,8 +100,9 @@ def create_streaming_error_response(
99100 return json_str
100101
101102 async def _check_model (
102- self , request : Union [CompletionRequest , ChatCompletionRequest ,
103- EmbeddingRequest ]
103+ self , request : Union [ChatCompletionRequest , CompletionRequest ,
104+ DetokenizeRequest , EmbeddingRequest ,
105+ TokenizeRequest ]
104106 ) -> Optional [ErrorResponse ]:
105107 if request .model in self .served_model_names :
106108 return None
@@ -126,7 +128,8 @@ def _maybe_get_lora(
126128 def _validate_prompt_and_tokenize (
127129 self ,
128130 request : Union [ChatCompletionRequest , CompletionRequest ,
129- EmbeddingRequest ],
131+ DetokenizeRequest , EmbeddingRequest ,
132+ TokenizeRequest ],
130133 prompt : Optional [str ] = None ,
131134 prompt_ids : Optional [List [int ]] = None ,
132135 truncate_prompt_tokens : Optional [Annotated [int ,
@@ -174,6 +177,11 @@ def _validate_prompt_and_tokenize(
174177 f"generation. Please reduce the length of the input." , )
175178 return input_ids , input_text
176179
180+ # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
181+ # and does not require model context length validation
182+ if isinstance (request , (TokenizeRequest , DetokenizeRequest )):
183+ return input_ids , input_text
184+
177185 if request .max_tokens is None :
178186 if token_num >= self .max_model_len :
179187 raise ValueError (
0 commit comments