Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 41 additions & 6 deletions tests/entrypoints/openai/test_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ def server():

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
model_name: str):
def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
Expand All @@ -45,8 +44,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
model_name: str):
def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?"
Expand All @@ -73,8 +71,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
model_name: str):
def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."

Expand All @@ -91,3 +88,41 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
assert score.data is not None
assert len(score.data) == 1
assert score.data[0].score >= 0.9


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_score_max_model_len(model_name: str):

args = ["--enforce-eager", "--max-model-len", "5"]

with RemoteOpenAIServer(model_name, args) as remote_server:

text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]

score_response = requests.post(remote_server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
score_response.text

# Test truncation
score_response = requests.post(remote_server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
"truncate_prompt_tokens": 10
})
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \
score_response.text
20 changes: 19 additions & 1 deletion vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,13 @@ async def create_score(
if not self.model_config.is_cross_encoder:
raise ValueError("Model is not cross encoder.")

if truncate_prompt_tokens is not None and \
truncate_prompt_tokens > self.max_model_len:
raise ValueError(
f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
f"is greater than max_model_len ({self.max_model_len})."
f" Please, select a smaller truncation size.")

except ValueError as e:
logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(str(e))
Expand All @@ -123,8 +130,19 @@ async def create_score(
prompt_inputs = await tokenize_async(text=q,
text_pair=t,
**tokenization_kwargs)

input_ids = prompt_inputs["input_ids"]
token_num = len(input_ids)
if len(input_ids) > self.max_model_len:
Copy link
Member

@DarkLight1337 DarkLight1337 Jan 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we call _validate_input directly inside serving_score.py? (Might need to update the function to handle ScoreRequest correctly)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @DarkLight1337 , thanks for review it!

Maybe...
However _validate_input returns TextTokensPrompt while for score we need TokensPrompt and they have different structures. While we can change it to outputs both types, I am not sure if it's worth to make this change. I think we will need write more code, more code checks and may increase the complexity of this method. What do you think?

Copy link
Member

@DarkLight1337 DarkLight1337 Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be straightforward to convert it to TokensPrompt by only extracting the prompt_token_ids from the TextTokensPrompt.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, did that!

err_msg = (
f"This model's maximum context length is "
f"{self.max_model_len} tokens. However, you requested "
f"{token_num} tokens in the input for score. "
f"Please reduce the length of the input.")
logger.error(err_msg)
return self.create_error_response(err_msg)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
prompt_token_ids=input_ids,
token_type_ids=prompt_inputs.get("token_type_ids"))

request_prompts.append(request_prompt)
Expand Down
Loading