Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
92a6d2e
init
noooop Oct 17, 2025
e54fe8b
init
noooop Oct 17, 2025
83803c4
+ response_compression_pooling_output
noooop Oct 17, 2025
fc826d1
fix
noooop Oct 17, 2025
3b10e99
support endianness
noooop Oct 19, 2025
fa03e6c
+ tensor_serial
noooop Oct 20, 2025
b033bda
+ embedding_requests_base64_client
noooop Oct 20, 2025
24bb3cf
Merge branch 'main' into binary_response
noooop Oct 20, 2025
63502bb
fix
noooop Oct 20, 2025
c8a6d0f
typo
noooop Oct 20, 2025
f4882a4
Support binary embedding response
noooop Oct 20, 2025
836fc05
+ embedding_requests_bytes_client.py
noooop Oct 20, 2025
687a889
fix
noooop Oct 20, 2025
1b7790b
fix
noooop Oct 20, 2025
f8e37b6
Update tests/utils_/test_tensor_serial.py
noooop Oct 20, 2025
9bc193f
Update vllm/entrypoints/openai/serving_embedding.py
noooop Oct 20, 2025
5b85cf5
fix tests
noooop Oct 20, 2025
6d81371
fix
noooop Oct 20, 2025
9431501
fix
noooop Oct 20, 2025
861738e
clean up
noooop Oct 20, 2025
2aabfdf
fix
noooop Oct 20, 2025
8fe01ff
Merge branch 'main' into binary_response
noooop Oct 20, 2025
b825588
Update vllm/utils/tensor_serial.py
noooop Oct 20, 2025
ba217ea
fix
noooop Oct 20, 2025
dacda88
Merge branch 'main' into binary_response
noooop Oct 20, 2025
0f6382e
Update vllm/utils/serial_utils.py
noooop Oct 20, 2025
4590a98
Update vllm/utils/serial_utils.py
noooop Oct 20, 2025
60c8b52
Update vllm/utils/serial_utils.py
noooop Oct 20, 2025
3cd65fc
fix
noooop Oct 20, 2025
acc4b50
fix
noooop Oct 20, 2025
3a1302b
rename
noooop Oct 21, 2025
07b2508
StreamingResponse
noooop Oct 21, 2025
68276e6
Merge branch 'main' into binary_response
noooop Oct 21, 2025
405f11d
rename
noooop Oct 21, 2025
95986f1
fix
noooop Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions tests/entrypoints/pooling/openai/test_binary_encode_and_decode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch

from tests.models.utils import check_embeddings_close
from vllm.entrypoints.openai.utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
binary2tenser,
tenser2binary,
)


@pytest.mark.parametrize("endianness", ENDIANNESS)
@pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
@torch.inference_mode
def test_encode_and_decode(embed_dtype: str, endianness: str):
for i in range(10):
tenser = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)
shape = tenser.shape
binary = tenser2binary(tenser, embed_dtype, endianness)
new_tenser = binary2tenser(binary, shape, embed_dtype, endianness).to(
torch.float32
)

if "embed_dtype" in ["float32", "float16", "bfloat16"]:
torch.testing.assert_close(tenser, new_tenser, atol=1e-7, rtol=1e-7)
else: # for fp8
torch.testing.assert_close(tenser, new_tenser, atol=0.1, rtol=0.1)
check_embeddings_close(
embeddings_0_lst=tenser.view(1, -1),
embeddings_1_lst=new_tenser.view(1, -1),
name_0="gt",
name_1="new",
tol=1e-2,
)
75 changes: 47 additions & 28 deletions tests/entrypoints/pooling/openai/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import (
EMBED_DTYPE_TO_TORCH_DTYPE,
EmbeddingResponse,
PoolingResponse,
)
from vllm.entrypoints.openai.utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
binary2tenser,
)
from vllm.transformers_utils.tokenizer import get_tokenizer

MODEL_NAME = "intfloat/multilingual-e5-small"
Expand Down Expand Up @@ -250,7 +254,7 @@ async def test_batch_base64_embedding(

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_base64_embed_dtype(
async def test_base64_embed_dtype_and_endianness(
hf_model, server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
):
input_texts = [
Expand All @@ -262,45 +266,44 @@ async def test_base64_embed_dtype(
)
float_data = [d.embedding for d in responses_float.data]

for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items():
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"embed_dtype": embed_dtype,
},
)

base64_data = []
for data in responses_base64.json()["data"]:
base64_data.append(
torch.frombuffer(base64.b64decode(data["embedding"]), dtype=torch_dtype)
.to(torch.float32)
.tolist()
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)

check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float_data",
name_1="base64_data",
tol=1e-2,
)
base64_data = []
for data in responses_base64.json()["data"]:
binary = base64.b64decode(data["embedding"])
tenser = binary2tenser(binary, (-1,), embed_dtype, endianness)
base64_data.append(tenser.to(torch.float32).tolist())

check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float_data",
name_1="base64_data",
tol=1e-2,
)


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_base64_embed_dtype_not_supported(
async def test_base64_embed_dtype_and_endianness_not_supported(
hf_model, server: RemoteOpenAIServer, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]

bad_embed_dtype = "bad_embed_dtype"

responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
Expand All @@ -316,6 +319,22 @@ async def test_base64_embed_dtype_not_supported(
f"embed_dtype={bad_embed_dtype!r} is not supported."
)

#################################

bad_endianness = "bad_endianness"
responses_base64 = requests.post(
server.url_for("/v1/embeddings"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"endianness": bad_endianness,
},
)

assert responses_base64.status_code == 400
assert "literal_error" in responses_base64.json()["error"]["message"]


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
Expand Down
78 changes: 50 additions & 28 deletions tests/entrypoints/pooling/openai/test_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@

from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import EMBED_DTYPE_TO_TORCH_DTYPE, PoolingResponse
from vllm.entrypoints.openai.protocol import PoolingResponse
from vllm.entrypoints.openai.utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
binary2tenser,
)
from vllm.transformers_utils.tokenizer import get_tokenizer

MODEL_NAME = "internlm/internlm2-1_8b-reward"
Expand Down Expand Up @@ -251,7 +256,9 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer, model_name: str)

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_base64_embed_dtype(server: RemoteOpenAIServer, model_name: str):
async def test_base64_embed_dtype_and_endianness(
server: RemoteOpenAIServer, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]
Expand All @@ -268,45 +275,44 @@ async def test_base64_embed_dtype(server: RemoteOpenAIServer, model_name: str):
responses_float = PoolingResponse.model_validate(float_response.json())
float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]

for embed_dtype, torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE.items():
responses_base64 = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"embed_dtype": embed_dtype,
},
)

base64_data = []
for data in responses_base64.json()["data"]:
base64_data.append(
torch.frombuffer(base64.b64decode(data["data"]), dtype=torch_dtype)
.to(torch.float32)
.tolist()
for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
for endianness in ENDIANNESS:
responses_base64 = requests.post(
url,
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"embed_dtype": embed_dtype,
"endianness": endianness,
},
)

check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float_data",
name_1="base64_data",
tol=1e-2,
)
base64_data = []
for data in responses_base64.json()["data"]:
binary = base64.b64decode(data["data"])
tenser = binary2tenser(binary, (-1,), embed_dtype, endianness)
base64_data.append(tenser.to(torch.float32).tolist())

check_embeddings_close(
embeddings_0_lst=float_data,
embeddings_1_lst=base64_data,
name_0="float_data",
name_1="base64_data",
tol=1e-2,
)


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_base64_embed_dtype_not_supported(
async def test_base64_embed_dtype_and_endianness_not_supported(
server: RemoteOpenAIServer, model_name: str
):
input_texts = [
"The best thing about vLLM is that it supports many different models",
]

bad_embed_dtype = "bad_embed_dtype"

responses_base64 = requests.post(
server.url_for("pooling"),
json={
Expand All @@ -322,6 +328,22 @@ async def test_base64_embed_dtype_not_supported(
f"embed_dtype={bad_embed_dtype!r} is not supported."
)

##################

bad_endianness = "bad_endianness"
responses_base64 = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_texts,
"encoding_format": "base64",
"endianness": bad_endianness,
},
)

assert responses_base64.status_code == 400
assert "literal_error" in responses_base64.json()["error"]["message"]


@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
Expand Down
62 changes: 36 additions & 26 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,6 @@
from vllm.utils import random_uuid
from vllm.utils.import_utils import resolve_obj_by_qualname

EMBED_DTYPE_TO_TORCH_DTYPE = {
"float32": torch.float32,
"float16": torch.float16,
"bfloat16": torch.bfloat16,
# I'm not sure if other platforms' CPUs support the fp8 data format.
# EMBED_DTYPE only uses the fp8 data representation,
# does not use fp8 computation, and only occurs on the CPU.
# Apologize for any possible break.
"fp8_e4m3": torch.float8_e4m3fn,
"fp8_e5m2": torch.float8_e5m2,
}

logger = init_logger(__name__)

_LONG_INFO = torch.iinfo(torch.long)
Expand Down Expand Up @@ -1551,17 +1539,22 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
embed_dtype: str = Field(
default="float32",
description=(
"What dtype to use for base64 encoding. Default to using "
"float32 for base64 encoding to match the OpenAI python client behavior."
"What dtype to use for encoding. Default to using float32 for base64 "
"encoding to match the OpenAI python client behavior. "
"This parameter will affect base64 and binary_response."
),
)
endianness: Literal["native", "big", "little"] = Field(
default="native",
description=(
"What endianness to use for encoding. Default to using native for "
"base64 encoding to match the OpenAI python client behavior."
"This parameter will affect base64 and binary_response."
),
)
binary_response: bool = Field(
default=False, description="Whether to use binary response."
)
endianness: Literal["big-endian", "little-endian"] = Field(
default="big-endian",
description="Binary response endianness. Use big-endian by default.",
)
# --8<-- [end:embedding-extra-params]

def to_pooling_params(self):
Expand Down Expand Up @@ -1644,17 +1637,22 @@ class EmbeddingChatRequest(OpenAIBaseModel):
embed_dtype: str = Field(
default="float32",
description=(
"Which dtype to use for base64 encoding. Defaults to float32 "
"to match OpenAI API."
"What dtype to use for encoding. Default to using float32 for base64 "
"encoding to match the OpenAI python client behavior. "
"This parameter will affect base64 and binary_response."
),
)
endianness: Literal["native", "big", "little"] = Field(
default="native",
description=(
"What endianness to use for encoding. Default to using native for "
"base64 encoding to match the OpenAI python client behavior."
"This parameter will affect base64 and binary_response."
),
)
binary_response: bool = Field(
default=False, description="Whether to use binary response."
)
endianness: Literal["big-endian", "little-endian"] = Field(
default="big-endian",
description="Binary response endianness. Use big-endian by default.",
)
# --8<-- [end:chat-embedding-extra-params]

@model_validator(mode="before")
Expand Down Expand Up @@ -1702,10 +1700,22 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
embed_dtype: str = Field(
default="float32",
description=(
"What dtype to use for base64 encoding. Default to using "
"float32 for base64 encoding to match the OpenAI python client behavior."
"What dtype to use for encoding. Default to using float32 for base64 "
"encoding to match the OpenAI python client behavior. "
"This parameter will affect base64 and binary_response."
),
)
endianness: Literal["native", "big", "little"] = Field(
default="native",
description=(
"What endianness to use for encoding. Default to using native for "
"base64 encoding to match the OpenAI python client behavior."
"This parameter will affect base64 and binary_response."
),
)
binary_response: bool = Field(
default=False, description="Whether to use binary response."
)

def to_pooling_params(self):
return PoolingParams(task="token_classify", activation=self.activation)
Expand Down
Loading