Skip to content

Commit e552022

Browse files
FurtherAIweilong.yu
authored andcommitted
[Model] Adding Support for Qwen2VL as an Embedding Model. Using MrLight/dse-qwen2-2b-mrl-v1 (vllm-project#9944)
Signed-off-by: FurtherAI <[email protected]> Co-authored-by: FurtherAI <[email protected]>
1 parent 4419185 commit e552022

File tree

8 files changed

+364
-19
lines changed

8 files changed

+364
-19
lines changed

docs/source/models/supported_models.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,12 @@ Multimodal Embedding
584584
- :code:`TIGER-Lab/VLM2Vec-Full`
585585
- 🚧
586586
- ✅︎
587+
* - :code:`Qwen2VLForConditionalGeneration`
588+
- Qwen2-VL-based
589+
- T + I
590+
- :code:`MrLight/dse-qwen2-2b-mrl-v1`
591+
-
592+
- ✅︎
587593

588594
.. important::
589595
Some model architectures support both generation and embedding tasks.

docs/source/models/vlm.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,4 +310,21 @@ Since the request schema is not defined by OpenAI client, we post a request to t
310310
response_json = response.json()
311311
print("Embedding output:", response_json["data"][0]["embedding"])
312312
313+
Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
314+
315+
.. code-block:: bash
316+
317+
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
318+
--trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
319+
320+
.. important::
321+
322+
Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings,
323+
which is handled by the jinja template.
324+
325+
.. important::
326+
327+
Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code
328+
example below for details.
329+
313330
A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
Lines changed: 105 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,120 @@
1+
import argparse
2+
import base64
3+
import io
4+
15
import requests
6+
from PIL import Image
27

38
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
49

5-
response = requests.post(
6-
"http://localhost:8000/v1/embeddings",
7-
json={
8-
"model":
9-
"TIGER-Lab/VLM2Vec-Full",
10-
"messages": [{
10+
11+
def vlm2vec():
12+
response = requests.post(
13+
"http://localhost:8000/v1/embeddings",
14+
json={
15+
"model":
16+
"TIGER-Lab/VLM2Vec-Full",
17+
"messages": [{
18+
"role":
19+
"user",
20+
"content": [
21+
{
22+
"type": "image_url",
23+
"image_url": {
24+
"url": image_url
25+
}
26+
},
27+
{
28+
"type": "text",
29+
"text": "Represent the given image."
30+
},
31+
],
32+
}],
33+
"encoding_format":
34+
"float",
35+
},
36+
)
37+
response.raise_for_status()
38+
response_json = response.json()
39+
40+
print("Embedding output:", response_json["data"][0]["embedding"])
41+
42+
43+
def dse_qwen2_vl(inp: dict):
44+
# Embedding an Image
45+
if inp["dtype"] == "image":
46+
messages = [{
47+
"role":
48+
"user",
49+
"content": [{
50+
"type": "image_url",
51+
"image_url": {
52+
"url": inp["image_url"],
53+
}
54+
}, {
55+
"type": "text",
56+
"text": "What is shown in this image?"
57+
}]
58+
}]
59+
# Embedding a Text Query
60+
else:
61+
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
62+
# of the minimum input size
63+
buffer = io.BytesIO()
64+
image_placeholder = Image.new("RGB", (56, 56))
65+
image_placeholder.save(buffer, "png")
66+
buffer.seek(0)
67+
image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
68+
messages = [{
1169
"role":
1270
"user",
1371
"content": [
1472
{
1573
"type": "image_url",
1674
"image_url": {
17-
"url": image_url
75+
"url": f"data:image/jpeg;base64,{image_placeholder}",
1876
}
1977
},
2078
{
2179
"type": "text",
22-
"text": "Represent the given image."
80+
"text": f"Query: {inp['content']}"
2381
},
24-
],
25-
}],
26-
"encoding_format":
27-
"float",
28-
},
29-
)
30-
response.raise_for_status()
31-
response_json = response.json()
32-
33-
print("Embedding output:", response_json["data"][0]["embedding"])
82+
]
83+
}]
84+
85+
response = requests.post(
86+
"http://localhost:8000/v1/embeddings",
87+
json={
88+
"model": "MrLight/dse-qwen2-2b-mrl-v1",
89+
"messages": messages,
90+
"encoding_format": "float",
91+
},
92+
)
93+
response.raise_for_status()
94+
response_json = response.json()
95+
96+
print("Embedding output:", response_json["data"][0]["embedding"])
97+
98+
99+
if __name__ == '__main__':
100+
parser = argparse.ArgumentParser(
101+
"Script to call a specified VLM through the API. Make sure to serve "
102+
"the model with --task embedding before running this.")
103+
parser.add_argument("model",
104+
type=str,
105+
choices=["vlm2vec", "dse_qwen2_vl"],
106+
required=True,
107+
help="Which model to call.")
108+
args = parser.parse_args()
109+
110+
if args.model == "vlm2vec":
111+
vlm2vec()
112+
elif args.model == "dse_qwen2_vl":
113+
dse_qwen2_vl({
114+
"dtye": "image",
115+
"image_url": image_url,
116+
})
117+
dse_qwen2_vl({
118+
"dtype": "text",
119+
"content": "What is the weather like today?",
120+
})
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system
2+
You are a helpful assistant.<|im_end|>
3+
{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %}
4+
{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %}
5+
{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %}
6+
{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %}
7+
{% endraw %}{% endif %}<|endoftext|>

tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,9 @@ def video_assets() -> _VideoAssets:
243243
class HfRunner:
244244

245245
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
246+
if x is None or isinstance(x, (bool, )):
247+
return x
248+
246249
if device is None:
247250
device = "cpu" if current_platform.is_cpu() else "cuda"
248251

0 commit comments

Comments
 (0)