Skip to content

Commit 22474aa

Browse files
LiuLi1998Liliu1997DarkLight1337mgoin
authored andcommitted
Support Anthropic API /v1/messages Endpoint (vllm-project#22627)
Signed-off-by: liuli <[email protected]> Co-authored-by: liuli <[email protected]> Co-authored-by: Cyrus Leung <[email protected]> Co-authored-by: Michael Goin <[email protected]> Signed-off-by: 0xrushi <[email protected]>
1 parent 3694915 commit 22474aa

File tree

10 files changed

+1262
-46
lines changed

10 files changed

+1262
-46
lines changed

requirements/common.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,4 @@ pybase64 # fast base64 implementation
4848
cbor2 # Required for cross-language serialization of hashable objects
4949
setproctitle # Used to set process names for better debugging and monitoring
5050
openai-harmony >= 0.0.3 # Required for gpt-oss
51+
anthropic == 0.71.0

tests/entrypoints/anthropic/__init__.py

Whitespace-only changes.
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import anthropic
5+
import pytest
6+
import pytest_asyncio
7+
8+
from ...utils import RemoteAnthropicServer
9+
10+
MODEL_NAME = "Qwen/Qwen3-0.6B"
11+
12+
13+
@pytest.fixture(scope="module")
14+
def server(): # noqa: F811
15+
args = [
16+
"--max-model-len",
17+
"2048",
18+
"--enforce-eager",
19+
"--enable-auto-tool-choice",
20+
"--tool-call-parser",
21+
"hermes",
22+
"--served-model-name",
23+
"claude-3-7-sonnet-latest",
24+
]
25+
26+
with RemoteAnthropicServer(MODEL_NAME, args) as remote_server:
27+
yield remote_server
28+
29+
30+
@pytest_asyncio.fixture
31+
async def client(server):
32+
async with server.get_async_client() as async_client:
33+
yield async_client
34+
35+
36+
@pytest.mark.asyncio
37+
async def test_simple_messages(client: anthropic.AsyncAnthropic):
38+
resp = await client.messages.create(
39+
model="claude-3-7-sonnet-latest",
40+
max_tokens=1024,
41+
messages=[{"role": "user", "content": "how are you!"}],
42+
)
43+
assert resp.stop_reason == "end_turn"
44+
assert resp.role == "assistant"
45+
46+
print(f"Anthropic response: {resp.model_dump_json()}")
47+
48+
49+
@pytest.mark.asyncio
50+
async def test_system_message(client: anthropic.AsyncAnthropic):
51+
resp = await client.messages.create(
52+
model="claude-3-7-sonnet-latest",
53+
max_tokens=1024,
54+
system="you are a helpful assistant",
55+
messages=[{"role": "user", "content": "how are you!"}],
56+
)
57+
assert resp.stop_reason == "end_turn"
58+
assert resp.role == "assistant"
59+
60+
print(f"Anthropic response: {resp.model_dump_json()}")
61+
62+
63+
@pytest.mark.asyncio
64+
async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
65+
resp = await client.messages.create(
66+
model="claude-3-7-sonnet-latest",
67+
max_tokens=1024,
68+
messages=[{"role": "user", "content": "how are you!"}],
69+
stream=True,
70+
)
71+
72+
async for chunk in resp:
73+
print(chunk.model_dump_json())
74+
75+
76+
@pytest.mark.asyncio
77+
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
78+
resp = await client.messages.create(
79+
model="claude-3-7-sonnet-latest",
80+
max_tokens=1024,
81+
messages=[
82+
{"role": "user", "content": "What's the weather like in New York today?"}
83+
],
84+
tools=[
85+
{
86+
"name": "get_current_weather",
87+
"description": "Useful for querying the weather in a specified city.",
88+
"input_schema": {
89+
"type": "object",
90+
"properties": {
91+
"location": {
92+
"type": "string",
93+
"description": "City or region, for example: "
94+
"New York, London, Tokyo, etc.",
95+
}
96+
},
97+
"required": ["location"],
98+
},
99+
}
100+
],
101+
stream=False,
102+
)
103+
assert resp.stop_reason == "tool_use"
104+
assert resp.role == "assistant"
105+
106+
print(f"Anthropic response: {resp.model_dump_json()}")
107+
108+
@pytest.mark.asyncio
109+
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
110+
resp = await client.messages.create(
111+
model="claude-3-7-sonnet-latest",
112+
max_tokens=1024,
113+
messages=[
114+
{
115+
"role": "user",
116+
"content": "What's the weather like in New York today?",
117+
}
118+
],
119+
tools=[
120+
{
121+
"name": "get_current_weather",
122+
"description": "Useful for querying the weather "
123+
"in a specified city.",
124+
"input_schema": {
125+
"type": "object",
126+
"properties": {
127+
"location": {
128+
"type": "string",
129+
"description": "City or region, for example: "
130+
"New York, London, Tokyo, etc.",
131+
}
132+
},
133+
"required": ["location"],
134+
},
135+
}
136+
],
137+
stream=True,
138+
)
139+
140+
async for chunk in resp:
141+
print(chunk.model_dump_json())

tests/utils.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from typing import Any, Literal
2424
from unittest.mock import patch
2525

26+
import anthropic
2627
import cloudpickle
2728
import httpx
2829
import openai
@@ -294,6 +295,131 @@ def __exit__(self, exc_type, exc_value, traceback):
294295
self.proc.kill()
295296

296297

298+
class RemoteAnthropicServer:
299+
DUMMY_API_KEY = "token-abc123" # vLLM's Anthropic server does not need API key
300+
301+
def __init__(
302+
self,
303+
model: str,
304+
vllm_serve_args: list[str],
305+
*,
306+
env_dict: dict[str, str] | None = None,
307+
seed: int | None = 0,
308+
auto_port: bool = True,
309+
max_wait_seconds: float | None = None,
310+
) -> None:
311+
if auto_port:
312+
if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
313+
raise ValueError(
314+
"You have manually specified the port when `auto_port=True`."
315+
)
316+
317+
# Don't mutate the input args
318+
vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
319+
if seed is not None:
320+
if "--seed" in vllm_serve_args:
321+
raise ValueError(
322+
f"You have manually specified the seed when `seed={seed}`."
323+
)
324+
325+
vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
326+
327+
parser = FlexibleArgumentParser(description="vLLM's remote Anthropic server.")
328+
subparsers = parser.add_subparsers(required=False, dest="subparser")
329+
parser = ServeSubcommand().subparser_init(subparsers)
330+
args = parser.parse_args(["--model", model, *vllm_serve_args])
331+
self.host = str(args.host or "localhost")
332+
self.port = int(args.port)
333+
334+
self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None
335+
336+
# download the model before starting the server to avoid timeout
337+
is_local = os.path.isdir(model)
338+
if not is_local:
339+
engine_args = AsyncEngineArgs.from_cli_args(args)
340+
model_config = engine_args.create_model_config()
341+
load_config = engine_args.create_load_config()
342+
343+
model_loader = get_model_loader(load_config)
344+
model_loader.download_model(model_config)
345+
346+
env = os.environ.copy()
347+
# the current process might initialize cuda,
348+
# to be safe, we should use spawn method
349+
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
350+
if env_dict is not None:
351+
env.update(env_dict)
352+
self.proc = subprocess.Popen(
353+
[
354+
sys.executable,
355+
"-m",
356+
"vllm.entrypoints.anthropic.api_server",
357+
model,
358+
*vllm_serve_args,
359+
],
360+
env=env,
361+
stdout=sys.stdout,
362+
stderr=sys.stderr,
363+
)
364+
max_wait_seconds = max_wait_seconds or 240
365+
self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)
366+
367+
def __enter__(self):
368+
return self
369+
370+
def __exit__(self, exc_type, exc_value, traceback):
371+
self.proc.terminate()
372+
try:
373+
self.proc.wait(8)
374+
except subprocess.TimeoutExpired:
375+
# force kill if needed
376+
self.proc.kill()
377+
378+
def _wait_for_server(self, *, url: str, timeout: float):
379+
# run health check
380+
start = time.time()
381+
while True:
382+
try:
383+
if requests.get(url).status_code == 200:
384+
break
385+
except Exception:
386+
# this exception can only be raised by requests.get,
387+
# which means the server is not ready yet.
388+
# the stack trace is not useful, so we suppress it
389+
# by using `raise from None`.
390+
result = self.proc.poll()
391+
if result is not None and result != 0:
392+
raise RuntimeError("Server exited unexpectedly.") from None
393+
394+
time.sleep(0.5)
395+
if time.time() - start > timeout:
396+
raise RuntimeError("Server failed to start in time.") from None
397+
398+
@property
399+
def url_root(self) -> str:
400+
return f"http://{self.host}:{self.port}"
401+
402+
def url_for(self, *parts: str) -> str:
403+
return self.url_root + "/" + "/".join(parts)
404+
405+
def get_client(self, **kwargs):
406+
if "timeout" not in kwargs:
407+
kwargs["timeout"] = 600
408+
return anthropic.Anthropic(
409+
base_url=self.url_for(),
410+
api_key=self.DUMMY_API_KEY,
411+
max_retries=0,
412+
**kwargs,
413+
)
414+
415+
def get_async_client(self, **kwargs):
416+
if "timeout" not in kwargs:
417+
kwargs["timeout"] = 600
418+
return anthropic.AsyncAnthropic(
419+
base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs
420+
)
421+
422+
297423
def _test_completion(
298424
client: openai.OpenAI,
299425
model: str,

vllm/entrypoints/anthropic/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)