[Feature][Responses API] Stream Function Call - harmony (#24317)

chaunceyjiang · web-flow · commit df850c4912d3 · 2025-10-14T08:31:43.000-07:00
Signed-off-by: chaunceyjiang &lt;chaunceyjiang@gmail.com&gt;
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -16,6 +16,22 @@
 
 MODEL_NAME = "openai/gpt-oss-20b"
 
+GET_WEATHER_SCHEMA = {
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get current temperature for provided coordinates in celsius.",  # noqa
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "latitude": {"type": "number"},
+            "longitude": {"type": "number"},
+        },
+        "required": ["latitude", "longitude"],
+        "additionalProperties": False,
+    },
+    "strict": True,
+}
+
 
 @pytest.fixture(scope="module")
 def server():
@@ -305,6 +321,54 @@ async def test_streaming_types(client: OpenAI, model_name: str):
         assert len(stack_of_event_types) == 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str):
+    # this links the "done" type with the "start" type
+    # so every "done" type should have a corresponding "start" type
+    # and every open block should be closed by the end of the stream
+    pairs_of_event_types = {
+        "response.completed": "response.created",
+        "response.output_item.done": "response.output_item.added",
+        "response.output_text.done": "response.output_text.delta",
+        "response.reasoning_text.done": "response.reasoning_text.delta",
+        "response.reasoning_part.done": "response.reasoning_part.added",
+        "response.function_call_arguments.done": "response.function_call_arguments.delta",  # noqa
+    }
+
+    tools = [GET_WEATHER_SCHEMA]
+    input_list = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Paris today?",
+        }
+    ]
+    stream_response = await client.responses.create(
+        model=model_name,
+        input=input_list,
+        tools=tools,
+        stream=True,
+    )
+
+    stack_of_event_types = []
+    async for event in stream_response:
+        if event.type == "response.created":
+            stack_of_event_types.append(event.type)
+        elif event.type == "response.completed":
+            assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+            stack_of_event_types.pop()
+        if event.type.endswith("added"):
+            stack_of_event_types.append(event.type)
+        elif event.type.endswith("delta"):
+            if stack_of_event_types[-1] == event.type:
+                continue
+            stack_of_event_types.append(event.type)
+        elif event.type.endswith("done"):
+            assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+            stack_of_event_types.pop()
+    assert len(stack_of_event_types) == 0
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("background", [True, False])
@@ -483,23 +547,7 @@ def call_function(name, args):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling(client: OpenAI, model_name: str):
-    tools = [
-        {
-            "type": "function",
-            "name": "get_weather",
-            "description": "Get current temperature for provided coordinates in celsius.",  # noqa
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "latitude": {"type": "number"},
-                    "longitude": {"type": "number"},
-                },
-                "required": ["latitude", "longitude"],
-                "additionalProperties": False,
-            },
-            "strict": True,
-        }
-    ]
+    tools = [GET_WEATHER_SCHEMA]
 
     response = await client.responses.create(
         model=model_name,
@@ -565,21 +613,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
             },
             "strict": True,
         },
-        {
-            "type": "function",
-            "name": "get_weather",
-            "description": "Get current temperature for provided coordinates in celsius.",  # noqa
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "latitude": {"type": "number"},
-                    "longitude": {"type": "number"},
-                },
-                "required": ["latitude", "longitude"],
-                "additionalProperties": False,
-            },
-            "strict": True,
-        },
+        GET_WEATHER_SCHEMA,
     ]
 
     response = await client.responses.create(
@@ -643,23 +677,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_required(client: OpenAI, model_name: str):
-    tools = [
-        {
-            "type": "function",
-            "name": "get_weather",
-            "description": "Get current temperature for provided coordinates in celsius.",  # noqa
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "latitude": {"type": "number"},
-                    "longitude": {"type": "number"},
-                },
-                "required": ["latitude", "longitude"],
-                "additionalProperties": False,
-            },
-            "strict": True,
-        }
-    ]
+    tools = [GET_WEATHER_SCHEMA]
 
     with pytest.raises(BadRequestError):
         await client.responses.create(
@@ -689,23 +707,7 @@ async def test_system_message_with_tools(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling_full_history(client: OpenAI, model_name: str):
-    tools = [
-        {
-            "type": "function",
-            "name": "get_weather",
-            "description": "Get current temperature for provided coordinates in celsius.",  # noqa
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "latitude": {"type": "number"},
-                    "longitude": {"type": "number"},
-                },
-                "required": ["latitude", "longitude"],
-                "additionalProperties": False,
-            },
-            "strict": True,
-        }
-    ]
+    tools = [GET_WEATHER_SCHEMA]
 
     input_messages = [
         {"role": "user", "content": "What's the weather like in Paris today?"}
@@ -745,6 +747,74 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
     assert response_2.output_text is not None
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_with_stream(client: OpenAI, model_name: str):
+    tools = [GET_WEATHER_SCHEMA]
+    input_list = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Paris today?",
+        }
+    ]
+    stream_response = await client.responses.create(
+        model=model_name,
+        input=input_list,
+        tools=tools,
+        stream=True,
+    )
+    assert stream_response is not None
+    final_tool_calls = {}
+    final_tool_calls_named = {}
+    async for event in stream_response:
+        if event.type == "response.output_item.added":
+            if event.item.type != "function_call":
+                continue
+            final_tool_calls[event.output_index] = event.item
+            final_tool_calls_named[event.item.name] = event.item
+        elif event.type == "response.function_call_arguments.delta":
+            index = event.output_index
+            tool_call = final_tool_calls[index]
+            if tool_call:
+                tool_call.arguments += event.delta
+                final_tool_calls_named[tool_call.name] = tool_call
+        elif event.type == "response.function_call_arguments.done":
+            assert event.arguments == final_tool_calls_named[event.name].arguments
+    for tool_call in final_tool_calls.values():
+        if (
+            tool_call
+            and tool_call.type == "function_call"
+            and tool_call.name == "get_weather"
+        ):
+            args = json.loads(tool_call.arguments)
+            result = call_function(tool_call.name, args)
+            input_list += [tool_call]
+            break
+    assert result is not None
+    response = await client.responses.create(
+        model=model_name,
+        input=input_list
+        + [
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
+        tools=tools,
+        stream=True,
+    )
+    assert response is not None
+    async for event in response:
+        # check that no function call events in the stream
+        assert event.type != "response.function_call_arguments.delta"
+        assert event.type != "response.function_call_arguments.done"
+        # check that the response contains output text
+        if event.type == "response.completed":
+            assert len(event.response.output) > 0
+            assert event.response.output_text is not None
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
@@ -23,6 +23,8 @@
     ResponseCodeInterpreterToolCallParam,
     ResponseContentPartAddedEvent,
     ResponseContentPartDoneEvent,
+    ResponseFunctionCallArgumentsDeltaEvent,
+    ResponseFunctionCallArgumentsDoneEvent,
     ResponseFunctionToolCall,
     ResponseFunctionWebSearch,
     ResponseOutputItem,
@@ -927,6 +929,11 @@ def _construct_input_messages_with_harmony(
                 # to add the tool call request to prev_outputs so that the
                 # parse_response_input can find the tool call request when
                 # parsing the tool call output.
+                if (
+                    isinstance(response_msg, dict)
+                    and response_msg.get("type") == "function_call"
+                ):
+                    response_msg = ResponseFunctionToolCall.model_validate(response_msg)
                 if isinstance(response_msg, ResponseFunctionToolCall):
                     prev_outputs.append(response_msg)
         return messages
@@ -1398,19 +1405,48 @@ async def _process_harmony_streaming_events(
         current_output_index = 0
         current_item_id: str = ""
         sent_output_item_added = False
-
+        is_first_function_call_delta = False
         async for ctx in result_generator:
             assert isinstance(ctx, StreamingHarmonyContext)
 
             if ctx.is_expecting_start():
                 current_output_index += 1
                 sent_output_item_added = False
-
+                is_first_function_call_delta = False
                 if len(ctx.parser.messages) > 0:
                     previous_item = ctx.parser.messages[-1]
                     if previous_item.recipient is not None:
-                        # Deal with tool call here
-                        pass
+                        # Deal with tool call
+                        if previous_item.recipient.startswith("functions."):
+                            function_name = previous_item.recipient[len("functions.") :]
+                            yield _increment_sequence_number_and_return(
+                                ResponseFunctionCallArgumentsDoneEvent(
+                                    type="response.function_call_arguments.done",
+                                    arguments=previous_item.content[0].text,
+                                    name=function_name,
+                                    item_id=current_item_id,
+                                    output_index=current_output_index,
+                                    sequence_number=-1,
+                                )
+                            )
+                            function_call_item = ResponseFunctionToolCall(
+                                type="function_call",
+                                arguments=previous_item.content[0].text,
+                                name=function_name,
+                                item_id=current_item_id,
+                                output_index=current_output_index,
+                                sequence_number=-1,
+                                call_id=f"fc_{random_uuid()}",
+                                status="completed",
+                            )
+                            yield _increment_sequence_number_and_return(
+                                ResponseOutputItemDoneEvent(
+                                    type="response.output_item.done",
+                                    sequence_number=-1,
+                                    output_index=current_output_index,
+                                    item=function_call_item,
+                                )
+                            )
                     elif previous_item.channel == "analysis":
                         content = ResponseReasoningTextContent(
                             text=previous_item.content[0].text,
@@ -1766,6 +1802,43 @@ async def _process_harmony_streaming_events(
                             ),
                         )
                     )
+            # developer tools will be triggered on the commentary channel
+            # and recipient starts with "functions.TOOL_NAME"
+            if (
+                ctx.parser.current_channel == "commentary"
+                and ctx.parser.current_recipient
+                and ctx.parser.current_recipient.startswith("functions.")
+            ):
+                if is_first_function_call_delta is False:
+                    is_first_function_call_delta = True
+                    fc_name = ctx.parser.current_recipient[len("functions.") :]
+                    tool_call_item = ResponseFunctionToolCall(
+                        name=fc_name,
+                        type="function_call",
+                        id=current_item_id,
+                        call_id=f"call_{random_uuid()}",
+                        arguments="",
+                        status="in_progress",
+                    )
+                    current_item_id = f"fc_{random_uuid()}"
+                    yield _increment_sequence_number_and_return(
+                        ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=tool_call_item,
+                        )
+                    )
+                else:
+                    yield _increment_sequence_number_and_return(
+                        ResponseFunctionCallArgumentsDeltaEvent(
+                            item_id=current_item_id,
+                            delta=ctx.parser.last_content_delta,
+                            output_index=current_output_index,
+                            sequence_number=-1,
+                            type="response.function_call_arguments.delta",
+                        )
+                    )
 
     async def responses_stream_generator(
         self,