more coderabbit

thoraxe · thoraxe · commit d5e262250719 · 2025-08-14T16:17:24.000-04:00
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -1,14 +1,14 @@
 """Handler for REST API call to provide answer to query."""
 
-import ast
 from datetime import datetime, UTC
 import json
 import logging
 import os
 from pathlib import Path
-import re
 from typing import Annotated, Any, cast
 
+import pydantic
+
 from llama_stack_client import APIConnectionError
 from llama_stack_client import AsyncLlamaStackClient  # type: ignore
 from llama_stack_client.types import UserMessage, Shield  # type: ignore
@@ -43,13 +43,12 @@
 )
 from utils.mcp_headers import mcp_headers_dependency, handle_mcp_headers_with_toolgroups
 from utils.suid import get_suid
+from utils.metadata import parse_knowledge_search_metadata
 
 logger = logging.getLogger("app.endpoints.handlers")
 router = APIRouter(tags=["query"])
 auth_dependency = get_auth_dependency()
 
-METADATA_PATTERN = re.compile(r"^\s*Metadata:\s*(\{.*?\})\s*$", re.MULTILINE)
-
 
 def _process_knowledge_search_content(
     tool_response: Any, metadata_map: dict[str, dict[str, Any]]
@@ -75,17 +74,14 @@ def _process_knowledge_search_content(
         if not text:
             continue
 
-        for match in METADATA_PATTERN.findall(text):
-            try:
-                meta = ast.literal_eval(match)
-                # Verify the result is a dict before accessing keys
-                if isinstance(meta, dict) and "document_id" in meta:
-                    metadata_map[meta["document_id"]] = meta
-            except (SyntaxError, ValueError):  # only expected from literal_eval
-                logger.exception(
-                    "An exception was thrown in processing %s",
-                    match,
-                )
+        try:
+            parsed_metadata = parse_knowledge_search_metadata(text)
+            metadata_map.update(parsed_metadata)
+        except ValueError:
+            logger.exception(
+                "An exception was thrown in processing metadata from text: %s",
+                text[:200] + "..." if len(text) > 200 else text,
+            )
 
 
 def extract_referenced_documents_from_steps(steps: list) -> list[ReferencedDocument]:
@@ -113,12 +109,23 @@ def extract_referenced_documents_from_steps(steps: list) -> list[ReferencedDocum
 
             _process_knowledge_search_content(tool_response, metadata_map)
 
-    # Extract referenced documents from metadata
-    return [
-        ReferencedDocument(doc_url=v["docs_url"], doc_title=v["title"])
-        for v in metadata_map.values()
-        if "docs_url" in v and "title" in v
-    ]
+    # Extract referenced documents from metadata with error handling
+    referenced_documents = []
+    for v in metadata_map.values():
+        if "docs_url" in v and "title" in v:
+            try:
+                doc = ReferencedDocument(doc_url=v["docs_url"], doc_title=v["title"])
+                referenced_documents.append(doc)
+            except (pydantic.ValidationError, ValueError, Exception) as e:
+                logger.warning(
+                    "Skipping invalid referenced document with docs_url='%s', title='%s': %s",
+                    v.get("docs_url", "<missing>"),
+                    v.get("title", "<missing>"),
+                    str(e),
+                )
+                continue
+
+    return referenced_documents
 
 
 query_response: dict[int | str, dict[str, Any]] = {
diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py
@@ -1,11 +1,11 @@
 """Handler for REST API call to provide answer to streaming query."""
 
-import ast
 import json
-import re
 import logging
 from typing import Annotated, Any, AsyncIterator, Iterator
 
+import pydantic
+
 from llama_stack_client import APIConnectionError
 from llama_stack_client import AsyncLlamaStackClient  # type: ignore
 from llama_stack_client.types import UserMessage  # type: ignore
@@ -27,6 +27,7 @@
 from models.responses import ReferencedDocument
 from utils.endpoints import check_configuration_loaded, get_agent, get_system_prompt
 from utils.mcp_headers import mcp_headers_dependency, handle_mcp_headers_with_toolgroups
+from utils.metadata import parse_knowledge_search_metadata
 
 from app.endpoints.query import (
     get_rag_toolgroups,
@@ -46,9 +47,6 @@
 auth_dependency = get_auth_dependency()
 
 
-METADATA_PATTERN = re.compile(r"\nMetadata: (\{.+})\n")
-
-
 def format_stream_data(d: dict) -> str:
     """Format outbound data in the Event Stream Format."""
     data = json.dumps(d)
@@ -79,15 +77,24 @@ def stream_end_event(metadata_map: dict) -> str:
         lambda v: ("docs_url" in v) and ("title" in v),
         metadata_map.values(),
     ):
-        doc = ReferencedDocument(doc_url=v["docs_url"], doc_title=v["title"])
-        referenced_documents.append(
-            {
-                "doc_url": str(
-                    doc.doc_url
-                ),  # Convert AnyUrl to string for JSON serialization
-                "doc_title": doc.doc_title,
-            }
-        )
+        try:
+            doc = ReferencedDocument(doc_url=v["docs_url"], doc_title=v["title"])
+            referenced_documents.append(
+                {
+                    "doc_url": str(
+                        doc.doc_url
+                    ),  # Convert AnyUrl to string for JSON serialization
+                    "doc_title": doc.doc_title,
+                }
+            )
+        except (pydantic.ValidationError, ValueError, Exception) as e:
+            logger.warning(
+                "Skipping invalid referenced document with docs_url='%s', title='%s': %s",
+                v.get("docs_url", "<missing>"),
+                v.get("title", "<missing>"),
+                str(e),
+            )
+            continue
 
     return format_stream_data(
         {
@@ -335,20 +342,20 @@ def _handle_tool_execution_event(
                             newline_pos = summary.find("\n")
                             if newline_pos > 0:
                                 summary = summary[:newline_pos]
-                        for match in METADATA_PATTERN.findall(text_content_item.text):
-                            try:
-                                meta = ast.literal_eval(match)
-                                # Verify the result is a dict before accessing keys
-                                if isinstance(meta, dict) and "document_id" in meta:
-                                    metadata_map[meta["document_id"]] = meta
-                            except (
-                                SyntaxError,
-                                ValueError,
-                            ):  # only expected from literal_eval
-                                logger.exception(
-                                    "An exception was thrown in processing %s",
-                                    match,
-                                )
+                        try:
+                            parsed_metadata = parse_knowledge_search_metadata(
+                                text_content_item.text
+                            )
+                            metadata_map.update(parsed_metadata)
+                        except ValueError:
+                            logger.exception(
+                                "An exception was thrown in processing metadata from text: %s",
+                                (
+                                    text_content_item.text[:200] + "..."
+                                    if len(text_content_item.text) > 200
+                                    else text_content_item.text
+                                ),
+                            )
 
                 yield format_stream_data(
                     {
diff --git a/src/utils/metadata.py b/src/utils/metadata.py
@@ -0,0 +1,34 @@
+"""Shared utilities for parsing metadata from knowledge search responses."""
+
+import ast
+import re
+from typing import Any
+
+
+METADATA_PATTERN = re.compile(r"^\s*Metadata:\s*(\{.*?\})\s*$", re.MULTILINE)
+
+
+def parse_knowledge_search_metadata(text: str) -> dict[str, Any]:
+    """Parse metadata from knowledge search text content.
+
+    Args:
+        text: Text content that may contain metadata patterns
+
+    Returns:
+        Dictionary of document_id -> metadata mappings
+
+    Raises:
+        ValueError: If metadata parsing fails due to invalid JSON or syntax
+    """
+    metadata_map: dict[str, Any] = {}
+
+    for match in METADATA_PATTERN.findall(text):
+        try:
+            meta = ast.literal_eval(match)
+            # Verify the result is a dict before accessing keys
+            if isinstance(meta, dict) and "document_id" in meta:
+                metadata_map[meta["document_id"]] = meta
+        except (SyntaxError, ValueError) as e:
+            raise ValueError(f"Failed to parse metadata '{match}': {e}") from e
+
+    return metadata_map
diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py
@@ -1888,3 +1888,107 @@ async def test_retrieve_response_with_structured_content_object(
     # Should convert the structured object to string representation
     assert response == str(structured_content)
     assert conversation_id == "fake_conversation_id"
+
+
+@pytest.mark.asyncio
+async def test_retrieve_response_skips_invalid_docs_url(prepare_agent_mocks, mocker):
+    """Test that retrieve_response skips entries with invalid docs_url."""
+    mock_client, mock_agent = prepare_agent_mocks
+    mock_agent.create_turn.return_value.output_message.content = "LLM answer"
+    mock_client.shields.list.return_value = []
+    mock_client.vector_dbs.list.return_value = []
+
+    # Mock tool response with valid and invalid docs_url entries
+    invalid_docs_url_results = [
+        """knowledge_search tool found 2 chunks:
+BEGIN of knowledge_search tool results.
+""",
+        """Result 1
+Content: Valid content
+Metadata: {'docs_url': 'https://example.com/doc1', 'title': 'Valid Doc', 'document_id': 'doc-1'}
+""",
+        """Result 2
+Content: Invalid content
+Metadata: {'docs_url': 'not-a-valid-url', 'title': 'Invalid Doc', 'document_id': 'doc-2'}
+""",
+        """END of knowledge_search tool results.
+""",
+    ]
+
+    mock_tool_response = mocker.Mock()
+    mock_tool_response.call_id = "c1"
+    mock_tool_response.tool_name = "knowledge_search"
+    mock_tool_response.content = [
+        mocker.Mock(text=s, type="text") for s in invalid_docs_url_results
+    ]
+
+    mock_tool_execution_step = mocker.Mock()
+    mock_tool_execution_step.step_type = "tool_execution"
+    mock_tool_execution_step.tool_responses = [mock_tool_response]
+
+    mock_agent.create_turn.return_value.steps = [mock_tool_execution_step]
+
+    # Mock configuration with empty MCP servers
+    mock_config = mocker.Mock()
+    mock_config.mcp_servers = []
+    mocker.patch("app.endpoints.query.configuration", mock_config)
+    mocker.patch(
+        "app.endpoints.query.get_agent",
+        return_value=(mock_agent, "fake_conversation_id", "fake_session_id"),
+    )
+
+    query_request = QueryRequest(query="What is OpenStack?")
+    model_id = "fake_model_id"
+    access_token = "test_token"
+
+    response, conversation_id, referenced_documents = await retrieve_response(
+        mock_client, model_id, query_request, access_token
+    )
+
+    assert response == "LLM answer"
+    assert conversation_id == "fake_conversation_id"
+
+    # Assert only the valid document is included, invalid one is skipped
+    assert len(referenced_documents) == 1
+    assert str(referenced_documents[0].doc_url) == "https://example.com/doc1"
+    assert referenced_documents[0].doc_title == "Valid Doc"
+
+
+@pytest.mark.asyncio
+async def test_extract_referenced_documents_from_steps_handles_validation_errors(
+    mocker,
+):
+    """Test that extract_referenced_documents_from_steps handles validation errors gracefully."""
+    # Mock tool response with invalid docs_url that will cause pydantic validation error
+    mock_tool_response = mocker.Mock()
+    mock_tool_response.tool_name = "knowledge_search"
+    mock_tool_response.content = [
+        mocker.Mock(
+            text="""Result 1
+Content: Valid content
+Metadata: {'docs_url': 'https://example.com/doc1', 'title': 'Valid Doc', 'document_id': 'doc-1'}
+"""
+        ),
+        mocker.Mock(
+            text="""Result 2
+Content: Invalid content  
+Metadata: {'docs_url': 'invalid-url', 'title': 'Invalid Doc', 'document_id': 'doc-2'}
+"""
+        ),
+    ]
+
+    mock_tool_execution_step = mocker.Mock()
+    mock_tool_execution_step.step_type = "tool_execution"
+    mock_tool_execution_step.tool_responses = [mock_tool_response]
+
+    steps = [mock_tool_execution_step]
+
+    # Import the function directly to test it
+    from app.endpoints.query import extract_referenced_documents_from_steps
+
+    referenced_documents = extract_referenced_documents_from_steps(steps)
+
+    # Should only return the valid document, skipping the invalid one
+    assert len(referenced_documents) == 1
+    assert str(referenced_documents[0].doc_url) == "https://example.com/doc1"
+    assert referenced_documents[0].doc_title == "Valid Doc"
diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py
@@ -1104,6 +1104,44 @@ def test_stream_end_event_with_referenced_documents():
     assert "Test Document 4" not in doc_titles
 
 
+def test_stream_end_event_skips_invalid_docs_url():
+    """Test stream_end_event skips entries with invalid docs_url."""
+    metadata_map = {
+        "doc-1": {
+            "docs_url": "https://example.com/doc1",
+            "title": "Valid Document",
+            "document_id": "doc-1",
+        },
+        "doc-2": {
+            "docs_url": "not-a-valid-url",  # Invalid URL that will cause ValidationError
+            "title": "Invalid Document",
+            "document_id": "doc-2",
+        },
+        "doc-3": {
+            "docs_url": "",  # Empty URL that will cause ValidationError
+            "title": "Empty URL Document",
+            "document_id": "doc-3",
+        },
+    }
+
+    result = stream_end_event(metadata_map)
+
+    # Parse the JSON response
+    parsed = json.loads(result.replace("data: ", ""))
+
+    # Verify structure
+    assert parsed["event"] == "end"
+    assert "referenced_documents" in parsed["data"]
+
+    # Verify only valid documents are included, invalid ones are skipped
+    referenced_docs = parsed["data"]["referenced_documents"]
+    assert len(referenced_docs) == 1
+
+    # Verify the valid document is included
+    assert referenced_docs[0]["doc_url"] == "https://example.com/doc1"
+    assert referenced_docs[0]["doc_title"] == "Valid Document"
+
+
 def test_stream_build_event_error():
     """Test stream_build_event function returns a 'error' when chunk contains error information."""
     # Create a mock chunk without an expected payload structure
diff --git a/tests/unit/utils/test_metadata.py b/tests/unit/utils/test_metadata.py