Skip to content

Commit 84ba517

Browse files
committed
test: Add coverage improvement test for scrapegraph-py/tests/test_localscraper.py
1 parent 49561da commit 84ba517

File tree

1 file changed

+90
-0
lines changed

1 file changed

+90
-0
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import pytest
2+
from pydantic import BaseModel
3+
from scrapegraph_py.models.localscraper import LocalScraperRequest, GetLocalScraperRequest
4+
5+
# Create a dummy output schema to test the conversion in model_dump.
6+
class DummySchema(BaseModel):
7+
test_field: str
8+
9+
def test_output_schema_conversion():
10+
"""
11+
Test that when an output_schema is provided in a LocalScraperRequest,
12+
model_dump returns a dictionary where the output_schema key holds the JSON schema
13+
of the provided Pydantic model.
14+
"""
15+
user_prompt = "Extract company details"
16+
website_html = "<html><body><div>Content</div></body></html>"
17+
# Create a LocalScraperRequest with a dummy output_schema.
18+
request = LocalScraperRequest(user_prompt=user_prompt, website_html=website_html, output_schema=DummySchema)
19+
dumped = request.model_dump()
20+
# Verify that output_schema is converted properly in the dumped dictionary.
21+
assert "output_schema" in dumped
22+
assert dumped["output_schema"] == DummySchema.model_json_schema()
23+
24+
def test_invalid_website_html_structure():
25+
"""
26+
Test that LocalScraperRequest raises a ValueError when the website_html provided
27+
has no parseable HTML tags. This ensures the HTML content validation catches
28+
non-HTML input.
29+
"""
30+
# This string has no HTML tags so BeautifulSoup.find() should return None.
31+
invalid_html = "Just some random text"
32+
with pytest.raises(ValueError, match="Invalid HTML - no parseable content found"):
33+
LocalScraperRequest(user_prompt="Extract info about the company", website_html=invalid_html)
34+
35+
def test_invalid_user_prompt_non_alnum():
36+
"""
37+
Test that LocalScraperRequest raises a ValueError when the user_prompt
38+
does not contain any alphanumeric characters.
39+
"""
40+
with pytest.raises(ValueError, match="User prompt must contain a valid prompt"):
41+
LocalScraperRequest(
42+
user_prompt="!!!",
43+
website_html="<html><body><div>Valid Content</div></body></html>"
44+
)
45+
46+
def test_get_localscraper_request_invalid_uuid():
47+
"""
48+
Test that GetLocalScraperRequest raises a ValueError when an invalid UUID is provided.
49+
This ensures that the model correctly validates the request_id as a proper UUID.
50+
"""
51+
invalid_uuid = "not-a-valid-uuid"
52+
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
53+
GetLocalScraperRequest(request_id=invalid_uuid)
54+
55+
def test_website_html_exceeds_maximum_size():
56+
"""
57+
Test that LocalScraperRequest raises a ValueError when the website_html content
58+
exceeds the maximum allowed size of 2MB. The generated HTML is valid but too large.
59+
"""
60+
# Calculate the number of characters needed to exceed 2MB when encoded in UTF-8.
61+
max_size_bytes = 2 * 1024 * 1024
62+
# Create a valid HTML string that exceeds 2MB.
63+
base_html_prefix = "<html><body>"
64+
base_html_suffix = "</body></html>"
65+
repeated_char_length = max_size_bytes - len(base_html_prefix.encode("utf-8")) - len(base_html_suffix.encode("utf-8")) + 1
66+
oversized_content = "a" * repeated_char_length
67+
oversized_html = f"{base_html_prefix}{oversized_content}{base_html_suffix}"
68+
69+
with pytest.raises(ValueError, match="Website HTML content exceeds maximum size of 2MB"):
70+
LocalScraperRequest(user_prompt="Extract info", website_html=oversized_html)
71+
72+
def test_website_html_exactly_maximum_size():
73+
"""
74+
Test that LocalScraperRequest accepts website_html content exactly 2MB in size.
75+
This ensures that the size validation correctly allows content on the boundary.
76+
"""
77+
user_prompt = "Extract info with exact size HTML"
78+
prefix = "<html><body>"
79+
suffix = "</body></html>"
80+
# Calculate the length of the content needed to exactly reach 2MB when combined with prefix and suffix.
81+
max_size_bytes = 2 * 1024 * 1024
82+
content_length = max_size_bytes - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8"))
83+
valid_content = "a" * content_length
84+
html = prefix + valid_content + suffix
85+
86+
# Attempt to create a valid LocalScraperRequest.
87+
request = LocalScraperRequest(user_prompt=user_prompt, website_html=html)
88+
89+
# Verify that the HTML content is exactly 2MB in size when encoded in UTF-8.
90+
assert len(request.website_html.encode("utf-8")) == max_size_bytes

0 commit comments

Comments
 (0)