Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions scrapegraph-py/scrapegraph_py/async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,16 +445,17 @@ def new_id(prefix: str) -> str:
return {"status": "mock", "url": url, "method": method, "kwargs": kwargs}

async def markdownify(
self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, return_toon: bool = False
self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False
):
"""Send a markdownify request

Args:
website_url: The URL to convert to markdown
headers: Optional HTTP headers
mock: Enable mock mode for testing
render_heavy_js: Enable heavy JavaScript rendering
stealth: Enable stealth mode to avoid bot detection
wait_ms: Number of milliseconds to wait before scraping the website
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
"""
logger.info(f"🔍 Starting markdownify request for {website_url}")
Expand All @@ -467,7 +468,7 @@ async def markdownify(
if return_toon:
logger.debug("🎨 TOON format output enabled")

request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth)
request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms)
logger.debug("✅ Request validation passed")

result = await self._make_request(
Expand Down Expand Up @@ -504,6 +505,7 @@ async def scrape(
branding: bool = False,
headers: Optional[dict[str, str]] = None,
stealth: bool = False,
wait_ms: Optional[int] = None,
return_toon: bool = False,
):
"""Send a scrape request to get HTML content from a website
Expand All @@ -514,6 +516,7 @@ async def scrape(
branding: Whether to include branding in the response (defaults to False)
headers: Optional headers to send with the request
stealth: Enable stealth mode to avoid bot detection
wait_ms: Number of milliseconds to wait before scraping the website
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
"""
logger.info(f"🔍 Starting scrape request for {website_url}")
Expand All @@ -532,6 +535,7 @@ async def scrape(
branding=branding,
headers=headers,
stealth=stealth,
wait_ms=wait_ms,
)
logger.debug("✅ Request validation passed")

Expand Down Expand Up @@ -619,6 +623,7 @@ async def smartscraper(
plain_text: bool = False,
render_heavy_js: bool = False,
stealth: bool = False,
wait_ms: Optional[int] = None,
return_toon: bool = False,
):
"""
Expand All @@ -643,6 +648,7 @@ async def smartscraper(
plain_text: Return plain text instead of structured data
render_heavy_js: Enable heavy JavaScript rendering
stealth: Enable stealth mode to avoid bot detection
wait_ms: Number of milliseconds to wait before scraping the website
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)

Returns:
Expand Down Expand Up @@ -689,6 +695,7 @@ async def smartscraper(
plain_text=plain_text,
render_heavy_js=render_heavy_js,
stealth=stealth,
wait_ms=wait_ms,
)

logger.debug("✅ Request validation passed")
Expand Down
15 changes: 11 additions & 4 deletions scrapegraph-py/scrapegraph_py/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,15 +458,16 @@ def new_id(prefix: str) -> str:
# Generic fallback
return {"status": "mock", "url": url, "method": method, "kwargs": kwargs}

def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, return_toon: bool = False):
def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False):
"""Send a markdownify request

Args:
website_url: The URL to convert to markdown
headers: Optional HTTP headers
mock: Enable mock mode for testing
render_heavy_js: Enable heavy JavaScript rendering
stealth: Enable stealth mode to avoid bot detection
wait_ms: Number of milliseconds to wait before scraping the website
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
"""
logger.info(f"🔍 Starting markdownify request for {website_url}")
Expand All @@ -479,7 +480,7 @@ def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None
if return_toon:
logger.debug("🎨 TOON format output enabled")

request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth)
request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms)
logger.debug("✅ Request validation passed")

result = self._make_request(
Expand Down Expand Up @@ -515,6 +516,7 @@ def scrape(
headers: Optional[dict[str, str]] = None,
mock:bool=False,
stealth:bool=False,
wait_ms: Optional[int] = None,
return_toon: bool = False,
):
"""Send a scrape request to get HTML content from a website
Expand All @@ -526,6 +528,7 @@ def scrape(
headers: Optional headers to send with the request
mock: Enable mock mode for testing
stealth: Enable stealth mode to avoid bot detection
wait_ms: Number of milliseconds to wait before scraping the website
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
"""
logger.info(f"🔍 Starting scrape request for {website_url}")
Expand All @@ -544,7 +547,8 @@ def scrape(
branding=branding,
headers=headers,
mock=mock,
stealth=stealth
stealth=stealth,
wait_ms=wait_ms,
)
logger.debug("✅ Request validation passed")

Expand Down Expand Up @@ -631,6 +635,7 @@ def smartscraper(
plain_text: bool = False,
render_heavy_js: bool = False,
stealth: bool = False,
wait_ms: Optional[int] = None,
return_toon: bool = False,
):
"""
Expand All @@ -655,6 +660,7 @@ def smartscraper(
plain_text: Return plain text instead of structured data
render_heavy_js: Enable heavy JavaScript rendering
stealth: Enable stealth mode to avoid bot detection
wait_ms: Number of milliseconds to wait before scraping the website
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)

Returns:
Expand Down Expand Up @@ -701,6 +707,7 @@ def smartscraper(
plain_text=plain_text,
render_heavy_js=render_heavy_js,
stealth=stealth,
wait_ms=wait_ms,
)
logger.debug("✅ Request validation passed")

Expand Down
1 change: 1 addition & 0 deletions scrapegraph-py/scrapegraph_py/models/markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class MarkdownifyRequest(BaseModel):
mock: bool = Field(default=False, description="Whether to use mock mode for the request")
render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page")
stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection")
wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website")

@model_validator(mode="after")
def validate_url(self) -> "MarkdownifyRequest":
Expand Down
3 changes: 2 additions & 1 deletion scrapegraph-py/scrapegraph_py/models/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ class ScrapeRequest(BaseModel):
},
description="Optional headers to send with the request, including cookies "
"and user agent",
),
)
mock: bool = Field(default=False, description="Whether to use mock mode for the request")
stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection")
wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website")

@model_validator(mode="after")
def validate_url(self) -> "ScrapeRequest":
Expand Down
1 change: 1 addition & 0 deletions scrapegraph-py/scrapegraph_py/models/smartscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class SmartScraperRequest(BaseModel):
plain_text: bool = Field(default=False, description="Whether to return the result as plain text")
render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page")
stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection")
wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website")

@model_validator(mode="after")
def validate_user_prompt(self) -> "SmartScraperRequest":
Expand Down
29 changes: 29 additions & 0 deletions scrapegraph-py/tests/test_scrape_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,35 @@ def test_url_validation_edge_cases(self):
ScrapeRequest(website_url=url)


def test_wait_ms_default(self):
"""Test scrape request wait_ms defaults to None"""
request = ScrapeRequest(website_url="https://example.com")
assert request.wait_ms is None

def test_wait_ms_custom_value(self):
"""Test scrape request with custom wait_ms"""
request = ScrapeRequest(
website_url="https://example.com",
wait_ms=5000,
)
assert request.wait_ms == 5000

def test_wait_ms_serialization(self):
"""Test wait_ms is excluded from serialization when None"""
request = ScrapeRequest(website_url="https://example.com")
data = request.model_dump()
assert "wait_ms" not in data

def test_wait_ms_serialization_with_value(self):
"""Test wait_ms is included in serialization when set"""
request = ScrapeRequest(
website_url="https://example.com",
wait_ms=5000,
)
data = request.model_dump()
assert data["wait_ms"] == 5000


class TestGetScrapeRequest:
"""Test GetScrapeRequest model"""

Expand Down
36 changes: 36 additions & 0 deletions scrapegraph-py/tests/test_smartscraper_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,42 @@ def test_serialization_include_all(self):
data = request.model_dump(exclude_none=False)
assert data["render_heavy_js"] is False

def test_wait_ms_default(self):
"""Test smartscraper request wait_ms defaults to None"""
request = SmartScraperRequest(
user_prompt="Extract data",
website_url="https://example.com",
)
assert request.wait_ms is None

def test_wait_ms_custom_value(self):
"""Test smartscraper request with custom wait_ms"""
request = SmartScraperRequest(
user_prompt="Extract data",
website_url="https://example.com",
wait_ms=5000,
)
assert request.wait_ms == 5000

def test_wait_ms_serialization(self):
"""Test wait_ms is excluded from serialization when None"""
request = SmartScraperRequest(
user_prompt="Extract data",
website_url="https://example.com",
)
data = request.model_dump()
assert "wait_ms" not in data

def test_wait_ms_serialization_with_value(self):
"""Test wait_ms is included in serialization when set"""
request = SmartScraperRequest(
user_prompt="Extract data",
website_url="https://example.com",
wait_ms=5000,
)
data = request.model_dump()
assert data["wait_ms"] == 5000

def test_invalid_empty_prompt(self):
"""Test smartscraper request with empty prompt"""
with pytest.raises(ValidationError):
Expand Down
Loading