From 8aaeee90163ff79514a627685c117975a58ae737 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 13 Feb 2026 15:02:50 +0100 Subject: [PATCH 1/2] feat: add wait_ms parameter to SmartScraper, Scrape, and Markdownify endpoints Adds a new optional wait_ms parameter that controls how long to wait before scraping the website. This aligns the SDK with the API change in ScrapeGraphAI/sgai-api#399. Co-Authored-By: Claude Opus 4.6 --- scrapegraph-py/scrapegraph_py/async_client.py | 13 +++++-- scrapegraph-py/scrapegraph_py/client.py | 15 +++++--- .../scrapegraph_py/models/markdownify.py | 1 + .../scrapegraph_py/models/scrape.py | 1 + .../scrapegraph_py/models/smartscraper.py | 1 + scrapegraph-py/tests/test_scrape_models.py | 29 +++++++++++++++ .../tests/test_smartscraper_models.py | 36 +++++++++++++++++++ 7 files changed, 89 insertions(+), 7 deletions(-) diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 61ab22e..d2d5166 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -445,16 +445,17 @@ def new_id(prefix: str) -> str: return {"status": "mock", "url": url, "method": method, "kwargs": kwargs} async def markdownify( - self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, return_toon: bool = False + self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False ): """Send a markdownify request - + Args: website_url: The URL to convert to markdown headers: Optional HTTP headers mock: Enable mock mode for testing render_heavy_js: Enable heavy JavaScript rendering stealth: Enable stealth mode to avoid bot detection + wait_ms: Number of milliseconds to wait before scraping the website return_toon: If True, return response in TOON format (reduces token usage by 30-60%) """ logger.info(f"🔍 Starting markdownify request for {website_url}") @@ -467,7 +468,7 @@ async def markdownify( if return_toon: logger.debug("🎨 TOON format output enabled") - request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth) + request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms) logger.debug("✅ Request validation passed") result = await self._make_request( @@ -504,6 +505,7 @@ async def scrape( branding: bool = False, headers: Optional[dict[str, str]] = None, stealth: bool = False, + wait_ms: Optional[int] = None, return_toon: bool = False, ): """Send a scrape request to get HTML content from a website @@ -514,6 +516,7 @@ async def scrape( branding: Whether to include branding in the response (defaults to False) headers: Optional headers to send with the request stealth: Enable stealth mode to avoid bot detection + wait_ms: Number of milliseconds to wait before scraping the website return_toon: If True, return response in TOON format (reduces token usage by 30-60%) """ logger.info(f"🔍 Starting scrape request for {website_url}") @@ -532,6 +535,7 @@ async def scrape( branding=branding, headers=headers, stealth=stealth, + wait_ms=wait_ms, ) logger.debug("✅ Request validation passed") @@ -619,6 +623,7 @@ async def smartscraper( plain_text: bool = False, render_heavy_js: bool = False, stealth: bool = False, + wait_ms: Optional[int] = None, return_toon: bool = False, ): """ @@ -643,6 +648,7 @@ async def smartscraper( plain_text: Return plain text instead of structured data render_heavy_js: Enable heavy JavaScript rendering stealth: Enable stealth mode to avoid bot detection + wait_ms: Number of milliseconds to wait before scraping the website return_toon: If True, return response in TOON format (reduces token usage by 30-60%) Returns: @@ -689,6 +695,7 @@ async def smartscraper( plain_text=plain_text, render_heavy_js=render_heavy_js, stealth=stealth, + wait_ms=wait_ms, ) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 11e8f78..1333342 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -458,15 +458,16 @@ def new_id(prefix: str) -> str: # Generic fallback return {"status": "mock", "url": url, "method": method, "kwargs": kwargs} - def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, return_toon: bool = False): + def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False): """Send a markdownify request - + Args: website_url: The URL to convert to markdown headers: Optional HTTP headers mock: Enable mock mode for testing render_heavy_js: Enable heavy JavaScript rendering stealth: Enable stealth mode to avoid bot detection + wait_ms: Number of milliseconds to wait before scraping the website return_toon: If True, return response in TOON format (reduces token usage by 30-60%) """ logger.info(f"🔍 Starting markdownify request for {website_url}") @@ -479,7 +480,7 @@ def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None if return_toon: logger.debug("🎨 TOON format output enabled") - request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth) + request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms) logger.debug("✅ Request validation passed") result = self._make_request( @@ -515,6 +516,7 @@ def scrape( headers: Optional[dict[str, str]] = None, mock:bool=False, stealth:bool=False, + wait_ms: Optional[int] = None, return_toon: bool = False, ): """Send a scrape request to get HTML content from a website @@ -526,6 +528,7 @@ def scrape( headers: Optional headers to send with the request mock: Enable mock mode for testing stealth: Enable stealth mode to avoid bot detection + wait_ms: Number of milliseconds to wait before scraping the website return_toon: If True, return response in TOON format (reduces token usage by 30-60%) """ logger.info(f"🔍 Starting scrape request for {website_url}") @@ -544,7 +547,8 @@ def scrape( branding=branding, headers=headers, mock=mock, - stealth=stealth + stealth=stealth, + wait_ms=wait_ms, ) logger.debug("✅ Request validation passed") @@ -631,6 +635,7 @@ def smartscraper( plain_text: bool = False, render_heavy_js: bool = False, stealth: bool = False, + wait_ms: Optional[int] = None, return_toon: bool = False, ): """ @@ -655,6 +660,7 @@ def smartscraper( plain_text: Return plain text instead of structured data render_heavy_js: Enable heavy JavaScript rendering stealth: Enable stealth mode to avoid bot detection + wait_ms: Number of milliseconds to wait before scraping the website return_toon: If True, return response in TOON format (reduces token usage by 30-60%) Returns: @@ -701,6 +707,7 @@ def smartscraper( plain_text=plain_text, render_heavy_js=render_heavy_js, stealth=stealth, + wait_ms=wait_ms, ) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/models/markdownify.py b/scrapegraph-py/scrapegraph_py/models/markdownify.py index 31fbf90..0b95903 100644 --- a/scrapegraph-py/scrapegraph_py/models/markdownify.py +++ b/scrapegraph-py/scrapegraph_py/models/markdownify.py @@ -46,6 +46,7 @@ class MarkdownifyRequest(BaseModel): mock: bool = Field(default=False, description="Whether to use mock mode for the request") render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") + wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") @model_validator(mode="after") def validate_url(self) -> "MarkdownifyRequest": diff --git a/scrapegraph-py/scrapegraph_py/models/scrape.py b/scrapegraph-py/scrapegraph_py/models/scrape.py index a66fd1a..a8949e1 100644 --- a/scrapegraph-py/scrapegraph_py/models/scrape.py +++ b/scrapegraph-py/scrapegraph_py/models/scrape.py @@ -57,6 +57,7 @@ class ScrapeRequest(BaseModel): ), mock: bool = Field(default=False, description="Whether to use mock mode for the request") stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") + wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") @model_validator(mode="after") def validate_url(self) -> "ScrapeRequest": diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py index 4bd12c0..e68b2d8 100644 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/smartscraper.py @@ -100,6 +100,7 @@ class SmartScraperRequest(BaseModel): plain_text: bool = Field(default=False, description="Whether to return the result as plain text") render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") + wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") @model_validator(mode="after") def validate_user_prompt(self) -> "SmartScraperRequest": diff --git a/scrapegraph-py/tests/test_scrape_models.py b/scrapegraph-py/tests/test_scrape_models.py index beb2188..0112d06 100644 --- a/scrapegraph-py/tests/test_scrape_models.py +++ b/scrapegraph-py/tests/test_scrape_models.py @@ -130,6 +130,35 @@ def test_url_validation_edge_cases(self): ScrapeRequest(website_url=url) + def test_wait_ms_default(self): + """Test scrape request wait_ms defaults to None""" + request = ScrapeRequest(website_url="https://example.com") + assert request.wait_ms is None + + def test_wait_ms_custom_value(self): + """Test scrape request with custom wait_ms""" + request = ScrapeRequest( + website_url="https://example.com", + wait_ms=5000, + ) + assert request.wait_ms == 5000 + + def test_wait_ms_serialization(self): + """Test wait_ms is excluded from serialization when None""" + request = ScrapeRequest(website_url="https://example.com") + data = request.model_dump() + assert "wait_ms" not in data + + def test_wait_ms_serialization_with_value(self): + """Test wait_ms is included in serialization when set""" + request = ScrapeRequest( + website_url="https://example.com", + wait_ms=5000, + ) + data = request.model_dump() + assert data["wait_ms"] == 5000 + + class TestGetScrapeRequest: """Test GetScrapeRequest model""" diff --git a/scrapegraph-py/tests/test_smartscraper_models.py b/scrapegraph-py/tests/test_smartscraper_models.py index 3fe3248..5923f74 100644 --- a/scrapegraph-py/tests/test_smartscraper_models.py +++ b/scrapegraph-py/tests/test_smartscraper_models.py @@ -95,6 +95,42 @@ def test_serialization_include_all(self): data = request.model_dump(exclude_none=False) assert data["render_heavy_js"] is False + def test_wait_ms_default(self): + """Test smartscraper request wait_ms defaults to None""" + request = SmartScraperRequest( + user_prompt="Extract data", + website_url="https://example.com", + ) + assert request.wait_ms is None + + def test_wait_ms_custom_value(self): + """Test smartscraper request with custom wait_ms""" + request = SmartScraperRequest( + user_prompt="Extract data", + website_url="https://example.com", + wait_ms=5000, + ) + assert request.wait_ms == 5000 + + def test_wait_ms_serialization(self): + """Test wait_ms is excluded from serialization when None""" + request = SmartScraperRequest( + user_prompt="Extract data", + website_url="https://example.com", + ) + data = request.model_dump() + assert "wait_ms" not in data + + def test_wait_ms_serialization_with_value(self): + """Test wait_ms is included in serialization when set""" + request = SmartScraperRequest( + user_prompt="Extract data", + website_url="https://example.com", + wait_ms=5000, + ) + data = request.model_dump() + assert data["wait_ms"] == 5000 + def test_invalid_empty_prompt(self): """Test smartscraper request with empty prompt""" with pytest.raises(ValidationError): From a196d8de2ba7c46fec1b3a1efdec3174906e8b9c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 13 Feb 2026 18:04:13 +0100 Subject: [PATCH 2/2] fix: remove trailing comma on headers field in ScrapeRequest model The trailing comma turned the Field assignment into a tuple, breaking model_dump() serialization (headers always present even when None). Co-Authored-By: Claude Opus 4.6 --- scrapegraph-py/scrapegraph_py/models/scrape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraph-py/scrapegraph_py/models/scrape.py b/scrapegraph-py/scrapegraph_py/models/scrape.py index a8949e1..a080957 100644 --- a/scrapegraph-py/scrapegraph_py/models/scrape.py +++ b/scrapegraph-py/scrapegraph_py/models/scrape.py @@ -54,7 +54,7 @@ class ScrapeRequest(BaseModel): }, description="Optional headers to send with the request, including cookies " "and user agent", - ), + ) mock: bool = Field(default=False, description="Whether to use mock mode for the request") stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website")