From 9d4ff1628208da4a60f938a26974e965fb34fbc8 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 30 Mar 2026 08:40:06 -0700 Subject: [PATCH 1/6] feat!: migrate python SDK to v2 API surface Port the Python SDK to the new v2 API surface, mirroring scrapegraph-js PR #11. Breaking changes: - smartscraper -> extract (POST /api/v1/extract) - searchscraper -> search (POST /api/v1/search) - scrape now uses format-specific config (markdown/html/screenshot/branding) - crawl/monitor are now namespaced: client.crawl.start(), client.monitor.create() - Removed: markdownify, agenticscraper, sitemap, healthz, feedback, scheduled jobs - Auth: sends both Authorization: Bearer and SGAI-APIKEY headers - Added X-SDK-Version header, base_url parameter for custom endpoints - Version bumped to 2.0.0 Tested against dev API (https://sgai-api-dev-v2.onrender.com/api/v1/scrape): - Scrape markdown: returns markdown content successfully - Scrape html: returns content successfully - All 72 unit tests pass with 81% coverage Co-Authored-By: Claude Opus 4.6 (1M context) --- scrapegraph-py/pyproject.toml | 2 +- scrapegraph-py/scrapegraph_py/__init__.py | 159 +- scrapegraph-py/scrapegraph_py/async_client.py | 1492 ++++------------ scrapegraph-py/scrapegraph_py/client.py | 1540 ++++------------- scrapegraph-py/scrapegraph_py/config.py | 14 +- .../scrapegraph_py/models/__init__.py | 78 +- .../scrapegraph_py/models/agenticscraper.py | 148 -- scrapegraph-py/scrapegraph_py/models/crawl.py | 230 +-- .../scrapegraph_py/models/extract.py | 47 + .../scrapegraph_py/models/feedback.py | 32 - .../scrapegraph_py/models/history.py | 30 + .../scrapegraph_py/models/markdownify.py | 80 - .../scrapegraph_py/models/monitor.py | 54 + .../scrapegraph_py/models/scheduled_jobs.py | 151 -- .../scrapegraph_py/models/schema.py | 111 +- .../scrapegraph_py/models/scrape.py | 141 +- .../scrapegraph_py/models/search.py | 37 + .../scrapegraph_py/models/searchscraper.py | 142 -- .../scrapegraph_py/models/shared.py | 66 + .../scrapegraph_py/models/sitemap.py | 192 -- .../scrapegraph_py/models/smartscraper.py | 186 -- .../scrapegraph_py/utils/helpers.py | 50 +- scrapegraph-py/tests/test_async_client.py | 863 ++------- .../tests/test_async_scheduled_jobs.py | 293 ---- scrapegraph-py/tests/test_client.py | 860 +++------ .../tests/test_crawl_path_filtering.py | 116 -- scrapegraph-py/tests/test_crawl_polling.py | 459 ----- scrapegraph-py/tests/test_healthz_mock.py | 135 -- scrapegraph-py/tests/test_integration_v2.py | 56 + .../tests/test_mock_async_client.py | 345 ---- scrapegraph-py/tests/test_mock_client.py | 276 --- scrapegraph-py/tests/test_mocked_apis.py | 484 ------ scrapegraph-py/tests/test_models.py | 225 +++ scrapegraph-py/tests/test_scheduled_jobs.py | 281 --- .../tests/test_schema_generation.py | 529 ------ scrapegraph-py/tests/test_schema_models.py | 453 ----- .../tests/test_scrape_comprehensive.py | 372 ---- scrapegraph-py/tests/test_scrape_models.py | 213 --- scrapegraph-py/tests/test_sitemap_models.py | 210 --- .../tests/test_smartscraper_models.py | 327 ---- scrapegraph-py/tests/test_stealth_mode.py | 469 ----- scrapegraph-py/tests/utils.py | 9 +- 42 files changed, 1820 insertions(+), 10137 deletions(-) delete mode 100644 scrapegraph-py/scrapegraph_py/models/agenticscraper.py create mode 100644 scrapegraph-py/scrapegraph_py/models/extract.py delete mode 100644 scrapegraph-py/scrapegraph_py/models/feedback.py create mode 100644 scrapegraph-py/scrapegraph_py/models/history.py delete mode 100644 scrapegraph-py/scrapegraph_py/models/markdownify.py create mode 100644 scrapegraph-py/scrapegraph_py/models/monitor.py delete mode 100644 scrapegraph-py/scrapegraph_py/models/scheduled_jobs.py create mode 100644 scrapegraph-py/scrapegraph_py/models/search.py delete mode 100644 scrapegraph-py/scrapegraph_py/models/searchscraper.py create mode 100644 scrapegraph-py/scrapegraph_py/models/shared.py delete mode 100644 scrapegraph-py/scrapegraph_py/models/sitemap.py delete mode 100644 scrapegraph-py/scrapegraph_py/models/smartscraper.py delete mode 100644 scrapegraph-py/tests/test_async_scheduled_jobs.py delete mode 100644 scrapegraph-py/tests/test_crawl_path_filtering.py delete mode 100644 scrapegraph-py/tests/test_crawl_polling.py delete mode 100644 scrapegraph-py/tests/test_healthz_mock.py create mode 100644 scrapegraph-py/tests/test_integration_v2.py delete mode 100644 scrapegraph-py/tests/test_mock_async_client.py delete mode 100644 scrapegraph-py/tests/test_mock_client.py delete mode 100644 scrapegraph-py/tests/test_mocked_apis.py create mode 100644 scrapegraph-py/tests/test_models.py delete mode 100644 scrapegraph-py/tests/test_scheduled_jobs.py delete mode 100644 scrapegraph-py/tests/test_schema_generation.py delete mode 100644 scrapegraph-py/tests/test_schema_models.py delete mode 100644 scrapegraph-py/tests/test_scrape_comprehensive.py delete mode 100644 scrapegraph-py/tests/test_scrape_models.py delete mode 100644 scrapegraph-py/tests/test_sitemap_models.py delete mode 100644 scrapegraph-py/tests/test_smartscraper_models.py delete mode 100644 scrapegraph-py/tests/test_stealth_mode.py diff --git a/scrapegraph-py/pyproject.toml b/scrapegraph-py/pyproject.toml index 95ec2db..3ef69ed 100644 --- a/scrapegraph-py/pyproject.toml +++ b/scrapegraph-py/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraph_py" -version = "1.12.2" +version = "2.0.0" description = "ScrapeGraph Python SDK for API" authors = [ { name = "Marco Vinciguerra", email = "marco@scrapegraphai.com" }, diff --git a/scrapegraph-py/scrapegraph_py/__init__.py b/scrapegraph-py/scrapegraph_py/__init__.py index 588effe..b559129 100644 --- a/scrapegraph-py/scrapegraph_py/__init__.py +++ b/scrapegraph-py/scrapegraph_py/__init__.py @@ -1,97 +1,62 @@ -""" -ScrapeGraphAI Python SDK - -A comprehensive Python SDK for the ScrapeGraphAI API, providing both synchronous -and asynchronous clients for all API endpoints. - -Main Features: - - SmartScraper: AI-powered web scraping with structured data extraction - - SearchScraper: Web research across multiple sources - - Agentic Scraper: Automated browser interactions and form filling - - Crawl: Website crawling with AI extraction or markdown conversion - - Markdownify: Convert web pages to clean markdown - - Schema Generation: AI-assisted schema creation for data extraction - - Scheduled Jobs: Automate recurring scraping tasks - -Quick Start: - >>> from scrapegraph_py import Client - >>> - >>> # Initialize client from environment variables - >>> client = Client.from_env() - >>> - >>> # Basic scraping - >>> result = client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract all product information" - ... ) - >>> - >>> # With context manager - >>> with Client.from_env() as client: - ... result = client.scrape(website_url="https://example.com") - -Async Usage: - >>> import asyncio - >>> from scrapegraph_py import AsyncClient - >>> - >>> async def main(): - ... async with AsyncClient.from_env() as client: - ... result = await client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract products" - ... ) - >>> - >>> asyncio.run(main()) - -For more information visit: https://scrapegraphai.com -Documentation: https://docs.scrapegraphai.com -""" - -from .async_client import AsyncClient -from .client import Client - -# Scrape Models -from .models.scrape import ( - ScrapeRequest, - GetScrapeRequest, -) - -# Scheduled Jobs Models -from .models.scheduled_jobs import ( - GetJobExecutionsRequest, - GetScheduledJobRequest, - GetScheduledJobsRequest, - JobActionRequest, - JobActionResponse, - JobExecutionListResponse, - JobExecutionResponse, - JobTriggerResponse, - ScheduledJobCreate, - ScheduledJobListResponse, - ScheduledJobResponse, - ScheduledJobUpdate, - ServiceType, - TriggerJobRequest, -) - -__all__ = [ - "Client", - "AsyncClient", - # Scrape Models - "ScrapeRequest", - "GetScrapeRequest", - # Scheduled Jobs Models - "ServiceType", - "ScheduledJobCreate", - "ScheduledJobUpdate", - "ScheduledJobResponse", - "ScheduledJobListResponse", - "JobExecutionResponse", - "JobExecutionListResponse", - "JobTriggerResponse", - "JobActionResponse", - "GetScheduledJobsRequest", - "GetScheduledJobRequest", - "GetJobExecutionsRequest", - "TriggerJobRequest", - "JobActionRequest", -] +""" +ScrapeGraphAI Python SDK v2 + +A Python SDK for the ScrapeGraphAI v2 API, providing both synchronous +and asynchronous clients for intelligent web scraping powered by AI. + +Quick Start: + >>> from scrapegraph_py import Client + >>> client = Client(api_key="sgai-...") + >>> result = client.scrape("https://example.com") + >>> result = client.extract("https://example.com", prompt="Extract prices") + >>> job = client.crawl.start("https://example.com", depth=3) + +Async Usage: + >>> import asyncio + >>> from scrapegraph_py import AsyncClient + >>> async def main(): + ... async with AsyncClient(api_key="sgai-...") as client: + ... result = await client.extract( + ... url="https://example.com", + ... prompt="Extract products" + ... ) + >>> asyncio.run(main()) +""" + +from .async_client import AsyncClient +from .client import Client +from .config import VERSION +from .models.crawl import CrawlFormat, CrawlRequest +from .models.extract import ExtractRequest +from .models.history import HistoryFilter +from .models.monitor import MonitorCreateRequest +from .models.schema import SchemaRequest +from .models.scrape import ScrapeFormat, ScrapeRequest +from .models.search import SearchRequest +from .models.shared import FetchConfig, LlmConfig + +__version__ = VERSION + +__all__ = [ + "Client", + "AsyncClient", + # Shared config + "FetchConfig", + "LlmConfig", + # Scrape + "ScrapeFormat", + "ScrapeRequest", + # Extract + "ExtractRequest", + # Search + "SearchRequest", + # Schema + "SchemaRequest", + # Crawl + "CrawlFormat", + "CrawlRequest", + # Monitor + "MonitorCreateRequest", + # History + "HistoryFilter", +] diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 5111849..64ad7ba 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -1,111 +1,168 @@ """ -Asynchronous HTTP client for the ScrapeGraphAI API. - -This module provides an asynchronous client for interacting with all ScrapeGraphAI -API endpoints including smartscraper, searchscraper, crawl, agentic scraper, -markdownify, schema generation, scheduled jobs, and utility functions. - -The AsyncClient class supports: -- API key authentication -- SSL verification configuration -- Request timeout configuration -- Automatic retry logic with exponential backoff -- Mock mode for testing -- Async context manager support for proper resource cleanup -- Concurrent requests using asyncio +Asynchronous HTTP client for the ScrapeGraphAI v2 API. Example: - Basic usage with environment variables: - >>> import asyncio - >>> from scrapegraph_py import AsyncClient - >>> async def main(): - ... client = AsyncClient.from_env() - ... result = await client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract product information" - ... ) - ... await client.close() - >>> asyncio.run(main()) - - Using async context manager: - >>> async def main(): - ... async with AsyncClient(api_key="sgai-...") as client: - ... result = await client.scrape(website_url="https://example.com") - >>> asyncio.run(main()) + >>> import asyncio + >>> from scrapegraph_py import AsyncClient + >>> async def main(): + ... async with AsyncClient(api_key="sgai-...") as client: + ... result = await client.extract( + ... url="https://example.com", + ... prompt="Extract product information" + ... ) + ... print(result) + >>> asyncio.run(main()) """ + import asyncio -from typing import Any, Dict, Optional, Callable +from typing import Any, Dict, List, Optional from aiohttp import ClientSession, ClientTimeout, TCPConnector from aiohttp.client_exceptions import ClientError from pydantic import BaseModel -from urllib.parse import urlparse -import uuid as _uuid from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger -from scrapegraph_py.models.agenticscraper import ( - AgenticScraperRequest, - GetAgenticScraperRequest, -) -from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest -from scrapegraph_py.models.feedback import FeedbackRequest -from scrapegraph_py.models.scrape import GetScrapeRequest, ScrapeRequest -from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest -from scrapegraph_py.models.schema import ( - GenerateSchemaRequest, - GetSchemaStatusRequest, - SchemaGenerationResponse, -) -from scrapegraph_py.models.searchscraper import ( - GetSearchScraperRequest, - SearchScraperRequest, - TimeRange, -) -from scrapegraph_py.models.sitemap import SitemapRequest, SitemapResponse -from scrapegraph_py.models.smartscraper import ( - GetSmartScraperRequest, - SmartScraperRequest, -) -from scrapegraph_py.models.scheduled_jobs import ( - GetJobExecutionsRequest, - GetScheduledJobRequest, - GetScheduledJobsRequest, - JobActionRequest, - ScheduledJobCreate, - ScheduledJobUpdate, - TriggerJobRequest, -) +from scrapegraph_py.models.crawl import CrawlFormat, CrawlRequest +from scrapegraph_py.models.extract import ExtractRequest +from scrapegraph_py.models.history import HistoryFilter +from scrapegraph_py.models.monitor import MonitorCreateRequest +from scrapegraph_py.models.schema import SchemaRequest +from scrapegraph_py.models.scrape import ScrapeFormat, ScrapeRequest +from scrapegraph_py.models.search import SearchRequest +from scrapegraph_py.models.shared import FetchConfig, LlmConfig from scrapegraph_py.utils.helpers import handle_async_response, validate_api_key -from scrapegraph_py.utils.toon_converter import process_response_with_toon -class AsyncClient: - """ - Asynchronous client for the ScrapeGraphAI API. +class _AsyncCrawlNamespace: + """Namespaced async crawl operations.""" + + def __init__(self, client: "AsyncClient"): + self._client = client - This class provides asynchronous methods for all ScrapeGraphAI API endpoints. - It handles authentication, request management, error handling, and supports - mock mode for testing. Uses aiohttp for efficient async HTTP requests. + async def start( + self, + url: str, + depth: int = 2, + max_pages: int = 10, + format: str = "markdown", + include_patterns: Optional[List[str]] = None, + exclude_patterns: Optional[List[str]] = None, + fetch_config: Optional[FetchConfig] = None, + ) -> Dict[str, Any]: + """Start a crawl job.""" + logger.info(f"Starting crawl for {url}") + request = CrawlRequest( + url=url, + depth=depth, + max_pages=max_pages, + format=CrawlFormat(format), + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + fetch_config=fetch_config, + ) + return await self._client._make_request( + "POST", f"{self._client.base_url}/crawl", json=request.model_dump() + ) - Attributes: - api_key (str): The API key for authentication - headers (dict): Default headers including API key - timeout (ClientTimeout): Request timeout configuration - max_retries (int): Maximum number of retry attempts - retry_delay (float): Base delay between retries in seconds - mock (bool): Whether mock mode is enabled - session (ClientSession): Aiohttp session for connection pooling + async def status(self, crawl_id: str) -> Dict[str, Any]: + """Get crawl job status and results.""" + logger.info(f"Fetching crawl status for {crawl_id}") + return await self._client._make_request( + "GET", f"{self._client.base_url}/crawl/{crawl_id}" + ) + + async def stop(self, crawl_id: str) -> Dict[str, Any]: + """Stop a running crawl job.""" + logger.info(f"Stopping crawl {crawl_id}") + return await self._client._make_request( + "POST", f"{self._client.base_url}/crawl/{crawl_id}/stop" + ) + + async def resume(self, crawl_id: str) -> Dict[str, Any]: + """Resume a stopped crawl job.""" + logger.info(f"Resuming crawl {crawl_id}") + return await self._client._make_request( + "POST", f"{self._client.base_url}/crawl/{crawl_id}/resume" + ) + + +class _AsyncMonitorNamespace: + """Namespaced async monitor operations.""" + + def __init__(self, client: "AsyncClient"): + self._client = client + + async def create( + self, + name: str, + url: str, + prompt: str, + cron: str, + output_schema: Optional[Dict[str, Any]] = None, + fetch_config: Optional[FetchConfig] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Create a new monitor.""" + logger.info(f"Creating monitor '{name}' for {url}") + request = MonitorCreateRequest( + name=name, + url=url, + prompt=prompt, + cron=cron, + output_schema=output_schema, + fetch_config=fetch_config, + llm_config=llm_config, + ) + return await self._client._make_request( + "POST", f"{self._client.base_url}/monitor", json=request.model_dump() + ) + + async def list(self) -> Dict[str, Any]: + """List all monitors.""" + logger.info("Listing monitors") + return await self._client._make_request("GET", f"{self._client.base_url}/monitor") + + async def get(self, monitor_id: str) -> Dict[str, Any]: + """Get a specific monitor.""" + logger.info(f"Fetching monitor {monitor_id}") + return await self._client._make_request( + "GET", f"{self._client.base_url}/monitor/{monitor_id}" + ) + + async def pause(self, monitor_id: str) -> Dict[str, Any]: + """Pause a monitor.""" + logger.info(f"Pausing monitor {monitor_id}") + return await self._client._make_request( + "POST", f"{self._client.base_url}/monitor/{monitor_id}/pause" + ) + + async def resume(self, monitor_id: str) -> Dict[str, Any]: + """Resume a paused monitor.""" + logger.info(f"Resuming monitor {monitor_id}") + return await self._client._make_request( + "POST", f"{self._client.base_url}/monitor/{monitor_id}/resume" + ) + + async def delete(self, monitor_id: str) -> Dict[str, Any]: + """Delete a monitor.""" + logger.info(f"Deleting monitor {monitor_id}") + return await self._client._make_request( + "DELETE", f"{self._client.base_url}/monitor/{monitor_id}" + ) + + +class AsyncClient: + """Asynchronous client for the ScrapeGraphAI v2 API. Example: - >>> async def example(): - ... async with AsyncClient.from_env() as client: - ... result = await client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract all products" - ... ) + >>> async with AsyncClient(api_key="sgai-...") as client: + ... result = await client.scrape("https://example.com") + ... result = await client.extract("https://example.com", prompt="Extract prices") + ... job = await client.crawl.start("https://example.com", depth=3) """ + @classmethod def from_env( cls, @@ -113,67 +170,42 @@ def from_env( timeout: Optional[float] = None, max_retries: int = 3, retry_delay: float = 1.0, - mock: Optional[bool] = None, - mock_handler: Optional[Callable[[str, str, Dict[str, Any]], Any]] = None, - mock_responses: Optional[Dict[str, Any]] = None, - ): - """Initialize AsyncClient using API key from environment variable. - - Args: - verify_ssl: Whether to verify SSL certificates - timeout: Request timeout in seconds. None means no timeout (infinite) - max_retries: Maximum number of retry attempts - retry_delay: Delay between retries in seconds - """ + ) -> "AsyncClient": + """Initialize AsyncClient using SGAI_API_KEY environment variable.""" from os import getenv - # Allow enabling mock mode from environment if not explicitly provided - if mock is None: - mock_env = getenv("SGAI_MOCK", "0").strip().lower() - mock = mock_env in {"1", "true", "yes", "on"} - api_key = getenv("SGAI_API_KEY") - # In mock mode, we don't need a real API key if not api_key: - if mock: - api_key = "sgai-00000000-0000-0000-0000-000000000000" - else: - raise ValueError("SGAI_API_KEY environment variable not set") + raise ValueError("SGAI_API_KEY environment variable not set") return cls( api_key=api_key, verify_ssl=verify_ssl, timeout=timeout, max_retries=max_retries, retry_delay=retry_delay, - mock=bool(mock), - mock_handler=mock_handler, - mock_responses=mock_responses, ) def __init__( self, api_key: str = None, + base_url: Optional[str] = None, verify_ssl: bool = True, timeout: Optional[float] = None, max_retries: int = 3, retry_delay: float = 1.0, - mock: bool = False, - mock_handler: Optional[Callable[[str, str, Dict[str, Any]], Any]] = None, - mock_responses: Optional[Dict[str, Any]] = None, ): - """Initialize AsyncClient with configurable parameters. + """Initialize AsyncClient. Args: - api_key: API key for authentication. If None, will try to - load from environment + api_key: API key for authentication. If None, reads from SGAI_API_KEY env var + base_url: Override the default API base URL verify_ssl: Whether to verify SSL certificates - timeout: Request timeout in seconds. None means no timeout (infinite) - max_retries: Maximum number of retry attempts - retry_delay: Delay between retries in seconds + timeout: Request timeout in seconds (None = no timeout) + max_retries: Maximum retry attempts on server errors + retry_delay: Base delay between retries in seconds """ - logger.info("πŸ”‘ Initializing AsyncClient") + logger.info("Initializing AsyncClient") - # Try to get API key from environment if not provided if api_key is None: from os import getenv @@ -184,1141 +216,249 @@ def __init__( ) validate_api_key(api_key) - logger.debug( - f"πŸ› οΈ Configuration: verify_ssl={verify_ssl}, " - f"timeout={timeout}, max_retries={max_retries}" - ) + self.api_key = api_key - self.headers = {**DEFAULT_HEADERS, "SGAI-APIKEY": api_key} + self.base_url = (base_url or API_BASE_URL).rstrip("/") + self.headers = { + **DEFAULT_HEADERS, + "Authorization": f"Bearer {api_key}", + "SGAI-APIKEY": api_key, + } self.max_retries = max_retries self.retry_delay = retry_delay - self.mock = bool(mock) - self.mock_handler = mock_handler - self.mock_responses = mock_responses or {} ssl = None if verify_ssl else False self.timeout = ClientTimeout(total=timeout) if timeout is not None else None self.session = ClientSession( - headers=self.headers, connector=TCPConnector(ssl=ssl), timeout=self.timeout + headers=self.headers, + connector=TCPConnector(ssl=ssl), + timeout=self.timeout, ) - logger.info("βœ… AsyncClient initialized successfully") - - async def _make_request(self, method: str, url: str, **kwargs) -> Any: - """ - Make asynchronous HTTP request with retry logic and error handling. - - Args: - method: HTTP method (GET, POST, etc.) - url: Full URL for the request - **kwargs: Additional arguments to pass to aiohttp - - Returns: - Parsed JSON response data + # Namespaced sub-clients + self.crawl = _AsyncCrawlNamespace(self) + self.monitor = _AsyncMonitorNamespace(self) - Raises: - APIError: If the API returns an error response - ConnectionError: If unable to connect after all retries + logger.info("AsyncClient initialized successfully") - Note: - In mock mode, this method returns deterministic responses without - making actual HTTP requests. - """ - # Short-circuit when mock mode is enabled - if getattr(self, "mock", False): - return self._mock_response(method, url, **kwargs) + async def _make_request(self, method: str, url: str, **kwargs: Any) -> Any: + """Make async HTTP request with retry logic.""" for attempt in range(self.max_retries): try: - logger.info( - f"πŸš€ Making {method} request to {url} " - f"(Attempt {attempt + 1}/{self.max_retries})" + logger.debug( + f"Making {method} request to {url} " + f"(attempt {attempt + 1}/{self.max_retries})" ) - logger.debug(f"πŸ” Request parameters: {kwargs}") - async with self.session.request(method, url, **kwargs) as response: - logger.debug(f"πŸ“₯ Response status: {response.status}") - result = await handle_async_response(response) - logger.info(f"βœ… Request completed successfully: {method} {url}") - return result + return await handle_async_response(response) except ClientError as e: - logger.warning(f"⚠️ Request attempt {attempt + 1} failed: {str(e)}") + logger.warning(f"Request attempt {attempt + 1} failed: {e}") if hasattr(e, "status") and e.status is not None: try: error_data = await e.response.json() error_msg = error_data.get("error", str(e)) - logger.error(f"πŸ”΄ API Error: {error_msg}") raise APIError(error_msg, status_code=e.status) - except ValueError: - logger.error("πŸ”΄ Could not parse error response") + except (ValueError, AttributeError): raise APIError( str(e), - status_code=e.status if hasattr(e, "status") else None, + status_code=getattr(e, "status", None), ) if attempt == self.max_retries - 1: - logger.error(f"❌ All retry attempts failed for {method} {url}") - raise ConnectionError(f"Failed to connect to API: {str(e)}") + raise ConnectionError(f"Failed to connect to API: {e}") retry_delay = self.retry_delay * (attempt + 1) - logger.info(f"⏳ Waiting {retry_delay}s before retry {attempt + 2}") + logger.info(f"Waiting {retry_delay}s before retry {attempt + 2}") await asyncio.sleep(retry_delay) - def _mock_response(self, method: str, url: str, **kwargs) -> Any: - """Return a deterministic mock response without performing network I/O. - - Resolution order: - 1) If a custom mock_handler is provided, delegate to it - 2) If mock_responses contains a key for the request path, use it - 3) Fallback to built-in defaults per endpoint family - """ - logger.info(f"πŸ§ͺ Mock mode active. Returning stub for {method} {url}") - - # 1) Custom handler - if self.mock_handler is not None: - try: - return self.mock_handler(method, url, kwargs) - except Exception as handler_error: - logger.warning(f"Custom mock_handler raised: {handler_error}. Falling back to defaults.") - - # 2) Path-based override - try: - parsed = urlparse(url) - path = parsed.path.rstrip("/") - except Exception: - path = url - - override = self.mock_responses.get(path) - if override is not None: - return override() if callable(override) else override - - # 3) Built-in defaults - def new_id(prefix: str) -> str: - return f"{prefix}-{_uuid.uuid4()}" - - upper_method = method.upper() - - # Credits endpoint - if path.endswith("/credits") and upper_method == "GET": - return {"remaining_credits": 1000, "total_credits_used": 0} - - # Health check endpoint - if path.endswith("/healthz") and upper_method == "GET": - return {"status": "healthy", "message": "Service is operational"} - - # Feedback acknowledge - if path.endswith("/feedback") and upper_method == "POST": - return {"status": "success"} - - # Create-like endpoints (POST) - if upper_method == "POST": - if path.endswith("/crawl"): - return {"crawl_id": new_id("mock-crawl")} - elif path.endswith("/scheduled-jobs"): - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - elif "/pause" in path: - return { - "message": "Job paused successfully", - "job_id": new_id("mock-job"), - "is_active": False - } - elif "/resume" in path: - return { - "message": "Job resumed successfully", - "job_id": new_id("mock-job"), - "is_active": True, - "next_run_at": "2024-01-08T09:00:00Z" - } - elif "/trigger" in path: - task_id = new_id("mock-task") - return { - "execution_id": task_id, - "scheduled_job_id": new_id("mock-job"), - "triggered_at": "2024-01-01T00:00:00Z", - "message": f"Job triggered successfully. Task ID: {task_id}" - } - # All other POST endpoints return a request id - return {"request_id": new_id("mock-req")} - - # Status-like endpoints (GET) - if upper_method == "GET": - if "markdownify" in path: - return {"status": "completed", "content": "# Mock markdown\n\n..."} - if "smartscraper" in path: - return {"status": "completed", "result": [{"field": "value"}]} - if "searchscraper" in path: - return { - "status": "completed", - "results": [{"url": "https://example.com"}], - "markdown_content": "# Mock Markdown Content\n\nThis is mock markdown content for testing purposes.\n\n## Section 1\n\nSome content here.\n\n## Section 2\n\nMore content here.", - "reference_urls": ["https://example.com", "https://example2.com"] - } - if "crawl" in path: - return {"status": "completed", "pages": []} - if "agentic-scrapper" in path: - return {"status": "completed", "actions": []} - if "scheduled-jobs" in path: - if "/executions" in path: - return { - "executions": [ - { - "id": new_id("mock-exec"), - "scheduled_job_id": new_id("mock-job"), - "execution_id": new_id("mock-task"), - "status": "completed", - "started_at": "2024-01-01T00:00:00Z", - "completed_at": "2024-01-01T00:01:00Z", - "result": {"mock": "result"}, - "credits_used": 10 - } - ], - "total": 1, - "page": 1, - "page_size": 20 - } - elif path.endswith("/scheduled-jobs"): # List jobs endpoint - return { - "jobs": [ - { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - ], - "total": 1, - "page": 1, - "page_size": 20 - } - else: # Single job endpoint - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - - # Update operations (PATCH/PUT) - if upper_method in ["PATCH", "PUT"] and "scheduled-jobs" in path: - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Updated Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 10 * * 1", - "job_config": {"mock": "updated_config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T01:00:00Z", - "next_run_at": "2024-01-08T10:00:00Z" - } - - # Delete operations - if upper_method == "DELETE" and "scheduled-jobs" in path: - return {"message": "Scheduled job deleted successfully"} - - # Generic fallback - return {"status": "mock", "url": url, "method": method, "kwargs": kwargs} - - async def markdownify( - self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False - ): - """Send a markdownify request - - Args: - website_url: The URL to convert to markdown - headers: Optional HTTP headers - mock: Enable mock mode for testing - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Starting markdownify request for {website_url}") - if headers: - logger.debug("πŸ”§ Using custom headers") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if render_heavy_js: - logger.debug("⚑ Heavy JavaScript rendering enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms) - logger.debug("βœ… Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/markdownify", json=request.model_dump() - ) - logger.info("✨ Markdownify request completed successfully") - return process_response_with_toon(result, return_toon) - - async def get_markdownify(self, request_id: str, return_toon: bool = False): - """Get the result of a previous markdownify request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching markdownify result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetMarkdownifyRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") - - result = await self._make_request( - "GET", f"{API_BASE_URL}/markdownify/{request_id}" - ) - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # Scrape + # ------------------------------------------------------------------ async def scrape( self, - website_url: str, - render_heavy_js: bool = False, - branding: bool = False, - headers: Optional[dict[str, str]] = None, - stealth: bool = False, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """Send a scrape request to get HTML content from a website + url: str, + format: str = "markdown", + fetch_config: Optional[FetchConfig] = None, + ) -> Dict[str, Any]: + """Scrape a page and return it in the specified format. Args: - website_url: The URL of the website to get HTML from - render_heavy_js: Whether to render heavy JavaScript (defaults to False) - branding: Whether to include branding in the response (defaults to False) - headers: Optional headers to send with the request - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + url: URL to scrape + format: Output format - 'markdown', 'html', 'screenshot', or 'branding' + fetch_config: Fetch configuration options """ - logger.info(f"πŸ” Starting scrape request for {website_url}") - logger.debug(f"πŸ”§ Render heavy JS: {render_heavy_js}") - logger.debug(f"πŸ”§ Branding: {branding}") - if headers: - logger.debug("πŸ”§ Using custom headers") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - + logger.info(f"Scraping {url} (format={format})") request = ScrapeRequest( - website_url=website_url, - render_heavy_js=render_heavy_js, - branding=branding, - headers=headers, - stealth=stealth, - wait_ms=wait_ms, + url=url, + format=ScrapeFormat(format), + fetch_config=fetch_config, ) - logger.debug("βœ… Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/scrape", json=request.model_dump() + return await self._make_request( + "POST", f"{self.base_url}/scrape", json=request.model_dump() ) - logger.info("✨ Scrape request completed successfully") - return process_response_with_toon(result, return_toon) - - async def get_scrape(self, request_id: str, return_toon: bool = False): - """Get the result of a previous scrape request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching scrape result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - # Validate input using Pydantic model - GetScrapeRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") + # ------------------------------------------------------------------ + # Extract (replaces SmartScraper) + # ------------------------------------------------------------------ - result = await self._make_request( - "GET", f"{API_BASE_URL}/scrape/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - async def sitemap( + async def extract( self, - website_url: str, - mock: bool = False, - ) -> SitemapResponse: - """Extract all URLs from a website's sitemap. - - Automatically discovers sitemap from robots.txt or common sitemap locations. + url: str, + prompt: str, + output_schema: Optional[Any] = None, + fetch_config: Optional[FetchConfig] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Extract structured data from a page using AI. Args: - website_url: The URL of the website to extract sitemap from - mock: Whether to use mock mode for this request - - Returns: - SitemapResponse: Object containing list of URLs extracted from sitemap - - Raises: - ValueError: If website_url is invalid - APIError: If the API request fails - - Examples: - >>> async with AsyncClient(api_key="your-api-key") as client: - ... response = await client.sitemap("https://example.com") - ... print(f"Found {len(response.urls)} URLs") - ... for url in response.urls[:5]: - ... print(url) + url: URL to extract data from + prompt: Natural language prompt describing what to extract + output_schema: JSON Schema dict or Pydantic BaseModel class for output structure + fetch_config: Fetch configuration options + llm_config: LLM configuration options """ - logger.info(f"πŸ—ΊοΈ Starting sitemap extraction for {website_url}") + logger.info(f"Extracting from {url}") + + schema_dict = None + if output_schema is not None: + if isinstance(output_schema, type) and issubclass(output_schema, BaseModel): + schema_dict = output_schema.model_json_schema() + elif isinstance(output_schema, dict): + schema_dict = output_schema + else: + raise ValueError( + "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class" + ) - request = SitemapRequest( - website_url=website_url, - mock=mock + request = ExtractRequest( + url=url, + prompt=prompt, + output_schema=schema_dict, + fetch_config=fetch_config, + llm_config=llm_config, ) - logger.debug("βœ… Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/sitemap", json=request.model_dump() + return await self._make_request( + "POST", f"{self.base_url}/extract", json=request.model_dump() ) - logger.info(f"✨ Sitemap extraction completed successfully - found {len(result.get('urls', []))} URLs") - # Parse response into SitemapResponse model - return SitemapResponse(**result) + # ------------------------------------------------------------------ + # Search (replaces SearchScraper) + # ------------------------------------------------------------------ - async def smartscraper( + async def search( self, - user_prompt: str, - website_url: Optional[str] = None, - website_html: Optional[str] = None, - website_markdown: Optional[str] = None, - headers: Optional[dict[str, str]] = None, - cookies: Optional[Dict[str, str]] = None, - output_schema: Optional[BaseModel] = None, - number_of_scrolls: Optional[int] = None, - total_pages: Optional[int] = None, - mock: bool = False, - plain_text: bool = False, - render_heavy_js: bool = False, - stealth: bool = False, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """ - Send a smartscraper request with optional pagination support and cookies. - - Supports three types of input (must provide exactly one): - - website_url: Scrape from a URL - - website_html: Process local HTML content - - website_markdown: Process local Markdown content - - Args: - user_prompt: Natural language prompt describing what to extract - website_url: URL to scrape (optional) - website_html: Raw HTML content to process (optional, max 2MB) - website_markdown: Markdown content to process (optional, max 2MB) - headers: Optional HTTP headers - cookies: Optional cookies for authentication - output_schema: Optional Pydantic model for structured output - number_of_scrolls: Number of times to scroll (0-100) - total_pages: Number of pages to scrape (1-10) - mock: Enable mock mode for testing - plain_text: Return plain text instead of structured data - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - - Returns: - Dictionary containing the scraping results, or TOON formatted string if return_toon=True - - Raises: - ValueError: If validation fails or invalid parameters provided - APIError: If the API request fails - """ - logger.info("πŸ” Starting smartscraper request") - if website_url: - logger.debug(f"🌐 URL: {website_url}") - if website_html: - logger.debug("πŸ“„ Using provided HTML content") - if website_markdown: - logger.debug("πŸ“ Using provided Markdown content") - if headers: - logger.debug("πŸ”§ Using custom headers") - if cookies: - logger.debug("πŸͺ Using cookies for authentication/session management") - if number_of_scrolls is not None: - logger.debug(f"πŸ”„ Number of scrolls: {number_of_scrolls}") - if total_pages is not None: - logger.debug(f"πŸ“„ Total pages to scrape: {total_pages}") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if render_heavy_js: - logger.debug("⚑ Heavy JavaScript rendering enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - logger.debug(f"πŸ“ Prompt: {user_prompt}") - - request = SmartScraperRequest( - website_url=website_url, - website_html=website_html, - website_markdown=website_markdown, - headers=headers, - cookies=cookies, - user_prompt=user_prompt, - output_schema=output_schema, - number_of_scrolls=number_of_scrolls, - total_pages=total_pages, - mock=mock, - plain_text=plain_text, - render_heavy_js=render_heavy_js, - stealth=stealth, - wait_ms=wait_ms, - ) - - logger.debug("βœ… Request validation passed") + query: str, + num_results: int = 5, + output_schema: Optional[Any] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Search the web and extract structured results. - result = await self._make_request( - "POST", f"{API_BASE_URL}/smartscraper", json=request.model_dump() - ) - logger.info("✨ Smartscraper request completed successfully") - return process_response_with_toon(result, return_toon) - - async def get_smartscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous smartscraper request - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + query: The search query + num_results: Number of results (3-20, default 5) + output_schema: JSON Schema dict or Pydantic BaseModel class for output structure + llm_config: LLM configuration options """ - logger.info(f"πŸ” Fetching smartscraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetSmartScraperRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") - - result = await self._make_request( - "GET", f"{API_BASE_URL}/smartscraper/{request_id}" - ) - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - async def submit_feedback( - self, request_id: str, rating: int, feedback_text: Optional[str] = None - ): - """Submit feedback for a request""" - logger.info(f"πŸ“ Submitting feedback for request {request_id}") - logger.debug(f"⭐ Rating: {rating}, Feedback: {feedback_text}") - - feedback = FeedbackRequest( - request_id=request_id, rating=rating, feedback_text=feedback_text - ) - logger.debug("βœ… Feedback validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/feedback", json=feedback.model_dump() - ) - logger.info("✨ Feedback submitted successfully") - return result - - async def get_credits(self): - """Get credits information""" - logger.info("πŸ’³ Fetching credits information") - - result = await self._make_request( - "GET", - f"{API_BASE_URL}/credits", - ) - logger.info( - f"✨ Credits info retrieved: " - f"{result.get('remaining_credits')} credits remaining" - ) - return result - - async def healthz(self): - """Check the health status of the service - - This endpoint is useful for monitoring and ensuring the service is operational. - It returns a JSON response indicating the service's health status. - - Returns: - dict: Health status information - - Example: - >>> async with AsyncClient.from_env() as client: - ... health = await client.healthz() - ... print(health) - """ - logger.info("πŸ₯ Checking service health") - - result = await self._make_request( - "GET", - f"{API_BASE_URL}/healthz", - ) - logger.info("✨ Health check completed successfully") - return result - - async def searchscraper( - self, - user_prompt: str, - num_results: Optional[int] = 3, - headers: Optional[dict[str, str]] = None, - output_schema: Optional[BaseModel] = None, - extraction_mode: bool = True, - stealth: bool = False, - location_geo_code: Optional[str] = None, - time_range: Optional[TimeRange] = None, - return_toon: bool = False, - ): - """Send a searchscraper request + logger.info(f"Searching: {query}") + + schema_dict = None + if output_schema is not None: + if isinstance(output_schema, type) and issubclass(output_schema, BaseModel): + schema_dict = output_schema.model_json_schema() + elif isinstance(output_schema, dict): + schema_dict = output_schema + else: + raise ValueError( + "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class" + ) - Args: - user_prompt: The search prompt string - num_results: Number of websites to scrape (3-20). Default is 3. - More websites provide better research depth but cost more - credits. Credit calculation: 30 base + 10 per additional - website beyond 3. - headers: Optional headers to send with the request - output_schema: Optional schema to structure the output - extraction_mode: Whether to use AI extraction (True) or markdown conversion (False). - AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page. - stealth: Enable stealth mode to avoid bot detection - location_geo_code: Optional geo code of the location to search in (e.g., "us") - time_range: Optional time range filter for search results (e.g., TimeRange.PAST_WEEK) - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info("πŸ” Starting searchscraper request") - logger.debug(f"πŸ“ Prompt: {user_prompt}") - logger.debug(f"🌐 Number of results: {num_results}") - logger.debug(f"πŸ€– Extraction mode: {'AI extraction' if extraction_mode else 'Markdown conversion'}") - if headers: - logger.debug("πŸ”§ Using custom headers") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if location_geo_code: - logger.debug(f"🌍 Location geo code: {location_geo_code}") - if time_range: - logger.debug(f"πŸ“… Time range: {time_range.value}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = SearchScraperRequest( - user_prompt=user_prompt, + request = SearchRequest( + query=query, num_results=num_results, - headers=headers, - output_schema=output_schema, - extraction_mode=extraction_mode, - stealth=stealth, - location_geo_code=location_geo_code, - time_range=time_range, + output_schema=schema_dict, + llm_config=llm_config, ) - logger.debug("βœ… Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/searchscraper", json=request.model_dump() + return await self._make_request( + "POST", f"{self.base_url}/search", json=request.model_dump() ) - logger.info("✨ Searchscraper request completed successfully") - return process_response_with_toon(result, return_toon) - async def get_searchscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous searchscraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching searchscraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") + # ------------------------------------------------------------------ + # Schema + # ------------------------------------------------------------------ - # Validate input using Pydantic model - GetSearchScraperRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") - - result = await self._make_request( - "GET", f"{API_BASE_URL}/searchscraper/{request_id}" - ) - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - async def crawl( + async def schema( self, - url: str, - prompt: Optional[str] = None, - data_schema: Optional[Dict[str, Any]] = None, - extraction_mode: bool = True, - cache_website: bool = True, - depth: int = 2, - breadth: Optional[int] = None, - max_pages: int = 2, - same_domain_only: bool = True, - batch_size: Optional[int] = None, - sitemap: bool = False, - headers: Optional[dict[str, str]] = None, - render_heavy_js: bool = False, - stealth: bool = False, - include_paths: Optional[list[str]] = None, - exclude_paths: Optional[list[str]] = None, - webhook_url: Optional[str] = None, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """Send a crawl request with support for both AI extraction and - markdown conversion modes - - Args: - url: The starting URL to crawl - prompt: AI prompt for data extraction (required for AI extraction mode) - data_schema: Schema for structured output - extraction_mode: Whether to use AI extraction (True) or markdown (False) - cache_website: Whether to cache the website - depth: Maximum depth of link traversal - breadth: Maximum number of links to crawl per depth level. If None, unlimited (default). - Controls the 'width' of exploration at each depth. Useful for limiting crawl scope - on large sites. Note: max_pages always takes priority. Ignored when sitemap=True. - max_pages: Maximum number of pages to crawl - same_domain_only: Only crawl pages within the same domain - batch_size: Number of pages to process in batch - sitemap: Use sitemap for crawling - headers: Optional HTTP headers - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - include_paths: List of path patterns to include (e.g., ['/products/*', '/blog/**']) - Supports wildcards: * matches any characters, ** matches any path segments - exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*']) - Supports wildcards and takes precedence over include_paths - webhook_url: URL to receive webhook notifications when the crawl completes - wait_ms: Number of milliseconds to wait before scraping each page - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info("πŸ” Starting crawl request") - logger.debug(f"🌐 URL: {url}") - logger.debug( - f"πŸ€– Extraction mode: {'AI' if extraction_mode else 'Markdown conversion'}" - ) - if extraction_mode: - logger.debug(f"πŸ“ Prompt: {prompt}") - logger.debug(f"πŸ“Š Schema provided: {bool(data_schema)}") - else: - logger.debug( - "πŸ“„ Markdown conversion mode - no AI processing, 2 credits per page" - ) - logger.debug(f"πŸ’Ύ Cache website: {cache_website}") - logger.debug(f"πŸ” Depth: {depth}") - if breadth is not None: - logger.debug(f"πŸ“ Breadth: {breadth}") - logger.debug(f"πŸ“„ Max pages: {max_pages}") - logger.debug(f"🏠 Same domain only: {same_domain_only}") - logger.debug(f"πŸ—ΊοΈ Use sitemap: {sitemap}") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if render_heavy_js: - logger.debug("⚑ Heavy JavaScript rendering enabled") - if batch_size is not None: - logger.debug(f"πŸ“¦ Batch size: {batch_size}") - if include_paths: - logger.debug(f"βœ… Include paths: {include_paths}") - if exclude_paths: - logger.debug(f"❌ Exclude paths: {exclude_paths}") - if webhook_url: - logger.debug(f"πŸ”” Webhook URL: {webhook_url}") - if wait_ms is not None: - logger.debug(f"⏱️ Wait ms: {wait_ms}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Build request data, excluding None values - request_data = { - "url": url, - "extraction_mode": extraction_mode, - "cache_website": cache_website, - "depth": depth, - "max_pages": max_pages, - "same_domain_only": same_domain_only, - "sitemap": sitemap, - "render_heavy_js": render_heavy_js, - "stealth": stealth, - } - - # Add optional parameters only if provided - if prompt is not None: - request_data["prompt"] = prompt - if data_schema is not None: - request_data["data_schema"] = data_schema - if breadth is not None: - request_data["breadth"] = breadth - if batch_size is not None: - request_data["batch_size"] = batch_size - if headers is not None: - request_data["headers"] = headers - if include_paths is not None: - request_data["include_paths"] = include_paths - if exclude_paths is not None: - request_data["exclude_paths"] = exclude_paths - if webhook_url is not None: - request_data["webhook_url"] = webhook_url - if wait_ms is not None: - request_data["wait_ms"] = wait_ms - - request = CrawlRequest(**request_data) - logger.debug("βœ… Request validation passed") - - # Serialize the request, excluding None values - request_json = request.model_dump(exclude_none=True) - result = await self._make_request( - "POST", f"{API_BASE_URL}/crawl", json=request_json - ) - logger.info("✨ Crawl request completed successfully") - return process_response_with_toon(result, return_toon) - - async def get_crawl(self, crawl_id: str, return_toon: bool = False): - """Get the result of a previous crawl request - - Args: - crawl_id: The crawl ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching crawl result for request {crawl_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetCrawlRequest(crawl_id=crawl_id) - logger.debug("βœ… Request ID validation passed") - - result = await self._make_request("GET", f"{API_BASE_URL}/crawl/{crawl_id}") - logger.info(f"✨ Successfully retrieved result for request {crawl_id}") - return process_response_with_toon(result, return_toon) - - async def agenticscraper( - self, - url: str, - steps: list[str], - use_session: bool = True, - user_prompt: Optional[str] = None, - output_schema: Optional[Dict[str, Any]] = None, - ai_extraction: bool = False, - stealth: bool = False, - return_toon: bool = False, - ): - """Send an agentic scraper request to perform automated actions on a webpage + prompt: str, + existing_schema: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """Generate a JSON schema from a natural language prompt. Args: - url: The URL to scrape - steps: List of steps to perform on the webpage - use_session: Whether to use session for the scraping (default: True) - user_prompt: Prompt for AI extraction (required when ai_extraction=True) - output_schema: Schema for structured data extraction (optional, used with ai_extraction=True) - ai_extraction: Whether to use AI for data extraction from the scraped content (default: False) - stealth: Enable stealth mode to avoid bot detection - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + prompt: Description of the schema to generate + existing_schema: Optional existing schema to modify/extend """ - logger.info(f"πŸ€– Starting agentic scraper request for {url}") - logger.debug(f"πŸ”§ Use session: {use_session}") - logger.debug(f"πŸ“‹ Steps: {steps}") - logger.debug(f"🧠 AI extraction: {ai_extraction}") - if ai_extraction: - logger.debug(f"πŸ’­ User prompt: {user_prompt}") - logger.debug(f"πŸ“‹ Output schema provided: {output_schema is not None}") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = AgenticScraperRequest( - url=url, - steps=steps, - use_session=use_session, - user_prompt=user_prompt, - output_schema=output_schema, - ai_extraction=ai_extraction, - stealth=stealth, + logger.info("Generating schema") + request = SchemaRequest( + prompt=prompt, + existing_schema=existing_schema, ) - logger.debug("βœ… Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/agentic-scrapper", json=request.model_dump() + return await self._make_request( + "POST", f"{self.base_url}/schema", json=request.model_dump() ) - logger.info("✨ Agentic scraper request completed successfully") - return process_response_with_toon(result, return_toon) - async def get_agenticscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous agentic scraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching agentic scraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") + # ------------------------------------------------------------------ + # Credits + # ------------------------------------------------------------------ - # Validate input using Pydantic model - GetAgenticScraperRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") + async def credits(self) -> Dict[str, Any]: + """Get remaining API credits.""" + logger.info("Fetching credits") + return await self._make_request("GET", f"{self.base_url}/credits") - result = await self._make_request("GET", f"{API_BASE_URL}/agentic-scrapper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # History + # ------------------------------------------------------------------ - async def generate_schema( + async def history( self, - user_prompt: str, - existing_schema: Optional[Dict[str, Any]] = None, - ): - """Generate a JSON schema from a user prompt - - Args: - user_prompt: The user's search query to be refined into a schema - existing_schema: Optional existing JSON schema to modify/extend - """ - logger.info("πŸ”§ Starting schema generation request") - logger.debug(f"πŸ’­ User prompt: {user_prompt}") - if existing_schema: - logger.debug(f"πŸ“‹ Existing schema provided: {existing_schema is not None}") - - request = GenerateSchemaRequest( - user_prompt=user_prompt, - existing_schema=existing_schema, - ) - logger.debug("βœ… Request validation passed") - - result = await self._make_request( - "POST", f"{API_BASE_URL}/generate_schema", json=request.model_dump() - ) - logger.info("✨ Schema generation request completed successfully") - return result + endpoint: Optional[str] = None, + status: Optional[str] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> Dict[str, Any]: + """Retrieve request history. - async def get_schema_status(self, request_id: str): - """Get the result of a previous schema generation request - Args: - request_id: The request ID returned from generate_schema + endpoint: Filter by endpoint name (e.g. 'scrape', 'extract') + status: Filter by request status + limit: Maximum number of results (1-100) + offset: Number of results to skip """ - logger.info(f"πŸ” Fetching schema generation status for request {request_id}") - - # Validate input using Pydantic model - GetSchemaStatusRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") - - result = await self._make_request("GET", f"{API_BASE_URL}/generate_schema/{request_id}") - logger.info(f"✨ Successfully retrieved schema status for request {request_id}") - return result - - async def create_scheduled_job( - self, - job_name: str, - service_type: str, - cron_expression: str, - job_config: dict, - is_active: bool = True, - ): - """Create a new scheduled job""" - logger.info(f"πŸ“… Creating scheduled job: {job_name}") - - request = ScheduledJobCreate( - job_name=job_name, - service_type=service_type, - cron_expression=cron_expression, - job_config=job_config, - is_active=is_active, - ) - - result = await self._make_request( - "POST", f"{API_BASE_URL}/scheduled-jobs", json=request.model_dump() - ) - logger.info("✨ Scheduled job created successfully") - return result - - async def get_scheduled_jobs( - self, - page: int = 1, - page_size: int = 20, - service_type: Optional[str] = None, - is_active: Optional[bool] = None, - ): - """Get list of scheduled jobs with pagination""" - logger.info("πŸ“‹ Fetching scheduled jobs") - - GetScheduledJobsRequest( - page=page, - page_size=page_size, - service_type=service_type, - is_active=is_active, - ) - - params = {"page": page, "page_size": page_size} - if service_type: - params["service_type"] = service_type - if is_active is not None: - params["is_active"] = is_active - - result = await self._make_request("GET", f"{API_BASE_URL}/scheduled-jobs", params=params) - logger.info(f"✨ Successfully retrieved {len(result.get('jobs', []))} scheduled jobs") - return result - - async def get_scheduled_job(self, job_id: str): - """Get details of a specific scheduled job""" - logger.info(f"πŸ” Fetching scheduled job {job_id}") - - GetScheduledJobRequest(job_id=job_id) - - result = await self._make_request("GET", f"{API_BASE_URL}/scheduled-jobs/{job_id}") - logger.info(f"✨ Successfully retrieved scheduled job {job_id}") - return result - - async def update_scheduled_job( - self, - job_id: str, - job_name: Optional[str] = None, - cron_expression: Optional[str] = None, - job_config: Optional[dict] = None, - is_active: Optional[bool] = None, - ): - """Update an existing scheduled job (partial update)""" - logger.info(f"πŸ“ Updating scheduled job {job_id}") - - update_data = {} - if job_name is not None: - update_data["job_name"] = job_name - if cron_expression is not None: - update_data["cron_expression"] = cron_expression - if job_config is not None: - update_data["job_config"] = job_config - if is_active is not None: - update_data["is_active"] = is_active - - ScheduledJobUpdate(**update_data) - - result = await self._make_request( - "PATCH", f"{API_BASE_URL}/scheduled-jobs/{job_id}", json=update_data + logger.info("Fetching history") + filter_obj = HistoryFilter( + endpoint=endpoint, status=status, limit=limit, offset=offset ) - logger.info(f"✨ Successfully updated scheduled job {job_id}") - return result - - async def replace_scheduled_job( - self, - job_id: str, - job_name: str, - cron_expression: str, - job_config: dict, - is_active: bool = True, - ): - """Replace an existing scheduled job (full update)""" - logger.info(f"πŸ”„ Replacing scheduled job {job_id}") - - request_data = { - "job_name": job_name, - "cron_expression": cron_expression, - "job_config": job_config, - "is_active": is_active, - } - - result = await self._make_request( - "PUT", f"{API_BASE_URL}/scheduled-jobs/{job_id}", json=request_data - ) - logger.info(f"✨ Successfully replaced scheduled job {job_id}") - return result - - async def delete_scheduled_job(self, job_id: str): - """Delete a scheduled job""" - logger.info(f"πŸ—‘οΈ Deleting scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = await self._make_request("DELETE", f"{API_BASE_URL}/scheduled-jobs/{job_id}") - logger.info(f"✨ Successfully deleted scheduled job {job_id}") - return result - - async def pause_scheduled_job(self, job_id: str): - """Pause a scheduled job""" - logger.info(f"⏸️ Pausing scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = await self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/pause") - logger.info(f"✨ Successfully paused scheduled job {job_id}") - return result - - async def resume_scheduled_job(self, job_id: str): - """Resume a paused scheduled job""" - logger.info(f"▢️ Resuming scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = await self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/resume") - logger.info(f"✨ Successfully resumed scheduled job {job_id}") - return result - - async def trigger_scheduled_job(self, job_id: str): - """Manually trigger a scheduled job""" - logger.info(f"πŸš€ Manually triggering scheduled job {job_id}") - - TriggerJobRequest(job_id=job_id) - - result = await self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/trigger") - logger.info(f"✨ Successfully triggered scheduled job {job_id}") - return result - - async def get_job_executions( - self, - job_id: str, - page: int = 1, - page_size: int = 20, - status: Optional[str] = None, - ): - """Get execution history for a scheduled job""" - logger.info(f"πŸ“Š Fetching execution history for job {job_id}") - - GetJobExecutionsRequest( - job_id=job_id, - page=page, - page_size=page_size, - status=status, + params = filter_obj.to_params() + return await self._make_request( + "GET", f"{self.base_url}/history", params=params or None ) - params = {"page": page, "page_size": page_size} - if status: - params["status"] = status - - result = await self._make_request( - "GET", f"{API_BASE_URL}/scheduled-jobs/{job_id}/executions", params=params - ) - logger.info(f"✨ Successfully retrieved execution history for job {job_id}") - return result + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ - async def close(self): - """Close the session to free up resources""" - logger.info("πŸ”’ Closing AsyncClient session") + async def close(self) -> None: + """Close the HTTP session.""" + logger.info("Closing AsyncClient session") await self.session.close() - logger.debug("βœ… Session closed successfully") - async def __aenter__(self): + async def __aenter__(self) -> "AsyncClient": return self - async def __aexit__(self, exc_type, exc_val, exc_tb): + async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: await self.close() diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 28fc1bf..b4c9c27 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -1,34 +1,21 @@ """ -Synchronous HTTP client for the ScrapeGraphAI API. - -This module provides a synchronous client for interacting with all ScrapeGraphAI -API endpoints including smartscraper, searchscraper, crawl, agentic scraper, -markdownify, schema generation, scheduled jobs, and utility functions. - -The Client class supports: -- API key authentication -- SSL verification configuration -- Request timeout configuration -- Automatic retry logic with exponential backoff -- Mock mode for testing -- Context manager support for proper resource cleanup +Synchronous HTTP client for the ScrapeGraphAI v2 API. Example: - Basic usage with environment variables: - >>> from scrapegraph_py import Client - >>> client = Client.from_env() - >>> result = client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract product information" - ... ) - - Using context manager: - >>> with Client(api_key="sgai-...") as client: - ... result = client.scrape(website_url="https://example.com") + >>> from scrapegraph_py import Client + >>> client = Client(api_key="sgai-...") + >>> result = client.extract( + ... url="https://example.com", + ... prompt="Extract product information" + ... ) + >>> print(result) + + >>> # Namespaced crawl/monitor + >>> job = client.crawl.start("https://example.com", depth=3) + >>> status = client.crawl.status(job["id"]) """ -import uuid as _uuid -from typing import Any, Callable, Dict, Optional -from urllib.parse import urlparse + +from typing import Any, Callable, Dict, List, Optional import requests import urllib3 @@ -38,71 +25,191 @@ from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger -from scrapegraph_py.models.agenticscraper import ( - AgenticScraperRequest, - GetAgenticScraperRequest, -) -from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest -from scrapegraph_py.models.feedback import FeedbackRequest -from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest -from scrapegraph_py.models.schema import ( - GenerateSchemaRequest, - GetSchemaStatusRequest, - SchemaGenerationResponse, -) -from scrapegraph_py.models.scrape import GetScrapeRequest, ScrapeRequest -from scrapegraph_py.models.searchscraper import ( - GetSearchScraperRequest, - SearchScraperRequest, - TimeRange, -) -from scrapegraph_py.models.sitemap import SitemapRequest, SitemapResponse -from scrapegraph_py.models.smartscraper import ( - GetSmartScraperRequest, - SmartScraperRequest, -) -from scrapegraph_py.models.scheduled_jobs import ( - GetJobExecutionsRequest, - GetScheduledJobRequest, - GetScheduledJobsRequest, - JobActionRequest, - JobActionResponse, - JobExecutionListResponse, - JobTriggerResponse, - ScheduledJobCreate, - ScheduledJobListResponse, - ScheduledJobResponse, - ScheduledJobUpdate, - TriggerJobRequest, -) +from scrapegraph_py.models.crawl import CrawlFormat, CrawlRequest +from scrapegraph_py.models.extract import ExtractRequest +from scrapegraph_py.models.history import HistoryFilter +from scrapegraph_py.models.monitor import MonitorCreateRequest +from scrapegraph_py.models.schema import SchemaRequest +from scrapegraph_py.models.scrape import ScrapeFormat, ScrapeRequest +from scrapegraph_py.models.search import SearchRequest +from scrapegraph_py.models.shared import FetchConfig, LlmConfig from scrapegraph_py.utils.helpers import handle_sync_response, validate_api_key -from scrapegraph_py.utils.toon_converter import process_response_with_toon -class Client: - """ - Synchronous client for the ScrapeGraphAI API. +class _CrawlNamespace: + """Namespaced crawl operations: client.crawl.start(), .status(), .stop(), .resume().""" + + def __init__(self, client: "Client"): + self._client = client + + def start( + self, + url: str, + depth: int = 2, + max_pages: int = 10, + format: str = "markdown", + include_patterns: Optional[List[str]] = None, + exclude_patterns: Optional[List[str]] = None, + fetch_config: Optional[FetchConfig] = None, + ) -> Dict[str, Any]: + """Start a crawl job. + + Args: + url: The starting URL for the crawl + depth: Maximum crawl depth (1-10) + max_pages: Maximum pages to crawl (1-100) + format: Output format - 'markdown' or 'html' + include_patterns: URL patterns to include + exclude_patterns: URL patterns to exclude + fetch_config: Fetch configuration options + """ + logger.info(f"Starting crawl for {url}") + request = CrawlRequest( + url=url, + depth=depth, + max_pages=max_pages, + format=CrawlFormat(format), + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + fetch_config=fetch_config, + ) + return self._client._make_request( + "POST", f"{self._client.base_url}/crawl", json=request.model_dump() + ) + + def status(self, crawl_id: str) -> Dict[str, Any]: + """Get crawl job status and results. + + Args: + crawl_id: The crawl job ID + """ + logger.info(f"Fetching crawl status for {crawl_id}") + return self._client._make_request("GET", f"{self._client.base_url}/crawl/{crawl_id}") + + def stop(self, crawl_id: str) -> Dict[str, Any]: + """Stop a running crawl job. + + Args: + crawl_id: The crawl job ID to stop + """ + logger.info(f"Stopping crawl {crawl_id}") + return self._client._make_request( + "POST", f"{self._client.base_url}/crawl/{crawl_id}/stop" + ) + + def resume(self, crawl_id: str) -> Dict[str, Any]: + """Resume a stopped crawl job. + + Args: + crawl_id: The crawl job ID to resume + """ + logger.info(f"Resuming crawl {crawl_id}") + return self._client._make_request( + "POST", f"{self._client.base_url}/crawl/{crawl_id}/resume" + ) - This class provides synchronous methods for all ScrapeGraphAI API endpoints. - It handles authentication, request management, error handling, and supports - mock mode for testing. - Attributes: - api_key (str): The API key for authentication - headers (dict): Default headers including API key - timeout (Optional[float]): Request timeout in seconds - max_retries (int): Maximum number of retry attempts - retry_delay (float): Delay between retries in seconds - mock (bool): Whether mock mode is enabled - session (requests.Session): HTTP session for connection pooling +class _MonitorNamespace: + """Namespaced monitor operations: client.monitor.create(), .list(), .get(), etc.""" + + def __init__(self, client: "Client"): + self._client = client + + def create( + self, + name: str, + url: str, + prompt: str, + cron: str, + output_schema: Optional[Dict[str, Any]] = None, + fetch_config: Optional[FetchConfig] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Create a new monitor. + + Args: + name: Name of the monitor + url: URL to monitor + prompt: Prompt for AI extraction + cron: Cron expression (5 fields) + output_schema: Optional JSON Schema for structured output + fetch_config: Fetch configuration options + llm_config: LLM configuration options + """ + logger.info(f"Creating monitor '{name}' for {url}") + request = MonitorCreateRequest( + name=name, + url=url, + prompt=prompt, + cron=cron, + output_schema=output_schema, + fetch_config=fetch_config, + llm_config=llm_config, + ) + return self._client._make_request( + "POST", f"{self._client.base_url}/monitor", json=request.model_dump() + ) + + def list(self) -> Dict[str, Any]: + """List all monitors.""" + logger.info("Listing monitors") + return self._client._make_request("GET", f"{self._client.base_url}/monitor") + + def get(self, monitor_id: str) -> Dict[str, Any]: + """Get a specific monitor. + + Args: + monitor_id: The monitor ID + """ + logger.info(f"Fetching monitor {monitor_id}") + return self._client._make_request( + "GET", f"{self._client.base_url}/monitor/{monitor_id}" + ) + + def pause(self, monitor_id: str) -> Dict[str, Any]: + """Pause a monitor. + + Args: + monitor_id: The monitor ID to pause + """ + logger.info(f"Pausing monitor {monitor_id}") + return self._client._make_request( + "POST", f"{self._client.base_url}/monitor/{monitor_id}/pause" + ) + + def resume(self, monitor_id: str) -> Dict[str, Any]: + """Resume a paused monitor. + + Args: + monitor_id: The monitor ID to resume + """ + logger.info(f"Resuming monitor {monitor_id}") + return self._client._make_request( + "POST", f"{self._client.base_url}/monitor/{monitor_id}/resume" + ) + + def delete(self, monitor_id: str) -> Dict[str, Any]: + """Delete a monitor. + + Args: + monitor_id: The monitor ID to delete + """ + logger.info(f"Deleting monitor {monitor_id}") + return self._client._make_request( + "DELETE", f"{self._client.base_url}/monitor/{monitor_id}" + ) + + +class Client: + """Synchronous client for the ScrapeGraphAI v2 API. Example: - >>> client = Client.from_env() - >>> result = client.smartscraper( - ... website_url="https://example.com", - ... user_prompt="Extract all products" - ... ) + >>> client = Client(api_key="sgai-...") + >>> result = client.scrape("https://example.com") + >>> result = client.extract("https://example.com", prompt="Extract prices") + >>> job = client.crawl.start("https://example.com", depth=3) """ + @classmethod def from_env( cls, @@ -110,75 +217,42 @@ def from_env( timeout: Optional[float] = None, max_retries: int = 3, retry_delay: float = 1.0, - mock: Optional[bool] = None, - mock_handler: Optional[Callable[[str, str, Dict[str, Any]], Any]] = None, - mock_responses: Optional[Dict[str, Any]] = None, - ): - """Initialize Client using API key from environment variable. - - Args: - verify_ssl: Whether to verify SSL certificates - timeout: Request timeout in seconds. None means no timeout (infinite) - max_retries: Maximum number of retry attempts - retry_delay: Delay between retries in seconds - mock: If True, the client will not perform real HTTP requests and - will return stubbed responses. If None, reads from SGAI_MOCK env. - """ + ) -> "Client": + """Initialize Client using SGAI_API_KEY environment variable.""" from os import getenv - # Allow enabling mock mode from environment if not explicitly provided - if mock is None: - mock_env = getenv("SGAI_MOCK", "0").strip().lower() - mock = mock_env in {"1", "true", "yes", "on"} - api_key = getenv("SGAI_API_KEY") - # In mock mode, we don't need a real API key if not api_key: - if mock: - api_key = "sgai-00000000-0000-0000-0000-000000000000" - else: - raise ValueError("SGAI_API_KEY environment variable not set") + raise ValueError("SGAI_API_KEY environment variable not set") return cls( api_key=api_key, verify_ssl=verify_ssl, timeout=timeout, max_retries=max_retries, retry_delay=retry_delay, - mock=bool(mock), - mock_handler=mock_handler, - mock_responses=mock_responses, ) def __init__( self, api_key: str = None, + base_url: Optional[str] = None, verify_ssl: bool = True, timeout: Optional[float] = None, max_retries: int = 3, retry_delay: float = 1.0, - mock: bool = False, - mock_handler: Optional[Callable[[str, str, Dict[str, Any]], Any]] = None, - mock_responses: Optional[Dict[str, Any]] = None, ): - """Initialize Client with configurable parameters. + """Initialize Client. Args: - api_key: API key for authentication. If None, will try to load - from environment + api_key: API key for authentication. If None, reads from SGAI_API_KEY env var + base_url: Override the default API base URL verify_ssl: Whether to verify SSL certificates - timeout: Request timeout in seconds. None means no timeout (infinite) - max_retries: Maximum number of retry attempts - retry_delay: Delay between retries in seconds - mock: If True, the client will bypass HTTP calls and return - deterministic mock responses - mock_handler: Optional callable to generate custom mock responses - given (method, url, request_kwargs) - mock_responses: Optional mapping of path (e.g. "/v1/credits") to - static response or callable returning a response + timeout: Request timeout in seconds (None = no timeout) + max_retries: Maximum retry attempts on server errors + retry_delay: Base delay between retries in seconds """ - logger.info("πŸ”‘ Initializing Client") + logger.info("Initializing Client") - # Try to get API key from environment if not provided if api_key is None: from os import getenv @@ -189,28 +263,25 @@ def __init__( ) validate_api_key(api_key) - logger.debug( - f"πŸ› οΈ Configuration: verify_ssl={verify_ssl}, timeout={timeout}, " - f"max_retries={max_retries}" - ) self.api_key = api_key - self.headers = {**DEFAULT_HEADERS, "SGAI-APIKEY": api_key} + self.base_url = (base_url or API_BASE_URL).rstrip("/") + self.headers = { + **DEFAULT_HEADERS, + "Authorization": f"Bearer {api_key}", + "SGAI-APIKEY": api_key, + } self.timeout = timeout self.max_retries = max_retries self.retry_delay = retry_delay - self.mock = bool(mock) - self.mock_handler = mock_handler - self.mock_responses = mock_responses or {} - # Create a session for connection pooling + # HTTP session with connection pooling and retry self.session = requests.Session() self.session.headers.update(self.headers) self.session.verify = verify_ssl - # Configure retries adapter = requests.adapters.HTTPAdapter( - max_retries=requests.urllib3.Retry( + max_retries=urllib3.Retry( total=max_retries, backoff_factor=retry_delay, status_forcelist=[500, 502, 503, 504], @@ -219,1117 +290,222 @@ def __init__( self.session.mount("http://", adapter) self.session.mount("https://", adapter) - # Add warning suppression if verify_ssl is False if not verify_ssl: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - logger.info("βœ… Client initialized successfully") - - def _make_request(self, method: str, url: str, **kwargs) -> Any: - """ - Make HTTP request with error handling and retry logic. + # Namespaced sub-clients + self.crawl = _CrawlNamespace(self) + self.monitor = _MonitorNamespace(self) - Args: - method: HTTP method (GET, POST, etc.) - url: Full URL for the request - **kwargs: Additional arguments to pass to requests + logger.info("Client initialized successfully") - Returns: - Parsed JSON response data - - Raises: - APIError: If the API returns an error response - ConnectionError: If unable to connect to the API - - Note: - In mock mode, this method returns deterministic responses without - making actual HTTP requests. - """ - # Short-circuit when mock mode is enabled - if getattr(self, "mock", False): - return self._mock_response(method, url, **kwargs) + def _make_request(self, method: str, url: str, **kwargs: Any) -> Any: + """Make HTTP request with error handling.""" try: - logger.info(f"πŸš€ Making {method} request to {url}") - logger.debug(f"πŸ” Request parameters: {kwargs}") - + logger.debug(f"Making {method} request to {url}") response = self.session.request(method, url, timeout=self.timeout, **kwargs) - logger.debug(f"πŸ“₯ Response status: {response.status_code}") - - result = handle_sync_response(response) - logger.info(f"βœ… Request completed successfully: {method} {url}") - return result - + return handle_sync_response(response) except RequestException as e: - logger.error(f"❌ Request failed: {str(e)}") + logger.error(f"Request failed: {e}") if hasattr(e, "response") and e.response is not None: try: error_data = e.response.json() error_msg = error_data.get("error", str(e)) - logger.error(f"πŸ”΄ API Error: {error_msg}") raise APIError(error_msg, status_code=e.response.status_code) except ValueError: - logger.error("πŸ”΄ Could not parse error response") raise APIError( str(e), - status_code=( - e.response.status_code - if hasattr(e.response, "status_code") - else None - ), + status_code=getattr(e.response, "status_code", None), ) - logger.error(f"πŸ”΄ Connection Error: {str(e)}") - raise ConnectionError(f"Failed to connect to API: {str(e)}") - - def _mock_response(self, method: str, url: str, **kwargs) -> Any: - """Return a deterministic mock response without performing network I/O. - - Resolution order: - 1) If a custom mock_handler is provided, delegate to it - 2) If mock_responses contains a key for the request path, use it - 3) Fallback to built-in defaults per endpoint family - """ - logger.info(f"πŸ§ͺ Mock mode active. Returning stub for {method} {url}") - - # 1) Custom handler - if self.mock_handler is not None: - try: - return self.mock_handler(method, url, kwargs) - except Exception as handler_error: - logger.warning(f"Custom mock_handler raised: {handler_error}. Falling back to defaults.") + raise ConnectionError(f"Failed to connect to API: {e}") - # 2) Path-based override - try: - parsed = urlparse(url) - path = parsed.path.rstrip("/") - except Exception: - path = url - - override = self.mock_responses.get(path) - if override is not None: - return override() if callable(override) else override - - # 3) Built-in defaults - def new_id(prefix: str) -> str: - return f"{prefix}-{_uuid.uuid4()}" - - upper_method = method.upper() - - # Credits endpoint - if path.endswith("/credits") and upper_method == "GET": - return {"remaining_credits": 1000, "total_credits_used": 0} - - # Health check endpoint - if path.endswith("/healthz") and upper_method == "GET": - return {"status": "healthy", "message": "Service is operational"} - - # Feedback acknowledge - if path.endswith("/feedback") and upper_method == "POST": - return {"status": "success"} - - # Create-like endpoints (POST) - if upper_method == "POST": - if path.endswith("/crawl"): - return {"crawl_id": new_id("mock-crawl")} - elif path.endswith("/scheduled-jobs"): - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - elif "/pause" in path: - return { - "message": "Job paused successfully", - "job_id": new_id("mock-job"), - "is_active": False - } - elif "/resume" in path: - return { - "message": "Job resumed successfully", - "job_id": new_id("mock-job"), - "is_active": True, - "next_run_at": "2024-01-08T09:00:00Z" - } - elif "/trigger" in path: - return { - "execution_id": new_id("mock-task"), - "scheduled_job_id": new_id("mock-job"), - "triggered_at": "2024-01-01T00:00:00Z", - "message": f"Job triggered successfully. Task ID: {new_id('mock-task')}" - } - # All other POST endpoints return a request id - return {"request_id": new_id("mock-req")} - - # Status-like endpoints (GET) - if upper_method == "GET": - if "markdownify" in path: - return {"status": "completed", "content": "# Mock markdown\n\n..."} - if "smartscraper" in path: - return {"status": "completed", "result": [{"field": "value"}]} - if "searchscraper" in path: - return { - "status": "completed", - "results": [{"url": "https://example.com"}], - "markdown_content": "# Mock Markdown Content\n\nThis is mock markdown content for testing purposes.\n\n## Section 1\n\nSome content here.\n\n## Section 2\n\nMore content here.", - "reference_urls": ["https://example.com", "https://example2.com"] - } - if "crawl" in path: - return {"status": "completed", "pages": []} - if "agentic-scrapper" in path: - return {"status": "completed", "actions": []} - if "scheduled-jobs" in path: - if "/executions" in path: - return { - "executions": [ - { - "id": new_id("mock-exec"), - "scheduled_job_id": new_id("mock-job"), - "execution_id": new_id("mock-task"), - "status": "completed", - "started_at": "2024-01-01T00:00:00Z", - "completed_at": "2024-01-01T00:01:00Z", - "result": {"mock": "result"}, - "credits_used": 10 - } - ], - "total": 1, - "page": 1, - "page_size": 20 - } - elif path.endswith("/scheduled-jobs"): # List jobs endpoint - return { - "jobs": [ - { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - ], - "total": 1, - "page": 1, - "page_size": 20 - } - else: # Single job endpoint - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 9 * * 1", - "job_config": {"mock": "config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T00:00:00Z", - "next_run_at": "2024-01-08T09:00:00Z" - } - - # Update operations (PATCH/PUT) - if upper_method in ["PATCH", "PUT"] and "scheduled-jobs" in path: - return { - "id": new_id("mock-job"), - "user_id": new_id("mock-user"), - "job_name": "Updated Mock Scheduled Job", - "service_type": "smartscraper", - "cron_expression": "0 10 * * 1", - "job_config": {"mock": "updated_config"}, - "is_active": True, - "created_at": "2024-01-01T00:00:00Z", - "updated_at": "2024-01-01T01:00:00Z", - "next_run_at": "2024-01-08T10:00:00Z" - } - - # Delete operations - if upper_method == "DELETE" and "scheduled-jobs" in path: - return {"message": "Scheduled job deleted successfully"} - - # Generic fallback - return {"status": "mock", "url": url, "method": method, "kwargs": kwargs} - - def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False): - """Send a markdownify request - - Args: - website_url: The URL to convert to markdown - headers: Optional HTTP headers - mock: Enable mock mode for testing - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Starting markdownify request for {website_url}") - if headers: - logger.debug("πŸ”§ Using custom headers") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if render_heavy_js: - logger.debug("⚑ Heavy JavaScript rendering enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms) - logger.debug("βœ… Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/markdownify", json=request.model_dump() - ) - logger.info("✨ Markdownify request completed successfully") - return process_response_with_toon(result, return_toon) - - def get_markdownify(self, request_id: str, return_toon: bool = False): - """Get the result of a previous markdownify request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching markdownify result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetMarkdownifyRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/markdownify/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # Scrape + # ------------------------------------------------------------------ def scrape( self, - website_url: str, - render_heavy_js: bool = False, - branding: bool = False, - headers: Optional[dict[str, str]] = None, - mock:bool=False, - stealth:bool=False, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """Send a scrape request to get HTML content from a website + url: str, + format: str = "markdown", + fetch_config: Optional[FetchConfig] = None, + ) -> Dict[str, Any]: + """Scrape a page and return it in the specified format. Args: - website_url: The URL of the website to get HTML from - render_heavy_js: Whether to render heavy JavaScript (defaults to False) - branding: Whether to include branding in the response (defaults to False) - headers: Optional headers to send with the request - mock: Enable mock mode for testing - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + url: URL to scrape + format: Output format - 'markdown', 'html', 'screenshot', or 'branding' + fetch_config: Fetch configuration options """ - logger.info(f"πŸ” Starting scrape request for {website_url}") - logger.debug(f"πŸ”§ Render heavy JS: {render_heavy_js}") - logger.debug(f"πŸ”§ Branding: {branding}") - if headers: - logger.debug("πŸ”§ Using custom headers") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - + logger.info(f"Scraping {url} (format={format})") request = ScrapeRequest( - website_url=website_url, - render_heavy_js=render_heavy_js, - branding=branding, - headers=headers, - mock=mock, - stealth=stealth, - wait_ms=wait_ms, + url=url, + format=ScrapeFormat(format), + fetch_config=fetch_config, ) - logger.debug("βœ… Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/scrape", json=request.model_dump() + return self._make_request( + "POST", f"{self.base_url}/scrape", json=request.model_dump() ) - logger.info("✨ Scrape request completed successfully") - return process_response_with_toon(result, return_toon) - def get_scrape(self, request_id: str, return_toon: bool = False): - """Get the result of a previous scrape request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching scrape result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetScrapeRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/scrape/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # Extract (replaces SmartScraper) + # ------------------------------------------------------------------ - def sitemap( + def extract( self, - website_url: str, - mock: bool = False, - ) -> SitemapResponse: - """Extract all URLs from a website's sitemap. - - Automatically discovers sitemap from robots.txt or common sitemap locations. + url: str, + prompt: str, + output_schema: Optional[Any] = None, + fetch_config: Optional[FetchConfig] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Extract structured data from a page using AI. Args: - website_url: The URL of the website to extract sitemap from - mock: Whether to use mock mode for this request - - Returns: - SitemapResponse: Object containing list of URLs extracted from sitemap - - Raises: - ValueError: If website_url is invalid - APIError: If the API request fails - - Examples: - >>> client = Client(api_key="your-api-key") - >>> response = client.sitemap("https://example.com") - >>> print(f"Found {len(response.urls)} URLs") - >>> for url in response.urls[:5]: - ... print(url) + url: URL to extract data from + prompt: Natural language prompt describing what to extract + output_schema: JSON Schema dict or Pydantic BaseModel class for output structure + fetch_config: Fetch configuration options + llm_config: LLM configuration options """ - logger.info(f"πŸ—ΊοΈ Starting sitemap extraction for {website_url}") + logger.info(f"Extracting from {url}") + + # Convert Pydantic model class to JSON schema + schema_dict = None + if output_schema is not None: + if isinstance(output_schema, type) and issubclass(output_schema, BaseModel): + schema_dict = output_schema.model_json_schema() + elif isinstance(output_schema, dict): + schema_dict = output_schema + else: + raise ValueError( + "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class" + ) - request = SitemapRequest( - website_url=website_url, - mock=mock + request = ExtractRequest( + url=url, + prompt=prompt, + output_schema=schema_dict, + fetch_config=fetch_config, + llm_config=llm_config, ) - logger.debug("βœ… Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/sitemap", json=request.model_dump() + return self._make_request( + "POST", f"{self.base_url}/extract", json=request.model_dump() ) - logger.info(f"✨ Sitemap extraction completed successfully - found {len(result.get('urls', []))} URLs") - # Parse response into SitemapResponse model - return SitemapResponse(**result) + # ------------------------------------------------------------------ + # Search (replaces SearchScraper) + # ------------------------------------------------------------------ - def smartscraper( + def search( self, - user_prompt: str, - website_url: Optional[str] = None, - website_html: Optional[str] = None, - website_markdown: Optional[str] = None, - headers: Optional[dict[str, str]] = None, - cookies: Optional[Dict[str, str]] = None, - output_schema: Optional[BaseModel] = None, - number_of_scrolls: Optional[int] = None, - total_pages: Optional[int] = None, - mock: bool = False, - plain_text: bool = False, - render_heavy_js: bool = False, - stealth: bool = False, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """ - Send a smartscraper request with optional pagination support and cookies. - - Supports three types of input (must provide exactly one): - - website_url: Scrape from a URL - - website_html: Process local HTML content - - website_markdown: Process local Markdown content + query: str, + num_results: int = 5, + output_schema: Optional[Any] = None, + llm_config: Optional[LlmConfig] = None, + ) -> Dict[str, Any]: + """Search the web and extract structured results. Args: - user_prompt: Natural language prompt describing what to extract - website_url: URL to scrape (optional) - website_html: Raw HTML content to process (optional, max 2MB) - website_markdown: Markdown content to process (optional, max 2MB) - headers: Optional HTTP headers - cookies: Optional cookies for authentication - output_schema: Optional Pydantic model for structured output - number_of_scrolls: Number of times to scroll (0-100) - total_pages: Number of pages to scrape (1-10) - mock: Enable mock mode for testing - plain_text: Return plain text instead of structured data - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - wait_ms: Number of milliseconds to wait before scraping the website - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - - Returns: - Dictionary containing the scraping results, or TOON formatted string if return_toon=True - - Raises: - ValueError: If validation fails or invalid parameters provided - APIError: If the API request fails + query: The search query + num_results: Number of results (3-20, default 5) + output_schema: JSON Schema dict or Pydantic BaseModel class for output structure + llm_config: LLM configuration options """ - logger.info("πŸ” Starting smartscraper request") - if website_url: - logger.debug(f"🌐 URL: {website_url}") - if website_html: - logger.debug("πŸ“„ Using provided HTML content") - if website_markdown: - logger.debug("πŸ“ Using provided Markdown content") - if headers: - logger.debug("πŸ”§ Using custom headers") - if cookies: - logger.debug("πŸͺ Using cookies for authentication/session management") - if number_of_scrolls is not None: - logger.debug(f"πŸ”„ Number of scrolls: {number_of_scrolls}") - if total_pages is not None: - logger.debug(f"πŸ“„ Total pages to scrape: {total_pages}") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if render_heavy_js: - logger.debug("⚑ Heavy JavaScript rendering enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - logger.debug(f"πŸ“ Prompt: {user_prompt}") - - request = SmartScraperRequest( - website_url=website_url, - website_html=website_html, - website_markdown=website_markdown, - headers=headers, - cookies=cookies, - user_prompt=user_prompt, - output_schema=output_schema, - number_of_scrolls=number_of_scrolls, - total_pages=total_pages, - mock=mock, - plain_text=plain_text, - render_heavy_js=render_heavy_js, - stealth=stealth, - wait_ms=wait_ms, - ) - logger.debug("βœ… Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/smartscraper", json=request.model_dump() - ) - logger.info("✨ Smartscraper request completed successfully") - return process_response_with_toon(result, return_toon) - - def get_smartscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous smartscraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching smartscraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetSmartScraperRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/smartscraper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - def submit_feedback( - self, request_id: str, rating: int, feedback_text: Optional[str] = None - ): - """Submit feedback for a request""" - logger.info(f"πŸ“ Submitting feedback for request {request_id}") - logger.debug(f"⭐ Rating: {rating}, Feedback: {feedback_text}") - - feedback = FeedbackRequest( - request_id=request_id, rating=rating, feedback_text=feedback_text - ) - logger.debug("βœ… Feedback validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/feedback", json=feedback.model_dump() - ) - logger.info("✨ Feedback submitted successfully") - return result - - def get_credits(self): - """Get credits information""" - logger.info("πŸ’³ Fetching credits information") - - result = self._make_request( - "GET", - f"{API_BASE_URL}/credits", - ) - logger.info( - f"✨ Credits info retrieved: {result.get('remaining_credits')} " - f"credits remaining" - ) - return result - - def healthz(self): - """Check the health status of the service - - This endpoint is useful for monitoring and ensuring the service is operational. - It returns a JSON response indicating the service's health status. - - Returns: - dict: Health status information - - Example: - >>> client = Client.from_env() - >>> health = client.healthz() - >>> print(health) - """ - logger.info("πŸ₯ Checking service health") - - result = self._make_request( - "GET", - f"{API_BASE_URL}/healthz", - ) - logger.info("✨ Health check completed successfully") - return result - - def searchscraper( - self, - user_prompt: str, - num_results: Optional[int] = 3, - headers: Optional[dict[str, str]] = None, - output_schema: Optional[BaseModel] = None, - extraction_mode: bool = True, - mock: bool = False, - stealth: bool = False, - location_geo_code: Optional[str] = None, - time_range: Optional[TimeRange] = None, - return_toon: bool = False, - ): - """Send a searchscraper request + logger.info(f"Searching: {query}") + + schema_dict = None + if output_schema is not None: + if isinstance(output_schema, type) and issubclass(output_schema, BaseModel): + schema_dict = output_schema.model_json_schema() + elif isinstance(output_schema, dict): + schema_dict = output_schema + else: + raise ValueError( + "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class" + ) - Args: - user_prompt: The search prompt string - num_results: Number of websites to scrape (3-20). Default is 3. - More websites provide better research depth but cost more - credits. Credit calculation: 30 base + 10 per additional - website beyond 3. - headers: Optional headers to send with the request - output_schema: Optional schema to structure the output - extraction_mode: Whether to use AI extraction (True) or markdown conversion (False). - AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page. - mock: Enable mock mode for testing - stealth: Enable stealth mode to avoid bot detection - location_geo_code: Optional geo code of the location to search in (e.g., "us") - time_range: Optional time range filter for search results (e.g., TimeRange.PAST_WEEK) - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info("πŸ” Starting searchscraper request") - logger.debug(f"πŸ“ Prompt: {user_prompt}") - logger.debug(f"🌐 Number of results: {num_results}") - logger.debug(f"πŸ€– Extraction mode: {'AI extraction' if extraction_mode else 'Markdown conversion'}") - if headers: - logger.debug("πŸ”§ Using custom headers") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if location_geo_code: - logger.debug(f"🌍 Location geo code: {location_geo_code}") - if time_range: - logger.debug(f"πŸ“… Time range: {time_range.value}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = SearchScraperRequest( - user_prompt=user_prompt, + request = SearchRequest( + query=query, num_results=num_results, - headers=headers, - output_schema=output_schema, - extraction_mode=extraction_mode, - mock=mock, - stealth=stealth, - location_geo_code=location_geo_code, - time_range=time_range, + output_schema=schema_dict, + llm_config=llm_config, ) - logger.debug("βœ… Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/searchscraper", json=request.model_dump() + return self._make_request( + "POST", f"{self.base_url}/search", json=request.model_dump() ) - logger.info("✨ Searchscraper request completed successfully") - return process_response_with_toon(result, return_toon) - - def get_searchscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous searchscraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching searchscraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - # Validate input using Pydantic model - GetSearchScraperRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") + # ------------------------------------------------------------------ + # Schema + # ------------------------------------------------------------------ - result = self._make_request("GET", f"{API_BASE_URL}/searchscraper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) - - def crawl( + def schema( self, - url: str, - prompt: Optional[str] = None, - data_schema: Optional[Dict[str, Any]] = None, - extraction_mode: bool = True, - cache_website: bool = True, - depth: int = 2, - breadth: Optional[int] = None, - max_pages: int = 2, - same_domain_only: bool = True, - batch_size: Optional[int] = None, - sitemap: bool = False, - headers: Optional[dict[str, str]] = None, - render_heavy_js: bool = False, - stealth: bool = False, - include_paths: Optional[list[str]] = None, - exclude_paths: Optional[list[str]] = None, - webhook_url: Optional[str] = None, - wait_ms: Optional[int] = None, - return_toon: bool = False, - ): - """Send a crawl request with support for both AI extraction and - markdown conversion modes - - Args: - url: The starting URL to crawl - prompt: AI prompt for data extraction (required for AI extraction mode) - data_schema: Schema for structured output - extraction_mode: Whether to use AI extraction (True) or markdown (False) - cache_website: Whether to cache the website - depth: Maximum depth of link traversal - breadth: Maximum number of links to crawl per depth level. If None, unlimited (default). - Controls the 'width' of exploration at each depth. Useful for limiting crawl scope - on large sites. Note: max_pages always takes priority. Ignored when sitemap=True. - max_pages: Maximum number of pages to crawl - same_domain_only: Only crawl pages within the same domain - batch_size: Number of pages to process in batch - sitemap: Use sitemap for crawling - headers: Optional HTTP headers - render_heavy_js: Enable heavy JavaScript rendering - stealth: Enable stealth mode to avoid bot detection - include_paths: List of path patterns to include (e.g., ['/products/*', '/blog/**']) - Supports wildcards: * matches any characters, ** matches any path segments - exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*']) - Supports wildcards and takes precedence over include_paths - webhook_url: URL to receive webhook notifications when the crawl completes - wait_ms: Number of milliseconds to wait before scraping each page - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info("πŸ” Starting crawl request") - logger.debug(f"🌐 URL: {url}") - logger.debug( - f"πŸ€– Extraction mode: {'AI' if extraction_mode else 'Markdown conversion'}" - ) - if extraction_mode: - logger.debug(f"πŸ“ Prompt: {prompt}") - logger.debug(f"πŸ“Š Schema provided: {bool(data_schema)}") - else: - logger.debug( - "πŸ“„ Markdown conversion mode - no AI processing, 2 credits per page" - ) - logger.debug(f"πŸ’Ύ Cache website: {cache_website}") - logger.debug(f"πŸ” Depth: {depth}") - if breadth is not None: - logger.debug(f"πŸ“ Breadth: {breadth}") - logger.debug(f"πŸ“„ Max pages: {max_pages}") - logger.debug(f"🏠 Same domain only: {same_domain_only}") - logger.debug(f"πŸ—ΊοΈ Use sitemap: {sitemap}") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if render_heavy_js: - logger.debug("⚑ Heavy JavaScript rendering enabled") - if batch_size is not None: - logger.debug(f"πŸ“¦ Batch size: {batch_size}") - if include_paths: - logger.debug(f"βœ… Include paths: {include_paths}") - if exclude_paths: - logger.debug(f"❌ Exclude paths: {exclude_paths}") - if webhook_url: - logger.debug(f"πŸ”” Webhook URL: {webhook_url}") - if wait_ms is not None: - logger.debug(f"⏱️ Wait ms: {wait_ms}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Build request data, excluding None values - request_data = { - "url": url, - "extraction_mode": extraction_mode, - "cache_website": cache_website, - "depth": depth, - "max_pages": max_pages, - "same_domain_only": same_domain_only, - "sitemap": sitemap, - "render_heavy_js": render_heavy_js, - "stealth": stealth, - } - - # Add optional parameters only if provided - if prompt is not None: - request_data["prompt"] = prompt - if data_schema is not None: - request_data["data_schema"] = data_schema - if breadth is not None: - request_data["breadth"] = breadth - if batch_size is not None: - request_data["batch_size"] = batch_size - if headers is not None: - request_data["headers"] = headers - if include_paths is not None: - request_data["include_paths"] = include_paths - if exclude_paths is not None: - request_data["exclude_paths"] = exclude_paths - if webhook_url is not None: - request_data["webhook_url"] = webhook_url - if wait_ms is not None: - request_data["wait_ms"] = wait_ms - - request = CrawlRequest(**request_data) - logger.debug("βœ… Request validation passed") - - # Serialize the request, excluding None values - request_json = request.model_dump(exclude_none=True) - result = self._make_request("POST", f"{API_BASE_URL}/crawl", json=request_json) - logger.info("✨ Crawl request completed successfully") - return process_response_with_toon(result, return_toon) - - def get_crawl(self, crawl_id: str, return_toon: bool = False): - """Get the result of a previous crawl request - - Args: - crawl_id: The crawl ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching crawl result for request {crawl_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - # Validate input using Pydantic model - GetCrawlRequest(crawl_id=crawl_id) - logger.debug("βœ… Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/crawl/{crawl_id}") - logger.info(f"✨ Successfully retrieved result for request {crawl_id}") - return process_response_with_toon(result, return_toon) - - def agenticscraper( - self, - url: str, - steps: list[str], - use_session: bool = True, - user_prompt: Optional[str] = None, - output_schema: Optional[Dict[str, Any]] = None, - ai_extraction: bool = False, - mock: bool=False, - stealth: bool=False, - return_toon: bool = False, - ): - """Send an agentic scraper request to perform automated actions on a webpage + prompt: str, + existing_schema: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """Generate a JSON schema from a natural language prompt. Args: - url: The URL to scrape - steps: List of steps to perform on the webpage - use_session: Whether to use session for the scraping (default: True) - user_prompt: Prompt for AI extraction (required when ai_extraction=True) - output_schema: Schema for structured data extraction (optional, used with ai_extraction=True) - ai_extraction: Whether to use AI for data extraction from the scraped content (default: False) - mock: Enable mock mode for testing - stealth: Enable stealth mode to avoid bot detection - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) + prompt: Description of the schema to generate + existing_schema: Optional existing schema to modify/extend """ - logger.info(f"πŸ€– Starting agentic scraper request for {url}") - logger.debug(f"πŸ”§ Use session: {use_session}") - logger.debug(f"πŸ“‹ Steps: {steps}") - logger.debug(f"🧠 AI extraction: {ai_extraction}") - if ai_extraction: - logger.debug(f"πŸ’­ User prompt: {user_prompt}") - logger.debug(f"πŸ“‹ Output schema provided: {output_schema is not None}") - if stealth: - logger.debug("πŸ₯· Stealth mode enabled") - if return_toon: - logger.debug("🎨 TOON format output enabled") - - request = AgenticScraperRequest( - url=url, - steps=steps, - use_session=use_session, - user_prompt=user_prompt, - output_schema=output_schema, - ai_extraction=ai_extraction, - mock=mock, - stealth=stealth + logger.info("Generating schema") + request = SchemaRequest( + prompt=prompt, + existing_schema=existing_schema, ) - logger.debug("βœ… Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/agentic-scrapper", json=request.model_dump() + return self._make_request( + "POST", f"{self.base_url}/schema", json=request.model_dump() ) - logger.info("✨ Agentic scraper request completed successfully") - return process_response_with_toon(result, return_toon) - def get_agenticscraper(self, request_id: str, return_toon: bool = False): - """Get the result of a previous agentic scraper request - - Args: - request_id: The request ID to fetch - return_toon: If True, return response in TOON format (reduces token usage by 30-60%) - """ - logger.info(f"πŸ” Fetching agentic scraper result for request {request_id}") - if return_toon: - logger.debug("🎨 TOON format output enabled") + # ------------------------------------------------------------------ + # Credits + # ------------------------------------------------------------------ - # Validate input using Pydantic model - GetAgenticScraperRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") + def credits(self) -> Dict[str, Any]: + """Get remaining API credits.""" + logger.info("Fetching credits") + return self._make_request("GET", f"{self.base_url}/credits") - result = self._make_request("GET", f"{API_BASE_URL}/agentic-scrapper/{request_id}") - logger.info(f"✨ Successfully retrieved result for request {request_id}") - return process_response_with_toon(result, return_toon) + # ------------------------------------------------------------------ + # History + # ------------------------------------------------------------------ - def generate_schema( + def history( self, - user_prompt: str, - existing_schema: Optional[Dict[str, Any]] = None, - ): - """Generate a JSON schema from a user prompt - - Args: - user_prompt: The user's search query to be refined into a schema - existing_schema: Optional existing JSON schema to modify/extend - """ - logger.info("πŸ”§ Starting schema generation request") - logger.debug(f"πŸ’­ User prompt: {user_prompt}") - if existing_schema: - logger.debug(f"πŸ“‹ Existing schema provided: {existing_schema is not None}") - - request = GenerateSchemaRequest( - user_prompt=user_prompt, - existing_schema=existing_schema, - ) - logger.debug("βœ… Request validation passed") - - result = self._make_request( - "POST", f"{API_BASE_URL}/generate_schema", json=request.model_dump() - ) - logger.info("✨ Schema generation request completed successfully") - return result + endpoint: Optional[str] = None, + status: Optional[str] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> Dict[str, Any]: + """Retrieve request history. - def get_schema_status(self, request_id: str): - """Get the status of a schema generation request - Args: - request_id: The request ID returned from generate_schema + endpoint: Filter by endpoint name (e.g. 'scrape', 'extract') + status: Filter by request status + limit: Maximum number of results (1-100) + offset: Number of results to skip """ - logger.info(f"πŸ” Fetching schema generation status for request {request_id}") - - # Validate input using Pydantic model - GetSchemaStatusRequest(request_id=request_id) - logger.debug("βœ… Request ID validation passed") - - result = self._make_request("GET", f"{API_BASE_URL}/generate_schema/{request_id}") - logger.info(f"✨ Successfully retrieved schema status for request {request_id}") - return result - - def create_scheduled_job( - self, - job_name: str, - service_type: str, - cron_expression: str, - job_config: dict, - is_active: bool = True, - ): - """Create a new scheduled job""" - logger.info(f"πŸ“… Creating scheduled job: {job_name}") - - request = ScheduledJobCreate( - job_name=job_name, - service_type=service_type, - cron_expression=cron_expression, - job_config=job_config, - is_active=is_active, - ) - - result = self._make_request( - "POST", f"{API_BASE_URL}/scheduled-jobs", json=request.model_dump() - ) - logger.info("✨ Scheduled job created successfully") - return result - - def get_scheduled_jobs( - self, - page: int = 1, - page_size: int = 20, - service_type: Optional[str] = None, - is_active: Optional[bool] = None, - ): - """Get list of scheduled jobs with pagination""" - logger.info("πŸ“‹ Fetching scheduled jobs") - - GetScheduledJobsRequest( - page=page, - page_size=page_size, - service_type=service_type, - is_active=is_active, - ) - - params = {"page": page, "page_size": page_size} - if service_type: - params["service_type"] = service_type - if is_active is not None: - params["is_active"] = is_active - - result = self._make_request("GET", f"{API_BASE_URL}/scheduled-jobs", params=params) - logger.info(f"✨ Successfully retrieved {len(result.get('jobs', []))} scheduled jobs") - return result - - def get_scheduled_job(self, job_id: str): - """Get details of a specific scheduled job""" - logger.info(f"πŸ” Fetching scheduled job {job_id}") - - GetScheduledJobRequest(job_id=job_id) - - result = self._make_request("GET", f"{API_BASE_URL}/scheduled-jobs/{job_id}") - logger.info(f"✨ Successfully retrieved scheduled job {job_id}") - return result - - def update_scheduled_job( - self, - job_id: str, - job_name: Optional[str] = None, - cron_expression: Optional[str] = None, - job_config: Optional[dict] = None, - is_active: Optional[bool] = None, - ): - """Update an existing scheduled job (partial update)""" - logger.info(f"πŸ“ Updating scheduled job {job_id}") - - update_data = {} - if job_name is not None: - update_data["job_name"] = job_name - if cron_expression is not None: - update_data["cron_expression"] = cron_expression - if job_config is not None: - update_data["job_config"] = job_config - if is_active is not None: - update_data["is_active"] = is_active - - ScheduledJobUpdate(**update_data) - - result = self._make_request( - "PATCH", f"{API_BASE_URL}/scheduled-jobs/{job_id}", json=update_data + logger.info("Fetching history") + filter_obj = HistoryFilter( + endpoint=endpoint, status=status, limit=limit, offset=offset ) - logger.info(f"✨ Successfully updated scheduled job {job_id}") - return result - - def replace_scheduled_job( - self, - job_id: str, - job_name: str, - cron_expression: str, - job_config: dict, - is_active: bool = True, - ): - """Replace an existing scheduled job (full update)""" - logger.info(f"πŸ”„ Replacing scheduled job {job_id}") - - request_data = { - "job_name": job_name, - "cron_expression": cron_expression, - "job_config": job_config, - "is_active": is_active, - } - - result = self._make_request( - "PUT", f"{API_BASE_URL}/scheduled-jobs/{job_id}", json=request_data + params = filter_obj.to_params() + return self._make_request( + "GET", f"{self.base_url}/history", params=params or None ) - logger.info(f"✨ Successfully replaced scheduled job {job_id}") - return result - - def delete_scheduled_job(self, job_id: str): - """Delete a scheduled job""" - logger.info(f"πŸ—‘οΈ Deleting scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = self._make_request("DELETE", f"{API_BASE_URL}/scheduled-jobs/{job_id}") - logger.info(f"✨ Successfully deleted scheduled job {job_id}") - return result - - def pause_scheduled_job(self, job_id: str): - """Pause a scheduled job""" - logger.info(f"⏸️ Pausing scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - result = self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/pause") - logger.info(f"✨ Successfully paused scheduled job {job_id}") - return result - - def resume_scheduled_job(self, job_id: str): - """Resume a paused scheduled job""" - logger.info(f"▢️ Resuming scheduled job {job_id}") - - JobActionRequest(job_id=job_id) - - result = self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/resume") - logger.info(f"✨ Successfully resumed scheduled job {job_id}") - return result - - def trigger_scheduled_job(self, job_id: str): - """Manually trigger a scheduled job""" - logger.info(f"πŸš€ Manually triggering scheduled job {job_id}") - - TriggerJobRequest(job_id=job_id) - - result = self._make_request("POST", f"{API_BASE_URL}/scheduled-jobs/{job_id}/trigger") - logger.info(f"✨ Successfully triggered scheduled job {job_id}") - return result - - def get_job_executions( - self, - job_id: str, - page: int = 1, - page_size: int = 20, - status: Optional[str] = None, - ): - """Get execution history for a scheduled job""" - logger.info(f"πŸ“Š Fetching execution history for job {job_id}") - - GetJobExecutionsRequest( - job_id=job_id, - page=page, - page_size=page_size, - status=status, - ) - - params = {"page": page, "page_size": page_size} - if status: - params["status"] = status - - result = self._make_request( - "GET", f"{API_BASE_URL}/scheduled-jobs/{job_id}/executions", params=params - ) - logger.info(f"✨ Successfully retrieved execution history for job {job_id}") - return result + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ - def close(self): - """Close the session to free up resources""" - logger.info("πŸ”’ Closing Client session") + def close(self) -> None: + """Close the HTTP session.""" + logger.info("Closing Client session") self.session.close() - logger.debug("βœ… Session closed successfully") - def __enter__(self): + def __enter__(self) -> "Client": return self - def __exit__(self, exc_type, exc_val, exc_tb): + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: self.close() diff --git a/scrapegraph-py/scrapegraph_py/config.py b/scrapegraph-py/scrapegraph_py/config.py index e7ca178..052780b 100644 --- a/scrapegraph-py/scrapegraph_py/config.py +++ b/scrapegraph-py/scrapegraph_py/config.py @@ -1,15 +1,11 @@ """ -Configuration and constants for the ScrapeGraphAI SDK. - -This module contains API configuration settings including the base URL -and default headers used for all API requests. - -Attributes: - API_BASE_URL (str): Base URL for the ScrapeGraphAI API endpoints - DEFAULT_HEADERS (dict): Default HTTP headers for API requests +Configuration and constants for the ScrapeGraphAI SDK v2. """ -API_BASE_URL = "https://api.scrapegraphai.com/v1" + +VERSION = "2.0.0" +API_BASE_URL = "https://api.scrapegraphai.com/api/v1" DEFAULT_HEADERS = { "accept": "application/json", "Content-Type": "application/json", + "X-SDK-Version": f"python@{VERSION}", } diff --git a/scrapegraph-py/scrapegraph_py/models/__init__.py b/scrapegraph-py/scrapegraph_py/models/__init__.py index 1f374b8..26a4c9f 100644 --- a/scrapegraph-py/scrapegraph_py/models/__init__.py +++ b/scrapegraph-py/scrapegraph_py/models/__init__.py @@ -1,57 +1,35 @@ """ -Pydantic models for all ScrapeGraphAI API endpoints. - -This module provides request and response models for validating and -structuring data for all API operations. All models use Pydantic for -data validation and serialization. - -Available Models: - - AgenticScraperRequest, GetAgenticScraperRequest: Agentic scraper operations - - CrawlRequest, GetCrawlRequest: Website crawling operations - - FeedbackRequest: User feedback submission - - ScrapeRequest, GetScrapeRequest: Basic HTML scraping - - MarkdownifyRequest, GetMarkdownifyRequest: Markdown conversion - - SearchScraperRequest, GetSearchScraperRequest: Web research - - SmartScraperRequest, GetSmartScraperRequest: AI-powered scraping - - GenerateSchemaRequest, GetSchemaStatusRequest: Schema generation - - ScheduledJob models: Job scheduling and management - -Example: - >>> from scrapegraph_py.models import SmartScraperRequest - >>> request = SmartScraperRequest( - ... website_url="https://example.com", - ... user_prompt="Extract product info" - ... ) +Pydantic models for the ScrapeGraphAI v2 API. """ -from .agenticscraper import AgenticScraperRequest, GetAgenticScraperRequest -from .crawl import CrawlRequest, GetCrawlRequest -from .feedback import FeedbackRequest -from .scrape import GetScrapeRequest, ScrapeRequest -from .markdownify import GetMarkdownifyRequest, MarkdownifyRequest -from .searchscraper import GetSearchScraperRequest, SearchScraperRequest, TimeRange -from .sitemap import SitemapRequest, SitemapResponse -from .smartscraper import GetSmartScraperRequest, SmartScraperRequest -from .schema import GenerateSchemaRequest, GetSchemaStatusRequest, SchemaGenerationResponse +from .shared import FetchConfig, LlmConfig +from .scrape import ScrapeFormat, ScrapeRequest, GetScrapeRequest +from .extract import ExtractRequest +from .search import SearchRequest +from .schema import SchemaRequest +from .crawl import CrawlFormat, CrawlRequest +from .monitor import MonitorCreateRequest +from .history import HistoryFilter __all__ = [ - "AgenticScraperRequest", - "GetAgenticScraperRequest", - "CrawlRequest", - "GetCrawlRequest", - "FeedbackRequest", - "GetScrapeRequest", + # Shared + "FetchConfig", + "LlmConfig", + # Scrape + "ScrapeFormat", "ScrapeRequest", - "GetMarkdownifyRequest", - "MarkdownifyRequest", - "GetSearchScraperRequest", - "SearchScraperRequest", - "TimeRange", - "SitemapRequest", - "SitemapResponse", - "GetSmartScraperRequest", - "SmartScraperRequest", - "GenerateSchemaRequest", - "GetSchemaStatusRequest", - "SchemaGenerationResponse", + "GetScrapeRequest", + # Extract + "ExtractRequest", + # Search + "SearchRequest", + # Schema + "SchemaRequest", + # Crawl + "CrawlFormat", + "CrawlRequest", + # Monitor + "MonitorCreateRequest", + # History + "HistoryFilter", ] diff --git a/scrapegraph-py/scrapegraph_py/models/agenticscraper.py b/scrapegraph-py/scrapegraph_py/models/agenticscraper.py deleted file mode 100644 index 93b6234..0000000 --- a/scrapegraph-py/scrapegraph_py/models/agenticscraper.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Pydantic models for the Agentic Scraper API endpoint. - -This module defines request and response models for the Agentic Scraper endpoint, -which performs automated browser interactions and optional AI data extraction. - -The Agentic Scraper can: -- Execute a sequence of browser actions (click, type, scroll, etc.) -- Handle authentication flows and form submissions -- Optionally extract structured data using AI after interactions -- Maintain browser sessions across multiple steps -""" - -from typing import Any, Dict, List, Optional -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class AgenticScraperRequest(BaseModel): - """ - Request model for the Agentic Scraper endpoint. - - This model validates and structures requests for automated browser - interactions with optional AI extraction. - - Attributes: - url: The starting URL for the scraping session - use_session: Whether to maintain browser session across steps - steps: List of actions to perform (e.g., "Type email@example.com in email input") - user_prompt: Optional prompt for AI extraction (required if ai_extraction=True) - output_schema: Optional schema for structured data extraction - ai_extraction: Whether to use AI for data extraction after interactions - headers: Optional HTTP headers - mock: Whether to use mock mode for testing - render_heavy_js: Whether to render heavy JavaScript - - Example: - >>> request = AgenticScraperRequest( - ... url="https://dashboard.example.com", - ... steps=[ - ... "Type user@example.com in email input", - ... "Type password123 in password input", - ... "Click login button" - ... ], - ... ai_extraction=True, - ... user_prompt="Extract user dashboard information" - ... ) - """ - url: str = Field( - ..., - example="https://dashboard.scrapegraphai.com/", - description="The URL to scrape" - ) - use_session: bool = Field( - default=True, - description="Whether to use session for the scraping" - ) - steps: List[str] = Field( - ..., - example=[ - "Type email@gmail.com in email input box", - "Type test-password@123 in password inputbox", - "click on login" - ], - description="List of steps to perform on the webpage" - ) - user_prompt: Optional[str] = Field( - default=None, - example="Extract user information and available dashboard sections", - description="Prompt for AI extraction (only used when ai_extraction=True)" - ) - output_schema: Optional[Dict[str, Any]] = Field( - default=None, - example={ - "user_info": { - "type": "object", - "properties": { - "username": {"type": "string"}, - "email": {"type": "string"}, - "dashboard_sections": {"type": "array", "items": {"type": "string"}} - } - } - }, - description="Schema for structured data extraction (only used when ai_extraction=True)" - ) - ai_extraction: bool = Field( - default=False, - description="Whether to use AI for data extraction from the scraped content" - ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", - ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - - @model_validator(mode="after") - def validate_url(self) -> "AgenticScraperRequest": - if not self.url.strip(): - raise ValueError("URL cannot be empty") - if not ( - self.url.startswith("http://") - or self.url.startswith("https://") - ): - raise ValueError("Invalid URL - must start with http:// or https://") - return self - - @model_validator(mode="after") - def validate_steps(self) -> "AgenticScraperRequest": - if not self.steps: - raise ValueError("Steps cannot be empty") - if any(not step.strip() for step in self.steps): - raise ValueError("All steps must contain valid instructions") - return self - - @model_validator(mode="after") - def validate_ai_extraction(self) -> "AgenticScraperRequest": - if self.ai_extraction: - if not self.user_prompt or not self.user_prompt.strip(): - raise ValueError("user_prompt is required when ai_extraction=True") - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) - - -class GetAgenticScraperRequest(BaseModel): - """Request model for get_agenticscraper endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetAgenticScraperRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py index dd6cca9..1b7458c 100644 --- a/scrapegraph-py/scrapegraph_py/models/crawl.py +++ b/scrapegraph-py/scrapegraph_py/models/crawl.py @@ -1,219 +1,61 @@ -# Models for crawl endpoint +""" +Pydantic models for the v2 Crawl endpoints. -from typing import Any, Dict, Optional -from uuid import UUID +POST /v2/crawl - Start a crawl job +GET /v2/crawl/:id - Get crawl status/results +POST /v2/crawl/:id/stop - Stop a running crawl +POST /v2/crawl/:id/resume - Resume a stopped crawl +""" + +from enum import Enum +from typing import Any, Dict, List, Optional from pydantic import BaseModel, Field, conint, model_validator +from .shared import FetchConfig -class CrawlRequest(BaseModel): - """ - Request model for the crawl endpoint. - The crawl endpoint supports two modes: - 1. AI Extraction Mode (extraction_mode=True): Uses AI to extract structured data - 2. Markdown Conversion Mode (extraction_mode=False): Converts pages to markdown (80% cheaper) +class CrawlFormat(str, Enum): + """Output format for crawled pages.""" - Sitemap Support: - - When sitemap=True, the crawler uses sitemap.xml for better page discovery - - Recommended for structured websites (e-commerce, news sites, blogs) - - Provides more comprehensive crawling coverage - - Works with both AI extraction and markdown conversion modes + MARKDOWN = "markdown" + HTML = "html" - Path Filtering: - - include_paths: Specify which paths to crawl (e.g., ['/products/*', '/blog/**']) - - exclude_paths: Specify which paths to skip (e.g., ['/admin/*', '/api/*']) - - Supports wildcards: * (any characters), ** (any path segments) - - exclude_paths takes precedence over include_paths - """ - url: str = Field( - ..., - example="https://scrapegraphai.com/", - description="The starting URL for the crawl", - ) - extraction_mode: bool = Field( - default=True, - description="True for AI extraction mode, False for markdown conversion " - "mode (no AI/LLM processing)", - ) - prompt: Optional[str] = Field( - default=None, - example="What does the company do? and I need text content from there " - "privacy and terms", - description="The prompt to guide the crawl and extraction (required when " - "extraction_mode=True)", - ) - data_schema: Optional[Dict[str, Any]] = Field( - default=None, - description="JSON schema defining the structure of the extracted data " - "(required when extraction_mode=True)", - ) - cache_website: bool = Field( - default=True, description="Whether to cache the website content" - ) + +class CrawlRequest(BaseModel): + """Request model for POST /v2/crawl.""" + + url: str = Field(..., description="The starting URL for the crawl") depth: conint(ge=1, le=10) = Field( - default=2, description="Maximum depth of the crawl (1-10)" - ) - breadth: Optional[conint(ge=1)] = Field( - default=None, - description="Maximum number of links to crawl per depth level. " - "If None, unlimited (default). Controls the 'width' of exploration at each depth. " - "Useful for limiting crawl scope on large sites. Note: max_pages always takes priority - " - "the total crawled pages will never exceed max_pages regardless of breadth setting. " - "Ignored when sitemap=True (sitemap mode uses sitemap URLs directly instead of link discovery).", + default=2, description="Maximum crawl depth (1-10)" ) max_pages: conint(ge=1, le=100) = Field( - default=2, description="Maximum number of pages to crawl (1-100)" - ) - same_domain_only: bool = Field( - default=True, description="Whether to only crawl pages from the same domain" - ) - batch_size: Optional[conint(ge=1, le=10)] = Field( - default=None, description="Batch size for processing pages (1-10)" - ) - sitemap: bool = Field( - default=False, - description="Whether to use sitemap.xml for better page discovery and more comprehensive crawling. " - "When enabled, the crawler will use the website's sitemap.xml to discover pages more efficiently, " - "providing better coverage for structured websites like e-commerce sites, news portals, and content-heavy websites." + default=10, description="Maximum number of pages to crawl (1-100)" ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", + format: CrawlFormat = Field( + default=CrawlFormat.MARKDOWN, + description="Output format: markdown or html", ) - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - include_paths: Optional[list[str]] = Field( + include_patterns: Optional[List[str]] = Field( default=None, - description="List of path patterns to include (e.g., ['/products/*', '/blog/**']). " - "Supports wildcards: * matches any characters, ** matches any path segments. " - "If empty, all paths are included.", - example=["/products/*", "/blog/**"] + description="URL patterns to include (e.g. ['/products/*', '/blog/**'])", ) - exclude_paths: Optional[list[str]] = Field( + exclude_patterns: Optional[List[str]] = Field( default=None, - description="List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). " - "Supports wildcards: * matches any characters, ** matches any path segments. " - "Takes precedence over include_paths.", - example=["/admin/*", "/api/**"] + description="URL patterns to exclude (e.g. ['/admin/*', '/api/*'])", ) - webhook_url: Optional[str] = Field( - default=None, - description="URL to receive webhook notifications when the crawl job completes. " - "The webhook will receive a POST request with the crawl results.", - example="https://example.com/webhook" - ) - wait_ms: Optional[int] = Field( - default=None, - description="Number of milliseconds to wait before scraping each page. " - "Useful for pages with heavy JavaScript rendering that need extra time to load.", + fetch_config: Optional[FetchConfig] = Field( + default=None, description="Fetch configuration options" ) @model_validator(mode="after") def validate_url(self) -> "CrawlRequest": - if not self.url.strip(): + if not self.url or not self.url.strip(): raise ValueError("URL cannot be empty") if not (self.url.startswith("http://") or self.url.startswith("https://")): - raise ValueError("Invalid URL - must start with http:// or https://") - return self - - @model_validator(mode="after") - def validate_extraction_mode_requirements(self) -> "CrawlRequest": - """Validate requirements based on extraction mode""" - if self.extraction_mode: - # AI extraction mode - require prompt and data_schema - if not self.prompt: - raise ValueError("Prompt is required when extraction_mode=True") - if not self.prompt.strip(): - raise ValueError("Prompt cannot be empty") - if not any(c.isalnum() for c in self.prompt): - raise ValueError("Prompt must contain valid content") - - if not self.data_schema: - raise ValueError("Data schema is required when extraction_mode=True") - if not isinstance(self.data_schema, dict): - raise ValueError("Data schema must be a dictionary") - if not self.data_schema: - raise ValueError("Data schema cannot be empty") - else: - # Markdown conversion mode - prompt and data_schema should be None - if self.prompt is not None: - raise ValueError( - "Prompt should not be provided when extraction_mode=False " - "(markdown mode)" - ) - if self.data_schema is not None: - raise ValueError( - "Data schema should not be provided when extraction_mode=False " - "(markdown mode)" - ) - - return self - - @model_validator(mode="after") - def validate_batch_size(self) -> "CrawlRequest": - if self.batch_size is not None and ( - self.batch_size < 1 or self.batch_size > 10 - ): - raise ValueError("Batch size must be between 1 and 10") - return self - - @model_validator(mode="after") - def validate_sitemap_usage(self) -> "CrawlRequest": - """Validate sitemap usage and provide recommendations""" - if self.sitemap: - # Log recommendation for sitemap usage - if self.max_pages < 5: - # This is just a recommendation, not an error - pass # Could add logging here if needed - return self - - @model_validator(mode="after") - def validate_path_patterns(self) -> "CrawlRequest": - """Validate path patterns start with '/'""" - if self.include_paths: - for path in self.include_paths: - if not path.startswith("/"): - raise ValueError(f"Include path must start with '/': {path}") - - if self.exclude_paths: - for path in self.exclude_paths: - if not path.startswith("/"): - raise ValueError(f"Exclude path must start with '/': {path}") - - return self - - @model_validator(mode="after") - def validate_webhook_url(self) -> "CrawlRequest": - """Validate webhook URL format if provided""" - if self.webhook_url is not None: - if not self.webhook_url.strip(): - raise ValueError("Webhook URL cannot be empty") - if not ( - self.webhook_url.startswith("http://") - or self.webhook_url.startswith("https://") - ): - raise ValueError( - "Invalid webhook URL - must start with http:// or https://" - ) + raise ValueError("URL must start with http:// or https://") return self - -class GetCrawlRequest(BaseModel): - """Request model for get_crawl endpoint""" - - crawl_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_crawl_id(self) -> "GetCrawlRequest": - try: - # Validate the crawl_id is a valid UUID - UUID(self.crawl_id) - except ValueError: - raise ValueError("crawl_id must be a valid UUID") - return self + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/extract.py b/scrapegraph-py/scrapegraph_py/models/extract.py new file mode 100644 index 0000000..3b3381b --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/extract.py @@ -0,0 +1,47 @@ +""" +Pydantic models for the v2 Extract endpoint. + +POST /v2/extract - AI-powered data extraction (replaces SmartScraper). +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field, model_validator + +from .shared import FetchConfig, LlmConfig + + +class ExtractRequest(BaseModel): + """Request model for POST /v2/extract.""" + + url: str = Field(..., description="URL of the page to extract data from") + prompt: str = Field(..., description="Natural language prompt describing what to extract") + output_schema: Optional[Dict[str, Any]] = Field( + default=None, + description="JSON Schema defining the structure of the extracted data", + ) + fetch_config: Optional[FetchConfig] = Field( + default=None, description="Fetch configuration options" + ) + llm_config: Optional[LlmConfig] = Field( + default=None, description="LLM configuration options" + ) + + @model_validator(mode="after") + def validate_fields(self) -> "ExtractRequest": + if not self.url or not self.url.strip(): + raise ValueError("URL cannot be empty") + if not (self.url.startswith("http://") or self.url.startswith("https://")): + raise ValueError("URL must start with http:// or https://") + if not self.prompt or not self.prompt.strip(): + raise ValueError("Prompt cannot be empty") + return self + + def to_api_payload(self) -> Dict[str, Any]: + """Convert to API payload, handling Pydantic BaseModel output_schema.""" + data = self.model_dump(exclude_none=True) + return data + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/feedback.py b/scrapegraph-py/scrapegraph_py/models/feedback.py deleted file mode 100644 index 43c41ec..0000000 --- a/scrapegraph-py/scrapegraph_py/models/feedback.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Pydantic models for the Feedback API endpoint. - -This module defines request models for submitting user feedback about -API requests, helping improve the service quality. -""" - -from typing import Optional -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class FeedbackRequest(BaseModel): - """Request model for feedback endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - rating: int = Field(..., ge=1, le=5, example=5) - feedback_text: Optional[str] = Field(None, example="Great results!") - - @model_validator(mode="after") - def validate_request_id(self) -> "FeedbackRequest": - try: - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/history.py b/scrapegraph-py/scrapegraph_py/models/history.py new file mode 100644 index 0000000..164b25f --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/history.py @@ -0,0 +1,30 @@ +""" +Pydantic models for the v2 History endpoint. + +GET /v2/history - Retrieve request history with optional filters. +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + + +class HistoryFilter(BaseModel): + """Query parameters for GET /v2/history.""" + + endpoint: Optional[str] = Field( + default=None, description="Filter by endpoint name (e.g. 'scrape', 'extract')" + ) + status: Optional[str] = Field( + default=None, description="Filter by request status" + ) + limit: Optional[int] = Field( + default=None, ge=1, le=100, description="Maximum number of results (1-100)" + ) + offset: Optional[int] = Field( + default=None, ge=0, description="Number of results to skip" + ) + + def to_params(self) -> Dict[str, Any]: + """Convert to query parameter dict, excluding None values.""" + return {k: v for k, v in self.model_dump().items() if v is not None} diff --git a/scrapegraph-py/scrapegraph_py/models/markdownify.py b/scrapegraph-py/scrapegraph_py/models/markdownify.py deleted file mode 100644 index 0b95903..0000000 --- a/scrapegraph-py/scrapegraph_py/models/markdownify.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Pydantic models for the Markdownify API endpoint. - -This module defines request and response models for the Markdownify endpoint, -which converts web pages into clean markdown format. - -The Markdownify endpoint is useful for: -- Converting HTML to markdown for easier processing -- Extracting clean text content from websites -- Preparing content for LLM consumption -""" - -from typing import Optional -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class MarkdownifyRequest(BaseModel): - """ - Request model for the Markdownify endpoint. - - This model validates and structures requests for converting web pages - to markdown format. - - Attributes: - website_url: URL of the website to convert to markdown - headers: Optional HTTP headers including cookies - mock: Whether to use mock mode for testing - render_heavy_js: Whether to render heavy JavaScript on the page - stealth: Enable stealth mode to avoid bot detection - - Example: - >>> request = MarkdownifyRequest(website_url="https://example.com") - """ - website_url: str = Field(..., example="https://scrapegraphai.com/") - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", - ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") - - @model_validator(mode="after") - def validate_url(self) -> "MarkdownifyRequest": - if self.website_url is None or not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("Invalid URL") - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) - - -class GetMarkdownifyRequest(BaseModel): - """Request model for get_markdownify endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetMarkdownifyRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/models/monitor.py b/scrapegraph-py/scrapegraph_py/models/monitor.py new file mode 100644 index 0000000..809c7d6 --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/monitor.py @@ -0,0 +1,54 @@ +""" +Pydantic models for the v2 Monitor endpoints. + +POST /v2/monitor - Create a monitor +GET /v2/monitor - List monitors +GET /v2/monitor/:id - Get a monitor +POST /v2/monitor/:id/pause - Pause a monitor +POST /v2/monitor/:id/resume - Resume a monitor +DELETE /v2/monitor/:id - Delete a monitor +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field, model_validator + +from .shared import FetchConfig, LlmConfig + + +class MonitorCreateRequest(BaseModel): + """Request model for POST /v2/monitor.""" + + name: str = Field(..., description="Name of the monitor") + url: str = Field(..., description="URL to monitor") + prompt: str = Field(..., description="Prompt for AI extraction") + cron: str = Field(..., description="Cron expression for scheduling (5 fields)") + output_schema: Optional[Dict[str, Any]] = Field( + default=None, + description="JSON Schema defining the structure of extracted data", + ) + fetch_config: Optional[FetchConfig] = Field( + default=None, description="Fetch configuration options" + ) + llm_config: Optional[LlmConfig] = Field( + default=None, description="LLM configuration options" + ) + + @model_validator(mode="after") + def validate_fields(self) -> "MonitorCreateRequest": + if not self.name or not self.name.strip(): + raise ValueError("Name cannot be empty") + if not self.url or not self.url.strip(): + raise ValueError("URL cannot be empty") + if not (self.url.startswith("http://") or self.url.startswith("https://")): + raise ValueError("URL must start with http:// or https://") + if not self.prompt or not self.prompt.strip(): + raise ValueError("Prompt cannot be empty") + parts = self.cron.strip().split() + if len(parts) != 5: + raise ValueError("Cron expression must have exactly 5 fields") + return self + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/scheduled_jobs.py b/scrapegraph-py/scrapegraph_py/models/scheduled_jobs.py deleted file mode 100644 index 46e83d6..0000000 --- a/scrapegraph-py/scrapegraph_py/models/scheduled_jobs.py +++ /dev/null @@ -1,151 +0,0 @@ -""" -Pydantic models for the Scheduled Jobs API endpoints. - -This module defines request and response models for managing scheduled jobs, -which allow you to automate recurring scraping tasks using cron expressions. - -Scheduled Jobs support: -- Creating recurring scraping jobs -- Managing job lifecycle (pause, resume, delete) -- Manually triggering jobs on demand -- Viewing execution history -- Filtering and pagination -""" - -from typing import Any, Dict, Optional -from enum import Enum -from pydantic import BaseModel, Field, model_validator - - -class ServiceType(str, Enum): - """ - Enum defining available service types for scheduled jobs. - - Available services: - SMART_SCRAPER: AI-powered web scraping - SEARCH_SCRAPER: Web research across multiple sources - AGENTIC_SCRAPER: Automated browser interactions - """ - SMART_SCRAPER = "smartscraper" - SEARCH_SCRAPER = "searchscraper" - AGENTIC_SCRAPER = "agenticscraper" - - -class ScheduledJobCreate(BaseModel): - """Model for creating a new scheduled job""" - job_name: str = Field(..., min_length=1, description="Name of the scheduled job") - service_type: str = Field(..., description="Type of service (smartscraper, searchscraper, etc.)") - cron_expression: str = Field(..., description="Cron expression for scheduling") - job_config: Dict[str, Any] = Field( - ..., - example={ - "website_url": "https://example.com", - "user_prompt": "Extract company information", - "headers": { - "User-Agent": "scrapegraph-py", - "Cookie": "session=abc123" - } - }, - description="Configuration for the job" - ) - is_active: bool = Field(default=True, description="Whether the job is active") - - @model_validator(mode="after") - def validate_cron_expression(self) -> "ScheduledJobCreate": - parts = self.cron_expression.strip().split() - if len(parts) != 5: - raise ValueError("Cron expression must have exactly 5 fields") - return self - - -class ScheduledJobUpdate(BaseModel): - """Model for updating a scheduled job (partial update)""" - job_name: Optional[str] = Field(None, description="Name of the scheduled job") - cron_expression: Optional[str] = Field(None, description="Cron expression for scheduling") - job_config: Optional[Dict[str, Any]] = Field(None, description="Configuration for the job") - is_active: Optional[bool] = Field(None, description="Whether the job is active") - - -class GetScheduledJobsRequest(BaseModel): - """Model for getting list of scheduled jobs""" - page: int = Field(default=1, ge=1, description="Page number") - page_size: int = Field(default=20, ge=1, le=100, description="Number of jobs per page") - service_type: Optional[str] = Field(None, description="Filter by service type") - is_active: Optional[bool] = Field(None, description="Filter by active status") - - -class GetScheduledJobRequest(BaseModel): - """Model for getting a specific scheduled job""" - job_id: str = Field(..., description="ID of the scheduled job") - - -class JobActionRequest(BaseModel): - """Model for job actions (pause, resume, delete)""" - job_id: str = Field(..., description="ID of the scheduled job") - - -class TriggerJobRequest(BaseModel): - """Model for manually triggering a job""" - job_id: str = Field(..., description="ID of the scheduled job") - - -class GetJobExecutionsRequest(BaseModel): - """Model for getting job execution history""" - job_id: str = Field(..., description="ID of the scheduled job") - page: int = Field(default=1, ge=1, description="Page number") - page_size: int = Field(default=20, ge=1, le=100, description="Number of executions per page") - status: Optional[str] = Field(None, description="Filter by execution status") - - -class JobActionResponse(BaseModel): - """Response model for job actions""" - success: bool = Field(..., description="Whether the action was successful") - message: str = Field(..., description="Response message") - job_id: str = Field(..., description="ID of the scheduled job") - - -class JobExecutionListResponse(BaseModel): - """Response model for job execution list""" - executions: list = Field(..., description="List of job executions") - total_count: int = Field(..., description="Total number of executions") - page: int = Field(..., description="Current page number") - page_size: int = Field(..., description="Number of executions per page") - - -class JobTriggerResponse(BaseModel): - """Response model for job trigger""" - success: bool = Field(..., description="Whether the job was triggered successfully") - message: str = Field(..., description="Response message") - job_id: str = Field(..., description="ID of the scheduled job") - execution_id: Optional[str] = Field(None, description="ID of the triggered execution") - - -class ScheduledJobListResponse(BaseModel): - """Response model for scheduled job list""" - jobs: list = Field(..., description="List of scheduled jobs") - total_count: int = Field(..., description="Total number of jobs") - page: int = Field(..., description="Current page number") - page_size: int = Field(..., description="Number of jobs per page") - - -class JobExecutionResponse(BaseModel): - """Response model for a single job execution""" - execution_id: str = Field(..., description="ID of the job execution") - job_id: str = Field(..., description="ID of the scheduled job") - status: str = Field(..., description="Execution status") - started_at: Optional[str] = Field(None, description="Execution start timestamp") - completed_at: Optional[str] = Field(None, description="Execution completion timestamp") - result: Optional[Dict[str, Any]] = Field(None, description="Execution result data") - error_message: Optional[str] = Field(None, description="Error message if execution failed") - - -class ScheduledJobResponse(BaseModel): - """Response model for a single scheduled job""" - job_id: str = Field(..., description="ID of the scheduled job") - job_name: str = Field(..., description="Name of the scheduled job") - service_type: str = Field(..., description="Type of service") - cron_expression: str = Field(..., description="Cron expression for scheduling") - job_config: Dict[str, Any] = Field(..., description="Configuration for the job") - is_active: bool = Field(..., description="Whether the job is active") - created_at: Optional[str] = Field(None, description="Job creation timestamp") - updated_at: Optional[str] = Field(None, description="Job last update timestamp") \ No newline at end of file diff --git a/scrapegraph-py/scrapegraph_py/models/schema.py b/scrapegraph-py/scrapegraph_py/models/schema.py index d747f4b..81083b3 100644 --- a/scrapegraph-py/scrapegraph_py/models/schema.py +++ b/scrapegraph-py/scrapegraph_py/models/schema.py @@ -1,117 +1,30 @@ """ -Pydantic models for the Schema Generation API endpoint. +Pydantic models for the v2 Schema endpoint. -This module defines request and response models for the Schema Generation endpoint, -which uses AI to generate or refine JSON schemas based on user prompts. - -The Schema Generation endpoint can: -- Generate new schemas from natural language descriptions -- Refine and extend existing schemas -- Create structured data models for web scraping +POST /v2/schema - AI-powered JSON schema generation. """ from typing import Any, Dict, Optional -from uuid import UUID from pydantic import BaseModel, Field, model_validator -class GenerateSchemaRequest(BaseModel): - """Request model for generate_schema endpoint""" - - user_prompt: str = Field( - ..., - example="Find laptops with specifications like brand, processor, RAM, storage, and price", - description="The user's search query to be refined into a schema" +class SchemaRequest(BaseModel): + """Request model for POST /v2/schema.""" + + prompt: str = Field( + ..., description="Natural language description of the schema to generate" ) existing_schema: Optional[Dict[str, Any]] = Field( - default=None, - example={ - "$defs": { - "ProductSchema": { - "title": "ProductSchema", - "type": "object", - "properties": { - "name": {"title": "Name", "type": "string"}, - "price": {"title": "Price", "type": "number"}, - }, - "required": ["name", "price"], - } - } - }, - description="Optional existing JSON schema to modify/extend" - ) - - @model_validator(mode="after") - def validate_user_prompt(self) -> "GenerateSchemaRequest": - if not self.user_prompt or not self.user_prompt.strip(): - raise ValueError("user_prompt cannot be empty") - self.user_prompt = self.user_prompt.strip() - return self - - def model_dump(self, *args, **kwargs) -> dict: - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) - - -class GetSchemaStatusRequest(BaseModel): - """Request model for get_schema_status endpoint""" - - request_id: str = Field( - ..., - example="123e4567-e89b-12d3-a456-426614174000", - description="The request ID returned from generate_schema" + default=None, description="Existing JSON schema to modify or extend" ) @model_validator(mode="after") - def validate_request_id(self) -> "GetSchemaStatusRequest": - self.request_id = self.request_id.strip() - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") + def validate_prompt(self) -> "SchemaRequest": + if not self.prompt or not self.prompt.strip(): + raise ValueError("Prompt cannot be empty") return self - -class SchemaGenerationResponse(BaseModel): - """Response model for schema generation endpoints""" - - request_id: str = Field( - ..., - description="Unique identifier for the schema generation request" - ) - status: str = Field( - ..., - example="completed", - description="Status of the schema generation (pending, processing, completed, failed)" - ) - user_prompt: str = Field( - ..., - description="The original user prompt that was processed" - ) - refined_prompt: Optional[str] = Field( - default=None, - description="AI-refined version of the user prompt" - ) - generated_schema: Optional[Dict[str, Any]] = Field( - default=None, - description="The generated JSON schema" - ) - error: Optional[str] = Field( - default=None, - description="Error message if the request failed" - ) - created_at: Optional[str] = Field( - default=None, - description="Timestamp when the request was created" - ) - updated_at: Optional[str] = Field( - default=None, - description="Timestamp when the request was last updated" - ) - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: kwargs.setdefault("exclude_none", True) return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/scrape.py b/scrapegraph-py/scrapegraph_py/models/scrape.py index a080957..3155612 100644 --- a/scrapegraph-py/scrapegraph_py/models/scrape.py +++ b/scrapegraph-py/scrapegraph_py/models/scrape.py @@ -1,91 +1,96 @@ """ -Pydantic models for the Scrape API endpoint. +Pydantic models for the v2 Scrape endpoint. -This module defines request and response models for the basic Scrape endpoint, -which retrieves raw HTML content from websites. - -The Scrape endpoint is useful for: -- Getting clean HTML content from websites -- Handling JavaScript-heavy sites -- Preprocessing before AI extraction +POST /api/v1/scrape - Fetch a page in a given format (markdown, html, screenshot, branding). """ -from typing import Optional -from uuid import UUID +from enum import Enum +from typing import Any, Dict, Literal, Optional from pydantic import BaseModel, Field, model_validator +from .shared import FetchConfig + + +class ScrapeFormat(str, Enum): + """Output format for the scrape endpoint.""" + + MARKDOWN = "markdown" + HTML = "html" + SCREENSHOT = "screenshot" + BRANDING = "branding" + + +class MarkdownConfig(BaseModel): + """Configuration for markdown output.""" + + mode: str = Field(default="normal", description="Markdown mode (normal, etc.)") + + +class HtmlConfig(BaseModel): + """Configuration for html output.""" + + mode: str = Field(default="normal", description="HTML mode") + + +class ScreenshotConfig(BaseModel): + """Configuration for screenshot output.""" + + full_page: bool = Field(default=False, description="Capture full page") + class ScrapeRequest(BaseModel): + """Request model for POST /api/v1/scrape. + + The API expects a format-specific config key in the body, e.g.: + {"url": "...", "markdown": {"mode": "normal"}} + {"url": "...", "html": {"mode": "normal"}} + {"url": "...", "screenshot": {"full_page": false}} """ - Request model for the Scrape endpoint. - - This model validates and structures requests for basic HTML scraping - without AI extraction. - - Attributes: - website_url: URL of the website to scrape - render_heavy_js: Whether to render heavy JavaScript (default: False) - branding: Whether to include branding in the response (default: False) - headers: Optional HTTP headers including cookies - mock: Whether to use mock mode for testing - - Example: - >>> request = ScrapeRequest( - ... website_url="https://example.com", - ... render_heavy_js=True, - ... branding=True - ... ) - """ - website_url: str = Field(..., example="https://scrapegraphai.com/") - render_heavy_js: bool = Field( - False, - description="Whether to render heavy JavaScript (defaults to False)", - ) - branding: bool = Field( - False, - description="Whether to include branding in the response (defaults to False)", + + url: str = Field(..., description="URL of the page to scrape") + format: ScrapeFormat = Field( + default=ScrapeFormat.MARKDOWN, + description="Output format: markdown, html, screenshot, or branding", + exclude=True, ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", + markdown: Optional[MarkdownConfig] = Field(default=None) + html: Optional[HtmlConfig] = Field(default=None) + screenshot: Optional[ScreenshotConfig] = Field(default=None) + branding: Optional[Dict[str, Any]] = Field(default=None) + fetch_config: Optional[FetchConfig] = Field( + default=None, description="Fetch configuration options" ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") @model_validator(mode="after") def validate_url(self) -> "ScrapeRequest": - if self.website_url is None or not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("Invalid URL") + if not self.url or not self.url.strip(): + raise ValueError("URL cannot be empty") + if not (self.url.startswith("http://") or self.url.startswith("https://")): + raise ValueError("URL must start with http:// or https://") return self - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization + @model_validator(mode="after") + def set_format_config(self) -> "ScrapeRequest": + """Auto-populate the format config key if none were explicitly set.""" + has_any = any([self.markdown, self.html, self.screenshot, self.branding]) + if not has_any: + if self.format == ScrapeFormat.MARKDOWN: + self.markdown = MarkdownConfig() + elif self.format == ScrapeFormat.HTML: + self.html = HtmlConfig() + elif self.format == ScrapeFormat.SCREENSHOT: + self.screenshot = ScreenshotConfig() + elif self.format == ScrapeFormat.BRANDING: + self.branding = {} + return self + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: kwargs.setdefault("exclude_none", True) return super().model_dump(*args, **kwargs) class GetScrapeRequest(BaseModel): - """Request model for get_scrape endpoint""" + """Request model for GET /api/v1/scrape/:id.""" - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetScrapeRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self + request_id: str = Field(..., description="The request ID to fetch") diff --git a/scrapegraph-py/scrapegraph_py/models/search.py b/scrapegraph-py/scrapegraph_py/models/search.py new file mode 100644 index 0000000..a465561 --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/search.py @@ -0,0 +1,37 @@ +""" +Pydantic models for the v2 Search endpoint. + +POST /v2/search - Web search with AI extraction (replaces SearchScraper). +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field, conint, model_validator + +from .shared import LlmConfig + + +class SearchRequest(BaseModel): + """Request model for POST /v2/search.""" + + query: str = Field(..., description="The search query") + num_results: conint(ge=3, le=20) = Field( + default=5, description="Number of results to return (3-20)" + ) + output_schema: Optional[Dict[str, Any]] = Field( + default=None, + description="JSON Schema defining the structure of the extracted data", + ) + llm_config: Optional[LlmConfig] = Field( + default=None, description="LLM configuration options" + ) + + @model_validator(mode="after") + def validate_query(self) -> "SearchRequest": + if not self.query or not self.query.strip(): + raise ValueError("Query cannot be empty") + return self + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/searchscraper.py b/scrapegraph-py/scrapegraph_py/models/searchscraper.py deleted file mode 100644 index d143f97..0000000 --- a/scrapegraph-py/scrapegraph_py/models/searchscraper.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Pydantic models for the SearchScraper API endpoint. - -This module defines request and response models for the SearchScraper endpoint, -which performs AI-powered web research by searching, scraping, and synthesizing -information from multiple sources. - -The SearchScraper: -- Searches the web for relevant pages based on a query -- Scrapes multiple websites (3-20 configurable) -- Extracts and synthesizes information using AI -- Supports both AI extraction and markdown conversion modes -""" - -from enum import Enum -from typing import Optional, Type -from uuid import UUID - -from pydantic import BaseModel, Field, model_validator - - -class TimeRange(str, Enum): - """Time range filter for search results. - - Controls how recent the search results should be. This is useful for - finding recent news, updates, or time-sensitive information. - - Values: - PAST_HOUR: Results from the past hour - PAST_24_HOURS: Results from the past 24 hours - PAST_WEEK: Results from the past week - PAST_MONTH: Results from the past month - PAST_YEAR: Results from the past year - """ - - PAST_HOUR = "past_hour" - PAST_24_HOURS = "past_24_hours" - PAST_WEEK = "past_week" - PAST_MONTH = "past_month" - PAST_YEAR = "past_year" - - -class SearchScraperRequest(BaseModel): - """ - Request model for the SearchScraper endpoint. - - This model validates and structures requests for web research and scraping - across multiple search results. - - Attributes: - user_prompt: The search query/prompt - num_results: Number of websites to scrape (3-20, default 3) - headers: Optional HTTP headers - output_schema: Optional Pydantic model for structured extraction - extraction_mode: Use AI extraction (True) or markdown (False) - mock: Whether to use mock mode for testing - render_heavy_js: Whether to render heavy JavaScript - location_geo_code: Optional geo code for location-based search (e.g., "us") - time_range: Optional time range filter for search results - - Example: - >>> request = SearchScraperRequest( - ... user_prompt="What is the latest version of Python?", - ... num_results=5, - ... extraction_mode=True - ... ) - """ - user_prompt: str = Field(..., example="What is the latest version of Python?") - num_results: Optional[int] = Field( - default=3, - ge=3, - le=20, - example=5, - description="Number of websites to scrape (3-20). Default is 3. More " - "websites provide better research depth but cost more credits.", - ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", - ) - output_schema: Optional[Type[BaseModel]] = None - extraction_mode: bool = Field( - default=True, - description="Whether to use AI extraction (True) or markdown conversion (False). " - "AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page.", - ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - location_geo_code: Optional[str] = Field( - None, - description="The geo code of the location to search in", - example="us", - ) - time_range: Optional[TimeRange] = Field( - None, - description="The date range to filter search results", - examples=[ - TimeRange.PAST_HOUR, - TimeRange.PAST_24_HOURS, - TimeRange.PAST_WEEK, - TimeRange.PAST_MONTH, - TimeRange.PAST_YEAR, - ], - ) - - @model_validator(mode="after") - def validate_user_prompt(self) -> "SearchScraperRequest": - if self.user_prompt is None or not self.user_prompt.strip(): - raise ValueError("User prompt cannot be empty") - if not any(c.isalnum() for c in self.user_prompt): - raise ValueError("User prompt must contain a valid prompt") - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - data = super().model_dump(*args, **kwargs) - # Convert the Pydantic model schema to dict if present - if self.output_schema is not None: - data["output_schema"] = self.output_schema.model_json_schema() - return data - - -class GetSearchScraperRequest(BaseModel): - """Request model for get_searchscraper endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetSearchScraperRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/models/shared.py b/scrapegraph-py/scrapegraph_py/models/shared.py new file mode 100644 index 0000000..dd9624e --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/shared.py @@ -0,0 +1,66 @@ +""" +Shared configuration models for the ScrapeGraphAI v2 API. + +These models are used across multiple endpoints for fetch and LLM configuration. +""" + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + + +class FetchConfig(BaseModel): + """Configuration for how pages are fetched.""" + + mock: bool = Field(default=False, description="Use mock mode for testing") + stealth: bool = Field( + default=False, description="Enable stealth mode to avoid bot detection" + ) + scrolls: Optional[int] = Field( + default=None, ge=0, le=100, description="Number of scrolls to perform (0-100)" + ) + country: Optional[str] = Field( + default=None, description="Country code for geo-located requests (e.g. 'us')" + ) + cookies: Optional[Dict[str, str]] = Field( + default=None, description="Cookies to send with the request" + ) + headers: Optional[Dict[str, str]] = Field( + default=None, description="Custom HTTP headers to send with the request" + ) + wait_ms: Optional[int] = Field( + default=None, + ge=0, + description="Milliseconds to wait before scraping for JS rendering", + ) + render_js: bool = Field( + default=False, description="Whether to render heavy JavaScript" + ) + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) + + +class LlmConfig(BaseModel): + """Configuration for the LLM used in extraction.""" + + model: Optional[str] = Field( + default=None, description="LLM model to use for extraction" + ) + temperature: Optional[float] = Field( + default=None, + ge=0.0, + le=2.0, + description="Sampling temperature (0.0-2.0)", + ) + max_tokens: Optional[int] = Field( + default=None, ge=1, description="Maximum tokens in the response" + ) + chunker: Optional[str] = Field( + default=None, description="Chunking strategy for large pages" + ) + + def model_dump(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) diff --git a/scrapegraph-py/scrapegraph_py/models/sitemap.py b/scrapegraph-py/scrapegraph_py/models/sitemap.py deleted file mode 100644 index 4095cbb..0000000 --- a/scrapegraph-py/scrapegraph_py/models/sitemap.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Models for sitemap endpoint""" - -from typing import Optional - -from pydantic import BaseModel, Field, model_validator - - -class SitemapRequest(BaseModel): - """Request model for sitemap endpoint. - - Extracts all URLs from a website's sitemap. Automatically discovers sitemap - from robots.txt or common sitemap locations like /sitemap.xml and sitemap - index files. - - The sitemap endpoint is useful for: - - Discovering all pages on a website - - Building comprehensive crawling lists - - SEO audits and analysis - - Content inventory management - - Attributes: - website_url (str): The base URL of the website to extract sitemap from. - Must start with http:// or https://. The API will automatically - discover the sitemap location. - mock (bool): Whether to use mock mode for the request. When True, returns - stubbed responses without making actual API calls. Defaults to False. - - Raises: - ValueError: If website_url is empty, None, or doesn't start with - http:// or https://. - - Examples: - Basic usage:: - - >>> request = SitemapRequest(website_url="https://example.com") - >>> print(request.website_url) - https://example.com - - With mock mode:: - - >>> request = SitemapRequest( - ... website_url="https://example.com", - ... mock=True - ... ) - >>> print(request.mock) - True - - The API automatically discovers sitemaps from: - - robots.txt directives (Sitemap: https://example.com/sitemap.xml) - - Common locations (/sitemap.xml, /sitemap_index.xml) - - Sitemap index files with nested sitemaps - - Note: - The website_url should be the base domain URL. The API will handle - sitemap discovery automatically. - """ - - website_url: str = Field( - ..., - example="https://scrapegraphai.com/", - description="The URL of the website to extract sitemap from" - ) - mock: bool = Field( - default=False, - description="Whether to use mock mode for the request" - ) - - @model_validator(mode="after") - def validate_url(self) -> "SitemapRequest": - """Validate the website URL. - - Ensures the URL is not empty and uses http:// or https:// protocol. - - Returns: - SitemapRequest: The validated instance. - - Raises: - ValueError: If URL is empty or uses invalid protocol. - """ - if self.website_url is None or not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("URL must start with http:// or https://") - return self - - def model_dump(self, *args, **kwargs) -> dict: - """Serialize the model to a dictionary. - - Automatically excludes None values from the serialized output to - produce cleaner JSON payloads for the API. - - Args: - *args: Positional arguments passed to parent model_dump. - **kwargs: Keyword arguments passed to parent model_dump. - If 'exclude_none' is not specified, it defaults to True. - - Returns: - dict: Dictionary representation of the model with None values excluded. - - Examples: - >>> request = SitemapRequest(website_url="https://example.com") - >>> data = request.model_dump() - >>> print(data) - {'website_url': 'https://example.com', 'mock': False} - """ - kwargs.setdefault("exclude_none", True) - return super().model_dump(*args, **kwargs) - - -class SitemapResponse(BaseModel): - """Response model for sitemap endpoint. - - Contains the complete list of URLs extracted from the website's sitemap. - The URLs are returned in the order they appear in the sitemap, which - typically reflects the website's intended structure and priority. - - This response is useful for: - - Building comprehensive URL lists for crawling - - Identifying content structure and organization - - Discovering all public pages on a website - - Planning content migration or archival - - Attributes: - urls (list[str]): Complete list of URLs extracted from the sitemap. - Each URL is a fully-qualified absolute URL string. The list may - be empty if no sitemap is found or if the sitemap contains no URLs. - URLs are deduplicated and ordered as they appear in the sitemap. - - Examples: - Basic usage:: - - >>> response = SitemapResponse(urls=[ - ... "https://example.com/", - ... "https://example.com/about" - ... ]) - >>> print(f"Found {len(response.urls)} URLs") - Found 2 URLs - - Iterating over URLs:: - - >>> response = SitemapResponse(urls=[ - ... "https://example.com/", - ... "https://example.com/products", - ... "https://example.com/contact" - ... ]) - >>> for url in response.urls: - ... print(url) - https://example.com/ - https://example.com/products - https://example.com/contact - - Filtering URLs:: - - >>> response = SitemapResponse(urls=[ - ... "https://example.com/", - ... "https://example.com/blog/post-1", - ... "https://example.com/blog/post-2", - ... "https://example.com/products" - ... ]) - >>> blog_urls = [url for url in response.urls if '/blog/' in url] - >>> print(f"Found {len(blog_urls)} blog posts") - Found 2 blog posts - - Empty sitemap:: - - >>> response = SitemapResponse(urls=[]) - >>> if not response.urls: - ... print("No URLs found in sitemap") - No URLs found in sitemap - - Note: - The urls list may contain various types of pages including: - - Homepage and main sections - - Blog posts and articles - - Product pages - - Category and tag pages - - Media files (images, PDFs) if included in sitemap - """ - - urls: list[str] = Field( - ..., - description="List of URLs extracted from the sitemap", - example=[ - "https://example.com/", - "https://example.com/about", - "https://example.com/products", - "https://example.com/contact" - ] - ) diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py deleted file mode 100644 index e68b2d8..0000000 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ /dev/null @@ -1,186 +0,0 @@ -""" -Pydantic models for the SmartScraper API endpoint. - -This module defines request and response models for the SmartScraper endpoint, -which performs AI-powered web scraping with optional pagination and scrolling support. - -The SmartScraper can: -- Extract structured data from websites based on user prompts -- Handle infinite scroll scenarios -- Support pagination across multiple pages -- Accept custom output schemas for structured extraction -- Process URLs, raw HTML content, or Markdown content -""" - -from typing import Dict, Optional, Type -from uuid import UUID - -try: - from bs4 import BeautifulSoup - HAS_BS4 = True -except ImportError: - HAS_BS4 = False - -from pydantic import BaseModel, Field, conint, model_validator - - -class SmartScraperRequest(BaseModel): - """ - Request model for the SmartScraper endpoint. - - This model validates and structures requests for AI-powered web scraping. - You must provide exactly one of: website_url, website_html, or website_markdown. - - Attributes: - user_prompt: Natural language prompt describing what to extract - website_url: URL of the website to scrape (optional) - website_html: Raw HTML content to scrape (optional, max 2MB) - website_markdown: Markdown content to process (optional, max 2MB) - headers: Optional HTTP headers including cookies - cookies: Optional cookies for authentication/session management - output_schema: Optional Pydantic model defining the output structure - number_of_scrolls: Number of times to scroll (0-100) for infinite scroll pages - total_pages: Number of pages to scrape (1-10) for pagination - mock: Whether to use mock mode for testing - plain_text: Whether to return plain text instead of structured data - render_heavy_js: Whether to render heavy JavaScript content - - Example: - >>> request = SmartScraperRequest( - ... website_url="https://example.com", - ... user_prompt="Extract all product names and prices" - ... ) - """ - user_prompt: str = Field( - ..., - example="Extract info about the company", - ) - website_url: Optional[str] = Field( - default=None, example="https://scrapegraphai.com/" - ) - website_html: Optional[str] = Field( - default=None, - example="

Title

Content

", - description="HTML content, maximum size 2MB", - ) - website_markdown: Optional[str] = Field( - default=None, - example="# Title\n\nContent goes here", - description="Markdown content, maximum size 2MB", - ) - headers: Optional[dict[str, str]] = Field( - None, - example={ - "User-Agent": "scrapegraph-py", - "Cookie": "cookie1=value1; cookie2=value2", - }, - description="Optional headers to send with the request, including cookies " - "and user agent", - ) - cookies: Optional[Dict[str, str]] = Field( - None, - example={"session_id": "abc123", "user_token": "xyz789"}, - description="Dictionary of cookies to send with the request for " - "authentication or session management", - ) - output_schema: Optional[Type[BaseModel]] = None - number_of_scrolls: Optional[conint(ge=0, le=100)] = Field( - default=None, - description="Number of times to scroll the page (0-100). If None, no " - "scrolling will be performed.", - example=10, - ) - total_pages: Optional[conint(ge=1, le=10)] = Field( - default=None, - description="Number of pages to scrape (1-10). If None, only the first " - "page will be scraped.", - example=5, - ) - mock: bool = Field(default=False, description="Whether to use mock mode for the request") - plain_text: bool = Field(default=False, description="Whether to return the result as plain text") - render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page") - stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection") - wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website") - - @model_validator(mode="after") - def validate_user_prompt(self) -> "SmartScraperRequest": - if self.user_prompt is None or not self.user_prompt.strip(): - raise ValueError("User prompt cannot be empty") - if not any(c.isalnum() for c in self.user_prompt): - raise ValueError("User prompt must contain a valid prompt") - return self - - @model_validator(mode="after") - def validate_url_and_html(self) -> "SmartScraperRequest": - # Count how many input sources are provided - inputs_provided = sum([ - self.website_url is not None, - self.website_html is not None, - self.website_markdown is not None - ]) - - if inputs_provided == 0: - raise ValueError("Exactly one of website_url, website_html, or website_markdown must be provided") - elif inputs_provided > 1: - raise ValueError("Only one of website_url, website_html, or website_markdown can be provided") - - # Validate HTML content - if self.website_html is not None: - if len(self.website_html.encode("utf-8")) > 2 * 1024 * 1024: - raise ValueError("Website HTML content exceeds maximum size of 2MB") - if not HAS_BS4: - raise ImportError( - "beautifulsoup4 is required for HTML validation. " - "Install it with: pip install scrapegraph-py[html] or pip install beautifulsoup4" - ) - try: - soup = BeautifulSoup(self.website_html, "html.parser") - if not soup.find(): - raise ValueError("Invalid HTML - no parseable content found") - except Exception as e: - if isinstance(e, ImportError): - raise - raise ValueError(f"Invalid HTML structure: {str(e)}") - - # Validate URL - elif self.website_url is not None: - if not self.website_url.strip(): - raise ValueError("Website URL cannot be empty") - if not ( - self.website_url.startswith("http://") - or self.website_url.startswith("https://") - ): - raise ValueError("Invalid URL") - - # Validate Markdown content - elif self.website_markdown is not None: - if not self.website_markdown.strip(): - raise ValueError("Website markdown cannot be empty") - if len(self.website_markdown.encode("utf-8")) > 2 * 1024 * 1024: - raise ValueError("Website markdown content exceeds maximum size of 2MB") - - return self - - def model_dump(self, *args, **kwargs) -> dict: - # Set exclude_none=True to exclude None values from serialization - kwargs.setdefault("exclude_none", True) - data = super().model_dump(*args, **kwargs) - # Convert the Pydantic model schema to dict if present - if self.output_schema is not None: - data["output_schema"] = self.output_schema.model_json_schema() - return data - - -class GetSmartScraperRequest(BaseModel): - """Request model for get_smartscraper endpoint""" - - request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") - - @model_validator(mode="after") - def validate_request_id(self) -> "GetSmartScraperRequest": - try: - # Validate the request_id is a valid UUID - UUID(self.request_id) - except ValueError: - raise ValueError("request_id must be a valid UUID") - return self diff --git a/scrapegraph-py/scrapegraph_py/utils/helpers.py b/scrapegraph-py/scrapegraph_py/utils/helpers.py index 8e0e061..04a657d 100644 --- a/scrapegraph-py/scrapegraph_py/utils/helpers.py +++ b/scrapegraph-py/scrapegraph_py/utils/helpers.py @@ -1,12 +1,8 @@ """ -Helper utility functions for the ScrapeGraphAI SDK. - -This module provides utility functions for API key validation and -HTTP response handling for both synchronous and asynchronous requests. +Helper utility functions for the ScrapeGraphAI SDK v2. """ from typing import Any, Dict -from uuid import UUID import aiohttp from requests import Response @@ -15,10 +11,7 @@ def validate_api_key(api_key: str) -> bool: - """ - Validate the format of a ScrapeGraphAI API key. - - API keys must follow the format: 'sgai-' followed by a valid UUID. + """Validate that an API key is present and non-empty. Args: api_key: The API key string to validate @@ -27,32 +20,18 @@ def validate_api_key(api_key: str) -> bool: True if the API key is valid Raises: - ValueError: If the API key format is invalid - - Example: - >>> validate_api_key("sgai-12345678-1234-1234-1234-123456789abc") - True - >>> validate_api_key("invalid-key") - ValueError: Invalid API key format... + ValueError: If the API key is empty or missing """ - if not api_key.startswith("sgai-"): - raise ValueError("Invalid API key format. API key must start with 'sgai-'") - uuid_part = api_key[5:] # Strip out 'sgai-' - try: - UUID(uuid_part) - except ValueError: + if not api_key or not api_key.strip(): raise ValueError( - "Invalid API key format. API key must be 'sgai-' followed by a valid UUID. " - "You can get one at https://dashboard.scrapegraphai.com/" + "API key cannot be empty. " + "Get one at https://dashboard.scrapegraphai.com/" ) return True def handle_sync_response(response: Response) -> Dict[str, Any]: - """ - Handle and parse synchronous HTTP responses. - - Parses the JSON response and raises APIError for error status codes. + """Handle and parse synchronous HTTP responses. Args: response: The requests Response object @@ -62,15 +41,10 @@ def handle_sync_response(response: Response) -> Dict[str, Any]: Raises: APIError: If the response status code indicates an error (>= 400) - - Example: - >>> response = requests.get("https://api.example.com/data") - >>> data = handle_sync_response(response) """ try: data = response.json() except ValueError: - # If response is not JSON, use the raw text data = {"error": response.text} if response.status_code >= 400: @@ -83,10 +57,7 @@ def handle_sync_response(response: Response) -> Dict[str, Any]: async def handle_async_response(response: aiohttp.ClientResponse) -> Dict[str, Any]: - """ - Handle and parse asynchronous HTTP responses. - - Parses the JSON response and raises APIError for error status codes. + """Handle and parse asynchronous HTTP responses. Args: response: The aiohttp ClientResponse object @@ -96,16 +67,11 @@ async def handle_async_response(response: aiohttp.ClientResponse) -> Dict[str, A Raises: APIError: If the response status code indicates an error (>= 400) - - Example: - >>> async with session.get("https://api.example.com/data") as response: - ... data = await handle_async_response(response) """ try: data = await response.json() text = None except ValueError: - # If response is not JSON, use the raw text text = await response.text() data = {"error": text} diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py index 592cfc5..c23106b 100644 --- a/scrapegraph-py/tests/test_async_client.py +++ b/scrapegraph-py/tests/test_async_client.py @@ -1,838 +1,343 @@ -import asyncio +"""Tests for the asynchronous AsyncClient (v2 API).""" + from uuid import uuid4 import pytest +import pytest_asyncio from aioresponses import aioresponses -from pydantic import BaseModel +from pydantic import BaseModel, Field from scrapegraph_py.async_client import AsyncClient -from scrapegraph_py.exceptions import APIError +from scrapegraph_py.config import API_BASE_URL from tests.utils import generate_mock_api_key @pytest.fixture -def mock_api_key(): +def api_key(): return generate_mock_api_key() -@pytest.fixture -def mock_uuid(): - return str(uuid4()) - +@pytest_asyncio.fixture +async def client(api_key): + c = AsyncClient(api_key=api_key) + yield c + await c.close() -@pytest.mark.asyncio -async def test_smartscraper_with_url(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert response["status"] == "completed" - assert "description" in response["result"] +# ------------------------------------------------------------------ +# Auth & headers +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_smartscraper_with_html(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Test content."}, - }, - ) +async def test_bearer_auth_header(api_key): + c = AsyncClient(api_key=api_key) + assert c.headers["Authorization"] == f"Bearer {api_key}" + assert c.headers["X-SDK-Version"].startswith("python@") + await c.close() - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_html="

Test content

", - user_prompt="Extract info", - ) - assert response["status"] == "completed" - assert "description" in response["result"] - -@pytest.mark.asyncio -async def test_smartscraper_with_headers(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"description": "Example domain."}, - }, - ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com", - user_prompt="Describe this page.", - headers=headers, - ) - assert response["status"] == "completed" - assert "description" in response["result"] +# ------------------------------------------------------------------ +# Scrape +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_get_credits(mock_api_key): +async def test_scrape(client): with aioresponses() as mocked: - mocked.get( - "https://api.scrapegraphai.com/v1/credits", - payload={"remaining_credits": 100, "total_credits_used": 50}, + mocked.post( + f"{API_BASE_URL}/scrape", + payload={"request_id": str(uuid4()), "content": "# Hello"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_credits() - assert response["remaining_credits"] == 100 - assert response["total_credits_used"] == 50 + result = await client.scrape("https://example.com") + assert "content" in result @pytest.mark.asyncio -async def test_submit_feedback(mock_api_key): +async def test_scrape_html_format(client): with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/feedback", payload={"status": "success"} + f"{API_BASE_URL}/scrape", + payload={"request_id": str(uuid4()), "content": "

Hello

"}, ) + result = await client.scrape("https://example.com", format="html") + assert "content" in result - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.submit_feedback( - request_id=str(uuid4()), rating=5, feedback_text="Great service!" - ) - assert response["status"] == "success" - -@pytest.mark.asyncio -async def test_get_smartscraper(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/smartscraper/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": {"data": "test"}, - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_smartscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid +# ------------------------------------------------------------------ +# Extract +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_smartscraper_with_pagination(mock_api_key): +async def test_extract(client): with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - ] - }, - }, + f"{API_BASE_URL}/extract", + payload={"request_id": str(uuid4()), "result": {"title": "Example"}}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information", - total_pages=3, - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 3 - - -@pytest.mark.asyncio -async def test_smartscraper_with_pagination_and_scrolls(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10"}, - {"name": "Product 2", "price": "$20"}, - {"name": "Product 3", "price": "$30"}, - {"name": "Product 4", "price": "$40"}, - {"name": "Product 5", "price": "$50"}, - ] - }, - }, + result = await client.extract( + url="https://example.com", + prompt="Extract the title", ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information from paginated results", - total_pages=5, - number_of_scrolls=10, - ) - assert response["status"] == "completed" - assert "products" in response["result"] - assert len(response["result"]["products"]) == 5 + assert result["result"]["title"] == "Example" @pytest.mark.asyncio -async def test_smartscraper_with_pagination_and_all_features(mock_api_key): +async def test_extract_with_pydantic_schema(client): + class Product(BaseModel): + name: str = Field(description="Product name") + price: float = Field(description="Product price") + with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", + f"{API_BASE_URL}/extract", payload={ "request_id": str(uuid4()), - "status": "completed", - "result": { - "products": [ - {"name": "Product 1", "price": "$10", "rating": 4.5}, - {"name": "Product 2", "price": "$20", "rating": 4.0}, - ] - }, + "result": {"name": "Widget", "price": 9.99}, }, ) - - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - - class ProductSchema(BaseModel): - name: str - price: str - rating: float - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.smartscraper( - website_url="https://example.com/products", - user_prompt="Extract product information with ratings", - headers=headers, - output_schema=ProductSchema, - number_of_scrolls=5, - total_pages=2, - ) - assert response["status"] == "completed" - assert "products" in response["result"] - - -@pytest.mark.asyncio -async def test_api_error(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/smartscraper", - status=400, - payload={"error": "Bad request"}, - exception=APIError("Bad request", status_code=400), + result = await client.extract( + url="https://example.com", + prompt="Extract product info", + output_schema=Product, ) + assert result["result"]["name"] == "Widget" - async with AsyncClient(api_key=mock_api_key) as client: - with pytest.raises(APIError) as exc_info: - await client.smartscraper( - website_url="https://example.com", user_prompt="Describe this page." - ) - assert exc_info.value.status_code == 400 - assert "Bad request" in str(exc_info.value) - -@pytest.mark.asyncio -async def test_markdownify(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/markdownify", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify(website_url="https://example.com") - assert response["status"] == "completed" - assert "# Example Page" in response["result"] +# ------------------------------------------------------------------ +# Search +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_markdownify_with_headers(mock_api_key): +async def test_search(client): with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/markdownify", + f"{API_BASE_URL}/search", payload={ "request_id": str(uuid4()), - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", + "results": [{"url": "https://example.com"}], }, ) + result = await client.search("best web scrapers 2025") + assert "results" in result - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.markdownify( - website_url="https://example.com", headers=headers - ) - assert response["status"] == "completed" - assert "# Example Page" in response["result"] +# ------------------------------------------------------------------ +# Schema +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_get_markdownify(mock_api_key, mock_uuid): - with aioresponses() as mocked: - mocked.get( - f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": "# Example Page\n\nThis is markdown content.", - }, - ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_markdownify(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - - -@pytest.mark.asyncio -async def test_searchscraper(mock_api_key): +async def test_schema(client): with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/searchscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, + f"{API_BASE_URL}/schema", + payload={"request_id": str(uuid4()), "schema": {"type": "object"}}, ) + result = await client.schema("Generate a schema for product data") + assert "schema" in result + - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="What is the latest version of Python?" - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) +# ------------------------------------------------------------------ +# Credits +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_searchscraper_with_headers(mock_api_key): +async def test_credits(client): with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/searchscraper", - payload={ - "request_id": str(uuid4()), - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, + mocked.get( + f"{API_BASE_URL}/credits", + payload={"remaining_credits": 1000, "total_credits_used": 50}, ) + result = await client.credits() + assert result["remaining_credits"] == 1000 - headers = { - "User-Agent": "Mozilla/5.0", - "Cookie": "session=123", - } - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.searchscraper( - user_prompt="What is the latest version of Python?", - headers=headers, - ) - assert response["status"] == "completed" - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) +# ------------------------------------------------------------------ +# History +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_get_searchscraper(mock_api_key, mock_uuid): +async def test_history(client): with aioresponses() as mocked: mocked.get( - f"https://api.scrapegraphai.com/v1/searchscraper/{mock_uuid}", - payload={ - "request_id": mock_uuid, - "status": "completed", - "result": {"answer": "Python 3.12 is the latest version."}, - "reference_urls": ["https://www.python.org/downloads/"], - }, + f"{API_BASE_URL}/history", + payload={"requests": [], "total": 0}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_searchscraper(mock_uuid) - assert response["status"] == "completed" - assert response["request_id"] == mock_uuid - assert "answer" in response["result"] - assert "reference_urls" in response - assert isinstance(response["reference_urls"], list) + result = await client.history() + assert "requests" in result -@pytest.mark.asyncio -async def test_crawl(mock_api_key): - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, - ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - }, - "required": ["name"], - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - cache_website=True, - depth=2, - max_pages=5, - same_domain_only=True, - batch_size=1, - ) - assert response["status"] == "processing" - assert "id" in response +# ------------------------------------------------------------------ +# Crawl namespace +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_crawl_with_minimal_params(mock_api_key): +async def test_crawl_start(client): + crawl_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Crawl job started", - }, + f"{API_BASE_URL}/crawl", + payload={"id": crawl_id, "status": "running"}, ) - - schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Test Schema", - "type": "object", - "properties": { - "name": {"type": "string"}, - }, - "required": ["name"], - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - prompt="Extract company information", - data_schema=schema, - ) - assert response["status"] == "processing" - assert "id" in response + result = await client.crawl.start("https://example.com", depth=3) + assert result["id"] == crawl_id @pytest.mark.asyncio -async def test_get_crawl(mock_api_key, mock_uuid): +async def test_crawl_status(client): + crawl_id = str(uuid4()) with aioresponses() as mocked: mocked.get( - f"https://api.scrapegraphai.com/v1/crawl/{mock_uuid}", - payload={ - "id": mock_uuid, - "status": "completed", - "result": { - "llm_result": { - "company": { - "name": "Example Corp", - "description": "A technology company", - }, - "services": [ - { - "service_name": "Web Development", - "description": "Custom web solutions", - } - ], - "legal": { - "privacy_policy": "Privacy policy content", - "terms_of_service": "Terms of service content", - }, - } - }, - }, + f"{API_BASE_URL}/crawl/{crawl_id}", + payload={"id": crawl_id, "status": "completed", "pages": []}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_crawl(mock_uuid) - assert response["status"] == "completed" - assert response["id"] == mock_uuid - assert "result" in response - assert "llm_result" in response["result"] + result = await client.crawl.status(crawl_id) + assert result["status"] == "completed" @pytest.mark.asyncio -async def test_crawl_markdown_mode(mock_api_key): - """Test async crawl in markdown conversion mode (no AI processing)""" +async def test_crawl_stop(client): + crawl_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/crawl", - payload={ - "id": str(uuid4()), - "status": "processing", - "message": "Markdown crawl job started", - }, + f"{API_BASE_URL}/crawl/{crawl_id}/stop", + payload={"id": crawl_id, "status": "stopped"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.crawl( - url="https://example.com", - extraction_mode=False, # Markdown conversion mode - depth=2, - max_pages=3, - same_domain_only=True, - sitemap=True, - ) - assert response["status"] == "processing" - assert "id" in response + result = await client.crawl.stop(crawl_id) + assert result["status"] == "stopped" @pytest.mark.asyncio -async def test_crawl_markdown_mode_validation(mock_api_key): - """Test that async markdown mode rejects prompt and data_schema parameters""" - async with AsyncClient(api_key=mock_api_key) as client: - # Should raise validation error when prompt is provided in markdown mode - try: - await client.crawl( - url="https://example.com", - extraction_mode=False, - prompt="This should not be allowed", - ) - assert False, "Should have raised validation error" - except Exception as e: - assert "Prompt should not be provided when extraction_mode=False" in str(e) - - # Should raise validation error when data_schema is provided in markdown mode - try: - await client.crawl( - url="https://example.com", - extraction_mode=False, - data_schema={"type": "object"}, - ) - assert False, "Should have raised validation error" - except Exception as e: - assert ( - "Data schema should not be provided when extraction_mode=False" - in str(e) - ) - - -# ============================================================================ -# ASYNC SCRAPE TESTS -# ============================================================================ - - -@pytest.mark.asyncio -async def test_async_scrape_basic(mock_api_key): - """Test basic async scrape request""" +async def test_crawl_resume(client): + crawl_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": "

Example Page

This is HTML content.

", - }, + f"{API_BASE_URL}/crawl/{crawl_id}/resume", + payload={"id": crawl_id, "status": "running"}, ) + result = await client.crawl.resume(crawl_id) + assert result["status"] == "running" - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape(website_url="https://example.com") - assert response["status"] == "completed" - assert "html" in response - assert "

Example Page

" in response["html"] - - -@pytest.mark.asyncio -async def test_async_scrape_with_heavy_js(mock_api_key): - """Test async scrape request with heavy JavaScript rendering""" - with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": "
JavaScript rendered content
", - }, - ) - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape( - website_url="https://example.com", - render_heavy_js=True - ) - assert response["status"] == "completed" - assert "html" in response - assert "JavaScript rendered content" in response["html"] +# ------------------------------------------------------------------ +# Monitor namespace +# ------------------------------------------------------------------ @pytest.mark.asyncio -async def test_async_scrape_with_headers(mock_api_key): - """Test async scrape request with custom headers""" +async def test_monitor_create(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": "

Content with custom headers

", - }, + f"{API_BASE_URL}/monitor", + payload={"id": monitor_id, "name": "Price Monitor"}, ) - - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - "Cookie": "session=123" - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape( - website_url="https://example.com", - headers=headers - ) - assert response["status"] == "completed" - assert "html" in response + result = await client.monitor.create( + name="Price Monitor", + url="https://example.com/products", + prompt="Extract product prices", + cron="0 9 * * 1", + ) + assert result["name"] == "Price Monitor" @pytest.mark.asyncio -async def test_async_scrape_with_all_options(mock_api_key): - """Test async scrape request with all options enabled""" +async def test_monitor_list(client): with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": "
Full featured content
", - }, + mocked.get( + f"{API_BASE_URL}/monitor", + payload={"monitors": [], "total": 0}, ) - - headers = { - "User-Agent": "Custom Agent", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" - } - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape( - website_url="https://example.com", - render_heavy_js=True, - headers=headers - ) - assert response["status"] == "completed" - assert "html" in response + result = await client.monitor.list() + assert "monitors" in result @pytest.mark.asyncio -async def test_async_get_scrape(mock_api_key, mock_uuid): - """Test async get scrape result""" +async def test_monitor_get(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: mocked.get( - f"https://api.scrapegraphai.com/v1/scrape/{mock_uuid}", - payload={ - "scrape_request_id": mock_uuid, - "status": "completed", - "html": "

Retrieved HTML content

", - }, + f"{API_BASE_URL}/monitor/{monitor_id}", + payload={"id": monitor_id, "name": "Test Monitor"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.get_scrape(mock_uuid) - assert response["status"] == "completed" - assert response["scrape_request_id"] == mock_uuid - assert "html" in response + result = await client.monitor.get(monitor_id) + assert result["id"] == monitor_id @pytest.mark.asyncio -async def test_async_scrape_error_response(mock_api_key): - """Test async scrape error response handling""" +async def test_monitor_pause(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "error": "Website not accessible", - "status": "error" - }, - status=400 + f"{API_BASE_URL}/monitor/{monitor_id}/pause", + payload={"id": monitor_id, "status": "paused"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - with pytest.raises(Exception): - await client.scrape(website_url="https://inaccessible-site.com") + result = await client.monitor.pause(monitor_id) + assert result["status"] == "paused" @pytest.mark.asyncio -async def test_async_scrape_processing_status(mock_api_key): - """Test async scrape processing status response""" +async def test_monitor_resume(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "processing", - "message": "Scrape job started" - }, + f"{API_BASE_URL}/monitor/{monitor_id}/resume", + payload={"id": monitor_id, "status": "active"}, ) - - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape(website_url="https://example.com") - assert response["status"] == "processing" - assert "scrape_request_id" in response + result = await client.monitor.resume(monitor_id) + assert result["status"] == "active" @pytest.mark.asyncio -async def test_async_scrape_complex_html_response(mock_api_key): - """Test async scrape with complex HTML response""" - complex_html = """ - - - - - - Complex Page - - - -
- -
-
-

Welcome

-

This is a complex HTML page with multiple elements.

-
- Sample image - - -
Data 1Data 2
-
-
- - - - """ - +async def test_monitor_delete(client): + monitor_id = str(uuid4()) with aioresponses() as mocked: - mocked.post( - "https://api.scrapegraphai.com/v1/scrape", - payload={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": complex_html, - }, + mocked.delete( + f"{API_BASE_URL}/monitor/{monitor_id}", + payload={"message": "deleted"}, ) + result = await client.monitor.delete(monitor_id) + assert result["message"] == "deleted" - async with AsyncClient(api_key=mock_api_key) as client: - response = await client.scrape(website_url="https://complex-example.com") - assert response["status"] == "completed" - assert "html" in response - assert "" in response["html"] - assert "Complex Page" in response["html"] - assert " - - - """ - +def test_monitor_delete(client): + monitor_id = str(uuid4()) responses.add( - responses.POST, - "https://api.scrapegraphai.com/v1/scrape", - json={ - "scrape_request_id": str(uuid4()), - "status": "completed", - "html": complex_html, - }, + responses.DELETE, + f"{API_BASE_URL}/monitor/{monitor_id}", + json={"message": "deleted"}, ) + result = client.monitor.delete(monitor_id) + assert result["message"] == "deleted" - with Client(api_key=mock_api_key) as client: - response = client.scrape(website_url="https://complex-example.com") - assert response["status"] == "completed" - assert "html" in response - assert "" in response["html"] - assert "Complex Page" in response["html"] - assert "