Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from __future__ import annotations

import asyncio
import logging
from typing import TYPE_CHECKING, Any

from crawlee.browsers import BrowserPool
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.storages import KeyValueStore

if TYPE_CHECKING:
from crawlee.browsers._browser_controller import BrowserController
from crawlee.browsers._types import CrawleePage
from crawlee.proxy_configuration import ProxyInfo

logger = logging.getLogger(__name__)


async def main() -> None:
async with BrowserPool() as browser_pool:

@browser_pool.pre_page_create_hook
async def log_page_init(
page_id: str,
_browser_controller: BrowserController,
_browser_new_context_options: dict[str, Any],
_proxy_info: ProxyInfo | None,
) -> None:
"""Log when a new page is about to be created."""
logger.info(f'Creating page {page_id}...')

@browser_pool.post_page_create_hook
async def set_viewport(
crawlee_page: CrawleePage, _browser_controller: BrowserController
) -> None:
"""Set a fixed viewport size on each newly created page."""
await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024})

@browser_pool.pre_page_close_hook
async def save_screenshot(
crawlee_page: CrawleePage, _browser_controller: BrowserController
) -> None:
"""Save a screenshot to KeyValueStore before each page is closed."""
kvs = await KeyValueStore.open()

screenshot = await crawlee_page.page.screenshot()
await kvs.set_value(
key=f'screenshot-{crawlee_page.id}',
value=screenshot,
content_type='image/png',
)
logger.info(f'Saved screenshot for page {crawlee_page.id}.')

@browser_pool.post_page_close_hook
async def log_page_closed(
page_id: str, _browser_controller: BrowserController
) -> None:
"""Log after each page is closed."""
logger.info(f'Page {page_id} closed successfully.')

crawler = PlaywrightCrawler(
browser_pool=browser_pool,
max_requests_per_crawl=5,
)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

await context.enqueue_links()

# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,12 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await context.enqueue_links()

@crawler.pre_navigation_hook
async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None:
async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None:
context.log.info(f'Navigating to {context.request.url} ...')

# will set a timeout for all navigation methods
context.page.set_default_navigation_timeout(600_000)

# will set the page size before you go to the target URL
await context.page.set_viewport_size({'width': 1280, 'height': 1024})
# block stylesheets, images, fonts and other static assets
# to speed up page loading
await context.block_requests()

# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
Expand Down
16 changes: 12 additions & 4 deletions docs/guides/playwright_crawler.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py';
import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py';
import PreNavigationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/pre_navigation_hook_example.py';

import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py';
import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py';

A <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is a browser-based crawler. In contrast to HTTP-based crawlers like <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage.
Expand Down Expand Up @@ -57,14 +57,22 @@ You can also configure each plugin used by <ApiLink to="class/BrowserPool">`Brow

For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.

## Page configuration with pre-navigation hooks
## Page configuration with lifecycle page hooks

For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four lifecycle hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.

<RunnableCodeBlock className="language-python" language="python">
{BrowserPoolPageHooksExample}
</RunnableCodeBlock>

## Navigation hooks

In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> method of the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. This method is called before the page navigates to the target URL and allows you to configure the page instance.
Navigation hooks allow for additional configuration at specific points during page navigation. For example, the <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> is called before each navigation and provides <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> - including the [page](https://playwright.dev/python/docs/api/class-page) instance and a <ApiLink to="class/PlaywrightPreNavCrawlingContext#block_requests">`block_requests`</ApiLink> helper for filtering unwanted resource types and URL patterns. See the [block requests example](https://crawlee.dev/python/docs/examples/playwright-crawler-with-block-requests) for a dedicated walkthrough.

<RunnableCodeBlock className="language-python" language="python">
{PreNavigationExample}
</RunnableCodeBlock>

## Conclusion

This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, and apply pre-navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
92 changes: 90 additions & 2 deletions src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from crawlee.browsers._types import BrowserType, CrawleePage

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
from collections.abc import Awaitable, Callable, Mapping, Sequence
from pathlib import Path
from types import TracebackType

Expand Down Expand Up @@ -99,6 +99,13 @@ def __init__(
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins

self._pre_page_create_hooks: list[
Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
] = []
self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
self._post_page_close_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = []

# Flag to indicate the context state.
self._active = False

Expand Down Expand Up @@ -301,9 +308,15 @@ async def _get_new_page(
try:
if not browser_controller:
browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
browser_new_context_options = dict(plugin.browser_new_context_options)

await self._execute_hooks(
self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info
)

page = await asyncio.wait_for(
browser_controller.new_page(
browser_new_context_options=plugin.browser_new_context_options,
browser_new_context_options=browser_new_context_options,
proxy_info=proxy_info,
),
timeout,
Expand All @@ -319,6 +332,11 @@ async def _get_new_page(
crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type)
self._pages[page_id] = crawlee_page
self._total_pages_count += 1

await self._execute_hooks(self._post_page_create_hooks, crawlee_page, browser_controller)

self._override_page_close(crawlee_page, browser_controller)

return crawlee_page

def _pick_browser_with_free_capacity(
Expand Down Expand Up @@ -357,3 +375,73 @@ async def _close_inactive_browsers(self) -> None:
if not browser.pages:
await browser.close()
self._inactive_browsers.remove(browser)

async def _execute_hooks(self, hooks: list[Callable[..., Awaitable[None]]], *args: Any) -> None:
"""Execute the provided hooks with the given arguments."""
for hook in hooks:
await hook(*args)

def _override_page_close(self, crawlee_page: CrawleePage, browser_controller: BrowserController) -> None:
"""Override the page's close method to execute pre and post close hooks."""
if self._pre_page_close_hooks or self._post_page_close_hooks:
original_close = crawlee_page.page.close

async def close_with_hooks(*args: Any, **kwargs: Any) -> None:
try:
await self._execute_hooks(self._pre_page_close_hooks, crawlee_page, browser_controller)
finally:
await original_close(*args, **kwargs)
await self._execute_hooks(self._post_page_close_hooks, crawlee_page.id, browser_controller)

crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks

def pre_page_create_hook(
self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
) -> Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]:
"""Register a hook to be called just before a new page is created.

The hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`.
Note that depending on the `BrowserController` implementation, `browser_new_context_options` may not
apply to every page individually. For example, `PlaywrightBrowserController` with
``use_incognito_pages=False`` shares a single context across all pages, so the options are applied
only when the context is first created.
"""
self._pre_page_create_hooks.append(hook)

return hook

def post_page_create_hook(
self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]
) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]:
"""Register a hook to be called right after a new page is created.

The hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply
changes to all pages, such as injecting scripts or configuring request interception.
"""
self._post_page_create_hooks.append(hook)

return hook

def pre_page_close_hook(
self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]
) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]:
"""Register a hook to be called just before a page is closed.

The hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data,
such as taking a screenshot or saving page state before the page is destroyed.
"""
self._pre_page_close_hooks.append(hook)

return hook

def post_page_close_hook(
self, hook: Callable[[str, BrowserController], Awaitable[None]]
) -> Callable[[str, BrowserController], Awaitable[None]]:
"""Register a hook to be called right after a page is closed.

The hook receives the page ID and the `BrowserController`. Use it for cleanup or logging
after a page's lifecycle ends.
"""
self._post_page_close_hooks.append(hook)

return hook
Loading
Loading