diff --git a/src/askui/__init__.py b/src/askui/__init__.py index fc0938a0..0f0794c0 100644 --- a/src/askui/__init__.py +++ b/src/askui/__init__.py @@ -58,7 +58,7 @@ from .multi_device_agent import MultiDeviceAgent try: - from .web_agent import WebVisionAgent + from .web_agent import WebAgent, WebVisionAgent from .web_testing_agent import WebTestingAgent _WEB_AGENTS_AVAILABLE = True @@ -113,4 +113,4 @@ __all__ += ["AndroidAgent", "AndroidVisionAgent", "MultiDeviceAgent"] if _WEB_AGENTS_AVAILABLE: - __all__ += ["WebVisionAgent", "WebTestingAgent"] + __all__ += ["WebAgent", "WebVisionAgent", "WebTestingAgent"] diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py index fe675d84..8168ab16 100644 --- a/src/askui/models/shared/__init__.py +++ b/src/askui/models/shared/__init__.py @@ -1,9 +1,11 @@ from .android_base_tool import AndroidBaseTool from .computer_base_tool import ComputerBaseTool +from .playwright_base_tool import PlaywrightBaseTool from .tool_tags import ToolTags __all__ = [ "AndroidBaseTool", "ComputerBaseTool", + "PlaywrightBaseTool", "ToolTags", ] diff --git a/src/askui/models/shared/playwright_base_tool.py b/src/askui/models/shared/playwright_base_tool.py new file mode 100644 index 00000000..1415c99a --- /dev/null +++ b/src/askui/models/shared/playwright_base_tool.py @@ -0,0 +1,57 @@ +from typing import Any + +from askui.models.shared.tool_tags import ToolTags +from askui.models.shared.tools import ToolWithAgentOS +from askui.tools.agent_os import AgentOs +from askui.tools.agent_os_type_error import AgentOsTypeError +from askui.tools.android.agent_os import AndroidAgentOs +from askui.tools.playwright.agent_os import PlaywrightAgentOs + + +class PlaywrightBaseTool(ToolWithAgentOS): + """Tool base class that has an the Playwright AgentOs available.""" + + def __init__( + self, + agent_os: AgentOs | None = None, + required_tags: list[str] | None = None, + **kwargs: Any, + ) -> None: + super().__init__( + required_tags=[ToolTags.PLAYWRIGHT.value] + (required_tags or []), + agent_os=agent_os, + **kwargs, + ) + + @property + def agent_os(self) -> PlaywrightAgentOs: + """Get the agent OS. + + Returns: + PlaywrightAgentOs: The Playwright agent OS instance. + """ + agent_os = super().agent_os + if not isinstance(agent_os, PlaywrightAgentOs): + raise AgentOsTypeError( + expected_type=PlaywrightAgentOs, + actual_type=type(agent_os), + ) + return agent_os + + @agent_os.setter + def agent_os(self, agent_os: AgentOs | AndroidAgentOs | PlaywrightAgentOs) -> None: + """Set the agent OS. + + Args: + agent_os (AgentOs | AndroidAgentOs | PlaywrightAgentOs): The agent OS + instance to set. + + Raises: + TypeError: If the agent OS is not an `PlaywrightAgentOs` instance. + """ + if not isinstance(agent_os, PlaywrightAgentOs): + raise AgentOsTypeError( + expected_type=PlaywrightAgentOs, + actual_type=type(agent_os), + ) + self._agent_os = agent_os diff --git a/src/askui/models/shared/tool_tags.py b/src/askui/models/shared/tool_tags.py index 48abaffc..153e4bee 100644 --- a/src/askui/models/shared/tool_tags.py +++ b/src/askui/models/shared/tool_tags.py @@ -7,3 +7,4 @@ class ToolTags(str, Enum): ANDROID = "android" COMPUTER = "computer" SCALED_AGENT_OS = "scaled_agent_os" + PLAYWRIGHT = "playwright" diff --git a/src/askui/tools/playwright/agent_os.py b/src/askui/tools/playwright/agent_os.py index f4a4b064..9c200939 100644 --- a/src/askui/tools/playwright/agent_os.py +++ b/src/askui/tools/playwright/agent_os.py @@ -16,24 +16,29 @@ ) from typing_extensions import override +from askui.reporting import NULL_REPORTER, Reporter +from askui.utils.annotated_image import AnnotatedImage + from ..agent_os import AgentOs, Display, DisplaySize, InputEvent, ModifierKey, PcKey class PlaywrightAgentOs(AgentOs): - """ - Playwright-based implementation of AgentOs. + """Playwright-based implementation of `AgentOs`. This implementation uses Playwright's Python SDK to control browser automation and simulate user interactions. It provides mouse control, keyboard input, and screen capture functionality through a browser context. Args: + reporter (Reporter, optional): Reporter used for reporting. Defaults to + `NULL_REPORTER`. browser_type (Literal["chromium", "firefox", "webkit"], optional): The browser type to use. Defaults to `"chromium"`. headless (bool, optional): Whether to run the browser in headless mode. Defaults to `False`. - viewport_size (ViewportSize | None, optional): The viewport size. - Defaults to `None` (uses default). + viewport_size (ViewportSize | None, optional): The viewport size. When + ``None``, the browser inherits the system's native DPI and window + size (``no_viewport=True``). Defaults to `None`. slow_mo (int, optional): Slows down Playwright operations by the specified amount of milliseconds. Defaults to `0`. install_browser (bool, optional): Whether to install browser on connection. @@ -42,8 +47,11 @@ class PlaywrightAgentOs(AgentOs): (requires root permissions). Defaults to `False`. """ + _REPORTER_ROLE_NAME: str = "PlaywrightAgentOS" + def __init__( self, + reporter: Reporter = NULL_REPORTER, browser_type: Literal["chromium", "firefox", "webkit"] = "chromium", headless: bool = False, viewport_size: ViewportSize | None = None, @@ -63,6 +71,7 @@ def __init__( self._browser: Browser | None = None self._context: BrowserContext | None = None self._page: Page | None = None + self._reporter: Reporter = reporter # Event listening state self._listening = False @@ -112,6 +121,14 @@ def _install_system_dependencies(self) -> None: ) raise RuntimeError(error_msg) from e + def _annotated_screenshot( + self, + point_list: list[tuple[int, int]], + ) -> AnnotatedImage: + """Capture a screenshot and wrap it in an `AnnotatedImage` with annotations.""" + screenshot = self.screenshot(report=False) + return AnnotatedImage(lambda: screenshot, point_list) + @override def connect(self) -> None: """Establishes a synchronous connection to the browser.""" @@ -129,13 +146,28 @@ def connect(self) -> None: headless=self._headless, slow_mo=self._slow_mo, ) - self._context = self._browser.new_context( - viewport=self._viewport_size, - ) + if self._viewport_size is not None: + self._context = self._browser.new_context( + viewport=self._viewport_size, + ) + else: + # Use no_viewport to inherit the system's native DPI and window + # size. Without this, Playwright defaults to 1280x720 with + # deviceScaleFactor=1. On high-DPI screens (e.g. macOS Retina) + # Chromium compensates by zooming the page 2x and briefly + # un-zooming every time a screenshot is captured, causing a + # visible flicker. + self._context = self._browser.new_context( + no_viewport=True, + ) self._page = self._context.new_page() # Navigate to a blank page to ensure we have a working page - self._page.goto("data:text/html,") + self._page.goto("data:text/html,

Starting...

") + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + "Connected to playwright browser", + ) @override def disconnect(self) -> None: @@ -159,10 +191,14 @@ def disconnect(self) -> None: self._playwright.stop() self._playwright = None + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + "Disconnected from playwright os", + ) + @override def screenshot(self, report: bool = True) -> Image.Image: - """ - Captures a screenshot of the current page. + """Capture a screenshot of the current page. Args: report (bool, optional): Whether to include the screenshot in @@ -175,23 +211,34 @@ def screenshot(self, report: bool = True) -> Image.Image: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) - screenshot_bytes = self._page.screenshot() - return Image.open(io.BytesIO(screenshot_bytes)) + screenshot_bytes = self._page.screenshot(scale="css") + screenshot = Image.open(io.BytesIO(screenshot_bytes)) + if report: + self._reporter.add_message( + self._REPORTER_ROLE_NAME, "screenshot()", screenshot + ) + return screenshot @override - def mouse_move(self, x: int, y: int, _duration: int = 500) -> None: - """ - Moves the mouse cursor to specified coordinates on the page. + def mouse_move(self, x: int, y: int, duration: int = 500) -> None: + """Move the mouse cursor to specified coordinates on the page. Args: x (int): The horizontal coordinate (in pixels) to move to. y (int): The vertical coordinate (in pixels) to move to. - _duration (int): Unused parameter as it is not applicable here. + duration (int, optional): Ignored — Playwright moves the mouse + instantly. Kept for compatibility with the base class. + Defaults to `500`. """ if not self._page: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"mouse_move(x={x}, y={y})", + self._annotated_screenshot([(x, y)]), + ) self._page.mouse.move(x, y) @override @@ -208,9 +255,19 @@ def type(self, text: str, typing_speed: int = 50) -> None: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"Typing text: '{text}'", + self.screenshot(report=False), + ) # Convert typing speed from CPM to delay between characters delay = 1000 / typing_speed if typing_speed > 0 else 0 self._page.keyboard.type(text, delay=delay) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"After typing text: '{text}'", + self.screenshot(report=False), + ) @override def click( @@ -224,9 +281,19 @@ def click( button to click. Defaults to `"left"`. count (int, optional): Number of times to click. Defaults to `1`. """ + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"click(button={button}, count={count})", + self.screenshot(report=False), + ) for _ in range(count): self.mouse_down(button) self.mouse_up(button) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"After click(button={button}, count={count})", + self.screenshot(report=False), + ) @override def mouse_down(self, button: Literal["left", "middle", "right"] = "left") -> None: @@ -241,6 +308,10 @@ def mouse_down(self, button: Literal["left", "middle", "right"] = "left") -> Non error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"mouse_down(button={button})", + ) self._page.mouse.down(button=button) @override @@ -257,6 +328,10 @@ def mouse_up(self, button: Literal["left", "middle", "right"] = "left") -> None: raise RuntimeError(error_msg) self._page.mouse.up(button=button) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"mouse_up(button={button})", + ) @override def mouse_scroll(self, dx: int, dy: int) -> None: @@ -273,7 +348,17 @@ def mouse_scroll(self, dx: int, dy: int) -> None: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"mouse_scroll(dx={dx}, dy={dy})", + self.screenshot(report=False), + ) self._page.mouse.wheel(delta_x=dx, delta_y=dy) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"After mouse_scroll(dx={dx}, dy={dy})", + self.screenshot(report=False), + ) @override def keyboard_pressed( @@ -291,6 +376,11 @@ def keyboard_pressed( error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"keyboard_pressed(key={key}, modifier_keys={modifier_keys})", + self.screenshot(report=False), + ) # Press modifier keys first if modifier_keys: for modifier in modifier_keys: @@ -323,6 +413,12 @@ def keyboard_release( for modifier in modifier_keys: self._page.keyboard.up(self._convert_key(modifier)) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"keyboard_release(key={key}, modifier_keys={modifier_keys})", + self.screenshot(report=False), + ) + @override def keyboard_tap( self, @@ -343,6 +439,11 @@ def keyboard_tap( error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + (f"keyboard_tap(key={key}, modifier_keys={modifier_keys}, count={count})"), + self.screenshot(report=False), + ) for _ in range(count): # Press modifier keys first if modifier_keys: @@ -357,6 +458,15 @@ def keyboard_tap( for modifier in modifier_keys: self._page.keyboard.up(self._convert_key(modifier)) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + ( + f"After keyboard_tap(key={key}, " + f"modifier_keys={modifier_keys}, count={count})" + ), + self.screenshot(report=False), + ) + @override def retrieve_active_display(self) -> Display: """ @@ -443,21 +553,50 @@ def goto(self, url: str) -> None: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"goto(url='{url}')", + ) self._page.goto(url) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"After goto(url='{url}')", + self.screenshot(report=False), + ) def back(self) -> None: + """Navigate back to the previous page in the browser history.""" if not self._page: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + "back()", + ) self._page.go_back() + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + "After back()", + self.screenshot(report=False), + ) def forward(self) -> None: + """Navigate forward to the next page in the browser history.""" if not self._page: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + "forward()", + ) self._page.go_forward() + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + "After forward()", + self.screenshot(report=False), + ) def get_page_title(self) -> str: """ @@ -470,7 +609,12 @@ def get_page_title(self) -> str: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) - return self._page.title() + title = self._page.title() + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"get_page_title() -> '{title}'", + ) + return title def get_page_url(self) -> str: """ @@ -483,4 +627,29 @@ def get_page_url(self) -> str: error_msg = "No active page. Call connect() first." raise RuntimeError(error_msg) - return self._page.url + url = self._page.url + self._reporter.add_message( + self._REPORTER_ROLE_NAME, + f"get_page_url() -> '{url}'", + ) + return url + + @property + def tags(self) -> list[str]: + """Get the tags for this agent OS. + + Returns: + list[str]: A list of tags that identify this agent OS type. + """ + if not hasattr(self, "_tags"): + self._tags = ["playwright"] + return self._tags + + @tags.setter + def tags(self, tags: list[str]) -> None: + """Set the tags for this agent OS. + + Args: + tags (list[str]): A list of tags that identify this agent OS type. + """ + self._tags = tags diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py new file mode 100644 index 00000000..091ff804 --- /dev/null +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -0,0 +1,123 @@ +from typing import Literal + +from PIL import Image + +from askui.models.shared.tool_tags import ToolTags +from askui.tools.agent_os import Display, ModifierKey, PcKey +from askui.tools.playwright.agent_os import PlaywrightAgentOs +from askui.utils.image_utils import scale_coordinates, scale_image_to_fit + + +class PlaywrightAgentOsFacade(PlaywrightAgentOs): + """Facade for `PlaywrightAgentOs` that adds coordinate scaling. + + Screenshots are scaled down to a fixed target resolution so that the + AI model always sees a consistent image size. Coordinate-based inputs + (``mouse_move``) are scaled back up to the real page resolution before + being forwarded to the underlying agent OS. + + Args: + agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap. + """ + + def __init__(self, agent_os: PlaywrightAgentOs) -> None: + self._agent_os = agent_os + self._target_resolution: tuple[int, int] = (1024, 768) + self._real_screen_resolution: tuple[int, int] | None = None + self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] + + def connect(self) -> None: + self._agent_os.connect() + self._real_screen_resolution = self._agent_os.screenshot( + report=False, + ).size + + def disconnect(self) -> None: + self._agent_os.disconnect() + self._real_screen_resolution = None + + def screenshot(self, report: bool = True) -> Image.Image: + screenshot = self._agent_os.screenshot(report=report) + self._real_screen_resolution = screenshot.size + return scale_image_to_fit(screenshot, self._target_resolution) + + def _scale_coordinates( + self, + x: int, + y: int, + from_agent: bool = True, + ) -> tuple[int, int]: + if self._real_screen_resolution is None: + self._real_screen_resolution = self._agent_os.screenshot( + report=False, + ).size + return scale_coordinates( + (x, y), + self._real_screen_resolution, + self._target_resolution, + inverse=from_agent, + ) + + def mouse_move(self, x: int, y: int, duration: int = 500) -> None: + scaled_x, scaled_y = self._scale_coordinates(x, y) + # scaled_x, scaled_y = x, y + self._agent_os.mouse_move(scaled_x, scaled_y, duration) + + def type(self, text: str, typing_speed: int = 50) -> None: + self._agent_os.type(text, typing_speed) + + def click( + self, + button: Literal["left", "middle", "right"] = "left", + count: int = 1, + ) -> None: + self._agent_os.click(button, count) + + def mouse_down(self, button: Literal["left", "middle", "right"] = "left") -> None: + self._agent_os.mouse_down(button) + + def mouse_up(self, button: Literal["left", "middle", "right"] = "left") -> None: + self._agent_os.mouse_up(button) + + def mouse_scroll(self, dx: int, dy: int) -> None: + self._agent_os.mouse_scroll(dx, dy) + + def keyboard_pressed( + self, + key: PcKey | ModifierKey, + modifier_keys: list[ModifierKey] | None = None, + ) -> None: + self._agent_os.keyboard_pressed(key, modifier_keys) + + def keyboard_release( + self, + key: PcKey | ModifierKey, + modifier_keys: list[ModifierKey] | None = None, + ) -> None: + self._agent_os.keyboard_release(key, modifier_keys) + + def keyboard_tap( + self, + key: PcKey | ModifierKey, + modifier_keys: list[ModifierKey] | None = None, + count: int = 1, + ) -> None: + self._agent_os.keyboard_tap(key, modifier_keys, count) + + def retrieve_active_display(self) -> Display: + return self._agent_os.retrieve_active_display() + + def goto(self, url: str) -> None: + self._agent_os.goto(url) + + def back(self) -> None: + self._agent_os.back() + + def forward(self) -> None: + self._agent_os.forward() + + def get_page_title(self) -> str: + return self._agent_os.get_page_title() + + def get_page_url(self) -> str: + return self._agent_os.get_page_url() diff --git a/src/askui/tools/playwright/tools.py b/src/askui/tools/playwright/tools.py index 144eec41..ba08f273 100644 --- a/src/askui/tools/playwright/tools.py +++ b/src/askui/tools/playwright/tools.py @@ -1,15 +1,420 @@ +import re +from typing import get_args + +from PIL import Image from typing_extensions import override -from askui.models.shared.tools import Tool +from askui.models.shared import PlaywrightBaseTool, ToolTags +from askui.tools.agent_os import ModifierKey, MouseButton, PcKey from askui.tools.playwright.agent_os import PlaywrightAgentOs -class PlaywrightGotoTool(Tool): +class PlaywrightScreenshotTool(PlaywrightBaseTool): + """Takes a screenshot of the current browser page.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="screenshot", + description="Take a screenshot of the current browser page.", + agent_os=agent_os, + required_tags=[ToolTags.SCALED_AGENT_OS.value], + ) + self.is_cacheable = True + + @override + def __call__(self) -> tuple[str, Image.Image]: + screenshot = self.agent_os.screenshot() + return "Screenshot was taken.", screenshot + + +class PlaywrightMouseMoveTool(PlaywrightBaseTool): + """Moves the mouse to a specific position on the page.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="move_mouse", + description=( + "Move the mouse to a specific position on the page. " + "Pass x and y as separate integer values, not as a combined string." + ), + input_schema={ + "type": "object", + "properties": { + "x": { + "type": "integer", + "description": ( + "The x (horizontal) pixel coordinate. " + "Must be a single integer, e.g. 330." + ), + }, + "y": { + "type": "integer", + "description": ( + "The y (vertical) pixel coordinate. " + "Must be a single integer, e.g. 182." + ), + }, + }, + "required": ["x", "y"], + }, + agent_os=agent_os, + required_tags=[ToolTags.SCALED_AGENT_OS.value], + ) + self.is_cacheable = True + + @override + def __call__(self, x: int, y: int) -> str: + # The agent occasionally passes coordinates as strings instead of ints. + # We parse them to handle both cases. + if not (isinstance(x, int) and isinstance(y, int)): + x, y = self._parse_coordinates(x, y) # type: ignore[unreachable] + self.agent_os.mouse_move(x, y) + return f"Mouse was moved to ({x}, {y})." + + @staticmethod + def _parse_coordinates(x: float | str, y: float | str) -> tuple[int, int]: + number_pattern = re.compile(r"-?\d+") + combined = f"{x},{y}" + numbers = number_pattern.findall(combined) + if not len(numbers) == 2: + error_msg = ( + "Could not parse mouse_move coordinates from provided " + f"parameters x={x}, y={y}. The parameters x and y must " + "be passed as separate integer values!" + ) + raise ValueError(error_msg) + return int(numbers[0]), int(numbers[1]) + + +class PlaywrightMouseClickTool(PlaywrightBaseTool): + """Clicks the mouse button at the current position on the page.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="mouse_click", + description=( + "Click and release the mouse button at the current" + " position on the page." + ), + input_schema={ + "type": "object", + "properties": { + "mouse_button": { + "type": "string", + "description": "The mouse button to click.", + "enum": get_args(MouseButton), + }, + "number_of_clicks": { + "type": "integer", + "description": ( + "The number of times to click the mouse button." + " Defaults to 1" + ), + "default": 1, + }, + }, + "required": ["mouse_button"], + }, + agent_os=agent_os, + ) + self.is_cacheable = True + + @override + def __call__(self, mouse_button: MouseButton, number_of_clicks: int = 1) -> str: + self.agent_os.click(mouse_button, number_of_clicks) + return f"Mouse button {mouse_button} was clicked {number_of_clicks} times." + + +class PlaywrightMouseScrollTool(PlaywrightBaseTool): + """Scrolls the mouse wheel at the current position on the page.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="mouse_scroll", + description=( + "Scroll the mouse wheel at the current position on the page. " + "Positive dy scrolls down, negative dy scrolls up. " + "Start with dy=150 or dy=-150 for a normal scroll " + "and adjust based on the result." + ), + input_schema={ + "type": "object", + "properties": { + "dx": { + "type": "integer", + "description": ( + "The horizontal scroll amount. " + "Positive values scroll right, " + "negative values scroll left. " + "Use 0 if no horizontal scrolling is needed." + ), + }, + "dy": { + "type": "integer", + "description": ( + "The vertical scroll amount. " + "Positive values scroll down, negative values scroll up. " + "Use 0 if no vertical scrolling is needed." + ), + }, + }, + "required": ["dx", "dy"], + }, + agent_os=agent_os, + required_tags=[ToolTags.SCALED_AGENT_OS.value], + ) + self.is_cacheable = True + + @override + def __call__(self, dx: int, dy: int) -> str: + self.agent_os.mouse_scroll(dx, dy) + return f"Mouse was scrolled by ({dx}, {dy})." + + +class PlaywrightMouseHoldDownTool(PlaywrightBaseTool): + """Holds down the mouse button at the current position on the page.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="mouse_hold_down", + description=( + "Hold down the mouse button at the current position on the page." + ), + input_schema={ + "type": "object", + "properties": { + "mouse_button": { + "type": "string", + "description": "The mouse button to hold down.", + "enum": get_args(MouseButton), + }, + }, + "required": ["mouse_button"], + }, + agent_os=agent_os, + ) + self.is_cacheable = True + + @override + def __call__(self, mouse_button: MouseButton) -> str: + self.agent_os.mouse_down(mouse_button) + return f"Mouse button {mouse_button} is now held down." + + +class PlaywrightMouseReleaseTool(PlaywrightBaseTool): + """Releases the mouse button at the current position on the page.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="mouse_release", + description="Release the mouse button at the current position on the page.", + input_schema={ + "type": "object", + "properties": { + "mouse_button": { + "type": "string", + "description": "The mouse button to release.", + "enum": get_args(MouseButton), + }, + }, + "required": ["mouse_button"], + }, + agent_os=agent_os, + ) + self.is_cacheable = True + + @override + def __call__(self, mouse_button: MouseButton) -> str: + self.agent_os.mouse_up(mouse_button) + return f"Mouse button {mouse_button} was released." + + +class PlaywrightTypeTool(PlaywrightBaseTool): + """Types text in the browser page.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="type", + description="Type text in the browser page.", + input_schema={ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to type.", + }, + "typing_speed": { + "type": "integer", + "description": ( + "The speed of typing in characters per minute." + " Defaults to 50" + ), + "default": 50, + }, + }, + "required": ["text"], + }, + agent_os=agent_os, + ) + self.is_cacheable = True + + @override + def __call__(self, text: str, typing_speed: int = 50) -> str: + self.agent_os.type(text, typing_speed) + return f"Text '{text}' was typed." + + +class PlaywrightKeyboardTapTool(PlaywrightBaseTool): + """Taps (press and release) a keyboard key in the browser.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="keyboard_tap", + description="Tap (press and release) a keyboard key in the browser.", + input_schema={ + "type": "object", + "properties": { + "key": { + "type": "string", + "description": "The key to tap.", + "enum": list(get_args(PcKey)) + list(get_args(ModifierKey)), + }, + "modifier_keys": { + "type": "array", + "items": { + "type": "string", + "enum": list(get_args(ModifierKey)), + }, + "description": ( + "List of modifier keys to press along with the main key." + ), + }, + "count": { + "type": "integer", + "description": ( + "The number of times to tap the key. Defaults to 1" + ), + "default": 1, + "minimum": 1, + }, + }, + "required": ["key"], + }, + agent_os=agent_os, + ) + self.is_cacheable = True + + @override + def __call__( + self, + key: PcKey | ModifierKey, + modifier_keys: list[ModifierKey] | None = None, + count: int = 1, + ) -> str: + self.agent_os.keyboard_tap(key, modifier_keys, count) + modifier_str = ( + f" with modifiers {', '.join(modifier_keys)}" if modifier_keys else "" + ) + count_str = f" {count} time{'s' if count != 1 else ''}" + return f"Key {key} was tapped{modifier_str}{count_str}." + + +class PlaywrightKeyboardPressedTool(PlaywrightBaseTool): + """Presses and holds a keyboard key in the browser.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="keyboard_pressed", + description="Press and hold a keyboard key in the browser.", + input_schema={ + "type": "object", + "properties": { + "key": { + "type": "string", + "description": "The key to press.", + "enum": list(get_args(PcKey)) + list(get_args(ModifierKey)), + }, + "modifier_keys": { + "type": "array", + "items": { + "type": "string", + "enum": list(get_args(ModifierKey)), + }, + "description": ( + "List of modifier keys to press along with the main key." + ), + }, + }, + "required": ["key"], + }, + agent_os=agent_os, + ) + self.is_cacheable = True + + @override + def __call__( + self, + key: PcKey | ModifierKey, + modifier_keys: list[ModifierKey] | None = None, + ) -> str: + self.agent_os.keyboard_pressed(key, modifier_keys) + modifier_str = ( + f" with modifiers {', '.join(modifier_keys)}" if modifier_keys else "" + ) + return f"Key {key} is now pressed{modifier_str}." + + +class PlaywrightKeyboardReleaseTool(PlaywrightBaseTool): + """Releases a keyboard key in the browser.""" + + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: + super().__init__( + name="keyboard_release", + description="Release a keyboard key in the browser.", + input_schema={ + "type": "object", + "properties": { + "key": { + "type": "string", + "description": "The key to release.", + "enum": list(get_args(PcKey)) + list(get_args(ModifierKey)), + }, + "modifier_keys": { + "type": "array", + "items": { + "type": "string", + "enum": list(get_args(ModifierKey)), + }, + "description": ( + "List of modifier keys to release along with the main key." + ), + }, + }, + "required": ["key"], + }, + agent_os=agent_os, + ) + self.is_cacheable = True + + @override + def __call__( + self, + key: PcKey | ModifierKey, + modifier_keys: list[ModifierKey] | None = None, + ) -> str: + self.agent_os.keyboard_release(key, modifier_keys) + modifier_str = ( + f" with modifiers {', '.join(modifier_keys)}" if modifier_keys else "" + ) + return f"Key {key} was released{modifier_str}." + + +class PlaywrightGotoTool(PlaywrightBaseTool): """ Navigates to a specific URL in the browser. """ - def __init__(self, agent_os: PlaywrightAgentOs) -> None: + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: super().__init__( name="playwright_goto_tool", description=( @@ -33,22 +438,20 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None: }, "required": ["url"], }, + agent_os=agent_os, ) - self._agent_os = agent_os self.is_cacheable = True @override def __call__(self, url: str) -> str: - self._agent_os.goto(url) + self.agent_os.goto(url) return f"Navigated to: {url}" -class PlaywrightBackTool(Tool): - """ - Navigates back to the previous page in the browser history. - """ +class PlaywrightBackTool(PlaywrightBaseTool): + """Navigates back to the previous page in the browser history.""" - def __init__(self, agent_os: PlaywrightAgentOs) -> None: + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: super().__init__( name="playwright_back_tool", description=( @@ -59,22 +462,20 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None: effect. """ ), + agent_os=agent_os, ) - self._agent_os = agent_os self.is_cacheable = True @override def __call__(self) -> str: - self._agent_os.back() + self.agent_os.back() return "Navigated back to the previous page" -class PlaywrightForwardTool(Tool): - """ - Navigates forward to the next page in the browser history. - """ +class PlaywrightForwardTool(PlaywrightBaseTool): + """Navigates forward to the next page in the browser history.""" - def __init__(self, agent_os: PlaywrightAgentOs) -> None: + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: super().__init__( name="playwright_forward_tool", description=( @@ -85,22 +486,20 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None: effect. """ ), + agent_os=agent_os, ) - self._agent_os = agent_os self.is_cacheable = True @override def __call__(self) -> str: - self._agent_os.forward() + self.agent_os.forward() return "Navigated forward to the next page" -class PlaywrightGetPageTitleTool(Tool): - """ - Gets the title of the current page. - """ +class PlaywrightGetPageTitleTool(PlaywrightBaseTool): + """Gets the title of the current page.""" - def __init__(self, agent_os: PlaywrightAgentOs) -> None: + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: super().__init__( name="playwright_get_page_title_tool", description=( @@ -110,22 +509,20 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None: the main heading or name of the page content. """ ), + agent_os=agent_os, ) - self._agent_os = agent_os self.is_cacheable = True @override def __call__(self) -> str: - title = self._agent_os.get_page_title() + title = self.agent_os.get_page_title() return f"Page title: {title}" -class PlaywrightGetPageUrlTool(Tool): - """ - Gets the URL of the current page. - """ +class PlaywrightGetPageUrlTool(PlaywrightBaseTool): + """Gets the URL of the current page.""" - def __init__(self, agent_os: PlaywrightAgentOs) -> None: + def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None: super().__init__( name="playwright_get_page_url_tool", description=( @@ -135,11 +532,11 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None: parameters. """ ), + agent_os=agent_os, ) - self._agent_os = agent_os self.is_cacheable = True @override def __call__(self) -> str: - url = self._agent_os.get_page_url() + url = self.agent_os.get_page_url() return f"Current page URL: {url}" diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py index fc6eea98..fe47c5f9 100644 --- a/src/askui/web_agent.py +++ b/src/askui/web_agent.py @@ -1,29 +1,53 @@ +import warnings + from pydantic import ConfigDict, validate_call -from askui import ComputerAgent +from askui.agent_base import Agent from askui.agent_settings import AgentSettings +from askui.callbacks import ConversationCallback +from askui.container import telemetry from askui.models.shared.settings import ( ActSettings, MessageSettings, ) from askui.models.shared.tools import Tool +from askui.models.shared.truncation_strategies import TruncationStrategy from askui.prompts.act_prompts import create_web_agent_prompt from askui.tools.exception_tool import ExceptionTool from askui.tools.playwright.agent_os import PlaywrightAgentOs +from askui.tools.playwright.agent_os_facade import PlaywrightAgentOsFacade from askui.tools.playwright.tools import ( PlaywrightBackTool, PlaywrightForwardTool, PlaywrightGetPageTitleTool, PlaywrightGetPageUrlTool, PlaywrightGotoTool, + PlaywrightKeyboardPressedTool, + PlaywrightKeyboardReleaseTool, + PlaywrightKeyboardTapTool, + PlaywrightMouseClickTool, + PlaywrightMouseHoldDownTool, + PlaywrightMouseMoveTool, + PlaywrightMouseReleaseTool, + PlaywrightMouseScrollTool, + PlaywrightScreenshotTool, + PlaywrightTypeTool, ) -from askui.tools.toolbox import AgentToolbox -from .reporting import Reporter +from .reporting import CompositeReporter, Reporter from .retry import Retry -class WebVisionAgent(ComputerAgent): +class WebAgent(Agent): + @telemetry.record_call( + exclude={ + "reporters", + "settings", + "act_tools", + "callbacks", + "truncation_strategy", + } + ) @validate_call(config=ConfigDict(arbitrary_types_allowed=True)) def __init__( self, @@ -31,29 +55,56 @@ def __init__( settings: AgentSettings | None = None, retry: Retry | None = None, act_tools: list[Tool] | None = None, + callbacks: list[ConversationCallback] | None = None, + truncation_strategy: TruncationStrategy | None = None, ) -> None: - agent_os = PlaywrightAgentOs() - tools = AgentToolbox( - agent_os=agent_os, - ) + reporter = CompositeReporter(reporters=reporters) + self.os = PlaywrightAgentOs(reporter) + self.act_agent_os_facade = PlaywrightAgentOsFacade(self.os) super().__init__( - reporters=reporters, - settings=settings, + reporter=reporter, retry=retry, - tools=tools, - act_tools=[ - PlaywrightGotoTool(agent_os=agent_os), - PlaywrightBackTool(agent_os=agent_os), - PlaywrightForwardTool(agent_os=agent_os), - PlaywrightGetPageTitleTool(agent_os=agent_os), - PlaywrightGetPageUrlTool(agent_os=agent_os), - ExceptionTool(), - ] - + (act_tools or []), + tools=self.get_default_tools() + (act_tools or []), + agent_os=self.os, + settings=settings, + callbacks=callbacks, + truncation_strategy=truncation_strategy, ) + self.act_tool_collection.add_agent_os(self.act_agent_os_facade) self.act_settings = ActSettings( messages=MessageSettings( system=create_web_agent_prompt(), thinking={"type": "enabled", "budget_tokens": 2048}, ), ) + + @staticmethod + def get_default_tools() -> list[Tool]: + return [ + PlaywrightScreenshotTool(), + PlaywrightMouseMoveTool(), + PlaywrightMouseClickTool(), + PlaywrightMouseScrollTool(), + PlaywrightMouseHoldDownTool(), + PlaywrightMouseReleaseTool(), + PlaywrightTypeTool(), + PlaywrightKeyboardTapTool(), + PlaywrightKeyboardPressedTool(), + PlaywrightKeyboardReleaseTool(), + PlaywrightGotoTool(), + PlaywrightBackTool(), + PlaywrightForwardTool(), + PlaywrightGetPageTitleTool(), + PlaywrightGetPageUrlTool(), + ExceptionTool(), + ] + + +class WebVisionAgent(WebAgent): + def __init__(self, *args, **kwargs) -> None: # type: ignore + warnings.warn( + "WebVisionAgent is deprecated, use WebAgent instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(*args, **kwargs)