diff --git a/src/askui/__init__.py b/src/askui/__init__.py
index fc0938a0..0f0794c0 100644
--- a/src/askui/__init__.py
+++ b/src/askui/__init__.py
@@ -58,7 +58,7 @@
from .multi_device_agent import MultiDeviceAgent
try:
- from .web_agent import WebVisionAgent
+ from .web_agent import WebAgent, WebVisionAgent
from .web_testing_agent import WebTestingAgent
_WEB_AGENTS_AVAILABLE = True
@@ -113,4 +113,4 @@
__all__ += ["AndroidAgent", "AndroidVisionAgent", "MultiDeviceAgent"]
if _WEB_AGENTS_AVAILABLE:
- __all__ += ["WebVisionAgent", "WebTestingAgent"]
+ __all__ += ["WebAgent", "WebVisionAgent", "WebTestingAgent"]
diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py
index fe675d84..8168ab16 100644
--- a/src/askui/models/shared/__init__.py
+++ b/src/askui/models/shared/__init__.py
@@ -1,9 +1,11 @@
from .android_base_tool import AndroidBaseTool
from .computer_base_tool import ComputerBaseTool
+from .playwright_base_tool import PlaywrightBaseTool
from .tool_tags import ToolTags
__all__ = [
"AndroidBaseTool",
"ComputerBaseTool",
+ "PlaywrightBaseTool",
"ToolTags",
]
diff --git a/src/askui/models/shared/playwright_base_tool.py b/src/askui/models/shared/playwright_base_tool.py
new file mode 100644
index 00000000..1415c99a
--- /dev/null
+++ b/src/askui/models/shared/playwright_base_tool.py
@@ -0,0 +1,57 @@
+from typing import Any
+
+from askui.models.shared.tool_tags import ToolTags
+from askui.models.shared.tools import ToolWithAgentOS
+from askui.tools.agent_os import AgentOs
+from askui.tools.agent_os_type_error import AgentOsTypeError
+from askui.tools.android.agent_os import AndroidAgentOs
+from askui.tools.playwright.agent_os import PlaywrightAgentOs
+
+
+class PlaywrightBaseTool(ToolWithAgentOS):
+ """Tool base class that has an the Playwright AgentOs available."""
+
+ def __init__(
+ self,
+ agent_os: AgentOs | None = None,
+ required_tags: list[str] | None = None,
+ **kwargs: Any,
+ ) -> None:
+ super().__init__(
+ required_tags=[ToolTags.PLAYWRIGHT.value] + (required_tags or []),
+ agent_os=agent_os,
+ **kwargs,
+ )
+
+ @property
+ def agent_os(self) -> PlaywrightAgentOs:
+ """Get the agent OS.
+
+ Returns:
+ PlaywrightAgentOs: The Playwright agent OS instance.
+ """
+ agent_os = super().agent_os
+ if not isinstance(agent_os, PlaywrightAgentOs):
+ raise AgentOsTypeError(
+ expected_type=PlaywrightAgentOs,
+ actual_type=type(agent_os),
+ )
+ return agent_os
+
+ @agent_os.setter
+ def agent_os(self, agent_os: AgentOs | AndroidAgentOs | PlaywrightAgentOs) -> None:
+ """Set the agent OS.
+
+ Args:
+ agent_os (AgentOs | AndroidAgentOs | PlaywrightAgentOs): The agent OS
+ instance to set.
+
+ Raises:
+ TypeError: If the agent OS is not an `PlaywrightAgentOs` instance.
+ """
+ if not isinstance(agent_os, PlaywrightAgentOs):
+ raise AgentOsTypeError(
+ expected_type=PlaywrightAgentOs,
+ actual_type=type(agent_os),
+ )
+ self._agent_os = agent_os
diff --git a/src/askui/models/shared/tool_tags.py b/src/askui/models/shared/tool_tags.py
index 48abaffc..153e4bee 100644
--- a/src/askui/models/shared/tool_tags.py
+++ b/src/askui/models/shared/tool_tags.py
@@ -7,3 +7,4 @@ class ToolTags(str, Enum):
ANDROID = "android"
COMPUTER = "computer"
SCALED_AGENT_OS = "scaled_agent_os"
+ PLAYWRIGHT = "playwright"
diff --git a/src/askui/tools/playwright/agent_os.py b/src/askui/tools/playwright/agent_os.py
index f4a4b064..9c200939 100644
--- a/src/askui/tools/playwright/agent_os.py
+++ b/src/askui/tools/playwright/agent_os.py
@@ -16,24 +16,29 @@
)
from typing_extensions import override
+from askui.reporting import NULL_REPORTER, Reporter
+from askui.utils.annotated_image import AnnotatedImage
+
from ..agent_os import AgentOs, Display, DisplaySize, InputEvent, ModifierKey, PcKey
class PlaywrightAgentOs(AgentOs):
- """
- Playwright-based implementation of AgentOs.
+ """Playwright-based implementation of `AgentOs`.
This implementation uses Playwright's Python SDK to control browser automation
and simulate user interactions. It provides mouse control, keyboard input,
and screen capture functionality through a browser context.
Args:
+ reporter (Reporter, optional): Reporter used for reporting. Defaults to
+ `NULL_REPORTER`.
browser_type (Literal["chromium", "firefox", "webkit"], optional): The browser
type to use. Defaults to `"chromium"`.
headless (bool, optional): Whether to run the browser in headless mode.
Defaults to `False`.
- viewport_size (ViewportSize | None, optional): The viewport size.
- Defaults to `None` (uses default).
+ viewport_size (ViewportSize | None, optional): The viewport size. When
+ ``None``, the browser inherits the system's native DPI and window
+ size (``no_viewport=True``). Defaults to `None`.
slow_mo (int, optional): Slows down Playwright operations by the specified
amount of milliseconds. Defaults to `0`.
install_browser (bool, optional): Whether to install browser on connection.
@@ -42,8 +47,11 @@ class PlaywrightAgentOs(AgentOs):
(requires root permissions). Defaults to `False`.
"""
+ _REPORTER_ROLE_NAME: str = "PlaywrightAgentOS"
+
def __init__(
self,
+ reporter: Reporter = NULL_REPORTER,
browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",
headless: bool = False,
viewport_size: ViewportSize | None = None,
@@ -63,6 +71,7 @@ def __init__(
self._browser: Browser | None = None
self._context: BrowserContext | None = None
self._page: Page | None = None
+ self._reporter: Reporter = reporter
# Event listening state
self._listening = False
@@ -112,6 +121,14 @@ def _install_system_dependencies(self) -> None:
)
raise RuntimeError(error_msg) from e
+ def _annotated_screenshot(
+ self,
+ point_list: list[tuple[int, int]],
+ ) -> AnnotatedImage:
+ """Capture a screenshot and wrap it in an `AnnotatedImage` with annotations."""
+ screenshot = self.screenshot(report=False)
+ return AnnotatedImage(lambda: screenshot, point_list)
+
@override
def connect(self) -> None:
"""Establishes a synchronous connection to the browser."""
@@ -129,13 +146,28 @@ def connect(self) -> None:
headless=self._headless,
slow_mo=self._slow_mo,
)
- self._context = self._browser.new_context(
- viewport=self._viewport_size,
- )
+ if self._viewport_size is not None:
+ self._context = self._browser.new_context(
+ viewport=self._viewport_size,
+ )
+ else:
+ # Use no_viewport to inherit the system's native DPI and window
+ # size. Without this, Playwright defaults to 1280x720 with
+ # deviceScaleFactor=1. On high-DPI screens (e.g. macOS Retina)
+ # Chromium compensates by zooming the page 2x and briefly
+ # un-zooming every time a screenshot is captured, causing a
+ # visible flicker.
+ self._context = self._browser.new_context(
+ no_viewport=True,
+ )
self._page = self._context.new_page()
# Navigate to a blank page to ensure we have a working page
- self._page.goto("data:text/html,
")
+ self._page.goto("data:text/html,Starting...
")
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ "Connected to playwright browser",
+ )
@override
def disconnect(self) -> None:
@@ -159,10 +191,14 @@ def disconnect(self) -> None:
self._playwright.stop()
self._playwright = None
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ "Disconnected from playwright os",
+ )
+
@override
def screenshot(self, report: bool = True) -> Image.Image:
- """
- Captures a screenshot of the current page.
+ """Capture a screenshot of the current page.
Args:
report (bool, optional): Whether to include the screenshot in
@@ -175,23 +211,34 @@ def screenshot(self, report: bool = True) -> Image.Image:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
- screenshot_bytes = self._page.screenshot()
- return Image.open(io.BytesIO(screenshot_bytes))
+ screenshot_bytes = self._page.screenshot(scale="css")
+ screenshot = Image.open(io.BytesIO(screenshot_bytes))
+ if report:
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME, "screenshot()", screenshot
+ )
+ return screenshot
@override
- def mouse_move(self, x: int, y: int, _duration: int = 500) -> None:
- """
- Moves the mouse cursor to specified coordinates on the page.
+ def mouse_move(self, x: int, y: int, duration: int = 500) -> None:
+ """Move the mouse cursor to specified coordinates on the page.
Args:
x (int): The horizontal coordinate (in pixels) to move to.
y (int): The vertical coordinate (in pixels) to move to.
- _duration (int): Unused parameter as it is not applicable here.
+ duration (int, optional): Ignored — Playwright moves the mouse
+ instantly. Kept for compatibility with the base class.
+ Defaults to `500`.
"""
if not self._page:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"mouse_move(x={x}, y={y})",
+ self._annotated_screenshot([(x, y)]),
+ )
self._page.mouse.move(x, y)
@override
@@ -208,9 +255,19 @@ def type(self, text: str, typing_speed: int = 50) -> None:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"Typing text: '{text}'",
+ self.screenshot(report=False),
+ )
# Convert typing speed from CPM to delay between characters
delay = 1000 / typing_speed if typing_speed > 0 else 0
self._page.keyboard.type(text, delay=delay)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"After typing text: '{text}'",
+ self.screenshot(report=False),
+ )
@override
def click(
@@ -224,9 +281,19 @@ def click(
button to click. Defaults to `"left"`.
count (int, optional): Number of times to click. Defaults to `1`.
"""
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"click(button={button}, count={count})",
+ self.screenshot(report=False),
+ )
for _ in range(count):
self.mouse_down(button)
self.mouse_up(button)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"After click(button={button}, count={count})",
+ self.screenshot(report=False),
+ )
@override
def mouse_down(self, button: Literal["left", "middle", "right"] = "left") -> None:
@@ -241,6 +308,10 @@ def mouse_down(self, button: Literal["left", "middle", "right"] = "left") -> Non
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"mouse_down(button={button})",
+ )
self._page.mouse.down(button=button)
@override
@@ -257,6 +328,10 @@ def mouse_up(self, button: Literal["left", "middle", "right"] = "left") -> None:
raise RuntimeError(error_msg)
self._page.mouse.up(button=button)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"mouse_up(button={button})",
+ )
@override
def mouse_scroll(self, dx: int, dy: int) -> None:
@@ -273,7 +348,17 @@ def mouse_scroll(self, dx: int, dy: int) -> None:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"mouse_scroll(dx={dx}, dy={dy})",
+ self.screenshot(report=False),
+ )
self._page.mouse.wheel(delta_x=dx, delta_y=dy)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"After mouse_scroll(dx={dx}, dy={dy})",
+ self.screenshot(report=False),
+ )
@override
def keyboard_pressed(
@@ -291,6 +376,11 @@ def keyboard_pressed(
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"keyboard_pressed(key={key}, modifier_keys={modifier_keys})",
+ self.screenshot(report=False),
+ )
# Press modifier keys first
if modifier_keys:
for modifier in modifier_keys:
@@ -323,6 +413,12 @@ def keyboard_release(
for modifier in modifier_keys:
self._page.keyboard.up(self._convert_key(modifier))
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"keyboard_release(key={key}, modifier_keys={modifier_keys})",
+ self.screenshot(report=False),
+ )
+
@override
def keyboard_tap(
self,
@@ -343,6 +439,11 @@ def keyboard_tap(
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ (f"keyboard_tap(key={key}, modifier_keys={modifier_keys}, count={count})"),
+ self.screenshot(report=False),
+ )
for _ in range(count):
# Press modifier keys first
if modifier_keys:
@@ -357,6 +458,15 @@ def keyboard_tap(
for modifier in modifier_keys:
self._page.keyboard.up(self._convert_key(modifier))
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ (
+ f"After keyboard_tap(key={key}, "
+ f"modifier_keys={modifier_keys}, count={count})"
+ ),
+ self.screenshot(report=False),
+ )
+
@override
def retrieve_active_display(self) -> Display:
"""
@@ -443,21 +553,50 @@ def goto(self, url: str) -> None:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"goto(url='{url}')",
+ )
self._page.goto(url)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"After goto(url='{url}')",
+ self.screenshot(report=False),
+ )
def back(self) -> None:
+ """Navigate back to the previous page in the browser history."""
if not self._page:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ "back()",
+ )
self._page.go_back()
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ "After back()",
+ self.screenshot(report=False),
+ )
def forward(self) -> None:
+ """Navigate forward to the next page in the browser history."""
if not self._page:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ "forward()",
+ )
self._page.go_forward()
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ "After forward()",
+ self.screenshot(report=False),
+ )
def get_page_title(self) -> str:
"""
@@ -470,7 +609,12 @@ def get_page_title(self) -> str:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
- return self._page.title()
+ title = self._page.title()
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"get_page_title() -> '{title}'",
+ )
+ return title
def get_page_url(self) -> str:
"""
@@ -483,4 +627,29 @@ def get_page_url(self) -> str:
error_msg = "No active page. Call connect() first."
raise RuntimeError(error_msg)
- return self._page.url
+ url = self._page.url
+ self._reporter.add_message(
+ self._REPORTER_ROLE_NAME,
+ f"get_page_url() -> '{url}'",
+ )
+ return url
+
+ @property
+ def tags(self) -> list[str]:
+ """Get the tags for this agent OS.
+
+ Returns:
+ list[str]: A list of tags that identify this agent OS type.
+ """
+ if not hasattr(self, "_tags"):
+ self._tags = ["playwright"]
+ return self._tags
+
+ @tags.setter
+ def tags(self, tags: list[str]) -> None:
+ """Set the tags for this agent OS.
+
+ Args:
+ tags (list[str]): A list of tags that identify this agent OS type.
+ """
+ self._tags = tags
diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py
new file mode 100644
index 00000000..091ff804
--- /dev/null
+++ b/src/askui/tools/playwright/agent_os_facade.py
@@ -0,0 +1,123 @@
+from typing import Literal
+
+from PIL import Image
+
+from askui.models.shared.tool_tags import ToolTags
+from askui.tools.agent_os import Display, ModifierKey, PcKey
+from askui.tools.playwright.agent_os import PlaywrightAgentOs
+from askui.utils.image_utils import scale_coordinates, scale_image_to_fit
+
+
+class PlaywrightAgentOsFacade(PlaywrightAgentOs):
+ """Facade for `PlaywrightAgentOs` that adds coordinate scaling.
+
+ Screenshots are scaled down to a fixed target resolution so that the
+ AI model always sees a consistent image size. Coordinate-based inputs
+ (``mouse_move``) are scaled back up to the real page resolution before
+ being forwarded to the underlying agent OS.
+
+ Args:
+ agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap.
+ """
+
+ def __init__(self, agent_os: PlaywrightAgentOs) -> None:
+ self._agent_os = agent_os
+ self._target_resolution: tuple[int, int] = (1024, 768)
+ self._real_screen_resolution: tuple[int, int] | None = None
+ self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
+
+ def connect(self) -> None:
+ self._agent_os.connect()
+ self._real_screen_resolution = self._agent_os.screenshot(
+ report=False,
+ ).size
+
+ def disconnect(self) -> None:
+ self._agent_os.disconnect()
+ self._real_screen_resolution = None
+
+ def screenshot(self, report: bool = True) -> Image.Image:
+ screenshot = self._agent_os.screenshot(report=report)
+ self._real_screen_resolution = screenshot.size
+ return scale_image_to_fit(screenshot, self._target_resolution)
+
+ def _scale_coordinates(
+ self,
+ x: int,
+ y: int,
+ from_agent: bool = True,
+ ) -> tuple[int, int]:
+ if self._real_screen_resolution is None:
+ self._real_screen_resolution = self._agent_os.screenshot(
+ report=False,
+ ).size
+ return scale_coordinates(
+ (x, y),
+ self._real_screen_resolution,
+ self._target_resolution,
+ inverse=from_agent,
+ )
+
+ def mouse_move(self, x: int, y: int, duration: int = 500) -> None:
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
+ # scaled_x, scaled_y = x, y
+ self._agent_os.mouse_move(scaled_x, scaled_y, duration)
+
+ def type(self, text: str, typing_speed: int = 50) -> None:
+ self._agent_os.type(text, typing_speed)
+
+ def click(
+ self,
+ button: Literal["left", "middle", "right"] = "left",
+ count: int = 1,
+ ) -> None:
+ self._agent_os.click(button, count)
+
+ def mouse_down(self, button: Literal["left", "middle", "right"] = "left") -> None:
+ self._agent_os.mouse_down(button)
+
+ def mouse_up(self, button: Literal["left", "middle", "right"] = "left") -> None:
+ self._agent_os.mouse_up(button)
+
+ def mouse_scroll(self, dx: int, dy: int) -> None:
+ self._agent_os.mouse_scroll(dx, dy)
+
+ def keyboard_pressed(
+ self,
+ key: PcKey | ModifierKey,
+ modifier_keys: list[ModifierKey] | None = None,
+ ) -> None:
+ self._agent_os.keyboard_pressed(key, modifier_keys)
+
+ def keyboard_release(
+ self,
+ key: PcKey | ModifierKey,
+ modifier_keys: list[ModifierKey] | None = None,
+ ) -> None:
+ self._agent_os.keyboard_release(key, modifier_keys)
+
+ def keyboard_tap(
+ self,
+ key: PcKey | ModifierKey,
+ modifier_keys: list[ModifierKey] | None = None,
+ count: int = 1,
+ ) -> None:
+ self._agent_os.keyboard_tap(key, modifier_keys, count)
+
+ def retrieve_active_display(self) -> Display:
+ return self._agent_os.retrieve_active_display()
+
+ def goto(self, url: str) -> None:
+ self._agent_os.goto(url)
+
+ def back(self) -> None:
+ self._agent_os.back()
+
+ def forward(self) -> None:
+ self._agent_os.forward()
+
+ def get_page_title(self) -> str:
+ return self._agent_os.get_page_title()
+
+ def get_page_url(self) -> str:
+ return self._agent_os.get_page_url()
diff --git a/src/askui/tools/playwright/tools.py b/src/askui/tools/playwright/tools.py
index 144eec41..ba08f273 100644
--- a/src/askui/tools/playwright/tools.py
+++ b/src/askui/tools/playwright/tools.py
@@ -1,15 +1,420 @@
+import re
+from typing import get_args
+
+from PIL import Image
from typing_extensions import override
-from askui.models.shared.tools import Tool
+from askui.models.shared import PlaywrightBaseTool, ToolTags
+from askui.tools.agent_os import ModifierKey, MouseButton, PcKey
from askui.tools.playwright.agent_os import PlaywrightAgentOs
-class PlaywrightGotoTool(Tool):
+class PlaywrightScreenshotTool(PlaywrightBaseTool):
+ """Takes a screenshot of the current browser page."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="screenshot",
+ description="Take a screenshot of the current browser page.",
+ agent_os=agent_os,
+ required_tags=[ToolTags.SCALED_AGENT_OS.value],
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(self) -> tuple[str, Image.Image]:
+ screenshot = self.agent_os.screenshot()
+ return "Screenshot was taken.", screenshot
+
+
+class PlaywrightMouseMoveTool(PlaywrightBaseTool):
+ """Moves the mouse to a specific position on the page."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="move_mouse",
+ description=(
+ "Move the mouse to a specific position on the page. "
+ "Pass x and y as separate integer values, not as a combined string."
+ ),
+ input_schema={
+ "type": "object",
+ "properties": {
+ "x": {
+ "type": "integer",
+ "description": (
+ "The x (horizontal) pixel coordinate. "
+ "Must be a single integer, e.g. 330."
+ ),
+ },
+ "y": {
+ "type": "integer",
+ "description": (
+ "The y (vertical) pixel coordinate. "
+ "Must be a single integer, e.g. 182."
+ ),
+ },
+ },
+ "required": ["x", "y"],
+ },
+ agent_os=agent_os,
+ required_tags=[ToolTags.SCALED_AGENT_OS.value],
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(self, x: int, y: int) -> str:
+ # The agent occasionally passes coordinates as strings instead of ints.
+ # We parse them to handle both cases.
+ if not (isinstance(x, int) and isinstance(y, int)):
+ x, y = self._parse_coordinates(x, y) # type: ignore[unreachable]
+ self.agent_os.mouse_move(x, y)
+ return f"Mouse was moved to ({x}, {y})."
+
+ @staticmethod
+ def _parse_coordinates(x: float | str, y: float | str) -> tuple[int, int]:
+ number_pattern = re.compile(r"-?\d+")
+ combined = f"{x},{y}"
+ numbers = number_pattern.findall(combined)
+ if not len(numbers) == 2:
+ error_msg = (
+ "Could not parse mouse_move coordinates from provided "
+ f"parameters x={x}, y={y}. The parameters x and y must "
+ "be passed as separate integer values!"
+ )
+ raise ValueError(error_msg)
+ return int(numbers[0]), int(numbers[1])
+
+
+class PlaywrightMouseClickTool(PlaywrightBaseTool):
+ """Clicks the mouse button at the current position on the page."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="mouse_click",
+ description=(
+ "Click and release the mouse button at the current"
+ " position on the page."
+ ),
+ input_schema={
+ "type": "object",
+ "properties": {
+ "mouse_button": {
+ "type": "string",
+ "description": "The mouse button to click.",
+ "enum": get_args(MouseButton),
+ },
+ "number_of_clicks": {
+ "type": "integer",
+ "description": (
+ "The number of times to click the mouse button."
+ " Defaults to 1"
+ ),
+ "default": 1,
+ },
+ },
+ "required": ["mouse_button"],
+ },
+ agent_os=agent_os,
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(self, mouse_button: MouseButton, number_of_clicks: int = 1) -> str:
+ self.agent_os.click(mouse_button, number_of_clicks)
+ return f"Mouse button {mouse_button} was clicked {number_of_clicks} times."
+
+
+class PlaywrightMouseScrollTool(PlaywrightBaseTool):
+ """Scrolls the mouse wheel at the current position on the page."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="mouse_scroll",
+ description=(
+ "Scroll the mouse wheel at the current position on the page. "
+ "Positive dy scrolls down, negative dy scrolls up. "
+ "Start with dy=150 or dy=-150 for a normal scroll "
+ "and adjust based on the result."
+ ),
+ input_schema={
+ "type": "object",
+ "properties": {
+ "dx": {
+ "type": "integer",
+ "description": (
+ "The horizontal scroll amount. "
+ "Positive values scroll right, "
+ "negative values scroll left. "
+ "Use 0 if no horizontal scrolling is needed."
+ ),
+ },
+ "dy": {
+ "type": "integer",
+ "description": (
+ "The vertical scroll amount. "
+ "Positive values scroll down, negative values scroll up. "
+ "Use 0 if no vertical scrolling is needed."
+ ),
+ },
+ },
+ "required": ["dx", "dy"],
+ },
+ agent_os=agent_os,
+ required_tags=[ToolTags.SCALED_AGENT_OS.value],
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(self, dx: int, dy: int) -> str:
+ self.agent_os.mouse_scroll(dx, dy)
+ return f"Mouse was scrolled by ({dx}, {dy})."
+
+
+class PlaywrightMouseHoldDownTool(PlaywrightBaseTool):
+ """Holds down the mouse button at the current position on the page."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="mouse_hold_down",
+ description=(
+ "Hold down the mouse button at the current position on the page."
+ ),
+ input_schema={
+ "type": "object",
+ "properties": {
+ "mouse_button": {
+ "type": "string",
+ "description": "The mouse button to hold down.",
+ "enum": get_args(MouseButton),
+ },
+ },
+ "required": ["mouse_button"],
+ },
+ agent_os=agent_os,
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(self, mouse_button: MouseButton) -> str:
+ self.agent_os.mouse_down(mouse_button)
+ return f"Mouse button {mouse_button} is now held down."
+
+
+class PlaywrightMouseReleaseTool(PlaywrightBaseTool):
+ """Releases the mouse button at the current position on the page."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="mouse_release",
+ description="Release the mouse button at the current position on the page.",
+ input_schema={
+ "type": "object",
+ "properties": {
+ "mouse_button": {
+ "type": "string",
+ "description": "The mouse button to release.",
+ "enum": get_args(MouseButton),
+ },
+ },
+ "required": ["mouse_button"],
+ },
+ agent_os=agent_os,
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(self, mouse_button: MouseButton) -> str:
+ self.agent_os.mouse_up(mouse_button)
+ return f"Mouse button {mouse_button} was released."
+
+
+class PlaywrightTypeTool(PlaywrightBaseTool):
+ """Types text in the browser page."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="type",
+ description="Type text in the browser page.",
+ input_schema={
+ "type": "object",
+ "properties": {
+ "text": {
+ "type": "string",
+ "description": "The text to type.",
+ },
+ "typing_speed": {
+ "type": "integer",
+ "description": (
+ "The speed of typing in characters per minute."
+ " Defaults to 50"
+ ),
+ "default": 50,
+ },
+ },
+ "required": ["text"],
+ },
+ agent_os=agent_os,
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(self, text: str, typing_speed: int = 50) -> str:
+ self.agent_os.type(text, typing_speed)
+ return f"Text '{text}' was typed."
+
+
+class PlaywrightKeyboardTapTool(PlaywrightBaseTool):
+ """Taps (press and release) a keyboard key in the browser."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="keyboard_tap",
+ description="Tap (press and release) a keyboard key in the browser.",
+ input_schema={
+ "type": "object",
+ "properties": {
+ "key": {
+ "type": "string",
+ "description": "The key to tap.",
+ "enum": list(get_args(PcKey)) + list(get_args(ModifierKey)),
+ },
+ "modifier_keys": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": list(get_args(ModifierKey)),
+ },
+ "description": (
+ "List of modifier keys to press along with the main key."
+ ),
+ },
+ "count": {
+ "type": "integer",
+ "description": (
+ "The number of times to tap the key. Defaults to 1"
+ ),
+ "default": 1,
+ "minimum": 1,
+ },
+ },
+ "required": ["key"],
+ },
+ agent_os=agent_os,
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(
+ self,
+ key: PcKey | ModifierKey,
+ modifier_keys: list[ModifierKey] | None = None,
+ count: int = 1,
+ ) -> str:
+ self.agent_os.keyboard_tap(key, modifier_keys, count)
+ modifier_str = (
+ f" with modifiers {', '.join(modifier_keys)}" if modifier_keys else ""
+ )
+ count_str = f" {count} time{'s' if count != 1 else ''}"
+ return f"Key {key} was tapped{modifier_str}{count_str}."
+
+
+class PlaywrightKeyboardPressedTool(PlaywrightBaseTool):
+ """Presses and holds a keyboard key in the browser."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="keyboard_pressed",
+ description="Press and hold a keyboard key in the browser.",
+ input_schema={
+ "type": "object",
+ "properties": {
+ "key": {
+ "type": "string",
+ "description": "The key to press.",
+ "enum": list(get_args(PcKey)) + list(get_args(ModifierKey)),
+ },
+ "modifier_keys": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": list(get_args(ModifierKey)),
+ },
+ "description": (
+ "List of modifier keys to press along with the main key."
+ ),
+ },
+ },
+ "required": ["key"],
+ },
+ agent_os=agent_os,
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(
+ self,
+ key: PcKey | ModifierKey,
+ modifier_keys: list[ModifierKey] | None = None,
+ ) -> str:
+ self.agent_os.keyboard_pressed(key, modifier_keys)
+ modifier_str = (
+ f" with modifiers {', '.join(modifier_keys)}" if modifier_keys else ""
+ )
+ return f"Key {key} is now pressed{modifier_str}."
+
+
+class PlaywrightKeyboardReleaseTool(PlaywrightBaseTool):
+ """Releases a keyboard key in the browser."""
+
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
+ super().__init__(
+ name="keyboard_release",
+ description="Release a keyboard key in the browser.",
+ input_schema={
+ "type": "object",
+ "properties": {
+ "key": {
+ "type": "string",
+ "description": "The key to release.",
+ "enum": list(get_args(PcKey)) + list(get_args(ModifierKey)),
+ },
+ "modifier_keys": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": list(get_args(ModifierKey)),
+ },
+ "description": (
+ "List of modifier keys to release along with the main key."
+ ),
+ },
+ },
+ "required": ["key"],
+ },
+ agent_os=agent_os,
+ )
+ self.is_cacheable = True
+
+ @override
+ def __call__(
+ self,
+ key: PcKey | ModifierKey,
+ modifier_keys: list[ModifierKey] | None = None,
+ ) -> str:
+ self.agent_os.keyboard_release(key, modifier_keys)
+ modifier_str = (
+ f" with modifiers {', '.join(modifier_keys)}" if modifier_keys else ""
+ )
+ return f"Key {key} was released{modifier_str}."
+
+
+class PlaywrightGotoTool(PlaywrightBaseTool):
"""
Navigates to a specific URL in the browser.
"""
- def __init__(self, agent_os: PlaywrightAgentOs) -> None:
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
super().__init__(
name="playwright_goto_tool",
description=(
@@ -33,22 +438,20 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None:
},
"required": ["url"],
},
+ agent_os=agent_os,
)
- self._agent_os = agent_os
self.is_cacheable = True
@override
def __call__(self, url: str) -> str:
- self._agent_os.goto(url)
+ self.agent_os.goto(url)
return f"Navigated to: {url}"
-class PlaywrightBackTool(Tool):
- """
- Navigates back to the previous page in the browser history.
- """
+class PlaywrightBackTool(PlaywrightBaseTool):
+ """Navigates back to the previous page in the browser history."""
- def __init__(self, agent_os: PlaywrightAgentOs) -> None:
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
super().__init__(
name="playwright_back_tool",
description=(
@@ -59,22 +462,20 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None:
effect.
"""
),
+ agent_os=agent_os,
)
- self._agent_os = agent_os
self.is_cacheable = True
@override
def __call__(self) -> str:
- self._agent_os.back()
+ self.agent_os.back()
return "Navigated back to the previous page"
-class PlaywrightForwardTool(Tool):
- """
- Navigates forward to the next page in the browser history.
- """
+class PlaywrightForwardTool(PlaywrightBaseTool):
+ """Navigates forward to the next page in the browser history."""
- def __init__(self, agent_os: PlaywrightAgentOs) -> None:
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
super().__init__(
name="playwright_forward_tool",
description=(
@@ -85,22 +486,20 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None:
effect.
"""
),
+ agent_os=agent_os,
)
- self._agent_os = agent_os
self.is_cacheable = True
@override
def __call__(self) -> str:
- self._agent_os.forward()
+ self.agent_os.forward()
return "Navigated forward to the next page"
-class PlaywrightGetPageTitleTool(Tool):
- """
- Gets the title of the current page.
- """
+class PlaywrightGetPageTitleTool(PlaywrightBaseTool):
+ """Gets the title of the current page."""
- def __init__(self, agent_os: PlaywrightAgentOs) -> None:
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
super().__init__(
name="playwright_get_page_title_tool",
description=(
@@ -110,22 +509,20 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None:
the main heading or name of the page content.
"""
),
+ agent_os=agent_os,
)
- self._agent_os = agent_os
self.is_cacheable = True
@override
def __call__(self) -> str:
- title = self._agent_os.get_page_title()
+ title = self.agent_os.get_page_title()
return f"Page title: {title}"
-class PlaywrightGetPageUrlTool(Tool):
- """
- Gets the URL of the current page.
- """
+class PlaywrightGetPageUrlTool(PlaywrightBaseTool):
+ """Gets the URL of the current page."""
- def __init__(self, agent_os: PlaywrightAgentOs) -> None:
+ def __init__(self, agent_os: PlaywrightAgentOs | None = None) -> None:
super().__init__(
name="playwright_get_page_url_tool",
description=(
@@ -135,11 +532,11 @@ def __init__(self, agent_os: PlaywrightAgentOs) -> None:
parameters.
"""
),
+ agent_os=agent_os,
)
- self._agent_os = agent_os
self.is_cacheable = True
@override
def __call__(self) -> str:
- url = self._agent_os.get_page_url()
+ url = self.agent_os.get_page_url()
return f"Current page URL: {url}"
diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py
index fc6eea98..fe47c5f9 100644
--- a/src/askui/web_agent.py
+++ b/src/askui/web_agent.py
@@ -1,29 +1,53 @@
+import warnings
+
from pydantic import ConfigDict, validate_call
-from askui import ComputerAgent
+from askui.agent_base import Agent
from askui.agent_settings import AgentSettings
+from askui.callbacks import ConversationCallback
+from askui.container import telemetry
from askui.models.shared.settings import (
ActSettings,
MessageSettings,
)
from askui.models.shared.tools import Tool
+from askui.models.shared.truncation_strategies import TruncationStrategy
from askui.prompts.act_prompts import create_web_agent_prompt
from askui.tools.exception_tool import ExceptionTool
from askui.tools.playwright.agent_os import PlaywrightAgentOs
+from askui.tools.playwright.agent_os_facade import PlaywrightAgentOsFacade
from askui.tools.playwright.tools import (
PlaywrightBackTool,
PlaywrightForwardTool,
PlaywrightGetPageTitleTool,
PlaywrightGetPageUrlTool,
PlaywrightGotoTool,
+ PlaywrightKeyboardPressedTool,
+ PlaywrightKeyboardReleaseTool,
+ PlaywrightKeyboardTapTool,
+ PlaywrightMouseClickTool,
+ PlaywrightMouseHoldDownTool,
+ PlaywrightMouseMoveTool,
+ PlaywrightMouseReleaseTool,
+ PlaywrightMouseScrollTool,
+ PlaywrightScreenshotTool,
+ PlaywrightTypeTool,
)
-from askui.tools.toolbox import AgentToolbox
-from .reporting import Reporter
+from .reporting import CompositeReporter, Reporter
from .retry import Retry
-class WebVisionAgent(ComputerAgent):
+class WebAgent(Agent):
+ @telemetry.record_call(
+ exclude={
+ "reporters",
+ "settings",
+ "act_tools",
+ "callbacks",
+ "truncation_strategy",
+ }
+ )
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
def __init__(
self,
@@ -31,29 +55,56 @@ def __init__(
settings: AgentSettings | None = None,
retry: Retry | None = None,
act_tools: list[Tool] | None = None,
+ callbacks: list[ConversationCallback] | None = None,
+ truncation_strategy: TruncationStrategy | None = None,
) -> None:
- agent_os = PlaywrightAgentOs()
- tools = AgentToolbox(
- agent_os=agent_os,
- )
+ reporter = CompositeReporter(reporters=reporters)
+ self.os = PlaywrightAgentOs(reporter)
+ self.act_agent_os_facade = PlaywrightAgentOsFacade(self.os)
super().__init__(
- reporters=reporters,
- settings=settings,
+ reporter=reporter,
retry=retry,
- tools=tools,
- act_tools=[
- PlaywrightGotoTool(agent_os=agent_os),
- PlaywrightBackTool(agent_os=agent_os),
- PlaywrightForwardTool(agent_os=agent_os),
- PlaywrightGetPageTitleTool(agent_os=agent_os),
- PlaywrightGetPageUrlTool(agent_os=agent_os),
- ExceptionTool(),
- ]
- + (act_tools or []),
+ tools=self.get_default_tools() + (act_tools or []),
+ agent_os=self.os,
+ settings=settings,
+ callbacks=callbacks,
+ truncation_strategy=truncation_strategy,
)
+ self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
self.act_settings = ActSettings(
messages=MessageSettings(
system=create_web_agent_prompt(),
thinking={"type": "enabled", "budget_tokens": 2048},
),
)
+
+ @staticmethod
+ def get_default_tools() -> list[Tool]:
+ return [
+ PlaywrightScreenshotTool(),
+ PlaywrightMouseMoveTool(),
+ PlaywrightMouseClickTool(),
+ PlaywrightMouseScrollTool(),
+ PlaywrightMouseHoldDownTool(),
+ PlaywrightMouseReleaseTool(),
+ PlaywrightTypeTool(),
+ PlaywrightKeyboardTapTool(),
+ PlaywrightKeyboardPressedTool(),
+ PlaywrightKeyboardReleaseTool(),
+ PlaywrightGotoTool(),
+ PlaywrightBackTool(),
+ PlaywrightForwardTool(),
+ PlaywrightGetPageTitleTool(),
+ PlaywrightGetPageUrlTool(),
+ ExceptionTool(),
+ ]
+
+
+class WebVisionAgent(WebAgent):
+ def __init__(self, *args, **kwargs) -> None: # type: ignore
+ warnings.warn(
+ "WebVisionAgent is deprecated, use WebAgent instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ super().__init__(*args, **kwargs)