Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ place = scrape_place(
place_url,
browser_session=BrowserSessionConfig(
profile_dir=Path(".gmaps-scraper/session"),
human_mouse=True,
),
)

Expand Down
6 changes: 6 additions & 0 deletions docs/USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ uv run gmaps-scraper \
--output place-results.json
```

Browser sessions use a weighted random desktop viewport by default. Persistent
profile sessions keep a stable viewport for that profile path. Use
`--disable-random-window-size` when you need CloakBrowser's default viewport,
and `--human-mouse` when you want CloakBrowser to add humanized mouse, keyboard,
and scroll behavior despite the extra latency.

You can also pass multiple URLs directly or pipe them on stdin:

```bash
Expand Down
46 changes: 46 additions & 0 deletions src/gmaps_scraper/browser_humanization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Browser launch randomization helpers."""

from __future__ import annotations

import random
from pathlib import Path
from typing import Literal

type BrowserWindowSize = Literal["random"] | tuple[int, int] | None

_WEIGHTED_DESKTOP_WINDOW_SIZES: tuple[tuple[int, int], ...] = (
*((1920, 1080),) * 35,
*((1366, 768),) * 26,
*((1536, 864),) * 16,
*((1280, 720),) * 9,
*((1440, 900),) * 9,
*((1600, 900),) * 5,
)


def resolve_browser_window_size(
window_size: BrowserWindowSize,
*,
profile_dir: Path | None,
) -> tuple[int, int] | None:
"""Resolve a configured browser window size."""
if window_size is None:
return None
if window_size == "random":
if profile_dir is None:
return random.choice(_WEIGHTED_DESKTOP_WINDOW_SIZES)
return _WEIGHTED_DESKTOP_WINDOW_SIZES[
_stable_hash(str(profile_dir)) % len(_WEIGHTED_DESKTOP_WINDOW_SIZES)
]

width, height = window_size
if width <= 0 or height <= 0:
raise ValueError("Browser window dimensions must be positive.")
return width, height


def _stable_hash(text: str) -> int:
value = 0
for character in text:
value = (value * 281 ^ ord(character) * 997) & 0xFFFFFFFF
return value
19 changes: 18 additions & 1 deletion src/gmaps_scraper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ def build_parser() -> argparse.ArgumentParser:
type=Path,
help="Reuse a persistent browser profile stored in this directory.",
)
parser.add_argument(
"--disable-random-window-size",
action="store_true",
help="Use CloakBrowser's default viewport instead of a weighted random desktop size.",
)
parser.add_argument(
"--human-mouse",
action="store_true",
help="Enable CloakBrowser humanized mouse, keyboard, and scroll behavior.",
)
parser.add_argument(
"--proxy",
default=os.environ.get("GMAPS_SCRAPER_PROXY"),
Expand Down Expand Up @@ -232,10 +242,17 @@ def main() -> int:
if not args.urls and args.input is None:
parser.error("at least one URL or --input is required.")
browser_session = None
if args.session_dir is not None or args.proxy is not None:
if (
args.session_dir is not None
or args.proxy is not None
or args.disable_random_window_size
or args.human_mouse
):
browser_session = BrowserSessionConfig(
profile_dir=args.session_dir,
proxy=args.proxy,
window_size=None if args.disable_random_window_size else "random",
human_mouse=args.human_mouse,
)
http_session = None
if args.http_cookie_jar is not None or args.proxy is not None:
Expand Down
2 changes: 2 additions & 0 deletions src/gmaps_scraper/place_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2038,6 +2038,8 @@ def _browser_session_for_parallel_worker(
return BrowserSessionConfig(
profile_dir=browser_session.profile_dir / f"worker-{worker_index + 1}",
proxy=browser_session.proxy,
window_size=browser_session.window_size,
human_mouse=browser_session.human_mouse,
)


Expand Down
27 changes: 21 additions & 6 deletions src/gmaps_scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from typing import Any, Literal, Required, TypedDict
from urllib.parse import urljoin

from gmaps_scraper.browser_humanization import (
BrowserWindowSize,
resolve_browser_window_size,
)
from gmaps_scraper.models import SavedList
from gmaps_scraper.parser import JSONValue, ParseError, parse_saved_list_artifacts

Expand Down Expand Up @@ -92,6 +96,8 @@ class BrowserSessionConfig:

profile_dir: Path | None = None
proxy: str | BrowserProxyConfig | None = None
window_size: BrowserWindowSize = "random"
human_mouse: bool = False


@dataclass(slots=True, frozen=True)
Expand Down Expand Up @@ -325,19 +331,28 @@ def _launch_browser_context(
except ImportError as exc: # pragma: no cover - dependency error path
raise ScrapeError("CloakBrowser is not installed. Run `uv sync`.") from exc

session = browser_session or BrowserSessionConfig()
launch_kwargs: dict[str, Any] = {
"headless": headless,
"humanize": True,
"humanize": session.human_mouse,
"locale": "en-US",
"extra_http_headers": {"Accept-Language": "en-US,en;q=0.9"},
}
if browser_session is not None and browser_session.proxy is not None:
launch_kwargs["proxy"] = browser_session.proxy
if browser_session is None or browser_session.profile_dir is None:
window_size = resolve_browser_window_size(
session.window_size,
profile_dir=session.profile_dir,
)
if window_size is not None:
width, height = window_size
launch_kwargs["args"] = [f"--window-size={width},{height}"]
launch_kwargs["viewport"] = {"width": width, "height": height}
if session.proxy is not None:
launch_kwargs["proxy"] = session.proxy
if session.profile_dir is None:
return launch_context(**launch_kwargs)

browser_session.profile_dir.mkdir(parents=True, exist_ok=True)
return launch_persistent_context(browser_session.profile_dir, **launch_kwargs)
session.profile_dir.mkdir(parents=True, exist_ok=True)
return launch_persistent_context(session.profile_dir, **launch_kwargs)


def _read_resolved_url(page: Any) -> str | None:
Expand Down
36 changes: 36 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,42 @@ def test_forwards_explicit_collection_mode(self) -> None:
http_session=None,
)

def test_forwards_browser_humanization_flags(self) -> None:
stdout = io.StringIO()
artifacts = _artifacts()
parsed_payload = _parsed_payload()
result = _result(parsed_payload)

with (
patch(
"sys.argv",
[
"gmaps-scraper",
"https://maps.app.goo.gl/TestSavedListShortUrl",
"--human-mouse",
"--disable-random-window-size",
],
),
patch(
"gmaps_scraper.cli.collect_saved_list_result",
return_value=(artifacts, result),
) as collect_saved_list_result,
redirect_stdout(stdout),
):
exit_code = main()

self.assertEqual(exit_code, 0)
self.assertEqual(json.loads(stdout.getvalue()), parsed_payload)
collect_saved_list_result.assert_called_once_with(
"https://maps.app.goo.gl/TestSavedListShortUrl",
headless=True,
timeout_ms=30_000,
settle_time_ms=3_000,
collection_mode="auto",
browser_session=BrowserSessionConfig(window_size=None, human_mouse=True),
http_session=None,
)

def test_place_kind_calls_place_scraper(self) -> None:
stdout = io.StringIO()
details = PlaceDetails(
Expand Down
12 changes: 11 additions & 1 deletion tests/test_place_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ def close(self) -> None:
def test_scrape_places_parallel_uses_worker_scoped_session_paths(self) -> None:
seen_profile_dirs: list[Path | None] = []
seen_cookie_jar_paths: list[Path | None] = []
seen_human_mouse: list[bool] = []
seen_window_sizes: list[object] = []

def fake_scrape_places_sequential(
place_urls: list[str],
Expand All @@ -240,6 +242,8 @@ def fake_scrape_places_sequential(
self.assertIsInstance(browser_session, BrowserSessionConfig)
self.assertIsInstance(http_session, HttpSessionConfig)
seen_profile_dirs.append(browser_session.profile_dir)
seen_human_mouse.append(browser_session.human_mouse)
seen_window_sizes.append(browser_session.window_size)
seen_cookie_jar_paths.append(http_session.cookie_jar_path)
return [
PlaceScrapeResult(source_url=place_url, attempts=1)
Expand All @@ -258,7 +262,11 @@ def fake_scrape_places_sequential(
):
results = scrape_places(
["url-1", "url-2", "url-3"],
browser_session=BrowserSessionConfig(profile_dir=profile_dir),
browser_session=BrowserSessionConfig(
profile_dir=profile_dir,
window_size=None,
human_mouse=True,
),
http_session=HttpSessionConfig(cookie_jar_path=cookie_jar_path),
max_concurrency=2,
stagger_ms=10,
Expand All @@ -277,6 +285,8 @@ def fake_scrape_places_sequential(
]
),
)
self.assertEqual(seen_human_mouse, [True, True])
self.assertEqual(seen_window_sizes, [None, None])
self.assertEqual([result.source_url for result in results], ["url-1", "url-2", "url-3"])

def test_scrape_places_parallel_returns_worker_errors_per_url(self) -> None:
Expand Down
23 changes: 20 additions & 3 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,13 @@ def fake_launch_context(**kwargs: Any) -> object:
launch_persistent_context=lambda *_args, **_kwargs: None,
)

with patch.dict("sys.modules", {"cloakbrowser": fake_module}):
with (
patch.dict("sys.modules", {"cloakbrowser": fake_module}),
patch(
"gmaps_scraper.scraper.resolve_browser_window_size",
return_value=(1366, 768),
),
):
context = _launch_browser_context(
headless=False,
browser_session=BrowserSessionConfig(),
Expand All @@ -191,9 +197,11 @@ def fake_launch_context(**kwargs: Any) -> object:
[
{
"extra_http_headers": {"Accept-Language": "en-US,en;q=0.9"},
"args": ["--window-size=1366,768"],
"headless": False,
"humanize": True,
"humanize": False,
"locale": "en-US",
"viewport": {"width": 1366, "height": 768},
}
],
)
Expand All @@ -213,12 +221,19 @@ def fake_launch_persistent_context(profile_dir: Path, **kwargs: Any) -> object:

with tempfile.TemporaryDirectory() as tmp_dir:
profile_dir = Path(tmp_dir) / "session"
with patch.dict("sys.modules", {"cloakbrowser": fake_module}):
with (
patch.dict("sys.modules", {"cloakbrowser": fake_module}),
patch(
"gmaps_scraper.scraper.resolve_browser_window_size",
return_value=(1536, 864),
),
):
context = _launch_browser_context(
headless=True,
browser_session=BrowserSessionConfig(
profile_dir=profile_dir,
proxy="http://proxy.example:8080",
human_mouse=True,
),
)
self.assertTrue(profile_dir.is_dir())
Expand All @@ -231,10 +246,12 @@ def fake_launch_persistent_context(profile_dir: Path, **kwargs: Any) -> object:
profile_dir,
{
"extra_http_headers": {"Accept-Language": "en-US,en;q=0.9"},
"args": ["--window-size=1536,864"],
"headless": True,
"humanize": True,
"locale": "en-US",
"proxy": "http://proxy.example:8080",
"viewport": {"width": 1536, "height": 864},
},
)
],
Expand Down
Loading