From f21c4f06f677f4e4f07a512eb5a7270ab1ba7f57 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Thu, 21 May 2026 14:58:35 +0800 Subject: [PATCH] Add browser humanization controls --- README.md | 1 + docs/USAGE.md | 6 +++ src/gmaps_scraper/browser_humanization.py | 46 +++++++++++++++++++++++ src/gmaps_scraper/cli.py | 19 +++++++++- src/gmaps_scraper/place_scraper.py | 2 + src/gmaps_scraper/scraper.py | 27 ++++++++++--- tests/test_cli.py | 36 ++++++++++++++++++ tests/test_place_scraper.py | 12 +++++- tests/test_scraper.py | 23 ++++++++++-- 9 files changed, 161 insertions(+), 11 deletions(-) create mode 100644 src/gmaps_scraper/browser_humanization.py diff --git a/README.md b/README.md index b8b7d0f..206d730 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ place = scrape_place( place_url, browser_session=BrowserSessionConfig( profile_dir=Path(".gmaps-scraper/session"), + human_mouse=True, ), ) diff --git a/docs/USAGE.md b/docs/USAGE.md index 5f5286c..a2d9245 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -80,6 +80,12 @@ uv run gmaps-scraper \ --output place-results.json ``` +Browser sessions use a weighted random desktop viewport by default. Persistent +profile sessions keep a stable viewport for that profile path. Use +`--disable-random-window-size` when you need CloakBrowser's default viewport, +and `--human-mouse` when you want CloakBrowser to add humanized mouse, keyboard, +and scroll behavior despite the extra latency. + You can also pass multiple URLs directly or pipe them on stdin: ```bash diff --git a/src/gmaps_scraper/browser_humanization.py b/src/gmaps_scraper/browser_humanization.py new file mode 100644 index 0000000..b287c58 --- /dev/null +++ b/src/gmaps_scraper/browser_humanization.py @@ -0,0 +1,46 @@ +"""Browser launch randomization helpers.""" + +from __future__ import annotations + +import random +from pathlib import Path +from typing import Literal + +type BrowserWindowSize = Literal["random"] | tuple[int, int] | None + +_WEIGHTED_DESKTOP_WINDOW_SIZES: tuple[tuple[int, int], ...] = ( + *((1920, 1080),) * 35, + *((1366, 768),) * 26, + *((1536, 864),) * 16, + *((1280, 720),) * 9, + *((1440, 900),) * 9, + *((1600, 900),) * 5, +) + + +def resolve_browser_window_size( + window_size: BrowserWindowSize, + *, + profile_dir: Path | None, +) -> tuple[int, int] | None: + """Resolve a configured browser window size.""" + if window_size is None: + return None + if window_size == "random": + if profile_dir is None: + return random.choice(_WEIGHTED_DESKTOP_WINDOW_SIZES) + return _WEIGHTED_DESKTOP_WINDOW_SIZES[ + _stable_hash(str(profile_dir)) % len(_WEIGHTED_DESKTOP_WINDOW_SIZES) + ] + + width, height = window_size + if width <= 0 or height <= 0: + raise ValueError("Browser window dimensions must be positive.") + return width, height + + +def _stable_hash(text: str) -> int: + value = 0 + for character in text: + value = (value * 281 ^ ord(character) * 997) & 0xFFFFFFFF + return value diff --git a/src/gmaps_scraper/cli.py b/src/gmaps_scraper/cli.py index 0d1502f..7ede33c 100644 --- a/src/gmaps_scraper/cli.py +++ b/src/gmaps_scraper/cli.py @@ -108,6 +108,16 @@ def build_parser() -> argparse.ArgumentParser: type=Path, help="Reuse a persistent browser profile stored in this directory.", ) + parser.add_argument( + "--disable-random-window-size", + action="store_true", + help="Use CloakBrowser's default viewport instead of a weighted random desktop size.", + ) + parser.add_argument( + "--human-mouse", + action="store_true", + help="Enable CloakBrowser humanized mouse, keyboard, and scroll behavior.", + ) parser.add_argument( "--proxy", default=os.environ.get("GMAPS_SCRAPER_PROXY"), @@ -232,10 +242,17 @@ def main() -> int: if not args.urls and args.input is None: parser.error("at least one URL or --input is required.") browser_session = None - if args.session_dir is not None or args.proxy is not None: + if ( + args.session_dir is not None + or args.proxy is not None + or args.disable_random_window_size + or args.human_mouse + ): browser_session = BrowserSessionConfig( profile_dir=args.session_dir, proxy=args.proxy, + window_size=None if args.disable_random_window_size else "random", + human_mouse=args.human_mouse, ) http_session = None if args.http_cookie_jar is not None or args.proxy is not None: diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index 338a08b..2e6783f 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -2038,6 +2038,8 @@ def _browser_session_for_parallel_worker( return BrowserSessionConfig( profile_dir=browser_session.profile_dir / f"worker-{worker_index + 1}", proxy=browser_session.proxy, + window_size=browser_session.window_size, + human_mouse=browser_session.human_mouse, ) diff --git a/src/gmaps_scraper/scraper.py b/src/gmaps_scraper/scraper.py index 16988c3..d438970 100644 --- a/src/gmaps_scraper/scraper.py +++ b/src/gmaps_scraper/scraper.py @@ -12,6 +12,10 @@ from typing import Any, Literal, Required, TypedDict from urllib.parse import urljoin +from gmaps_scraper.browser_humanization import ( + BrowserWindowSize, + resolve_browser_window_size, +) from gmaps_scraper.models import SavedList from gmaps_scraper.parser import JSONValue, ParseError, parse_saved_list_artifacts @@ -92,6 +96,8 @@ class BrowserSessionConfig: profile_dir: Path | None = None proxy: str | BrowserProxyConfig | None = None + window_size: BrowserWindowSize = "random" + human_mouse: bool = False @dataclass(slots=True, frozen=True) @@ -325,19 +331,28 @@ def _launch_browser_context( except ImportError as exc: # pragma: no cover - dependency error path raise ScrapeError("CloakBrowser is not installed. Run `uv sync`.") from exc + session = browser_session or BrowserSessionConfig() launch_kwargs: dict[str, Any] = { "headless": headless, - "humanize": True, + "humanize": session.human_mouse, "locale": "en-US", "extra_http_headers": {"Accept-Language": "en-US,en;q=0.9"}, } - if browser_session is not None and browser_session.proxy is not None: - launch_kwargs["proxy"] = browser_session.proxy - if browser_session is None or browser_session.profile_dir is None: + window_size = resolve_browser_window_size( + session.window_size, + profile_dir=session.profile_dir, + ) + if window_size is not None: + width, height = window_size + launch_kwargs["args"] = [f"--window-size={width},{height}"] + launch_kwargs["viewport"] = {"width": width, "height": height} + if session.proxy is not None: + launch_kwargs["proxy"] = session.proxy + if session.profile_dir is None: return launch_context(**launch_kwargs) - browser_session.profile_dir.mkdir(parents=True, exist_ok=True) - return launch_persistent_context(browser_session.profile_dir, **launch_kwargs) + session.profile_dir.mkdir(parents=True, exist_ok=True) + return launch_persistent_context(session.profile_dir, **launch_kwargs) def _read_resolved_url(page: Any) -> str | None: diff --git a/tests/test_cli.py b/tests/test_cli.py index 276e291..4e94680 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -210,6 +210,42 @@ def test_forwards_explicit_collection_mode(self) -> None: http_session=None, ) + def test_forwards_browser_humanization_flags(self) -> None: + stdout = io.StringIO() + artifacts = _artifacts() + parsed_payload = _parsed_payload() + result = _result(parsed_payload) + + with ( + patch( + "sys.argv", + [ + "gmaps-scraper", + "https://maps.app.goo.gl/TestSavedListShortUrl", + "--human-mouse", + "--disable-random-window-size", + ], + ), + patch( + "gmaps_scraper.cli.collect_saved_list_result", + return_value=(artifacts, result), + ) as collect_saved_list_result, + redirect_stdout(stdout), + ): + exit_code = main() + + self.assertEqual(exit_code, 0) + self.assertEqual(json.loads(stdout.getvalue()), parsed_payload) + collect_saved_list_result.assert_called_once_with( + "https://maps.app.goo.gl/TestSavedListShortUrl", + headless=True, + timeout_ms=30_000, + settle_time_ms=3_000, + collection_mode="auto", + browser_session=BrowserSessionConfig(window_size=None, human_mouse=True), + http_session=None, + ) + def test_place_kind_calls_place_scraper(self) -> None: stdout = io.StringIO() details = PlaceDetails( diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 53f9936..2a4e450 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -230,6 +230,8 @@ def close(self) -> None: def test_scrape_places_parallel_uses_worker_scoped_session_paths(self) -> None: seen_profile_dirs: list[Path | None] = [] seen_cookie_jar_paths: list[Path | None] = [] + seen_human_mouse: list[bool] = [] + seen_window_sizes: list[object] = [] def fake_scrape_places_sequential( place_urls: list[str], @@ -240,6 +242,8 @@ def fake_scrape_places_sequential( self.assertIsInstance(browser_session, BrowserSessionConfig) self.assertIsInstance(http_session, HttpSessionConfig) seen_profile_dirs.append(browser_session.profile_dir) + seen_human_mouse.append(browser_session.human_mouse) + seen_window_sizes.append(browser_session.window_size) seen_cookie_jar_paths.append(http_session.cookie_jar_path) return [ PlaceScrapeResult(source_url=place_url, attempts=1) @@ -258,7 +262,11 @@ def fake_scrape_places_sequential( ): results = scrape_places( ["url-1", "url-2", "url-3"], - browser_session=BrowserSessionConfig(profile_dir=profile_dir), + browser_session=BrowserSessionConfig( + profile_dir=profile_dir, + window_size=None, + human_mouse=True, + ), http_session=HttpSessionConfig(cookie_jar_path=cookie_jar_path), max_concurrency=2, stagger_ms=10, @@ -277,6 +285,8 @@ def fake_scrape_places_sequential( ] ), ) + self.assertEqual(seen_human_mouse, [True, True]) + self.assertEqual(seen_window_sizes, [None, None]) self.assertEqual([result.source_url for result in results], ["url-1", "url-2", "url-3"]) def test_scrape_places_parallel_returns_worker_errors_per_url(self) -> None: diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 33c71ee..36d1500 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -179,7 +179,13 @@ def fake_launch_context(**kwargs: Any) -> object: launch_persistent_context=lambda *_args, **_kwargs: None, ) - with patch.dict("sys.modules", {"cloakbrowser": fake_module}): + with ( + patch.dict("sys.modules", {"cloakbrowser": fake_module}), + patch( + "gmaps_scraper.scraper.resolve_browser_window_size", + return_value=(1366, 768), + ), + ): context = _launch_browser_context( headless=False, browser_session=BrowserSessionConfig(), @@ -191,9 +197,11 @@ def fake_launch_context(**kwargs: Any) -> object: [ { "extra_http_headers": {"Accept-Language": "en-US,en;q=0.9"}, + "args": ["--window-size=1366,768"], "headless": False, - "humanize": True, + "humanize": False, "locale": "en-US", + "viewport": {"width": 1366, "height": 768}, } ], ) @@ -213,12 +221,19 @@ def fake_launch_persistent_context(profile_dir: Path, **kwargs: Any) -> object: with tempfile.TemporaryDirectory() as tmp_dir: profile_dir = Path(tmp_dir) / "session" - with patch.dict("sys.modules", {"cloakbrowser": fake_module}): + with ( + patch.dict("sys.modules", {"cloakbrowser": fake_module}), + patch( + "gmaps_scraper.scraper.resolve_browser_window_size", + return_value=(1536, 864), + ), + ): context = _launch_browser_context( headless=True, browser_session=BrowserSessionConfig( profile_dir=profile_dir, proxy="http://proxy.example:8080", + human_mouse=True, ), ) self.assertTrue(profile_dir.is_dir()) @@ -231,10 +246,12 @@ def fake_launch_persistent_context(profile_dir: Path, **kwargs: Any) -> object: profile_dir, { "extra_http_headers": {"Accept-Language": "en-US,en;q=0.9"}, + "args": ["--window-size=1536,864"], "headless": True, "humanize": True, "locale": "en-US", "proxy": "http://proxy.example:8080", + "viewport": {"width": 1536, "height": 864}, }, ) ],