From 56aa783d9891f0e3d7696659159ef83c680d06ec Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Sun, 10 May 2026 17:31:41 +0900 Subject: [PATCH 1/7] Reject first-person review descriptions --- src/gmaps_scraper/place_scraper.py | 25 ++++++++++++++ tests/test_place_scraper.py | 52 ++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index 83f36a9..41a425d 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -136,8 +136,33 @@ "i'm sorry to inform", ) _DESCRIPTION_REVIEW_PROSE_MARKERS = ( + "boy was it worth", + "best place to stay", + "definitely recommend this place", + "great experience overall", + "had a great time", + "hidden gem-literally", + "i forgot his name", + "i'd just finished", + "i’d just finished", "highly recommended", + "i've tasted", + "i’ve tasted", + "it was my first attempt", + "my stay in", + "once step in", + "offered great recommendation", + "omfg", "overrated", + "so yummy", + "the katsu burger", + "the rooms were huge", + "we have ever had", + "we've ever had", + "what a great hotel", + "would recommend to everyone", + "about this data", + "get the most out of google maps", "your children", "your kids", "you should", diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 5d5bbb0..75f64c0 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -1493,6 +1493,58 @@ def test_clean_description_text_rejects_first_person_review_prose(self) -> None: "feel like it is a must visit spot." ) ) + self.assertIsNone( + _clean_description_text( + "Best ramen we've ever had. It helps that you get to make it yourself " + "(the noodles at least). Everything tastes better when you do it yourself! " + "Date day for a Saturday morning class. Great experience overall." + ) + ) + self.assertIsNone( + _clean_description_text( + "That’s gotta be one of the best hot chocolate drink I’ve tasted in my life! " + "The long wait was definitely worth it. There was quite a line before we got " + "a seat, but boy was it worth every minute. Definitely recommend this place." + ) + ) + self.assertIsNone( + _clean_description_text( + "It was my first attempt to eat mukhata. I ordered beef combo which was " + "quite a lot with full of veggie for 1 person. Very good taste." + ) + ) + self.assertIsNone( + _clean_description_text( + "Best place to stay in Hanoi. I’d just finished a north to south Vietnam " + "cycle and had stayed in everything from dorms to old Soviet-era hotels." + ) + ) + self.assertIsNone( + _clean_description_text( + "The katsu burger!!! Omfg!!! So yummy. My bf got the shrimp burger but I " + "preferred the pork. We loved the sauce." + ) + ) + self.assertIsNone( + _clean_description_text( + "What a great hotel! The rooms were huge and clean. All the staff were very " + "friendly, helpful and sweet." + ) + ) + self.assertIsNone( + _clean_description_text( + "Had a great time here with my friends! The barkeeper made us feel welcomed " + "and we had a lot of fun." + ) + ) + for review_description in ( + "The lady was just so so lovely. My feet are just gorgeous. Would recommend to everyone.", + "My stay in Alila was wonderful. Special shout out to the staff for making it memorable.", + "The hotel have a sense of peace and tranquility once step in. The personal service was delicate.", + "The staffs also offered great recommendation for drinks based on your preference.", + "Directions Save Nearby Send to phone Share About this data Get the most out of Google Maps Sign in", + ): + self.assertIsNone(_clean_description_text(review_description)) def test_clean_description_text_keeps_first_person_business_summary(self) -> None: self.assertEqual( From f79b6a241a7d912bf3598de10ef2dd1b3fbd75ac Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Tue, 12 May 2026 20:44:54 +0900 Subject: [PATCH 2/7] Reject Google Maps UI action descriptions --- src/gmaps_scraper/place_scraper.py | 171 +++++++++++++++++++-------- tests/test_place_scraper.py | 178 ++++++++++++++++++++--------- 2 files changed, 253 insertions(+), 96 deletions(-) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index 41a425d..fdb69d8 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -99,6 +99,13 @@ "share", "website", } +_UI_ACTION_CLUSTER_LABEL_PATTERN = ( + r"(?:call|directions|save|saved|nearby|send to phone|share|website|sign in)" +) +_UI_ACTION_CLUSTER_RE = re.compile( + rf"^{_UI_ACTION_CLUSTER_LABEL_PATTERN}(?:\s+{_UI_ACTION_CLUSTER_LABEL_PATTERN}){{2,}}$", + re.IGNORECASE, +) _DESCRIPTION_STOP_MARKERS = { "photos", "about this data", @@ -136,17 +143,17 @@ "i'm sorry to inform", ) _DESCRIPTION_REVIEW_PROSE_MARKERS = ( - "boy was it worth", "best place to stay", + "boy was it worth", "definitely recommend this place", "great experience overall", "had a great time", "hidden gem-literally", + "highly recommended", "i forgot his name", "i'd just finished", - "i’d just finished", - "highly recommended", "i've tasted", + "i’d just finished", "i’ve tasted", "it was my first attempt", "my stay in", @@ -598,6 +605,67 @@ } return null; }; + const elementTop = (element) => { + const rect = element?.getBoundingClientRect?.(); + return rect && rect.height > 0 ? rect.top : null; + }; + const elementBottom = (element) => { + const rect = element?.getBoundingClientRect?.(); + return rect && rect.height > 0 ? rect.bottom : null; + }; + const descriptionBoundaryTop = () => { + const rows = Array.from(panel.querySelectorAll("[data-item-id]")) + .map(elementTop) + .filter((value) => value !== null); + if (rows.length > 0) { + return Math.min(...rows); + } + const addressRow = addressRowElement(); + const addressTop = elementTop(addressRow); + return addressTop === null ? Infinity : addressTop; + }; + const descriptionValue = () => { + const direct = firstText([".WeS02d", ".PYvSYb"]); + if (direct) { + return direct; + } + const titleBottom = Math.max( + ...[ + elementBottom(titleElement), + ...Array.from(panel.querySelectorAll("div.F7nice")).map(elementBottom), + ].filter((value) => value !== null), + 0, + ); + const boundaryTop = descriptionBoundaryTop(); + const candidates = []; + for (const element of panel.querySelectorAll("div, span")) { + const text = cleanLine(element.innerText || element.textContent || ""); + if (!text || text.includes("·")) { + continue; + } + if ( + element.closest( + "button, a, [role='button'], [data-item-id], [data-review-id], div.F7nice", + ) + ) { + continue; + } + if ( + Array.from(element.children).some( + (child) => cleanLine(child.innerText || child.textContent || "") === text, + ) + ) { + continue; + } + const top = elementTop(element); + if (top === null || top <= titleBottom || top >= boundaryTop) { + continue; + } + candidates.push({top, text}); + } + candidates.sort((left, right) => left.top - right.top); + return candidates[0]?.text || null; + }; const normalizeCount = (value) => { if (!value) { @@ -708,22 +776,19 @@ } const mainPhotoUrl = firstImageUrl([ + "div.RZ66Rb button[jsaction*='heroHeaderImage'] img", "button[jsaction*='heroHeaderImage'] img", - "button[aria-label^='Photo of'] img", - "button[aria-label^='写真'] img", - "button[jsaction*='image'] img", - "button[jsaction*='photo'] img", + "div.ZKCDEc [data-photo-index='0'] img", + "[data-photo-index='0'] img", "[data-photo-index] img", - ], document) + ]) || firstBackgroundImageUrl([ - "button[jsaction*='image']", - "button[jsaction*='photo']", + "div.RZ66Rb button[jsaction*='heroHeaderImage']", + "button[jsaction*='heroHeaderImage']", + "div.ZKCDEc [data-photo-index='0']", + "[data-photo-index='0']", "[data-photo-index]", - "[aria-label*='Photo']", - "[aria-label*='photo']", - "[aria-label*='写真']", - "[aria-label*='画像']", - ], document); + ]); const photoUrl = mainPhotoUrl || firstAttr(["meta[property='og:image']", "meta[itemprop='image']"], "content", document); @@ -1058,7 +1123,7 @@ "button[data-item-id^='phone:']", ]), plus_code: itemValue("oloc"), - description: firstText([".WeS02d", ".PYvSYb"]), + description: descriptionValue(), review_topics: collectReviewTopics(), admission_prices: collectLeafPrices(sectionRootByHeading([ "Admission", @@ -1349,24 +1414,33 @@ } return {category: null, address: null}; }; - const findDescriptionLine = (lines, excludedValues) => { + const cardDescription = (article, excludedValues) => { const excluded = new Set(excludedValues.map(cleanLine).filter(Boolean)); - return lines.find((line) => { - const text = cleanLine(line); + const rows = Array.from(article.querySelectorAll("div.W4Efsd")); + for (const row of rows) { + const text = cleanLine(row.innerText || row.textContent || ""); if (!text || excluded.has(text)) { - return false; + continue; } if (text.includes("·") || parseCardRating(text) || parseCardReviewCount(text)) { - return false; + continue; + } + if ( + row.querySelector( + ".AJB7ye, .UsdlK, [role='img'][aria-label*='star' i], a[href^='tel:']", + ) + ) { + continue; } if (/^(open|closed|temporarily closed|website|directions|saved in)\b/i.test(text)) { - return false; + continue; } if (/^[+()\d\s.-]{7,}$/.test(text)) { - return false; + continue; } - return text.length >= 12; - }) || null; + return text; + } + return null; }; const safeDecodeURIComponent = (value) => { try { @@ -1398,10 +1472,11 @@ review_count: reviewCount, category: categoryAddress.category, address: categoryAddress.address, - search_result_description: findDescriptionLine( - lines, + search_result_description: cardDescription( + article, [name, categoryAddress.category, categoryAddress.address], ), + panel_text: lines.join("\n"), body_text: lines.join("\n"), }; }; @@ -2415,9 +2490,8 @@ def _build_place_details( snapshot: Mapping[str, object], ) -> PlaceDetails: panel_lines = _body_lines(snapshot.get("panel_text")) - body_lines = _body_lines(snapshot.get("body_text")) - search_lines = panel_lines or body_lines - combined_lines = _dedupe_lines([*panel_lines, *body_lines]) + search_lines = panel_lines + combined_lines = _dedupe_lines(panel_lines) category = _clean_category_text(snapshot.get("category")) or _extract_category_from_lines( search_lines ) @@ -2509,7 +2583,7 @@ def _build_place_details( plus_code=_clean_plus_code_text(snapshot.get("plus_code")) or _extract_plus_code_from_lines(combined_lines), address_parts=_extract_address_parts(snapshot.get("address_parts")), - description=_extract_description(snapshot, combined_lines), + description=_extract_description(snapshot), search_result_description=_clean_description_text( snapshot.get("search_result_description") ), @@ -3649,18 +3723,8 @@ def _extract_plus_code_from_lines(lines: list[str]) -> str | None: return None -def _extract_description(snapshot: Mapping[str, object], lines: list[str]) -> str | None: - direct = _clean_description_text(snapshot.get("description")) - if direct is not None: - return direct - for index, line in enumerate(lines): - if line.startswith("Seasonal ") or line.startswith("Modern setting "): - return line - if line == "Share" and index + 1 < len(lines): - candidate = _clean_description_text(lines[index + 1]) - if candidate is not None and candidate.lower() not in _DESCRIPTION_STOP_MARKERS: - return candidate - return None +def _extract_description(snapshot: Mapping[str, object]) -> str | None: + return _clean_description_text(snapshot.get("description")) def _clean_description_text(value: object) -> str | None: @@ -3677,7 +3741,11 @@ def _clean_description_text(value: object) -> str | None: return None if _looks_like_status_text(normalized): return None - if _looks_like_search_results_label(normalized) or _looks_like_ui_action_label(normalized): + if ( + _looks_like_search_results_label(normalized) + or _looks_like_ui_action_label(normalized) + or _looks_like_ui_action_cluster(normalized) + ): return None if ( _looks_like_description_review_prose(normalized) @@ -3699,6 +3767,12 @@ def _clean_description_text(value: object) -> str | None: return normalized +def _looks_like_ui_action_cluster(value: str) -> bool: + text = re.sub(r"[\ue000-\uf8ff]", " ", value) + text = re.sub(r"\s+", " ", text).strip(" .") + return _UI_ACTION_CLUSTER_RE.fullmatch(text) is not None + + def _strip_description_service_options(value: str) -> str | None: segments = [_clean_description_segment(part) for part in re.split(r"[·•⋅]+", value)] cleaned_segments = [segment for segment in segments if segment] @@ -3821,6 +3895,10 @@ def _normalize_photo_url(value: object) -> str | None: "googleusercontent.com" in host or host.endswith("ggpht.com") ) and path.startswith(("/a-", "/a/")): return None + if re.fullmatch(r"lh[0-9]+\.(?:googleusercontent\.com|ggpht\.com)", host) is None: + return None + if re.search(r"(?:=|-)w[0-9]+-h[0-9]+(?:-|$)", normalized) is None: + return None return normalized @@ -4266,7 +4344,10 @@ def _looks_like_search_results_label(value: str) -> bool: normalized = _clean_text(value) if normalized is None: return False - return normalized.casefold() in _SEARCH_RESULTS_LABELS + normalized_lookup = normalized.casefold() + return normalized_lookup in _SEARCH_RESULTS_LABELS or normalized_lookup.startswith( + "sponsored " + ) def _looks_like_ui_action_label(value: str) -> bool: diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 75f64c0..3bafbad 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -79,6 +79,18 @@ def test_place_js_extractor_skips_review_scoped_photo_nodes(self) -> None: self.assertIn('element.closest("[data-review-id]")', _PLACE_JS_EXTRACTOR) self.assertIn("root.querySelectorAll(selector)", _PLACE_JS_EXTRACTOR) self.assertIn(r"return /(^|\W)reviews?(\W|$)/i.test(label);", _PLACE_JS_EXTRACTOR) + self.assertIn("const descriptionValue = () => {", _PLACE_JS_EXTRACTOR) + self.assertIn('firstText([".WeS02d", ".PYvSYb"])', _PLACE_JS_EXTRACTOR) + self.assertIn("const descriptionBoundaryTop = () => {", _PLACE_JS_EXTRACTOR) + + def test_place_js_extractor_uses_structural_panel_photo_selectors(self) -> None: + self.assertIn("div.RZ66Rb button[jsaction*='heroHeaderImage'] img", _PLACE_JS_EXTRACTOR) + self.assertIn("div.ZKCDEc [data-photo-index='0'] img", _PLACE_JS_EXTRACTOR) + self.assertIn("[data-photo-index='0'] img", _PLACE_JS_EXTRACTOR) + self.assertNotIn("button[jsaction*='image'] img", _PLACE_JS_EXTRACTOR) + self.assertNotIn("button[jsaction*='photo'] img", _PLACE_JS_EXTRACTOR) + self.assertNotIn("button[aria-label^='Photo of'] img", _PLACE_JS_EXTRACTOR) + self.assertNotIn("], document)\n || firstBackgroundImageUrl", _PLACE_JS_EXTRACTOR) def test_collect_place_snapshot_can_skip_reviews_and_about_tabs(self) -> None: class _FakePage: @@ -451,6 +463,7 @@ def test_build_place_details_moves_localized_admission_price_out_of_price_range( "rating": "4.5", "review_count": "40,001", "price_range": "NT$320", + "admission_prices": ["NT$320"], "address": "4 Chome-2-8 Shibakoen, Minato City, Tokyo 105-0011, Japan", "body_text": "\n".join( [ @@ -624,6 +637,12 @@ def test_place_js_extractor_collects_quote_sections_separately(self) -> None: ) self.assertIn("searchResultTitleLabels", _PLACE_SEARCH_RESULT_CLICK_JS) self.assertIn("parseCardReviewCount", _PLACE_SEARCH_RESULT_CLICK_JS) + self.assertIn( + "const cardDescription = (article, excludedValues) => {", + _PLACE_SEARCH_RESULT_CLICK_JS, + ) + self.assertIn('article.querySelectorAll("div.W4Efsd")', _PLACE_SEARCH_RESULT_CLICK_JS) + self.assertNotIn("findDescriptionLine", _PLACE_SEARCH_RESULT_CLICK_JS) self.assertIn("getBoundingClientRect()", _PLACE_SEARCH_RESULT_OPEN_JS) self.assertIn("const placePanelRoot = () => {", _PLACE_JS_EXTRACTOR) self.assertIn("visibleArea", _PLACE_JS_EXTRACTOR) @@ -950,7 +969,11 @@ def test_build_place_details_prefers_selected_card_when_search_open_fails(self) assert details.diagnostics is not None self.assertEqual(details.diagnostics.field_sources.get("name"), "search_result") - def test_build_place_details_uses_dom_fields_and_body_fallbacks(self) -> None: + def test_build_place_details_uses_structured_dom_fields(self) -> None: + description = ( + "Seasonal menus of strikingly presented contemporary dishes, with wine " + "pairings, in a stylish space." + ) details = _build_place_details( "https://www.google.com/maps/place/Den", resolved_url="https://www.google.com/maps/place/Den/@35.6731762,139.7127216,17z", @@ -969,6 +992,7 @@ def test_build_place_details_uses_dom_fields_and_body_fallbacks(self) -> None: "website": "http://www.jimbochoden.com/", "phone": "+81 3-6455-5433", "plus_code": "MPF7+73 Shibuya, Tokyo, Japan", + "description": description, "limited_view": True, "body_text": "\n".join( [ @@ -976,10 +1000,7 @@ def test_build_place_details_uses_dom_fields_and_body_fallbacks(self) -> None: "傳", "4.4", "Japanese restaurant·", - ( - "Seasonal menus of strikingly presented contemporary dishes, " - "with wine pairings, in a stylish space." - ), + description, ] ), }, @@ -990,17 +1011,53 @@ def test_build_place_details_uses_dom_fields_and_body_fallbacks(self) -> None: self.assertEqual(details.category, "Japanese restaurant") self.assertEqual(details.rating, 4.4) self.assertEqual(details.review_count, 324) - self.assertEqual( - details.description, - ( - "Seasonal menus of strikingly presented contemporary dishes, with wine " - "pairings, in a stylish space." - ), - ) + self.assertEqual(details.description, description) self.assertEqual(details.lat, 35.6731762) self.assertEqual(details.lng, 139.7127216) self.assertTrue(details.limited_view) + def test_build_place_details_does_not_promote_body_text_to_description(self) -> None: + details = _build_place_details( + "https://www.google.com/maps/place/Den", + resolved_url="https://www.google.com/maps/place/Den", + snapshot={ + "name": "Den", + "category": "Japanese restaurant", + "body_text": "\n".join( + [ + "Share", + ( + "Seasonal menus of strikingly presented contemporary dishes, " + "with wine pairings, in a stylish space." + ), + ] + ), + }, + ) + + self.assertIsNone(details.description) + + def test_build_place_details_does_not_promote_share_adjacent_panel_text(self) -> None: + details = _build_place_details( + "https://www.google.com/maps/place/CANNES+sign", + resolved_url="https://www.google.com/maps/place/CANNES+sign", + snapshot={ + "name": "CANNES sign", + "category": "Tourist attraction", + "panel_text": "\n".join( + [ + "Share", + ( + "We took a very long cruise last summer from Venice to Portugal. " + "One stop was Cannes, and this sign was on the walking tour." + ), + ] + ), + }, + ) + + self.assertIsNone(details.description) + def test_build_place_details_preserves_zero_coordinates(self) -> None: details = _build_place_details( "https://www.google.com/maps/place/Null+Island", @@ -1507,42 +1564,26 @@ def test_clean_description_text_rejects_first_person_review_prose(self) -> None: "a seat, but boy was it worth every minute. Definitely recommend this place." ) ) - self.assertIsNone( - _clean_description_text( - "It was my first attempt to eat mukhata. I ordered beef combo which was " - "quite a lot with full of veggie for 1 person. Very good taste." - ) - ) - self.assertIsNone( - _clean_description_text( - "Best place to stay in Hanoi. I’d just finished a north to south Vietnam " - "cycle and had stayed in everything from dorms to old Soviet-era hotels." - ) - ) - self.assertIsNone( - _clean_description_text( - "The katsu burger!!! Omfg!!! So yummy. My bf got the shrimp burger but I " - "preferred the pork. We loved the sauce." - ) - ) - self.assertIsNone( - _clean_description_text( - "What a great hotel! The rooms were huge and clean. All the staff were very " - "friendly, helpful and sweet." - ) - ) - self.assertIsNone( - _clean_description_text( - "Had a great time here with my friends! The barkeeper made us feel welcomed " - "and we had a lot of fun." - ) - ) for review_description in ( - "The lady was just so so lovely. My feet are just gorgeous. Would recommend to everyone.", - "My stay in Alila was wonderful. Special shout out to the staff for making it memorable.", - "The hotel have a sense of peace and tranquility once step in. The personal service was delicate.", + "It was my first attempt to eat mukhata. The food was so delicious and " + "the clerks were very kind.", + "Best place to stay in Hanoi. I’d just finished Ha Giang loop and " + "needed a place to rest.", + "The katsu burger!!! Omfg!!! So yummy and the young man with blonde " + "curly hair from England.", + "What a great hotel! The rooms were huge and have balconies with a seating area.", + "Had a great time here with my friends! The barkeeper made us feel " + "welcomed and we had a lot of fun.", + "The lady was just so so lovely. My feet are just gorgeous. " + "Would recommend to everyone.", + "My stay in Alila was wonderful. Special shout out to the staff for " + "making it memorable.", + "The hotel have a sense of peace and tranquility once step in. " + "The personal service was delicate.", "The staffs also offered great recommendation for drinks based on your preference.", - "Directions Save Nearby Send to phone Share About this data Get the most out of Google Maps Sign in", + "Directions Save Nearby Send to phone Share About this data " + "Get the most out of Google Maps Sign in", + "\ue52e Directions \ue866 Save \uf05f Nearby \ue702 Send to phone \ue80d Share", ): self.assertIsNone(_clean_description_text(review_description)) @@ -1937,6 +1978,21 @@ def test_build_place_details_rejects_search_results_labels_and_rating_categories self.assertIsNone(details.name) + def test_build_place_details_rejects_sponsored_name_label(self) -> None: + details = _build_place_details( + "https://www.google.com/maps/search/?api=1&query=Nakameguro+Iguchi", + resolved_url="https://www.google.com/maps/search/?api=1&query=Nakameguro+Iguchi", + snapshot={ + "name": "Sponsored \ue5d4", + "category": "Restaurant", + "rating": "3.7", + "review_count": "186", + "body_text": "\n".join(["Sponsored \ue5d4", "Restaurant", "3.7"]), + }, + ) + + self.assertIsNone(details.name) + def test_build_place_details_rejects_ui_action_fallback_name_and_marks_diagnostics( self, ) -> None: @@ -1973,7 +2029,7 @@ def test_build_place_details_rejects_structured_name_that_matches_action_label( snapshot={ "name": "Share", "category": "Event venue", - "body_text": "\n".join( + "panel_text": "\n".join( ["Share", "Saved", "Directions", "Pooles Temple", "Event venue"] ), }, @@ -2143,14 +2199,16 @@ def test_build_place_details_rejects_invalid_address_parts(self) -> None: self.assertIsNone(details.address_parts) - def test_build_place_details_rejects_page_chrome_address_and_falls_back_to_body(self) -> None: + def test_build_place_details_rejects_page_chrome_address_and_falls_back_to_panel_text( + self, + ) -> None: details = _build_place_details( "https://www.google.com/maps/place/Bianchetto", resolved_url="https://www.google.com/maps/place/Bianchetto", snapshot={ "name": "Bianchetto", "address": "Imagery © 2026 Google TermsPrivacySend Product Feedback", - "body_text": "\n".join( + "panel_text": "\n".join( [ "Bianchetto", "Restaurant", @@ -2194,7 +2252,7 @@ def test_build_place_details_rejects_invalid_snapshot_plus_code_and_falls_back_t snapshot={ "name": "Den", "plus_code": "https://www.google.com/maps/place/Den", - "body_text": "\n".join( + "panel_text": "\n".join( [ "Den", "Japanese restaurant", @@ -2404,7 +2462,8 @@ def test_search_result_candidate_js_decodes_place_id_safely(self) -> None: ) def test_place_js_extractor_keeps_place_page_description_selectors(self) -> None: - self.assertIn('description: firstText([".WeS02d", ".PYvSYb"])', _PLACE_JS_EXTRACTOR) + self.assertIn('const direct = firstText([".WeS02d", ".PYvSYb"])', _PLACE_JS_EXTRACTOR) + self.assertIn("description: descriptionValue()", _PLACE_JS_EXTRACTOR) def test_looks_like_google_maps_place_url_accepts_google_tlds_only(self) -> None: self.assertTrue( @@ -2487,6 +2546,23 @@ def test_normalize_photo_url_rejects_google_avatar_urls(self) -> None: _normalize_photo_url("https://lh3.googleusercontent.com/p/example=s680-w680-h510"), "https://lh3.googleusercontent.com/p/example=s680-w680-h510", ) + self.assertEqual( + _normalize_photo_url( + "https://lh3.googleusercontent.com/gps-cs-s/example=w408-h306-k-no" + ), + "https://lh3.googleusercontent.com/gps-cs-s/example=w408-h306-k-no", + ) + + def test_normalize_photo_url_rejects_ad_thumbnails_and_unshaped_hosts(self) -> None: + for photo_url in ( + "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcExample&s=3", + "https://www.gstatic.com/faviconV2?url=https://example.com", + "https://www.gstatic.com/ads-travel/example.png", + "https://lh3.googleusercontent.com/p/example", + "https://lh3.googleusercontent.com.example/p/example=w680-h510-k-no", + ): + with self.subTest(photo_url=photo_url): + self.assertIsNone(_normalize_photo_url(photo_url)) def test_normalize_photo_url_rejects_google_static_map_urls(self) -> None: self.assertIsNone( From cc6350594707b6a45928ceb2e1208091ad89a944 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Wed, 13 May 2026 01:09:38 +0900 Subject: [PATCH 3/7] Ignore owner payloads when extracting place CIDs --- src/gmaps_scraper/parser.py | 2 ++ tests/test_parser.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/gmaps_scraper/parser.py b/src/gmaps_scraper/parser.py index 201b1db..0203ce1 100644 --- a/src/gmaps_scraper/parser.py +++ b/src/gmaps_scraper/parser.py @@ -644,6 +644,8 @@ def _find_cid_in_value(value: JSONValue | None) -> str | None: return _normalize_cid_token(value) if not isinstance(value, list): return None + if _parse_list_owner(value) is not None: + return None numeric_texts = [ text diff --git a/tests/test_parser.py b/tests/test_parser.py index 5897bc7..311f2c2 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -350,6 +350,26 @@ def test_does_not_use_owner_profile_id_as_place_cid(self) -> None: "https://www.google.com/maps/search/?api=1&query=Northwind+Cafe%2C+Example+District", ) + def test_does_not_use_owner_payload_inside_metadata_as_place_cid(self) -> None: + runtime_state = copy.deepcopy(["noise", _LIST_NODE]) + first_place = runtime_state[1][8][0] + assert isinstance(first_place, list) + first_metadata = first_place[1] + assert isinstance(first_metadata, list) + + first_metadata[6] = [ + "Fixture Owner", + "https://lh3.googleusercontent.com/a-/fixture-owner", + "104356373423434804635", + ] + + parsed = parse_saved_list_artifacts(_LIST_URL, runtime_state=runtime_state) + + self.assertEqual(len(parsed.places), 2) + self.assertEqual(parsed.places[0].cid, None) + self.assertEqual(parsed.places[0].google_id, "/g/11northwind") + self.assertNotEqual(parsed.places[0].cid, "104356373423434804635") + def test_extracts_favorite_and_note_from_user_payload_shape(self) -> None: runtime_state = [ "noise", From d2c6fffc3bab6f7d1ead093e6442d6a0b9b54167 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Wed, 13 May 2026 01:17:09 +0900 Subject: [PATCH 4/7] Allow numeric CID arrays while skipping owner payloads --- src/gmaps_scraper/parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gmaps_scraper/parser.py b/src/gmaps_scraper/parser.py index 0203ce1..5fa2301 100644 --- a/src/gmaps_scraper/parser.py +++ b/src/gmaps_scraper/parser.py @@ -644,7 +644,8 @@ def _find_cid_in_value(value: JSONValue | None) -> str | None: return _normalize_cid_token(value) if not isinstance(value, list): return None - if _parse_list_owner(value) is not None: + owner = _parse_list_owner(value) + if owner is not None and (owner.photo_url is not None or owner.profile_id is not None): return None numeric_texts = [ From 5396fc6e9aaae0dff959a597f8fab2d8e95f4180 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Wed, 13 May 2026 07:59:02 +0900 Subject: [PATCH 5/7] Preserve search result panel text --- src/gmaps_scraper/place_scraper.py | 1 + tests/test_place_scraper.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index fdb69d8..d4349a3 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -2421,6 +2421,7 @@ def _search_result_snapshot(candidate: str | Mapping[str, object]) -> dict[str, "category", "address", "search_result_description", + "panel_text", "body_text", ): value = candidate.get(key) diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 3bafbad..799c314 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -51,6 +51,7 @@ _parse_price_amount, _parse_review_count, _search_result_candidate_url, + _search_result_snapshot, _seed_google_consent_cookies, _should_use_llm_repair, collect_place_snapshot, @@ -1993,6 +1994,30 @@ def test_build_place_details_rejects_sponsored_name_label(self) -> None: self.assertIsNone(details.name) + def test_search_result_snapshot_preserves_panel_text_for_fallback_parsing(self) -> None: + snapshot = _search_result_snapshot( + { + "href": "https://www.google.com/maps/place/Open+Kitchen", + "name": "Open Kitchen", + "panel_text": "\n".join( + [ + "Open Kitchen", + "Restaurant · MPF7+73 Shibuya, Tokyo, Japan", + ] + ), + } + ) + + details = _build_place_details( + "https://www.google.com/maps/place/Open+Kitchen", + resolved_url="https://www.google.com/maps/place/Open+Kitchen", + snapshot=snapshot, + ) + + self.assertEqual(snapshot["panel_text"], "Open Kitchen\nRestaurant · MPF7+73 Shibuya, Tokyo, Japan") + self.assertEqual(details.category, "Restaurant") + self.assertEqual(details.plus_code, "MPF7+73 Shibuya, Tokyo, Japan") + def test_build_place_details_rejects_ui_action_fallback_name_and_marks_diagnostics( self, ) -> None: From 6bac110bec281a874109956f137f00e60f88aaa3 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Wed, 13 May 2026 08:00:43 +0900 Subject: [PATCH 6/7] Fix lint in panel text test --- tests/test_place_scraper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index 799c314..e7cb3cb 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -2014,7 +2014,10 @@ def test_search_result_snapshot_preserves_panel_text_for_fallback_parsing(self) snapshot=snapshot, ) - self.assertEqual(snapshot["panel_text"], "Open Kitchen\nRestaurant · MPF7+73 Shibuya, Tokyo, Japan") + self.assertEqual( + snapshot["panel_text"], + "Open Kitchen\nRestaurant · MPF7+73 Shibuya, Tokyo, Japan", + ) self.assertEqual(details.category, "Restaurant") self.assertEqual(details.plus_code, "MPF7+73 Shibuya, Tokyo, Japan") From b56905695316f3b60c053af51fccb59615381794 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Wed, 13 May 2026 13:12:00 +0900 Subject: [PATCH 7/7] Ignore tab strips in description fallback --- src/gmaps_scraper/place_scraper.py | 4 +++- tests/test_place_scraper.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/gmaps_scraper/place_scraper.py b/src/gmaps_scraper/place_scraper.py index d4349a3..338a08b 100644 --- a/src/gmaps_scraper/place_scraper.py +++ b/src/gmaps_scraper/place_scraper.py @@ -113,6 +113,7 @@ "write a review", "claim this business", "suggest an edit", + "overview reviews about", "limited view of google maps", "get the most out of google maps", "our policies do not permit contributions to this type of place.", @@ -645,7 +646,8 @@ } if ( element.closest( - "button, a, [role='button'], [data-item-id], [data-review-id], div.F7nice", + "button, a, [role='button'], [role='tab'], [role='tablist'], " + + "[data-item-id], [data-review-id], div.F7nice", ) ) { continue; diff --git a/tests/test_place_scraper.py b/tests/test_place_scraper.py index e7cb3cb..53f9936 100644 --- a/tests/test_place_scraper.py +++ b/tests/test_place_scraper.py @@ -83,6 +83,7 @@ def test_place_js_extractor_skips_review_scoped_photo_nodes(self) -> None: self.assertIn("const descriptionValue = () => {", _PLACE_JS_EXTRACTOR) self.assertIn('firstText([".WeS02d", ".PYvSYb"])', _PLACE_JS_EXTRACTOR) self.assertIn("const descriptionBoundaryTop = () => {", _PLACE_JS_EXTRACTOR) + self.assertIn("[role='button'], [role='tab'], [role='tablist']", _PLACE_JS_EXTRACTOR) def test_place_js_extractor_uses_structural_panel_photo_selectors(self) -> None: self.assertIn("div.RZ66Rb button[jsaction*='heroHeaderImage'] img", _PLACE_JS_EXTRACTOR) @@ -1450,6 +1451,9 @@ def test_extract_preview_description_preserves_open_now_prose(self) -> None: def test_clean_description_text_rejects_sponsored_label(self) -> None: self.assertIsNone(_clean_description_text("Sponsored")) + def test_clean_description_text_rejects_place_tab_strip(self) -> None: + self.assertIsNone(_clean_description_text("Overview Reviews About")) + def test_clean_description_text_keeps_structured_marketing_summary(self) -> None: self.assertEqual( _clean_description_text(