From 3db35ffa9792880a9dda7fc08a2aa3d3b70dabe5 Mon Sep 17 00:00:00 2001 From: Abimael Martell Date: Wed, 20 May 2026 14:20:08 -0700 Subject: [PATCH] stage 3: also trigger fallback when chosen subtree is <5% of body full-text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The suspiciously-small fallback trigger added in the previous patch compares text-excluding-links on both sides. On pages where nearly all text IS link text (table-layout listings of all-anchor rows), text_len_excluding_links is near-zero for both the body and the chosen subtree, so the disparity ratio can't be computed meaningfully and the trigger doesn't fire. Adds a parallel full-text comparison: trigger fallback when body_full_text >= 1000 AND kept_full_text * 100 < body_full_text * 5 OR'd with the existing excl-links 15% check. Tighter 5% threshold + 1000- char minimum body to avoid false positives on small marketing pages whose hero/footer are link-heavy. Empirical impact on a 23-URL spot-check: - One link-heavy listing page: 0.1 KB → 2.8 KB (28×); extraction_quality 0.10 → 0.20. Output is the actual list items, not just the intro paragraph the scored walk had been locking onto. - 22 other URLs: extraction output unchanged. - Golden corpus: 54/54 fixtures still pass. - 37/37 unit + integration + doctests pass. --- crates/html-extractor/src/lib.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/crates/html-extractor/src/lib.rs b/crates/html-extractor/src/lib.rs index 1456e45..3d2a848 100644 --- a/crates/html-extractor/src/lib.rs +++ b/crates/html-extractor/src/lib.rs @@ -94,7 +94,25 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result= 200 && kept_text_len * 100 < body_text_len * 15; + let suspiciously_small_excl_links = + body_text_len >= 200 && kept_text_len * 100 < body_text_len * 15; + // Link-heavy variant: when nearly all body text IS link text (table- + // layout listings of all-anchor rows), text_len_excluding_links is near- + // zero for both the body and the chosen subtree, so the excl-links ratio + // above can't detect the disparity. Use full-text on both sides with a + // tighter 5% threshold and a minimum 1000-char body to avoid false- + // positive triggers on small marketing pages. + let kept_full_text = selected_root + .map(|idx| tree.full_text(idx).chars().count()) + .unwrap_or(0); + let body_full_text = if tree.body != usize::MAX { + tree.full_text(tree.body).chars().count() + } else { + 0 + }; + let suspiciously_small_full = + body_full_text >= 1000 && kept_full_text * 100 < body_full_text * 5; + let suspiciously_small = suspiciously_small_excl_links || suspiciously_small_full; let (final_root, quality, used_fallback) = if let Some(idx) = selected_root { if kept_text_len < min_len { // (b): too short to be useful — fall through.