From 43821db06234092ff5a6bf657830fc196dc6d5bb Mon Sep 17 00:00:00 2001 From: Abimael Martell Date: Wed, 20 May 2026 14:11:58 -0700 Subject: [PATCH] stage 3: fall back to justext/readability when picked subtree is suspiciously small MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scored tree walk can lock onto a high-density-but-tiny element (an intro paragraph, a small component-grid wrapper) and miss the substantive main content elsewhere on the page. Symptom: extraction returns ≤5% of the body text on a page with thousands of chars of legitimate content, because the link-density penalty drove every link-dense subtree's aggregate score negative and the intro paragraph won by default. Adds a third fallback trigger: body_text_len >= 200 AND kept_text_len * 100 < body_text_len * 15 AND fallback_text > kept_text_len * 2 When all three hold, we treat Stage 3's choice as untrustworthy and use the fallback chain's result instead. The 15% threshold is empirical, tuned against a small real-world corpus. Empirical impact on a 23-URL spot-check: - One page where Stage 3 was picking a 3 KB component-nav menu instead of the substantive 30 KB body content: now correctly returns the body content; extraction_quality moves from 0.22 to 0.40 (above the 0.30 confidence-gate threshold the downstream caller uses). - 22 other URLs: extraction output unchanged. - Golden corpus: all 54 fixtures still pass. Known limitation NOT fixed by this patch: pages where almost all text is link text (e.g. table-layout listings of all-anchor rows). On these, text_len_excluding_links is near-zero for both body and chosen subtree, so the 15%-of-body trigger doesn't fire. A follow-up may need a separate "full text including links" comparison for that class of page. --- crates/html-extractor/src/lib.rs | 41 +++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/crates/html-extractor/src/lib.rs b/crates/html-extractor/src/lib.rs index fe62748..1456e45 100644 --- a/crates/html-extractor/src/lib.rs +++ b/crates/html-extractor/src/lib.rs @@ -68,15 +68,54 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result= 200 && kept_text_len * 100 < body_text_len * 15; let (final_root, quality, used_fallback) = if let Some(idx) = selected_root { if kept_text_len < min_len { + // (b): too short to be useful — fall through. let (fb_root, q) = fallback::fallback(&tree, options); (fb_root.or(Some(idx)), q.max(0.15), true) + } else if suspiciously_small { + // (c): try the fallback chain and pick whichever produced more + // text-excluding-links content. + let (fb_root, fb_q) = fallback::fallback(&tree, options); + let fb_text = fb_root + .map(|i| tree.text_len_excluding_links(i)) + .unwrap_or(0); + if fb_text > kept_text_len * 2 { + (fb_root, fb_q.max(0.2), true) + } else { + ( + Some(idx), + confidence_from_score(score, kept_text_len), + false, + ) + } } else { ( Some(idx),