From 43821db06234092ff5a6bf657830fc196dc6d5bb Mon Sep 17 00:00:00 2001
From: Abimael Martell <abimex@gmail.com>
Date: Wed, 20 May 2026 14:11:58 -0700
Subject: [PATCH] stage 3: fall back to justext/readability when picked subtree
 is suspiciously small
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The scored tree walk can lock onto a high-density-but-tiny element (an
intro paragraph, a small component-grid wrapper) and miss the substantive
main content elsewhere on the page. Symptom: extraction returns ≤5% of
the body text on a page with thousands of chars of legitimate content,
because the link-density penalty drove every link-dense subtree's
aggregate score negative and the intro paragraph won by default.

Adds a third fallback trigger:

  body_text_len >= 200
  AND kept_text_len * 100 < body_text_len * 15
  AND fallback_text > kept_text_len * 2

When all three hold, we treat Stage 3's choice as untrustworthy and use
the fallback chain's result instead. The 15% threshold is empirical,
tuned against a small real-world corpus.

Empirical impact on a 23-URL spot-check:

  - One page where Stage 3 was picking a 3 KB component-nav menu instead
    of the substantive 30 KB body content: now correctly returns the
    body content; extraction_quality moves from 0.22 to 0.40 (above the
    0.30 confidence-gate threshold the downstream caller uses).
  - 22 other URLs: extraction output unchanged.
  - Golden corpus: all 54 fixtures still pass.

Known limitation NOT fixed by this patch: pages where almost all text is
link text (e.g. table-layout listings of all-anchor rows). On these,
text_len_excluding_links is near-zero for both body and chosen subtree,
so the 15%-of-body trigger doesn't fire. A follow-up may need a separate
"full text including links" comparison for that class of page.
---
 crates/html-extractor/src/lib.rs | 41 +++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/crates/html-extractor/src/lib.rs b/crates/html-extractor/src/lib.rs
index fe62748..1456e45 100644
--- a/crates/html-extractor/src/lib.rs
+++ b/crates/html-extractor/src/lib.rs
@@ -68,15 +68,54 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result<ExtractResult, Ex
     let profile = scoring::profile_for(page_type);
     let (selected_root, score) = scoring::select_main(&tree, &profile, options);
 
-    // Stage 4: fallback if selected subtree is too small or absent.
+    // Stage 4: fallback if selected subtree is too small or absent. Three
+    // triggers:
+    //   (a) Stage 3 didn't pick anything (selected_root is None)
+    //   (b) The picked subtree's text is below the user-configured minimum
+    //   (c) The picked subtree is *suspiciously* small relative to the body
+    //       text — typically means the scored walk locked onto an intro
+    //       paragraph or product-nav and missed the real (link-dense, table-
+    //       layout, or component-grid) main content. We re-run with the
+    //       fallback chain and keep whichever found more text-excluding-links.
     let kept_text_len = selected_root
         .map(|idx| tree.text_len_excluding_links(idx))
         .unwrap_or(0);
     let min_len = options.min_extraction_length;
+    let body_text_len = if tree.body != usize::MAX {
+        tree.text_len_excluding_links(tree.body)
+    } else {
+        0
+    };
+    // Suspicious-pick threshold: chosen subtree has < 15% of body text-
+    // excluding-links AND body text is large enough (≥200 chars) that the
+    // disparity is meaningful. Catches a class of failure where the scored
+    // walk locks onto an intro paragraph or a small component-grid and misses
+    // the substantive main content elsewhere on the page (typically when the
+    // real content has high link density and the wider penalty regime drives
+    // its aggregate negative). 15% is empirical, tuned against a small real-
+    // world corpus.
+    let suspiciously_small = body_text_len >= 200 && kept_text_len * 100 < body_text_len * 15;
     let (final_root, quality, used_fallback) = if let Some(idx) = selected_root {
         if kept_text_len < min_len {
+            // (b): too short to be useful — fall through.
             let (fb_root, q) = fallback::fallback(&tree, options);
             (fb_root.or(Some(idx)), q.max(0.15), true)
+        } else if suspiciously_small {
+            // (c): try the fallback chain and pick whichever produced more
+            // text-excluding-links content.
+            let (fb_root, fb_q) = fallback::fallback(&tree, options);
+            let fb_text = fb_root
+                .map(|i| tree.text_len_excluding_links(i))
+                .unwrap_or(0);
+            if fb_text > kept_text_len * 2 {
+                (fb_root, fb_q.max(0.2), true)
+            } else {
+                (
+                    Some(idx),
+                    confidence_from_score(score, kept_text_len),
+                    false,
+                )
+            }
         } else {
             (
                 Some(idx),