From d550f3116fc2a8c8a2da76567ba8da10465551d4 Mon Sep 17 00:00:00 2001 From: hyperpolymath <6759885+hyperpolymath@users.noreply.github.com> Date: Tue, 26 May 2026 15:08:51 +0100 Subject: [PATCH] fix(assail): char-boundary-aware advance in Isabelle cartouche skip (regression from #49) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `skip_cartouche_end` advanced its byte index by 1 in the no-open/no-close branch, which puts `j` inside a multi-byte UTF-8 sequence when the cartouche body contains non-ASCII (`¬`, `∀`, `⟹`, `🎉`, etc.). The next iteration's `haystack[j..].starts_with(open)` then panics with thread 'main' panicked at src/assail/analyzer.rs:5648:20: start byte index 89 is not a char boundary; it is inside '¬' (...) discovered on `echidna` full-tree scans during the 2026-05-26 estate reconnaissance. Subsetting to `src/` worked around it; the proper fix is to advance by `len_utf8()` of the current char. Two regression tests: one with a sampled echidna text-cartouche body (`¬¬A`, `∀x`, `⟹`), one with a 4-byte UTF-8 emoji to exercise the full multi-byte range. Reproducer (full-tree assail on echidna) now completes cleanly with 44 weak points. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/assail/analyzer.rs | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/src/assail/analyzer.rs b/src/assail/analyzer.rs index 0d697ba..8f1743d 100644 --- a/src/assail/analyzer.rs +++ b/src/assail/analyzer.rs @@ -5641,6 +5641,11 @@ fn skip_antiquotation_end(haystack: &str, start: usize) -> usize { /// Given a position just past the first `\` (depth = 1), scan forward and /// return the byte index just past the matching `\`. Nesting-aware. +/// +/// The byte-advance branch steps by `char.len_utf8()` rather than 1 so the +/// next iteration's `haystack[j..]` slice lands on a UTF-8 char boundary — +/// Isabelle prose routinely contains non-ASCII (e.g. `¬`, `∀`, `⟹`) inside +/// `text \...\` blocks. fn skip_cartouche_end(haystack: &str, after_open: usize, open: &str, close: &str) -> usize { let mut depth: i32 = 1; let mut j = after_open; @@ -5652,7 +5657,13 @@ fn skip_cartouche_end(haystack: &str, after_open: usize, open: &str, close: &str depth -= 1; j += close.len(); } else { - j += 1; + // Step by char width to keep `j` on a UTF-8 boundary. `chars()` + // is guaranteed non-empty here because `j < haystack.len()`. + j += haystack[j..] + .chars() + .next() + .map(|c| c.len_utf8()) + .unwrap_or(1); } } j @@ -5924,6 +5935,34 @@ mod tests { ); } + #[test] + fn isabelle_cartouche_with_non_ascii_does_not_panic() { + // Regression: prior to the char-boundary fix in skip_cartouche_end, + // any non-ASCII byte inside a `\...\` block panicked + // with "byte index N is not a char boundary" because the helper + // advanced by 1 byte at a time through multi-byte UTF-8 sequences. + // This content is shortened from a real echidna .thy file that + // triggered the panic on a full-tree assail scan. + let src = "text \\\n Double negation: ¬¬A ≡ A.\n ∀x. ¬(P x) ⟹ True\n\\\nlemma foo: True by simp"; + let stripped = strip_isabelle_prose(src); + assert!( + !stripped.contains("¬¬"), + "cartouche body containing non-ASCII not stripped: {stripped:?}" + ); + assert!(stripped.contains("lemma foo")); + } + + #[test] + fn isabelle_cartouche_emoji_grapheme_clusters() { + // 4-byte UTF-8 sequences (U+1F389 🎉) inside a cartouche also + // exercise the char-boundary path. A surrogate-pair-style multi- + // byte char being skipped by 1-byte advances is the same bug. + let src = "section \\release notes 🎉 — celebrate!\\\nlemma g: True by simp"; + let stripped = strip_isabelle_prose(src); + assert!(!stripped.contains("🎉"), "got: {stripped:?}"); + assert!(stripped.contains("lemma g")); + } + #[test] fn isabelle_nested_cartouches() { // \ can nest. A cartouche containing another cartouche