diff --git a/encodings/fsst/src/dfa/mod.rs b/encodings/fsst/src/dfa/mod.rs index 358fd3a7ab5..01abf95d3b9 100644 --- a/encodings/fsst/src/dfa/mod.rs +++ b/encodings/fsst/src/dfa/mod.rs @@ -208,6 +208,13 @@ enum LikeKind<'a> { impl<'a> LikeKind<'a> { fn parse(pattern: &'a [u8]) -> Option { + // The fast-path matchers below do not understand SQL LIKE escape sequences (e.g. `\%` + // matching a literal `%`). If the pattern contains a backslash we fall back to the + // general implementation, which correctly interprets escapes. + if pattern.contains(&b'\\') { + return None; + } + // `prefix%` (including just `%` where prefix is empty) if let Some(prefix) = pattern.strip_suffix(b"%") && !prefix.contains(&b'%') diff --git a/encodings/fsst/src/dfa/tests.rs b/encodings/fsst/src/dfa/tests.rs index 6ad30ca685d..320cdf3372c 100644 --- a/encodings/fsst/src/dfa/tests.rs +++ b/encodings/fsst/src/dfa/tests.rs @@ -64,6 +64,16 @@ fn test_like_kind_parse() { // Suffix and underscore patterns are not supported. assert!(LikeKind::parse(b"%suffix").is_none()); assert!(LikeKind::parse(b"a_c").is_none()); + + // Patterns containing the SQL LIKE escape character must not be parsed by the fast path, + // because that path treats `%` and `_` literally and would misinterpret escapes. For + // example, `%\%` (the pattern produced by Spark's `endsWith("%")`) means "ends with `%`", + // not "contains `\`". The fast path should bail so the general implementation handles it. + assert!(LikeKind::parse(br"%\%").is_none()); + assert!(LikeKind::parse(br"\%%").is_none()); + assert!(LikeKind::parse(br"%\_%").is_none()); + assert!(LikeKind::parse(br"\_%").is_none()); + assert!(LikeKind::parse(br"%\\%").is_none()); } /// No symbols — all bytes escaped. Simplest case to see the two tables.