From f2328982d4baa4520091dafc7aa0576cfe24bd12 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 5 Mar 2026 20:29:55 +0400 Subject: [PATCH 01/19] Spark soundex function implementation --- datafusion/spark/src/function/string/mod.rs | 4 + .../spark/src/function/string/soundex.rs | 135 ++++++++++++++++++ .../test_files/spark/string/soundex.slt | 17 ++- 3 files changed, 147 insertions(+), 9 deletions(-) create mode 100644 datafusion/spark/src/function/string/soundex.rs diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 8859beca77996..7bcdac5d85474 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -25,6 +25,7 @@ pub mod ilike; pub mod length; pub mod like; pub mod luhn_check; +pub mod soundex; pub mod space; pub mod substring; @@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc, format_string); make_udf_function!(space::SparkSpace, space); make_udf_function!(substring::SparkSubstring, substring); make_udf_function!(base64::SparkUnBase64, unbase64); +make_udf_function!(soundex::SparkSoundex, soundex); pub mod expr_fn { use datafusion_functions::export_functions; @@ -110,6 +112,7 @@ pub mod expr_fn { "Decodes the input string `str` from a base64 string into binary data.", str )); + export_functions!((soundex, "Returns Soundex code of the string.", str)); } pub fn functions() -> Vec> { @@ -127,5 +130,6 @@ pub fn functions() -> Vec> { space(), substring(), unbase64(), + soundex(), ] } diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs new file mode 100644 index 0000000000000..3b47de6ada9c6 --- /dev/null +++ b/datafusion/spark/src/function/string/soundex.rs @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray}; +use arrow::datatypes::DataType; +use datafusion::logical_expr::{ColumnarValue, Signature, Volatility}; +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions::utils::make_scalar_function; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible `soundex` expression +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkSoundex { + signature: Signature, +} + +impl Default for SparkSoundex { + fn default() -> Self { + Self::new() + } +} + +impl SparkSoundex { + pub fn new() -> Self { + Self { + signature: Signature::string(1, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkSoundex { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "soundex" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(spark_soundex_inner, vec![])(&args.args) + } +} + +fn spark_soundex_inner(arg: &[ArrayRef]) -> Result { + let [array] = take_function_args("soundex", arg)?; + match &array.data_type() { + DataType::Utf8 | DataType::Utf8View => soundex::(array), + DataType::LargeUtf8 => soundex::(array), + other => { + exec_err!("unsupported data type {other:?} for function `soundex`") + } + } +} + +fn soundex(array: &ArrayRef) -> Result { + let str_array = as_generic_string_array::(array)?; + + let result = str_array + .iter() + .map(|s| s.map(compute_soundex)) + .collect::(); + + Ok(Arc::new(result)) +} + +fn compute_soundex(s: &str) -> String { + let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); + + let first_ch = match chars.next() { + Some(c) => c.to_ascii_uppercase(), + None => return "".to_string(), + }; + + let mut result = String::with_capacity(4); + result.push(first_ch); + let mut last_code = classify_char(first_ch); + + for c in chars { + if result.len() >= 4 { + break; + } + let current = classify_char(c); + if let Some(digit) = current { + if current != last_code { + result.push(digit); + } + } + last_code = current; + } + + while result.len() < 4 { + result.push('0'); + } + result +} + +fn classify_char(c: char) -> Option { + match c.to_ascii_uppercase() { + 'B' | 'F' | 'P' | 'V' => Some('1'), + 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'), + 'D' | 'T' => Some('3'), + 'L' => Some('4'), + 'M' | 'N' => Some('5'), + 'R' => Some('6'), + _ => None, // A, E, I, O, U, H, W, Y + } +} diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index f0c46e10fd1de..5a109c294c716 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -15,13 +15,12 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 +query T +SELECT soundex('Miller'::string); +---- +M460 -## Original Query: SELECT soundex('Miller'); -## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'} -#query -#SELECT soundex('Miller'::string); +query T +SELECT soundex(NULL); +---- +NULL From e2aadb341c2aa528580b631a555ede37eefc262e Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 5 Mar 2026 20:41:39 +0400 Subject: [PATCH 02/19] Add more tests --- .../test_files/spark/string/soundex.slt | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index 5a109c294c716..c6d905bcfb265 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -24,3 +24,23 @@ query T SELECT soundex(NULL); ---- NULL + +query T +SELECT soundex(''::string); +---- +(empty) + +query T +SELECT soundex('Apache Spark'::string); +---- +A122 + +query T +SELECT soundex('123'::string); +---- +123 + +query T +SELECT soundex('Datafusion'::string); +---- +D312 From 37c339065ce0cd5c633ac4bae344dc6080ce731b Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 5 Mar 2026 23:01:46 +0400 Subject: [PATCH 03/19] Clippy fixing --- datafusion/spark/src/function/string/soundex.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 3b47de6ada9c6..f8a97c3d4d5dd 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -108,14 +108,11 @@ fn compute_soundex(s: &str) -> String { break; } let current = classify_char(c); - if let Some(digit) = current { - if current != last_code { - result.push(digit); - } + if let Some(digit) = current && current != last_code { + result.push(digit); } last_code = current; } - while result.len() < 4 { result.push('0'); } From 5058986e644c066d81fc788f3ecf330f61942cea Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 5 Mar 2026 23:07:30 +0400 Subject: [PATCH 04/19] Clippy fixing --- datafusion/spark/src/function/string/soundex.rs | 6 ++++-- .../sqllogictest/test_files/spark/string/soundex.slt | 10 +++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index f8a97c3d4d5dd..8044363893684 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -72,7 +72,7 @@ impl ScalarUDFImpl for SparkSoundex { fn spark_soundex_inner(arg: &[ArrayRef]) -> Result { let [array] = take_function_args("soundex", arg)?; match &array.data_type() { - DataType::Utf8 | DataType::Utf8View => soundex::(array), + DataType::Utf8 => soundex::(array), DataType::LargeUtf8 => soundex::(array), other => { exec_err!("unsupported data type {other:?} for function `soundex`") @@ -108,7 +108,9 @@ fn compute_soundex(s: &str) -> String { break; } let current = classify_char(c); - if let Some(digit) = current && current != last_code { + if let Some(digit) = current + && current != last_code + { result.push(digit); } last_code = current; diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index c6d905bcfb265..1f202f9c8b686 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -16,7 +16,7 @@ # under the License. query T -SELECT soundex('Miller'::string); +SELECT soundex('Miller'); ---- M460 @@ -26,21 +26,21 @@ SELECT soundex(NULL); NULL query T -SELECT soundex(''::string); +SELECT soundex(''); ---- (empty) query T -SELECT soundex('Apache Spark'::string); +SELECT soundex('Apache Spark'); ---- A122 query T -SELECT soundex('123'::string); +SELECT soundex('123'); ---- 123 query T -SELECT soundex('Datafusion'::string); +SELECT soundex('Datafusion'); ---- D312 From 5682c4f945dffb566d2a9c01799874c7f28ad77a Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 6 Mar 2026 20:43:53 +0400 Subject: [PATCH 05/19] Fix compute_soundex --- datafusion/spark/src/function/string/soundex.rs | 4 ++++ datafusion/sqllogictest/test_files/spark/string/soundex.slt | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 8044363893684..045a42511136b 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -99,6 +99,10 @@ fn compute_soundex(s: &str) -> String { None => return "".to_string(), }; + if first_ch.is_ascii_digit() { + return s.to_string() + } + let mut result = String::with_capacity(4); result.push(first_ch); let mut last_code = classify_char(first_ch); diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index 1f202f9c8b686..b1f98fe167d1e 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -40,6 +40,11 @@ SELECT soundex('123'); ---- 123 +query T +SELECT soundex('a123'); +---- +A000 + query T SELECT soundex('Datafusion'); ---- From 9b014ec3d05ef77aa86859e90d6aa73ff7630c9d Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 6 Mar 2026 20:58:15 +0400 Subject: [PATCH 06/19] Fix compute_soundex --- datafusion/spark/src/function/string/soundex.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 045a42511136b..c7046baa73c73 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -92,6 +92,10 @@ fn soundex(array: &ArrayRef) -> Result { } fn compute_soundex(s: &str) -> String { + if s.chars().next().map_or(false, |c| c.is_ascii_digit()) { + return s.to_string() + } + let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); let first_ch = match chars.next() { @@ -99,10 +103,6 @@ fn compute_soundex(s: &str) -> String { None => return "".to_string(), }; - if first_ch.is_ascii_digit() { - return s.to_string() - } - let mut result = String::with_capacity(4); result.push(first_ch); let mut last_code = classify_char(first_ch); From 74569b44c2b7ecd6d92feaf272fac2ba08dca089 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 6 Mar 2026 21:00:38 +0400 Subject: [PATCH 07/19] Fix compute_soundex --- datafusion/spark/src/function/string/soundex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index c7046baa73c73..2ff9428d414ce 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -93,7 +93,7 @@ fn soundex(array: &ArrayRef) -> Result { fn compute_soundex(s: &str) -> String { if s.chars().next().map_or(false, |c| c.is_ascii_digit()) { - return s.to_string() + return s.to_string(); } let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); From b89175e16a47e6e5a26e16cfa165b72ea8502dff Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 6 Mar 2026 21:07:43 +0400 Subject: [PATCH 08/19] Clippy fixing --- datafusion/spark/src/function/string/soundex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 2ff9428d414ce..332d7bb51df7d 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -92,7 +92,7 @@ fn soundex(array: &ArrayRef) -> Result { } fn compute_soundex(s: &str) -> String { - if s.chars().next().map_or(false, |c| c.is_ascii_digit()) { + if s.chars().next().is_some_and(|c| c.is_ascii_digit()) { return s.to_string(); } From 25c763d27bd611e1e7cf9433affa621a565f269c Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 14 Mar 2026 21:46:25 +0400 Subject: [PATCH 09/19] Ad more slt tests --- .../test_files/spark/string/soundex.slt | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index b1f98fe167d1e..6e5b1f01af278 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -49,3 +49,153 @@ query T SELECT soundex('Datafusion'); ---- D312 + +query T +SELECT soundex('Ashcroft'); +---- +A261 + +query T +SELECT soundex('B1B'); +---- +B100 + +query T +SELECT soundex('B B'); +---- +B100 + +query T +SELECT soundex('BAB'); +---- +B100 + +query T +SELECT soundex('#hello'); +---- +#hello + +query T +SELECT soundex(' hello'); +---- + hello + +query T +SELECT soundex(' '); +---- +(empty) + +query T +SELECT soundex('\thello'); +---- + hello + +query T +SELECT soundex('😀hello'); +---- +😀hello + +query T +SELECT soundex('123'); +---- +123 + +query T +SELECT soundex('1abc'); +---- +1abc + +query T +SELECT soundex('A'); +---- +A000 + +query T +SELECT soundex('BFPV'); +---- +B000 + +query T +SELECT soundex('Robert'); +---- +R163 + +query T +SELECT soundex('Rupert'); +---- +R163 + +query T +SELECT soundex(NULL); +---- +NULL + +query T +SELECT soundex(''); +---- +(empty) + +query T +SELECT soundex('robert'); +---- +R163 + +query T +SELECT soundex('rObErT'); +---- +R163 + +query T +SELECT soundex('Müller'); +---- +M460 + +query T +SELECT soundex('Abcdefghijklmnop'); +---- +A123 + +query T +SELECT soundex('Lloyd'); +---- +L300 + +query T +SELECT soundex('BWB'); +---- +B000 + +query T +SELECT soundex('BHB'); +---- +B000 + +query T +SELECT soundex('Tymczak'); +---- +T522 + +query T +SELECT soundex('Aeiou'); +---- +A000 + +query T +SELECT soundex('1Robert'); +---- +1Robert + +query T +SELECT soundex('Smith-Jones'); +---- +S532 + +query T +SELECT soundex('#'); +---- +# + +query T +SELECT soundex('\nhello'); +---- +\nhello From 3965da1634f88881509e23361ed911514c1c50c0 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 14 Mar 2026 21:48:42 +0400 Subject: [PATCH 10/19] Add more slt tests --- datafusion/sqllogictest/test_files/spark/string/soundex.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index 6e5b1f01af278..c29f45dc43153 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -83,12 +83,12 @@ SELECT soundex(' hello'); query T SELECT soundex(' '); ---- -(empty) + query T SELECT soundex('\thello'); ---- - hello +\thello query T SELECT soundex('😀hello'); From 11531248a29246179b57dd0d183a48608b5eaea3 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 10:22:15 +0400 Subject: [PATCH 11/19] fix --- .../spark/src/function/string/soundex.rs | 77 +++++++++++-------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 332d7bb51df7d..7c8e27d4486af 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -91,48 +91,57 @@ fn soundex(array: &ArrayRef) -> Result { Ok(Arc::new(result)) } +const US_ENGLISH_MAPPING: [u8; 26] = [ + b'0', b'1', b'2', b'3', b'0', b'1', b'2', b'7', b'0', b'2', b'2', b'4', b'5', b'5', + b'0', b'1', b'2', b'6', b'2', b'3', b'0', b'1', b'7', b'2', b'0', b'2', +]; + fn compute_soundex(s: &str) -> String { - if s.chars().next().is_some_and(|c| c.is_ascii_digit()) { - return s.to_string(); + let bytes = s.as_bytes(); + if bytes.is_empty() { + return String::new(); } - let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); + let mut b = bytes[0]; + + if (b'a'..=b'z').contains(&b) { + b -= 32; + } else if !(b'A'..=b'Z').contains(&b) { + return s.to_string(); + } - let first_ch = match chars.next() { - Some(c) => c.to_ascii_uppercase(), - None => return "".to_string(), - }; + let mut sx = [b'0', b'0', b'0', b'0']; + sx[0] = b; + let mut sxi = 1; + let idx = (b - b'A') as usize; + let mut last_code = US_ENGLISH_MAPPING[idx]; - let mut result = String::with_capacity(4); - result.push(first_ch); - let mut last_code = classify_char(first_ch); + for i in bytes.iter().skip(1) { + let mut b = *i; - for c in chars { - if result.len() >= 4 { - break; + if (b'a'..=b'z').contains(&b) { + b -= 32; + } else if !(b'A'..=b'Z').contains(&b) { + last_code = b'0'; + continue; } - let current = classify_char(c); - if let Some(digit) = current - && current != last_code - { - result.push(digit); + + let idx = (b - b'A') as usize; + let code = US_ENGLISH_MAPPING[idx]; + + if code == b'7' { + continue; + } else { + if code != b'0' && code != last_code { + sx[sxi] = code; + sxi += 1; + if sxi > 3 { + break; + } + } + last_code = code; } - last_code = current; - } - while result.len() < 4 { - result.push('0'); } - result -} -fn classify_char(c: char) -> Option { - match c.to_ascii_uppercase() { - 'B' | 'F' | 'P' | 'V' => Some('1'), - 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'), - 'D' | 'T' => Some('3'), - 'L' => Some('4'), - 'M' | 'N' => Some('5'), - 'R' => Some('6'), - _ => None, // A, E, I, O, U, H, W, Y - } + String::from_utf8_lossy(&sx).to_string() } From 1a867bfe334e6bd38448016b3790741281e4e869 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 10:30:46 +0400 Subject: [PATCH 12/19] fix --- datafusion/spark/src/function/string/soundex.rs | 8 ++++---- .../sqllogictest/test_files/spark/string/soundex.slt | 5 ----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 7c8e27d4486af..692e9dff08cd9 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -104,9 +104,9 @@ fn compute_soundex(s: &str) -> String { let mut b = bytes[0]; - if (b'a'..=b'z').contains(&b) { + if b.is_ascii_lowercase() { b -= 32; - } else if !(b'A'..=b'Z').contains(&b) { + } else if !b.is_ascii_uppercase() { return s.to_string(); } @@ -119,9 +119,9 @@ fn compute_soundex(s: &str) -> String { for i in bytes.iter().skip(1) { let mut b = *i; - if (b'a'..=b'z').contains(&b) { + if b.is_ascii_lowercase() { b -= 32; - } else if !(b'A'..=b'Z').contains(&b) { + } else if !b.is_ascii_uppercase() { last_code = b'0'; continue; } diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index c29f45dc43153..0970914ed12a4 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -80,11 +80,6 @@ SELECT soundex(' hello'); ---- hello -query T -SELECT soundex(' '); ----- - - query T SELECT soundex('\thello'); ---- From 6d47d8369923066eb0acbc8f44d5b4ae48dc0e69 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 10:47:46 +0400 Subject: [PATCH 13/19] fix --- datafusion/sqllogictest/test_files/spark/string/soundex.slt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index 0970914ed12a4..d321ad36d3159 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -194,3 +194,8 @@ query T SELECT soundex('\nhello'); ---- \nhello + +query T +SELECT soundex(' '); +---- + From 061c6b168b62c122474ae03798d669c766f1881a Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 10:49:35 +0400 Subject: [PATCH 14/19] fix --- datafusion/sqllogictest/test_files/spark/string/soundex.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index d321ad36d3159..ec85c4bd40b24 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -196,6 +196,6 @@ SELECT soundex('\nhello'); \nhello query T -SELECT soundex(' '); +SELECT concat(soundex(' '), 'Spark') ---- - + Spark From d7852bf1c501e826dc582502aca2e29aad85def0 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 18:04:59 +0400 Subject: [PATCH 15/19] fix --- datafusion/spark/src/function/string/soundex.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 692e9dff08cd9..b46462648bf29 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -110,8 +110,7 @@ fn compute_soundex(s: &str) -> String { return s.to_string(); } - let mut sx = [b'0', b'0', b'0', b'0']; - sx[0] = b; + let mut soundex_code = [b, b'0', b'0', b'0']; let mut sxi = 1; let idx = (b - b'A') as usize; let mut last_code = US_ENGLISH_MAPPING[idx]; @@ -133,7 +132,7 @@ fn compute_soundex(s: &str) -> String { continue; } else { if code != b'0' && code != last_code { - sx[sxi] = code; + soundex_code[sxi] = code; sxi += 1; if sxi > 3 { break; @@ -143,5 +142,5 @@ fn compute_soundex(s: &str) -> String { } } - String::from_utf8_lossy(&sx).to_string() + String::from_utf8_lossy(&soundex_code).to_string() } From cab229a23ddd8b3d99fa33dc6f4b95b190ada9a3 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 18:07:17 +0400 Subject: [PATCH 16/19] fix --- .../spark/src/function/string/soundex.rs | 77 +++++++++---------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index b46462648bf29..7b2a9daa0d2a1 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -91,56 +91,55 @@ fn soundex(array: &ArrayRef) -> Result { Ok(Arc::new(result)) } -const US_ENGLISH_MAPPING: [u8; 26] = [ - b'0', b'1', b'2', b'3', b'0', b'1', b'2', b'7', b'0', b'2', b'2', b'4', b'5', b'5', - b'0', b'1', b'2', b'6', b'2', b'3', b'0', b'1', b'7', b'2', b'0', b'2', -]; - fn compute_soundex(s: &str) -> String { - let bytes = s.as_bytes(); - if bytes.is_empty() { + if s.is_empty() { return String::new(); } - let mut b = bytes[0]; - - if b.is_ascii_lowercase() { - b -= 32; - } else if !b.is_ascii_uppercase() { - return s.to_string(); - } + let mut chars = s.chars(); + let first_char = chars.next().unwrap(); - let mut soundex_code = [b, b'0', b'0', b'0']; - let mut sxi = 1; - let idx = (b - b'A') as usize; - let mut last_code = US_ENGLISH_MAPPING[idx]; + let first_code = match classify_char(first_char) { + Some(code) => code, + None => return s.to_string(), + }; - for i in bytes.iter().skip(1) { - let mut b = *i; + let mut soundex_code = [first_code, b'0', b'0', b'0']; + let mut result_index = 1; + let mut last_code = first_code; - if b.is_ascii_lowercase() { - b -= 32; - } else if !b.is_ascii_uppercase() { - last_code = b'0'; - continue; - } + for c in chars { + let current_code = match classify_char(c) { + Some(code) => code, + None => { + last_code = b'0'; + continue; + } + }; - let idx = (b - b'A') as usize; - let code = US_ENGLISH_MAPPING[idx]; - - if code == b'7' { - continue; - } else { - if code != b'0' && code != last_code { - soundex_code[sxi] = code; - sxi += 1; - if sxi > 3 { - break; - } + if current_code != b'0' && current_code != last_code { + soundex_code[result_index] = current_code; + result_index += 1; + if result_index >= soundex_code.len() { + break; } - last_code = code; } + + last_code = current_code; } String::from_utf8_lossy(&soundex_code).to_string() } + +fn classify_char(c: char) -> Option { + match c.to_ascii_uppercase() { + 'A' | 'E' | 'I' | 'O' | 'U' | 'H' | 'W' | 'Y' => Some(b'0'), + 'B' | 'F' | 'P' | 'V' => Some(b'1'), + 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some(b'2'), + 'D' | 'T' => Some(b'3'), + 'L' => Some(b'4'), + 'M' | 'N' => Some(b'5'), + 'R' => Some(b'6'), + _ => None, + } +} From 1af1d307cb7b9ae264678fc0ce6205394abfb108 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 18:15:08 +0400 Subject: [PATCH 17/19] fix --- .../spark/src/function/string/soundex.rs | 77 ++++++++++--------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 7b2a9daa0d2a1..b46462648bf29 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -91,55 +91,56 @@ fn soundex(array: &ArrayRef) -> Result { Ok(Arc::new(result)) } +const US_ENGLISH_MAPPING: [u8; 26] = [ + b'0', b'1', b'2', b'3', b'0', b'1', b'2', b'7', b'0', b'2', b'2', b'4', b'5', b'5', + b'0', b'1', b'2', b'6', b'2', b'3', b'0', b'1', b'7', b'2', b'0', b'2', +]; + fn compute_soundex(s: &str) -> String { - if s.is_empty() { + let bytes = s.as_bytes(); + if bytes.is_empty() { return String::new(); } - let mut chars = s.chars(); - let first_char = chars.next().unwrap(); + let mut b = bytes[0]; - let first_code = match classify_char(first_char) { - Some(code) => code, - None => return s.to_string(), - }; + if b.is_ascii_lowercase() { + b -= 32; + } else if !b.is_ascii_uppercase() { + return s.to_string(); + } - let mut soundex_code = [first_code, b'0', b'0', b'0']; - let mut result_index = 1; - let mut last_code = first_code; + let mut soundex_code = [b, b'0', b'0', b'0']; + let mut sxi = 1; + let idx = (b - b'A') as usize; + let mut last_code = US_ENGLISH_MAPPING[idx]; - for c in chars { - let current_code = match classify_char(c) { - Some(code) => code, - None => { - last_code = b'0'; - continue; - } - }; + for i in bytes.iter().skip(1) { + let mut b = *i; - if current_code != b'0' && current_code != last_code { - soundex_code[result_index] = current_code; - result_index += 1; - if result_index >= soundex_code.len() { - break; - } + if b.is_ascii_lowercase() { + b -= 32; + } else if !b.is_ascii_uppercase() { + last_code = b'0'; + continue; } - last_code = current_code; + let idx = (b - b'A') as usize; + let code = US_ENGLISH_MAPPING[idx]; + + if code == b'7' { + continue; + } else { + if code != b'0' && code != last_code { + soundex_code[sxi] = code; + sxi += 1; + if sxi > 3 { + break; + } + } + last_code = code; + } } String::from_utf8_lossy(&soundex_code).to_string() } - -fn classify_char(c: char) -> Option { - match c.to_ascii_uppercase() { - 'A' | 'E' | 'I' | 'O' | 'U' | 'H' | 'W' | 'Y' => Some(b'0'), - 'B' | 'F' | 'P' | 'V' => Some(b'1'), - 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some(b'2'), - 'D' | 'T' => Some(b'3'), - 'L' => Some(b'4'), - 'M' | 'N' => Some(b'5'), - 'R' => Some(b'6'), - _ => None, - } -} From a63709a585c962265ccfbc2a6a85f415837d01e3 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 18:50:38 +0400 Subject: [PATCH 18/19] fix --- .../spark/src/function/string/soundex.rs | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index b46462648bf29..4af23546d83d1 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -91,56 +91,56 @@ fn soundex(array: &ArrayRef) -> Result { Ok(Arc::new(result)) } -const US_ENGLISH_MAPPING: [u8; 26] = [ - b'0', b'1', b'2', b'3', b'0', b'1', b'2', b'7', b'0', b'2', b'2', b'4', b'5', b'5', - b'0', b'1', b'2', b'6', b'2', b'3', b'0', b'1', b'7', b'2', b'0', b'2', -]; - -fn compute_soundex(s: &str) -> String { - let bytes = s.as_bytes(); - if bytes.is_empty() { +pub fn compute_soundex(s: &str) -> String { + if s.is_empty() { return String::new(); } - let mut b = bytes[0]; + let mut chars = s.chars(); + let first_char = chars.next().unwrap(); - if b.is_ascii_lowercase() { - b -= 32; - } else if !b.is_ascii_uppercase() { - return s.to_string(); - } - - let mut soundex_code = [b, b'0', b'0', b'0']; - let mut sxi = 1; - let idx = (b - b'A') as usize; - let mut last_code = US_ENGLISH_MAPPING[idx]; + let first_code = match classify_char(first_char) { + Some(code) => code, + None => return s.to_string(), + }; - for i in bytes.iter().skip(1) { - let mut b = *i; + let mut result = [first_code, b'0', b'0', b'0']; + let mut result_index = 1; + let mut last_code = first_code; - if b.is_ascii_lowercase() { - b -= 32; - } else if !b.is_ascii_uppercase() { - last_code = b'0'; - continue; - } + for c in chars { + let current_code = match classify_char(c) { + Some(code) => code, + None => { + last_code = b'0'; + continue; + } + }; - let idx = (b - b'A') as usize; - let code = US_ENGLISH_MAPPING[idx]; - - if code == b'7' { - continue; - } else { - if code != b'0' && code != last_code { - soundex_code[sxi] = code; - sxi += 1; - if sxi > 3 { - break; - } + if current_code != b'0' && current_code != last_code { + result[result_index] = current_code; + result_index += 1; + if result_index >= result.len() { + break; } - last_code = code; } + + last_code = current_code; } - String::from_utf8_lossy(&soundex_code).to_string() + String::from_utf8_lossy(&result).to_string() +} + +fn classify_char(c: char) -> Option { + match c.to_ascii_uppercase() { + 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' => Some(0), + 'B' | 'F' | 'P' | 'V' => Some(1), + 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some(2), + 'D' | 'T' => Some(3), + 'L' => Some(4), + 'M' | 'N' => Some(5), + 'R' => Some(6), + 'H' | 'W' => Some(7), + _ => None, + } } From bb5f6f01c323fc4d04825aafa532e861f8a0e8cc Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sun, 15 Mar 2026 21:48:22 +0400 Subject: [PATCH 19/19] fix --- .../spark/src/function/string/soundex.rs | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 4af23546d83d1..d6e9dcc099161 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -91,56 +91,56 @@ fn soundex(array: &ArrayRef) -> Result { Ok(Arc::new(result)) } -pub fn compute_soundex(s: &str) -> String { - if s.is_empty() { +const US_ENGLISH_MAPPING: [u8; 26] = [ + b'0', b'1', b'2', b'3', b'0', b'1', b'2', b'7', b'0', b'2', b'2', b'4', b'5', b'5', + b'0', b'1', b'2', b'6', b'2', b'3', b'0', b'1', b'7', b'2', b'0', b'2', +]; + +fn compute_soundex(s: &str) -> String { + let bytes = s.as_bytes(); + if bytes.is_empty() { return String::new(); } - let mut chars = s.chars(); - let first_char = chars.next().unwrap(); + let mut first_ch = bytes[0]; - let first_code = match classify_char(first_char) { - Some(code) => code, - None => return s.to_string(), - }; + if first_ch.is_ascii_lowercase() { + first_ch -= 32; + } else if !first_ch.is_ascii_uppercase() { + return s.to_string(); + } - let mut result = [first_code, b'0', b'0', b'0']; - let mut result_index = 1; - let mut last_code = first_code; + let mut soundex_code = [first_ch, b'0', b'0', b'0']; + let mut sxi = 1; + let idx = (first_ch - b'A') as usize; + let mut last_code = US_ENGLISH_MAPPING[idx]; - for c in chars { - let current_code = match classify_char(c) { - Some(code) => code, - None => { - last_code = b'0'; - continue; - } - }; + for i in bytes.iter().skip(1) { + let mut b = *i; - if current_code != b'0' && current_code != last_code { - result[result_index] = current_code; - result_index += 1; - if result_index >= result.len() { - break; - } + if b.is_ascii_lowercase() { + b -= 32; + } else if !b.is_ascii_uppercase() { + last_code = b'0'; + continue; } - last_code = current_code; + let idx = (b - b'A') as usize; + let code = US_ENGLISH_MAPPING[idx]; + + if code == b'7' { + continue; + } else { + if code != b'0' && code != last_code { + soundex_code[sxi] = code; + sxi += 1; + if sxi > 3 { + break; + } + } + last_code = code; + } } - String::from_utf8_lossy(&result).to_string() -} - -fn classify_char(c: char) -> Option { - match c.to_ascii_uppercase() { - 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' => Some(0), - 'B' | 'F' | 'P' | 'V' => Some(1), - 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some(2), - 'D' | 'T' => Some(3), - 'L' => Some(4), - 'M' | 'N' => Some(5), - 'R' => Some(6), - 'H' | 'W' => Some(7), - _ => None, - } + String::from_utf8_lossy(&soundex_code).to_string() }