diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 8859beca77996..7bcdac5d85474 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -25,6 +25,7 @@ pub mod ilike; pub mod length; pub mod like; pub mod luhn_check; +pub mod soundex; pub mod space; pub mod substring; @@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc, format_string); make_udf_function!(space::SparkSpace, space); make_udf_function!(substring::SparkSubstring, substring); make_udf_function!(base64::SparkUnBase64, unbase64); +make_udf_function!(soundex::SparkSoundex, soundex); pub mod expr_fn { use datafusion_functions::export_functions; @@ -110,6 +112,7 @@ pub mod expr_fn { "Decodes the input string `str` from a base64 string into binary data.", str )); + export_functions!((soundex, "Returns Soundex code of the string.", str)); } pub fn functions() -> Vec> { @@ -127,5 +130,6 @@ pub fn functions() -> Vec> { space(), substring(), unbase64(), + soundex(), ] } diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs new file mode 100644 index 0000000000000..d6e9dcc099161 --- /dev/null +++ b/datafusion/spark/src/function/string/soundex.rs @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray}; +use arrow::datatypes::DataType; +use datafusion::logical_expr::{ColumnarValue, Signature, Volatility}; +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions::utils::make_scalar_function; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible `soundex` expression +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkSoundex { + signature: Signature, +} + +impl Default for SparkSoundex { + fn default() -> Self { + Self::new() + } +} + +impl SparkSoundex { + pub fn new() -> Self { + Self { + signature: Signature::string(1, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkSoundex { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "soundex" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(spark_soundex_inner, vec![])(&args.args) + } +} + +fn spark_soundex_inner(arg: &[ArrayRef]) -> Result { + let [array] = take_function_args("soundex", arg)?; + match &array.data_type() { + DataType::Utf8 => soundex::(array), + DataType::LargeUtf8 => soundex::(array), + other => { + exec_err!("unsupported data type {other:?} for function `soundex`") + } + } +} + +fn soundex(array: &ArrayRef) -> Result { + let str_array = as_generic_string_array::(array)?; + + let result = str_array + .iter() + .map(|s| s.map(compute_soundex)) + .collect::(); + + Ok(Arc::new(result)) +} + +const US_ENGLISH_MAPPING: [u8; 26] = [ + b'0', b'1', b'2', b'3', b'0', b'1', b'2', b'7', b'0', b'2', b'2', b'4', b'5', b'5', + b'0', b'1', b'2', b'6', b'2', b'3', b'0', b'1', b'7', b'2', b'0', b'2', +]; + +fn compute_soundex(s: &str) -> String { + let bytes = s.as_bytes(); + if bytes.is_empty() { + return String::new(); + } + + let mut first_ch = bytes[0]; + + if first_ch.is_ascii_lowercase() { + first_ch -= 32; + } else if !first_ch.is_ascii_uppercase() { + return s.to_string(); + } + + let mut soundex_code = [first_ch, b'0', b'0', b'0']; + let mut sxi = 1; + let idx = (first_ch - b'A') as usize; + let mut last_code = US_ENGLISH_MAPPING[idx]; + + for i in bytes.iter().skip(1) { + let mut b = *i; + + if b.is_ascii_lowercase() { + b -= 32; + } else if !b.is_ascii_uppercase() { + last_code = b'0'; + continue; + } + + let idx = (b - b'A') as usize; + let code = US_ENGLISH_MAPPING[idx]; + + if code == b'7' { + continue; + } else { + if code != b'0' && code != last_code { + soundex_code[sxi] = code; + sxi += 1; + if sxi > 3 { + break; + } + } + last_code = code; + } + } + + String::from_utf8_lossy(&soundex_code).to_string() +} diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index f0c46e10fd1de..ec85c4bd40b24 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -15,13 +15,187 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT soundex('Miller'); -## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'} -#query -#SELECT soundex('Miller'::string); +query T +SELECT soundex('Miller'); +---- +M460 + +query T +SELECT soundex(NULL); +---- +NULL + +query T +SELECT soundex(''); +---- +(empty) + +query T +SELECT soundex('Apache Spark'); +---- +A122 + +query T +SELECT soundex('123'); +---- +123 + +query T +SELECT soundex('a123'); +---- +A000 + +query T +SELECT soundex('Datafusion'); +---- +D312 + +query T +SELECT soundex('Ashcroft'); +---- +A261 + +query T +SELECT soundex('B1B'); +---- +B100 + +query T +SELECT soundex('B B'); +---- +B100 + +query T +SELECT soundex('BAB'); +---- +B100 + +query T +SELECT soundex('#hello'); +---- +#hello + +query T +SELECT soundex(' hello'); +---- + hello + +query T +SELECT soundex('\thello'); +---- +\thello + +query T +SELECT soundex('😀hello'); +---- +😀hello + +query T +SELECT soundex('123'); +---- +123 + +query T +SELECT soundex('1abc'); +---- +1abc + +query T +SELECT soundex('A'); +---- +A000 + +query T +SELECT soundex('BFPV'); +---- +B000 + +query T +SELECT soundex('Robert'); +---- +R163 + +query T +SELECT soundex('Rupert'); +---- +R163 + +query T +SELECT soundex(NULL); +---- +NULL + +query T +SELECT soundex(''); +---- +(empty) + +query T +SELECT soundex('robert'); +---- +R163 + +query T +SELECT soundex('rObErT'); +---- +R163 + +query T +SELECT soundex('Müller'); +---- +M460 + +query T +SELECT soundex('Abcdefghijklmnop'); +---- +A123 + +query T +SELECT soundex('Lloyd'); +---- +L300 + +query T +SELECT soundex('BWB'); +---- +B000 + +query T +SELECT soundex('BHB'); +---- +B000 + +query T +SELECT soundex('Tymczak'); +---- +T522 + +query T +SELECT soundex('Aeiou'); +---- +A000 + +query T +SELECT soundex('1Robert'); +---- +1Robert + +query T +SELECT soundex('Smith-Jones'); +---- +S532 + +query T +SELECT soundex('#'); +---- +# + +query T +SELECT soundex('\nhello'); +---- +\nhello + +query T +SELECT concat(soundex(' '), 'Spark') +---- + Spark