Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions datafusion/spark/src/function/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pub mod ilike;
pub mod length;
pub mod like;
pub mod luhn_check;
pub mod soundex;
pub mod space;
pub mod substring;

Expand All @@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc, format_string);
make_udf_function!(space::SparkSpace, space);
make_udf_function!(substring::SparkSubstring, substring);
make_udf_function!(base64::SparkUnBase64, unbase64);
make_udf_function!(soundex::SparkSoundex, soundex);

pub mod expr_fn {
use datafusion_functions::export_functions;
Expand Down Expand Up @@ -110,6 +112,7 @@ pub mod expr_fn {
"Decodes the input string `str` from a base64 string into binary data.",
str
));
export_functions!((soundex, "Returns Soundex code of the string.", str));
}

pub fn functions() -> Vec<Arc<ScalarUDF>> {
Expand All @@ -127,5 +130,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
space(),
substring(),
unbase64(),
soundex(),
]
}
146 changes: 146 additions & 0 deletions datafusion/spark/src/function/string/soundex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray};
use arrow::datatypes::DataType;
use datafusion::logical_expr::{ColumnarValue, Signature, Volatility};
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::utils::take_function_args;
use datafusion_common::{Result, exec_err};
use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
use datafusion_functions::utils::make_scalar_function;
use std::any::Any;
use std::sync::Arc;

/// Spark-compatible `soundex` expression
/// <https://spark.apache.org/docs/latest/api/sql/index.html#soundex>
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct SparkSoundex {
signature: Signature,
}

impl Default for SparkSoundex {
fn default() -> Self {
Self::new()
}
}

impl SparkSoundex {
pub fn new() -> Self {
Self {
signature: Signature::string(1, Volatility::Immutable),
}
}
}

impl ScalarUDFImpl for SparkSoundex {
fn as_any(&self) -> &dyn Any {
self
}

fn name(&self) -> &str {
"soundex"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
Ok(DataType::Utf8)
}

fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
make_scalar_function(spark_soundex_inner, vec![])(&args.args)
}
}

fn spark_soundex_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
let [array] = take_function_args("soundex", arg)?;
match &array.data_type() {
DataType::Utf8 => soundex::<i32>(array),
DataType::LargeUtf8 => soundex::<i64>(array),
other => {
exec_err!("unsupported data type {other:?} for function `soundex`")
}
}
}

fn soundex<T: OffsetSizeTrait>(array: &ArrayRef) -> Result<ArrayRef> {
let str_array = as_generic_string_array::<T>(array)?;

let result = str_array
.iter()
.map(|s| s.map(compute_soundex))
.collect::<StringArray>();

Ok(Arc::new(result))
}

const US_ENGLISH_MAPPING: [u8; 26] = [
b'0', b'1', b'2', b'3', b'0', b'1', b'2', b'7', b'0', b'2', b'2', b'4', b'5', b'5',
b'0', b'1', b'2', b'6', b'2', b'3', b'0', b'1', b'7', b'2', b'0', b'2',
];

fn compute_soundex(s: &str) -> String {
let bytes = s.as_bytes();
if bytes.is_empty() {
return String::new();
}

let mut first_ch = bytes[0];

if first_ch.is_ascii_lowercase() {
first_ch -= 32;
} else if !first_ch.is_ascii_uppercase() {
return s.to_string();
}

let mut soundex_code = [first_ch, b'0', b'0', b'0'];
let mut sxi = 1;
let idx = (first_ch - b'A') as usize;
let mut last_code = US_ENGLISH_MAPPING[idx];

for i in bytes.iter().skip(1) {
let mut b = *i;

if b.is_ascii_lowercase() {
b -= 32;
} else if !b.is_ascii_uppercase() {
last_code = b'0';
continue;
}

let idx = (b - b'A') as usize;
let code = US_ENGLISH_MAPPING[idx];

if code == b'7' {
continue;
} else {
if code != b'0' && code != last_code {
soundex_code[sxi] = code;
sxi += 1;
if sxi > 3 {
break;
}
}
last_code = code;
}
}

String::from_utf8_lossy(&soundex_code).to_string()
}
194 changes: 184 additions & 10 deletions datafusion/sqllogictest/test_files/spark/string/soundex.slt
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,187 @@
# specific language governing permissions and limitations
# under the License.

# This file was originally created by a porting script from:
# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
# This file is part of the implementation of the datafusion-spark function library.
# For more information, please see:
# https://github.com/apache/datafusion/issues/15914

## Original Query: SELECT soundex('Miller');
## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'}
#query
#SELECT soundex('Miller'::string);
query T
SELECT soundex('Miller');
----
M460

query T
SELECT soundex(NULL);
----
NULL

query T
SELECT soundex('');
----
(empty)

query T
SELECT soundex('Apache Spark');
----
A122

query T
SELECT soundex('123');
----
123

query T
SELECT soundex('a123');
----
A000

query T
SELECT soundex('Datafusion');
----
D312
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey! I had actually started working on a Spark soundex implementation too and didn't realize there was already a PR for it. Happy to see this moving forward!

I had put together a battery of edge-case tests validated against Spark JVM that might be useful. The current SLT coverage is a bit thin — there are some tricky Soundex behaviors that are easy to get wrong:

tests = [
    # H/W transparency (must NOT separate same codes)
    ("H/W transparency", "SELECT soundex('Ashcroft') AS result"),
    # Separators (digit, space, vowel MUST separate same codes)
    ("Digit separates same-code", "SELECT soundex('B1B') AS result"),
    ("Space separates same-code", "SELECT soundex('B B') AS result"),
    ("Vowel separates same-code", "SELECT soundex('BAB') AS result"),
    # Non-alpha first character (returns input unchanged)
    ("Non-alpha first char", "SELECT soundex('#hello') AS result"),
    ("Space first char", "SELECT soundex(' hello') AS result"),
    ("Only spaces", "SELECT soundex('   ') AS result"),
    ("Tab prefix", "SELECT soundex('\thello') AS result"),
    ("Emoji prefix", "SELECT soundex('😀hello') AS result"),
    ("Only digits", "SELECT soundex('123') AS result"),
    ("Starts with digit", "SELECT soundex('1abc') AS result"),
    # Basic behavior
    ("Single character", "SELECT soundex('A') AS result"),
    ("All same-code letters", "SELECT soundex('BFPV') AS result"),
    ("Similar names Robert", "SELECT soundex('Robert') AS result"),
    ("Similar names Rupert", "SELECT soundex('Rupert') AS result"),
    ("NULL", "SELECT soundex(NULL) AS result"),
    ("Empty string", "SELECT soundex('') AS result"),
    # Case insensitivity
    ("Lowercase", "SELECT soundex('robert') AS result"),
    ("Mixed case same", "SELECT soundex('rObErT') AS result"),
    # Unicode
    ("Unicode umlaut", "SELECT soundex('Müller') AS result"),
    # Truncation (only first 3 codes after initial)
    ("Long string", "SELECT soundex('Abcdefghijklmnop') AS result"),
    # Extra edge cases
    ("Adjacent same codes collapse", "SELECT soundex('Lloyd') AS result"),
    ("W between same codes", "SELECT soundex('BWB') AS result"),
    ("H between same codes", "SELECT soundex('BHB') AS result"),
    ("Double letters", "SELECT soundex('Tymczak') AS result"),
    ("All vowels after first", "SELECT soundex('Aeiou') AS result"),
    ("First char digit rest alpha", "SELECT soundex('1Robert') AS result"),
    ("Hyphen in name", "SELECT soundex('Smith-Jones') AS result"),
    ("Single non-alpha", "SELECT soundex('#') AS result"),
    ("Newline prefix", "SELECT soundex('\nhello') AS result"),
]

for label, sql in tests:
    r = spark.sql(sql).collect()
    print(f"{label}: {repr(r[0].result)}")

# Multi-row column test
print("\nColumn test:")
spark.sql("""
    SELECT soundex(name) AS result 
    FROM VALUES ('Robert'), ('Rupert'), (NULL), (''), ('123') AS t(name)
""").show()

Spark-3.5

Image

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Big thanks to @davidlghellin for the test cases.


query T
SELECT soundex('Ashcroft');
----
A261

query T
SELECT soundex('B1B');
----
B100

query T
SELECT soundex('B B');
----
B100

query T
SELECT soundex('BAB');
----
B100

query T
SELECT soundex('#hello');
----
#hello

query T
SELECT soundex(' hello');
----
hello

query T
SELECT soundex('\thello');
----
\thello

query T
SELECT soundex('😀hello');
----
😀hello

query T
SELECT soundex('123');
----
123

query T
SELECT soundex('1abc');
----
1abc

query T
SELECT soundex('A');
----
A000

query T
SELECT soundex('BFPV');
----
B000

query T
SELECT soundex('Robert');
----
R163

query T
SELECT soundex('Rupert');
----
R163

query T
SELECT soundex(NULL);
----
NULL

query T
SELECT soundex('');
----
(empty)

query T
SELECT soundex('robert');
----
R163

query T
SELECT soundex('rObErT');
----
R163

query T
SELECT soundex('Müller');
----
M460

query T
SELECT soundex('Abcdefghijklmnop');
----
A123

query T
SELECT soundex('Lloyd');
----
L300

query T
SELECT soundex('BWB');
----
B000

query T
SELECT soundex('BHB');
----
B000

query T
SELECT soundex('Tymczak');
----
T522

query T
SELECT soundex('Aeiou');
----
A000

query T
SELECT soundex('1Robert');
----
1Robert

query T
SELECT soundex('Smith-Jones');
----
S532

query T
SELECT soundex('#');
----
#

query T
SELECT soundex('\nhello');
----
\nhello

query T
SELECT concat(soundex(' '), 'Spark')
----
Spark
Loading