From 7396c4c241d49e2d5473daedfc8dbc1f375dbee8 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Mar 2026 13:59:37 -0400 Subject: [PATCH 1/4] Add benchmark for `array_sort()` --- datafusion/functions-nested/Cargo.toml | 4 + .../functions-nested/benches/array_sort.rs | 181 ++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 datafusion/functions-nested/benches/array_sort.rs diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 5fce3e854eb3..2ce9532a22ee 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -109,3 +109,7 @@ name = "array_to_string" [[bench]] harness = false name = "array_position" + +[[bench]] +harness = false +name = "array_sort" diff --git a/datafusion/functions-nested/benches/array_sort.rs b/datafusion/functions-nested/benches/array_sort.rs new file mode 100644 index 000000000000..99772febc4d6 --- /dev/null +++ b/datafusion/functions-nested/benches/array_sort.rs @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{ArrayRef, BooleanBufferBuilder, Int32Array, ListArray, StringArray}; +use arrow::buffer::{NullBuffer, OffsetBuffer}; +use arrow::datatypes::{DataType, Field}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_functions_nested::sort::array_sort_inner; +use rand::SeedableRng; +use rand::rngs::StdRng; +use rand::seq::SliceRandom; + +const SEED: u64 = 42; +const NUM_ROWS: usize = 8192; + +fn create_int32_list_array( + num_rows: usize, + elements_per_row: usize, + with_nulls: bool, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let total_values = num_rows * elements_per_row; + + let mut values: Vec = (0..total_values as i32).collect(); + values.shuffle(&mut rng); + + let values = Arc::new(Int32Array::from(values)); + let offsets: Vec = (0..=num_rows) + .map(|i| (i * elements_per_row) as i32) + .collect(); + + let nulls = if with_nulls { + // Every 10th row is null + Some(NullBuffer::from( + (0..num_rows).map(|i| i % 10 != 0).collect::>(), + )) + } else { + None + }; + + Arc::new(ListArray::new( + Arc::new(Field::new("item", DataType::Int32, true)), + OffsetBuffer::new(offsets.into()), + values, + nulls, + )) +} + +/// Creates a ListArray where ~10% of elements within each row are null. +fn create_int32_list_array_with_null_elements( + num_rows: usize, + elements_per_row: usize, +) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let total_values = num_rows * elements_per_row; + + let mut values: Vec = (0..total_values as i32).collect(); + values.shuffle(&mut rng); + + // ~10% of elements are null + let mut validity = BooleanBufferBuilder::new(total_values); + for i in 0..total_values { + validity.append(i % 10 != 0); + } + let null_buffer = NullBuffer::from(validity.finish()); + + let values = Arc::new(Int32Array::new(values.into(), Some(null_buffer))); + let offsets: Vec = (0..=num_rows) + .map(|i| (i * elements_per_row) as i32) + .collect(); + + Arc::new(ListArray::new( + Arc::new(Field::new("item", DataType::Int32, true)), + OffsetBuffer::new(offsets.into()), + values, + None, + )) +} + +fn create_string_list_array(num_rows: usize, elements_per_row: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let total_values = num_rows * elements_per_row; + + let mut indices: Vec = (0..total_values).collect(); + indices.shuffle(&mut rng); + let string_values: Vec = + indices.iter().map(|i| format!("value_{i:06}")).collect(); + let values = Arc::new(StringArray::from(string_values)); + + let offsets: Vec = (0..=num_rows) + .map(|i| (i * elements_per_row) as i32) + .collect(); + + Arc::new(ListArray::new( + Arc::new(Field::new("item", DataType::Utf8, true)), + OffsetBuffer::new(offsets.into()), + values, + None, + )) +} + +/// Vary elements_per_row over [5, 20, 100, 1000]: for small arrays, per-row +/// overhead dominates, whereas for larger arrays the sort kernel dominates. +fn bench_array_sort(c: &mut Criterion) { + let mut group = c.benchmark_group("array_sort"); + + // Int32 arrays + for &elements_per_row in &[5, 20, 100, 1000] { + let array = create_int32_list_array(NUM_ROWS, elements_per_row, false); + group.bench_with_input( + BenchmarkId::new("int32", elements_per_row), + &elements_per_row, + |b, _| { + b.iter(|| { + black_box(array_sort_inner(std::slice::from_ref(&array)).unwrap()); + }); + }, + ); + } + + // Int32 with nulls in the outer list (10% null rows), single size + { + let array = create_int32_list_array(NUM_ROWS, 50, true); + group.bench_function("int32_with_nulls", |b| { + b.iter(|| { + black_box(array_sort_inner(std::slice::from_ref(&array)).unwrap()); + }); + }); + } + + // Int32 with null elements (~10% of elements within rows are null) + for &elements_per_row in &[5, 20, 100, 1000] { + let array = + create_int32_list_array_with_null_elements(NUM_ROWS, elements_per_row); + group.bench_with_input( + BenchmarkId::new("int32_null_elements", elements_per_row), + &elements_per_row, + |b, _| { + b.iter(|| { + black_box(array_sort_inner(std::slice::from_ref(&array)).unwrap()); + }); + }, + ); + } + + // String arrays + for &elements_per_row in &[5, 20, 100, 1000] { + let array = create_string_list_array(NUM_ROWS, elements_per_row); + group.bench_with_input( + BenchmarkId::new("string", elements_per_row), + &elements_per_row, + |b, _| { + b.iter(|| { + black_box(array_sort_inner(std::slice::from_ref(&array)).unwrap()); + }); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_array_sort); +criterion_main!(benches); From 294d76921fc83c6bd1434d497a22c78940f31821 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Fri, 20 Mar 2026 13:59:59 -0400 Subject: [PATCH 2/4] Optimize array_sort --- datafusion/functions-nested/src/sort.rs | 362 ++++++++++++++---- datafusion/sqllogictest/test_files/array.slt | 86 +++++ .../source/user-guide/sql/scalar_functions.md | 4 +- 3 files changed, 383 insertions(+), 69 deletions(-) diff --git a/datafusion/functions-nested/src/sort.rs b/datafusion/functions-nested/src/sort.rs index 256293169123..138eac0abbd0 100644 --- a/datafusion/functions-nested/src/sort.rs +++ b/datafusion/functions-nested/src/sort.rs @@ -18,14 +18,18 @@ //! [`ScalarUDFImpl`] definitions for array_sort function. use crate::utils::make_scalar_function; -use arrow::array::{Array, ArrayRef, GenericListArray, OffsetSizeTrait, new_null_array}; -use arrow::buffer::OffsetBuffer; -use arrow::compute::SortColumn; -use arrow::datatypes::{DataType, FieldRef}; -use arrow::{compute, compute::SortOptions}; +use arrow::array::BooleanBufferBuilder; +use arrow::array::{ + Array, ArrayRef, ArrowPrimitiveType, GenericListArray, OffsetSizeTrait, + PrimitiveArray, UInt32Array, UInt64Array, new_empty_array, new_null_array, +}; +use arrow::buffer::{NullBuffer, OffsetBuffer}; +use arrow::datatypes::{ArrowNativeTypeOp, DataType, FieldRef}; +use arrow::row::{RowConverter, SortField}; +use arrow::{compute, compute::SortOptions, downcast_primitive_array}; use datafusion_common::cast::{as_large_list_array, as_list_array, as_string_array}; use datafusion_common::utils::ListCoercion; -use datafusion_common::{Result, exec_err}; +use datafusion_common::{Result, exec_err, internal_datafusion_err}; use datafusion_expr::{ ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility, @@ -67,11 +71,11 @@ make_udf_expr_and_func!( ), argument( name = "desc", - description = "Whether to sort in descending order(`ASC` or `DESC`)." + description = "Whether to sort in ascending (`ASC`) or descending (`DESC`) order. The default is `ASC`." ), argument( name = "nulls_first", - description = "Whether to sort nulls first(`NULLS FIRST` or `NULLS LAST`)." + description = "Whether to sort nulls first (`NULLS FIRST`) or last (`NULLS LAST`). The default is `NULLS FIRST`." ) )] #[derive(Debug, PartialEq, Eq, Hash)] @@ -148,7 +152,7 @@ impl ScalarUDFImpl for ArraySort { } } -fn array_sort_inner(args: &[ArrayRef]) -> Result { +pub fn array_sort_inner(args: &[ArrayRef]) -> Result { if args.is_empty() || args.len() > 3 { return exec_err!("array_sort expects one to three arguments"); } @@ -161,25 +165,20 @@ fn array_sort_inner(args: &[ArrayRef]) -> Result { return Ok(new_null_array(args[0].data_type(), args[0].len())); } - let sort_options = match args.len() { - 1 => None, - 2 => { - let sort = as_string_array(&args[1])?.value(0); - Some(SortOptions { - descending: order_desc(sort)?, - nulls_first: true, - }) - } - 3 => { - let sort = as_string_array(&args[1])?.value(0); - let nulls_first = as_string_array(&args[2])?.value(0); - Some(SortOptions { - descending: order_desc(sort)?, - nulls_first: order_nulls_first(nulls_first)?, - }) - } - // We guard at the top - _ => unreachable!(), + let sort_options = if args.len() >= 2 { + let order = as_string_array(&args[1])?.value(0); + let descending = order_desc(order)?; + let nulls_first = if args.len() >= 3 { + order_nulls_first(as_string_array(&args[2])?.value(0))? + } else { + true + }; + Some(SortOptions { + descending, + nulls_first, + }) + } else { + None }; match args[0].data_type() { @@ -206,54 +205,283 @@ fn array_sort_generic( field: FieldRef, sort_options: Option, ) -> Result { + let values = list_array.values(); + + if values.data_type().is_primitive() { + array_sort_primitive(list_array, field, sort_options) + } else { + array_sort_non_primitive(list_array, field, sort_options) + } +} + +/// Sort each row of a primitive-typed ListArray using a custom in-place sort +/// kernel. +fn array_sort_primitive( + list_array: &GenericListArray, + field: FieldRef, + sort_options: Option, +) -> Result { + let values = list_array.values().as_ref(); + downcast_primitive_array! { + values => sort_primitive_list(values, list_array, field, sort_options), + _ => exec_err!("array_sort: unsupported primitive type") + } +} + +fn sort_primitive_list( + prim_values: &PrimitiveArray, + list_array: &GenericListArray, + field: FieldRef, + sort_options: Option, +) -> Result +where + T::Native: ArrowNativeTypeOp, +{ + if prim_values.null_count() > 0 { + sort_list_with_nulls(prim_values, list_array, field, sort_options) + } else { + sort_list_no_nulls(prim_values, list_array, field, sort_options) + } +} + +/// Fast path for primitive values with no element-level nulls. Copies all +/// values into a single `Vec` and sorts each row's slice in-place. +fn sort_list_no_nulls( + prim_values: &PrimitiveArray, + list_array: &GenericListArray, + field: FieldRef, + sort_options: Option, +) -> Result +where + T::Native: ArrowNativeTypeOp, +{ let row_count = list_array.len(); + let offsets = list_array.offsets(); + let values_start = offsets[0].as_usize(); + let values_end = offsets[row_count].as_usize(); + + let descending = sort_options.is_some_and(|o| o.descending); - let mut array_lengths = vec![]; - let mut arrays = vec![]; - for i in 0..row_count { - if list_array.is_null(i) { - array_lengths.push(0); + // Copy all values into a mutable buffer + let mut values: Vec = + prim_values.values()[values_start..values_end].to_vec(); + + for (row_index, window) in offsets.windows(2).enumerate() { + if list_array.is_null(row_index) { + continue; + } + let start = window[0].as_usize() - values_start; + let end = window[1].as_usize() - values_start; + let slice = &mut values[start..end]; + if descending { + slice.sort_unstable_by(|a, b| b.compare(*a)); } else { - let arr_ref = list_array.value(i); - - // arrow sort kernel does not support Structs, so use - // lexsort_to_indices instead: - // https://github.com/apache/arrow-rs/issues/6911#issuecomment-2562928843 - let sorted_array = match arr_ref.data_type() { - DataType::Struct(_) => { - let sort_columns: Vec = vec![SortColumn { - values: Arc::clone(&arr_ref), - options: sort_options, - }]; - let indices = compute::lexsort_to_indices(&sort_columns, None)?; - compute::take(arr_ref.as_ref(), &indices, None)? - } - _ => { - let arr_ref = arr_ref.as_ref(); - compute::sort(arr_ref, sort_options)? - } - }; - array_lengths.push(sorted_array.len()); - arrays.push(sorted_array); + slice.sort_unstable_by(|a, b| a.compare(*b)); } } - let elements = arrays - .iter() - .map(|a| a.as_ref()) - .collect::>(); + let new_offsets = rebase_offsets(offsets); + let sorted_values = Arc::new( + PrimitiveArray::::new(values.into(), None) + .with_data_type(prim_values.data_type().clone()), + ); - let list_arr = if elements.is_empty() { - GenericListArray::::new_null(field, row_count) - } else { - GenericListArray::::new( - field, - OffsetBuffer::from_lengths(array_lengths), - Arc::new(compute::concat(elements.as_slice())?), - list_array.nulls().cloned(), + Ok(Arc::new(GenericListArray::::try_new( + field, + new_offsets, + sorted_values, + list_array.nulls().cloned(), + )?)) +} + +/// Slow path for primitive values with element-level nulls. +fn sort_list_with_nulls( + prim_values: &PrimitiveArray, + list_array: &GenericListArray, + field: FieldRef, + sort_options: Option, +) -> Result +where + T::Native: ArrowNativeTypeOp, +{ + let row_count = list_array.len(); + let offsets = list_array.offsets(); + let values_start = offsets[0].as_usize(); + let values_end = offsets[row_count].as_usize(); + let total_values = values_end - values_start; + + let descending = sort_options.is_some_and(|o| o.descending); + let nulls_first = sort_options.is_none_or(|o| o.nulls_first); + + let mut out_values: Vec = vec![T::Native::default(); total_values]; + let mut validity = BooleanBufferBuilder::new(total_values); + + let src_nulls = prim_values.nulls().ok_or_else(|| { + internal_datafusion_err!( + "sort_list_with_nulls called but values have no null buffer" ) + })?; + let src_values = prim_values.values(); + + for (row_index, window) in offsets.windows(2).enumerate() { + let start = window[0].as_usize(); + let end = window[1].as_usize(); + let row_len = end - start; + let out_start = start - values_start; + + if list_array.is_null(row_index) || row_len == 0 { + validity.append_n(row_len, false); + continue; + } + + let null_count = src_nulls.slice(start, row_len).null_count(); + let valid_count = row_len - null_count; + + // Compact valid values directly into the target region of the output + // buffer: after nulls (if nulls_first) or at the start (if nulls_last). + let valid_offset = if nulls_first { null_count } else { 0 }; + let mut write_pos = out_start + valid_offset; + for i in start..end { + if src_nulls.is_valid(i) { + out_values[write_pos] = src_values[i]; + write_pos += 1; + } + } + + let valid_slice = &mut out_values + [out_start + valid_offset..out_start + valid_offset + valid_count]; + if descending { + valid_slice.sort_unstable_by(|a, b| b.compare(*a)); + } else { + valid_slice.sort_unstable_by(|a, b| a.compare(*b)); + } + + // Build validity bits + if nulls_first { + validity.append_n(null_count, false); + validity.append_n(valid_count, true); + } else { + validity.append_n(valid_count, true); + validity.append_n(null_count, false); + } + } + + let new_offsets = rebase_offsets(offsets); + + let null_buffer = NullBuffer::from(validity.finish()); + let sorted_values = Arc::new( + PrimitiveArray::::new(out_values.into(), Some(null_buffer)) + .with_data_type(prim_values.data_type().clone()), + ); + + Ok(Arc::new(GenericListArray::::try_new( + field, + new_offsets, + sorted_values, + list_array.nulls().cloned(), + )?)) +} + +/// Sort a non-pritive-typed ListArray by converting all rows at once using +/// `RowConverter`, and then sort row indices by comparing encoded bytes (sort +/// direction and null ordering are baked into the encoding), and materialize +/// the result with a single `take()`. +fn array_sort_non_primitive( + list_array: &GenericListArray, + field: FieldRef, + sort_options: Option, +) -> Result { + let row_count = list_array.len(); + let values = list_array.values(); + let offsets = list_array.offsets(); + let values_start = offsets[0].as_usize(); + let total_values = offsets[row_count].as_usize() - values_start; + + let converter = RowConverter::new(vec![SortField::new_with_options( + values.data_type().clone(), + sort_options.unwrap_or_default(), + )])?; + let values_sliced = values.slice(values_start, total_values); + let rows = converter.convert_columns(&[Arc::clone(&values_sliced)])?; + + let mut indices: Vec = Vec::with_capacity(total_values); + let mut new_offsets = Vec::with_capacity(row_count + 1); + new_offsets.push(OffsetSize::usize_as(0)); + + let mut sort_scratch: Vec = Vec::new(); + + for (row_index, window) in offsets.windows(2).enumerate() { + let start = window[0]; + let end = window[1]; + + if list_array.is_null(row_index) { + new_offsets.push(new_offsets[row_index]); + continue; + } + + let len = (end - start).as_usize(); + let local_start = start.as_usize() - values_start; + + if len <= 1 { + indices.extend((local_start..local_start + len).map(OffsetSize::usize_as)); + } else { + sort_scratch.clear(); + sort_scratch.extend(local_start..local_start + len); + sort_scratch.sort_unstable_by(|&a, &b| rows.row(a).cmp(&rows.row(b))); + indices.extend(sort_scratch.iter().map(|&i| OffsetSize::usize_as(i))); + } + + new_offsets.push(new_offsets[row_index] + (end - start)); + } + + let sorted_values = if indices.is_empty() { + new_empty_array(values.data_type()) + } else { + take_by_indices(&values_sliced, indices)? }; - Ok(Arc::new(list_arr)) + + Ok(Arc::new(GenericListArray::::try_new( + field, + OffsetBuffer::::new(new_offsets.into()), + sorted_values, + list_array.nulls().cloned(), + )?)) +} + +/// Select elements from `values` at the given `indices` using `compute::take`. +/// We consume `indices` in order to avoid an intermediate copy. +fn take_by_indices( + values: &ArrayRef, + indices: Vec, +) -> Result { + let len = indices.len(); + let buffer = arrow::buffer::Buffer::from_vec(indices); + let indices_array: ArrayRef = if OffsetSize::IS_LARGE { + Arc::new(UInt64Array::new( + arrow::buffer::ScalarBuffer::new(buffer, 0, len), + None, + )) + } else { + Arc::new(UInt32Array::new( + arrow::buffer::ScalarBuffer::new(buffer, 0, len), + None, + )) + }; + Ok(compute::take(values.as_ref(), &indices_array, None)?) +} + +/// Rebase offsets so they start at 0. For non-sliced ListArrays (the common +/// case) offsets already start at 0 and we can clone the Arc-backed buffer +/// cheaply instead of allocating a new Vec. +fn rebase_offsets( + offsets: &OffsetBuffer, +) -> OffsetBuffer { + if offsets[0].as_usize() == 0 { + offsets.clone() + } else { + let rebased: Vec = offsets.iter().map(|o| *o - offsets[0]).collect(); + OffsetBuffer::new(rebased.into()) + } } fn order_desc(modifier: &str) -> Result { diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 7e6050d8e62f..9fbd2f5eaf27 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -2535,6 +2535,14 @@ select array_sort([]); ---- [] +# empty-but-non-null string arrays should remain non-null, not become null +query ?B +select array_sort(column1), array_sort(column1) is null +from (values (arrow_cast(make_array('b', 'a'), 'List(Utf8)')), (arrow_cast([], 'List(Utf8)'))) as t(column1); +---- +[a, b] false +[] false + # test with null arguments query ? select array_sort(NULL); @@ -2602,6 +2610,14 @@ from values (array_sort(arrow_cast([1, 3, 5, -5], 'FixedSizeList(4 x non-null In ---- [-5, 1, 3, 5] List(non-null Int32) +# arrays of strings +query ??? +select array_sort(make_array('banana', 'apple', null, 'cherry')), + array_sort(make_array('banana', 'apple', null, 'cherry'), 'DESC', 'NULLS LAST'), + array_sort(make_array('banana', 'apple', null, 'cherry'), 'ASC', 'NULLS LAST'); +---- +[NULL, apple, banana, cherry] [cherry, banana, apple, NULL] [apple, banana, cherry, NULL] + query ? select array_sort([struct('foo', 3), struct('foo', 1), struct('bar', 1)]) ---- @@ -2625,6 +2641,76 @@ select array_sort([sum(a)]) from t1 where a > 100 group by b; statement ok drop table t1; +# float arrays with NaN and Infinity (NaN sorts after Infinity per IEEE totalOrder) +query ??? +select array_sort(make_array(1.0, 'NaN'::double, -1.0, 'Infinity'::double, '-Infinity'::double, null)), + array_sort(make_array(1.0, 'NaN'::double, -1.0, 'Infinity'::double, '-Infinity'::double, null), 'DESC', 'NULLS LAST'), + array_sort(make_array('NaN'::double, 'NaN'::double, 1.0)); +---- +[NULL, -inf, -1.0, 1.0, inf, NaN] [NaN, inf, 1.0, -1.0, -inf, NULL] [1.0, NaN, NaN] + +# float32 arrays +query ?? +select array_sort(arrow_cast(make_array(3.0, 1.0, 'NaN'::double, null, 2.0), 'List(Float32)')), + array_sort(arrow_cast(make_array(3.0, 1.0, 'NaN'::double, null, 2.0), 'List(Float32)'), 'DESC', 'NULLS LAST'); +---- +[NULL, 1.0, 2.0, 3.0, NaN] [NaN, 3.0, 2.0, 1.0, NULL] + +# element-level nulls with all sort option combinations +query ???? +select array_sort(make_array(3, null, 1, null, 2), 'ASC', 'NULLS FIRST'), + array_sort(make_array(3, null, 1, null, 2), 'ASC', 'NULLS LAST'), + array_sort(make_array(3, null, 1, null, 2), 'DESC', 'NULLS FIRST'), + array_sort(make_array(3, null, 1, null, 2), 'DESC', 'NULLS LAST'); +---- +[NULL, NULL, 1, 2, 3] [1, 2, 3, NULL, NULL] [NULL, NULL, 3, 2, 1] [3, 2, 1, NULL, NULL] + +# timestamp arrays +query ?? +select array_sort(make_array(arrow_cast('2024-01-15T10:00:00', 'Timestamp(Nanosecond, None)'), + arrow_cast('2024-01-01T00:00:00', 'Timestamp(Nanosecond, None)'), + null, + arrow_cast('2024-06-15T12:00:00', 'Timestamp(Nanosecond, None)'))), + array_sort(make_array(arrow_cast('2024-01-15T10:00:00', 'Timestamp(Nanosecond, None)'), + arrow_cast('2024-01-01T00:00:00', 'Timestamp(Nanosecond, None)'), + null, + arrow_cast('2024-06-15T12:00:00', 'Timestamp(Nanosecond, None)')), 'DESC', 'NULLS LAST'); +---- +[NULL, 2024-01-01T00:00:00, 2024-01-15T10:00:00, 2024-06-15T12:00:00] [2024-06-15T12:00:00, 2024-01-15T10:00:00, 2024-01-01T00:00:00, NULL] + +# date arrays +query ?? +select array_sort(make_array('2024-03-01'::date, '2024-01-01'::date, null, '2024-02-01'::date)), + array_sort(make_array('2024-03-01'::date, '2024-01-01'::date, null, '2024-02-01'::date), 'DESC', 'NULLS LAST'); +---- +[NULL, 2024-01-01, 2024-02-01, 2024-03-01] [2024-03-01, 2024-02-01, 2024-01-01, NULL] + +# struct arrays with nulls and DESC +query ?? +select array_sort([struct('b', 2), struct('a', 1), null, struct('a', 3)]), + array_sort([struct('b', 2), struct('a', 1), null, struct('a', 3)], 'DESC', 'NULLS LAST'); +---- +[NULL, {c0: a, c1: 1}, {c0: a, c1: 3}, {c0: b, c1: 2}] [{c0: b, c1: 2}, {c0: a, c1: 3}, {c0: a, c1: 1}, NULL] + +# boolean arrays +query ?? +select array_sort(make_array(true, false, null, true, false)), + array_sort(make_array(true, false, null, true, false), 'DESC', 'NULLS LAST'); +---- +[NULL, false, false, true, true] [true, true, false, false, NULL] + +# all-null array +query ? +select array_sort(make_array(null, null, null)); +---- +[NULL, NULL, NULL] + +# single-element arrays +query ?? +select array_sort(make_array(42)), array_sort(make_array(null::int)); +---- +[42] [NULL] + ## list_sort (aliases: `array_sort`) query ??? select list_sort(make_array(1, 3, null, 5, NULL, -5)), list_sort(make_array(1, 3, null, 2), 'ASC'), list_sort(make_array(1, 3, null, 2), 'desc', 'NULLS FIRST'); diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index ae5dbd5bee75..d347e36d2783 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -4182,8 +4182,8 @@ array_sort(array, desc, nulls_first) #### Arguments - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. -- **desc**: Whether to sort in descending order(`ASC` or `DESC`). -- **nulls_first**: Whether to sort nulls first(`NULLS FIRST` or `NULLS LAST`). +- **desc**: Whether to sort in ascending (`ASC`) or descending (`DESC`) order. The default is `ASC`. +- **nulls_first**: Whether to sort nulls first (`NULLS FIRST`) or last (`NULLS LAST`). The default is `NULLS FIRST`. #### Example From d92f38e54ae3721d7f08399a9078fe31cc1a8f33 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sat, 21 Mar 2026 11:24:54 -0400 Subject: [PATCH 3/4] Avoid making array_sort_inner public for benchmarks --- .../functions-nested/benches/array_sort.rs | 26 +++++++++++++++---- datafusion/functions-nested/src/sort.rs | 2 +- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/datafusion/functions-nested/benches/array_sort.rs b/datafusion/functions-nested/benches/array_sort.rs index 99772febc4d6..0698a180596d 100644 --- a/datafusion/functions-nested/benches/array_sort.rs +++ b/datafusion/functions-nested/benches/array_sort.rs @@ -22,7 +22,9 @@ use arrow::array::{ArrayRef, BooleanBufferBuilder, Int32Array, ListArray, String use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::datatypes::{DataType, Field}; use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use datafusion_functions_nested::sort::array_sort_inner; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions_nested::sort::ArraySort; use rand::SeedableRng; use rand::rngs::StdRng; use rand::seq::SliceRandom; @@ -116,10 +118,24 @@ fn create_string_list_array(num_rows: usize, elements_per_row: usize) -> ArrayRe )) } +fn invoke_array_sort(udf: &ArraySort, array: &ArrayRef) -> ColumnarValue { + udf.invoke_with_args(ScalarFunctionArgs { + args: vec![ColumnarValue::Array(Arc::clone(array))], + arg_fields: vec![ + Field::new("arr", array.data_type().clone(), true).into(), + ], + number_rows: array.len(), + return_field: Field::new("result", array.data_type().clone(), true).into(), + config_options: Arc::new(ConfigOptions::default()), + }) + .unwrap() +} + /// Vary elements_per_row over [5, 20, 100, 1000]: for small arrays, per-row /// overhead dominates, whereas for larger arrays the sort kernel dominates. fn bench_array_sort(c: &mut Criterion) { let mut group = c.benchmark_group("array_sort"); + let udf = ArraySort::new(); // Int32 arrays for &elements_per_row in &[5, 20, 100, 1000] { @@ -129,7 +145,7 @@ fn bench_array_sort(c: &mut Criterion) { &elements_per_row, |b, _| { b.iter(|| { - black_box(array_sort_inner(std::slice::from_ref(&array)).unwrap()); + black_box(invoke_array_sort(&udf, &array)); }); }, ); @@ -140,7 +156,7 @@ fn bench_array_sort(c: &mut Criterion) { let array = create_int32_list_array(NUM_ROWS, 50, true); group.bench_function("int32_with_nulls", |b| { b.iter(|| { - black_box(array_sort_inner(std::slice::from_ref(&array)).unwrap()); + black_box(invoke_array_sort(&udf, &array)); }); }); } @@ -154,7 +170,7 @@ fn bench_array_sort(c: &mut Criterion) { &elements_per_row, |b, _| { b.iter(|| { - black_box(array_sort_inner(std::slice::from_ref(&array)).unwrap()); + black_box(invoke_array_sort(&udf, &array)); }); }, ); @@ -168,7 +184,7 @@ fn bench_array_sort(c: &mut Criterion) { &elements_per_row, |b, _| { b.iter(|| { - black_box(array_sort_inner(std::slice::from_ref(&array)).unwrap()); + black_box(invoke_array_sort(&udf, &array)); }); }, ); diff --git a/datafusion/functions-nested/src/sort.rs b/datafusion/functions-nested/src/sort.rs index 138eac0abbd0..8678c680997a 100644 --- a/datafusion/functions-nested/src/sort.rs +++ b/datafusion/functions-nested/src/sort.rs @@ -152,7 +152,7 @@ impl ScalarUDFImpl for ArraySort { } } -pub fn array_sort_inner(args: &[ArrayRef]) -> Result { +fn array_sort_inner(args: &[ArrayRef]) -> Result { if args.is_empty() || args.len() > 3 { return exec_err!("array_sort expects one to three arguments"); } From 2ee21e19693ec536ec69dba7965f4d0d8754ee9a Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sat, 21 Mar 2026 11:42:51 -0400 Subject: [PATCH 4/4] cargo fmt --- datafusion/functions-nested/benches/array_sort.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datafusion/functions-nested/benches/array_sort.rs b/datafusion/functions-nested/benches/array_sort.rs index 0698a180596d..940c0396cbb0 100644 --- a/datafusion/functions-nested/benches/array_sort.rs +++ b/datafusion/functions-nested/benches/array_sort.rs @@ -121,9 +121,7 @@ fn create_string_list_array(num_rows: usize, elements_per_row: usize) -> ArrayRe fn invoke_array_sort(udf: &ArraySort, array: &ArrayRef) -> ColumnarValue { udf.invoke_with_args(ScalarFunctionArgs { args: vec![ColumnarValue::Array(Arc::clone(array))], - arg_fields: vec![ - Field::new("arr", array.data_type().clone(), true).into(), - ], + arg_fields: vec![Field::new("arr", array.data_type().clone(), true).into()], number_rows: array.len(), return_field: Field::new("result", array.data_type().clone(), true).into(), config_options: Arc::new(ConfigOptions::default()),