From 33ccecc97a81b54bd5da652fcad20dea0ce38df2 Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 08:06:49 +0000 Subject: [PATCH 01/23] Refactor approx_percentile_cont UDFs to use numeric coercion --- .../src/approx_percentile_cont.rs | 59 ++++++++++----- .../src/approx_percentile_cont_with_weight.rs | 72 +++++++++++++------ 2 files changed, 91 insertions(+), 40 deletions(-) diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 392a044d01394..174769a4182c1 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -30,17 +30,17 @@ use arrow::{ }, datatypes::{DataType, Field}, }; +use datafusion_common::types::{NativeType, logical_float64}; use datafusion_common::{ DataFusionError, Result, ScalarValue, downcast_value, internal_err, not_impl_err, plan_err, }; use datafusion_expr::expr::{AggregateFunction, Sort}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; -use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature, - Volatility, + Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature, + TypeSignature, TypeSignatureClass, Volatility, }; use datafusion_functions_aggregate_common::tdigest::{DEFAULT_MAX_SIZE, TDigest}; use datafusion_macros::user_doc; @@ -133,22 +133,45 @@ impl Default for ApproxPercentileCont { impl ApproxPercentileCont { /// Create a new [`ApproxPercentileCont`] aggregate function. pub fn new() -> Self { - let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1)); // Accept any numeric value paired with a float64 percentile - for num in NUMERICS { - variants.push(TypeSignature::Exact(vec![num.clone(), DataType::Float64])); - // Additionally accept an integer number of centroids for T-Digest - for int in INTEGERS { - variants.push(TypeSignature::Exact(vec![ - num.clone(), - DataType::Float64, - int.clone(), - ])) - } - } - Self { - signature: Signature::one_of(variants, Volatility::Immutable), - } + // Additionally accept an integer number of centroids for T-Digest + let signature = Signature::one_of( + vec![ + // 2 args - numeric, percentile (float) + TypeSignature::Coercible(vec![ + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + ]), + // 3 args - numeric, percentile (float), centroid (integer) + TypeSignature::Coercible(vec![ + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Integer, + vec![TypeSignatureClass::Numeric], + NativeType::Int64, + ), + ]), + ], + Volatility::Immutable, + ); + Self { signature } } pub(crate) fn create_accumulator( diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs index 6fd90130e6741..180f59d776d52 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs @@ -25,13 +25,13 @@ use arrow::compute::{and, filter, is_not_null}; use arrow::datatypes::FieldRef; use arrow::{array::ArrayRef, datatypes::DataType}; use datafusion_common::ScalarValue; +use datafusion_common::types::{NativeType, logical_float64}; use datafusion_common::{Result, not_impl_err, plan_err}; -use datafusion_expr::Volatility::Immutable; use datafusion_expr::expr::{AggregateFunction, Sort}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; -use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature, + Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature, + TypeSignature, TypeSignatureClass, Volatility, }; use datafusion_functions_aggregate_common::tdigest::{Centroid, TDigest}; use datafusion_macros::user_doc; @@ -126,26 +126,54 @@ impl Default for ApproxPercentileContWithWeight { impl ApproxPercentileContWithWeight { /// Create a new [`ApproxPercentileContWithWeight`] aggregate function. pub fn new() -> Self { - let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1)); - // Accept any numeric value paired with weight and float64 percentile - for num in NUMERICS { - variants.push(TypeSignature::Exact(vec![ - num.clone(), - num.clone(), - DataType::Float64, - ])); - // Additionally accept an integer number of centroids for T-Digest - for int in INTEGERS { - variants.push(TypeSignature::Exact(vec![ - num.clone(), - num.clone(), - DataType::Float64, - int.clone(), - ])); - } - } + let signature = Signature::one_of( + vec![ + // 3 args - numeric, weight (float), percentile (float) + TypeSignature::Coercible(vec![ + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + ]), + // 4 args - numeric, weight (float), percentile (float), centroid (integer) + TypeSignature::Coercible(vec![ + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + ), + Coercion::new_implicit( + TypeSignatureClass::Integer, + vec![TypeSignatureClass::Numeric], + NativeType::Int64, + ), + ]), + ], + Volatility::Immutable, + ); Self { - signature: Signature::one_of(variants, Immutable), + signature, approx_percentile_cont: ApproxPercentileCont::new(), } } From 3b48e04fb1b9c896f0186666f63fd84fc4343786 Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 08:06:59 +0000 Subject: [PATCH 02/23] Update SLT of approx_percentile_cont to return float --- datafusion/sqllogictest/test_files/clickbench_extended.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/clickbench_extended.slt b/datafusion/sqllogictest/test_files/clickbench_extended.slt index ee3e33551ee3e..6b0d78cdba8f3 100644 --- a/datafusion/sqllogictest/test_files/clickbench_extended.slt +++ b/datafusion/sqllogictest/test_files/clickbench_extended.slt @@ -52,7 +52,7 @@ query IIIIII SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10; ---- -query IIIIII +query IIIIRI SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; ---- From 4637df2593384305f42b3055fc9da01958cff99a Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 08:07:06 +0000 Subject: [PATCH 03/23] Refactor test UDF stub to use numeric coercion --- datafusion/expr/src/test/function_stub.rs | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/test/function_stub.rs b/datafusion/expr/src/test/function_stub.rs index 26ac16d90d69f..c6434f9598462 100644 --- a/datafusion/expr/src/test/function_stub.rs +++ b/datafusion/expr/src/test/function_stub.rs @@ -31,13 +31,14 @@ use datafusion_common::plan_err; use datafusion_common::{Result, exec_err, not_impl_err, utils::take_function_args}; use crate::Volatility::Immutable; -use crate::type_coercion::aggregates::NUMERICS; use crate::{ - Accumulator, AggregateUDFImpl, Expr, GroupsAccumulator, ReversedUDAF, Signature, + Accumulator, AggregateUDFImpl, Coercion, Expr, GroupsAccumulator, ReversedUDAF, + Signature, TypeSignature, TypeSignatureClass, expr::AggregateFunction, function::{AccumulatorArgs, StateFieldsArgs}, utils::AggregateOrderSensitivity, }; +use datafusion_common::types::{NativeType, logical_float64}; macro_rules! create_func { ($UDAF:ty, $AGGREGATE_UDF_FN:ident) => { @@ -462,9 +463,25 @@ pub struct Avg { impl Avg { pub fn new() -> Self { + let signature = Signature::one_of( + vec![ + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Decimal, + )]), + TypeSignature::Coercible(vec![Coercion::new_exact( + TypeSignatureClass::Duration, + )]), + TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Native(logical_float64()), + vec![TypeSignatureClass::Integer, TypeSignatureClass::Float], + NativeType::Float64, + )]), + ], + Immutable, + ); Self { aliases: vec![String::from("mean")], - signature: Signature::uniform(1, NUMERICS.to_vec(), Immutable), + signature, } } } From 968a851f56bbec2039e6071e087db8a3fdc5ae90 Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 08:07:15 +0000 Subject: [PATCH 04/23] Deprecate aggregates NUMERICS and INTEGERS arrays --- datafusion/expr-common/src/signature.rs | 15 ++++++++++++++- .../expr-common/src/type_coercion/aggregates.rs | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 82759be9f75e8..c1b38dd40361e 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -21,7 +21,6 @@ use std::fmt::Display; use std::hash::Hash; use std::sync::Arc; -use crate::type_coercion::aggregates::NUMERICS; use arrow::datatypes::{ DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType, Decimal128Type, DecimalType, Field, IntervalUnit, TimeUnit, @@ -595,6 +594,20 @@ impl Display for ArrayFunctionArgument { } } +static NUMERICS: &[DataType] = &[ + DataType::Int8, + DataType::Int16, + DataType::Int32, + DataType::Int64, + DataType::UInt8, + DataType::UInt16, + DataType::UInt32, + DataType::UInt64, + DataType::Float16, + DataType::Float32, + DataType::Float64, +]; + impl TypeSignature { pub fn to_string_repr(&self) -> Vec { match self { diff --git a/datafusion/expr-common/src/type_coercion/aggregates.rs b/datafusion/expr-common/src/type_coercion/aggregates.rs index df86ff582d658..d2c33b1e602d7 100644 --- a/datafusion/expr-common/src/type_coercion/aggregates.rs +++ b/datafusion/expr-common/src/type_coercion/aggregates.rs @@ -22,6 +22,7 @@ use datafusion_common::{Result, internal_err, plan_err}; // TODO: remove usage of these (INTEGERS and NUMERICS) in favour of signatures // see https://github.com/apache/datafusion/issues/18092 +#[deprecated(since = "54.0.0", note = "Use functions signatures")] pub static INTEGERS: &[DataType] = &[ DataType::Int8, DataType::Int16, @@ -33,6 +34,7 @@ pub static INTEGERS: &[DataType] = &[ DataType::UInt64, ]; +#[deprecated(since = "54.0.0", note = "Use functions signatures")] pub static NUMERICS: &[DataType] = &[ DataType::Int8, DataType::Int16, From d2bccf35ababc3209a2c5db6d9c52680efbc226a Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 09:08:09 +0000 Subject: [PATCH 05/23] Update unit test for approx_percentile_cont to return floats --- .../core/tests/dataframe/dataframe_functions.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 014f356cd64cd..d790f9cc25a1e 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -422,7 +422,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +---------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.5)) WITHIN GROUP [test.b ASC NULLS LAST] | +---------------------------------------------------------------------------+ - | 10 | + | 10.0 | +---------------------------------------------------------------------------+ "); @@ -437,7 +437,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +----------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.1)) WITHIN GROUP [test.b DESC NULLS LAST] | +----------------------------------------------------------------------------+ - | 100 | + | 100.0 | +----------------------------------------------------------------------------+ "); @@ -457,7 +457,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +--------------------------------------------------------------------+ | approx_percentile_cont(arg_2) WITHIN GROUP [test.b ASC NULLS LAST] | +--------------------------------------------------------------------+ - | 10 | + | 10.0 | +--------------------------------------------------------------------+ " ); @@ -477,7 +477,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +---------------------------------------------------------------------+ | approx_percentile_cont(arg_2) WITHIN GROUP [test.b DESC NULLS LAST] | +---------------------------------------------------------------------+ - | 100 | + | 100.0 | +---------------------------------------------------------------------+ " ); @@ -494,7 +494,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +------------------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.5),Int32(2)) WITHIN GROUP [test.b ASC NULLS LAST] | +------------------------------------------------------------------------------------+ - | 30 | + | 30.25 | +------------------------------------------------------------------------------------+ "); @@ -510,7 +510,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +-------------------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] | +-------------------------------------------------------------------------------------+ - | 69 | + | 69.85 | +-------------------------------------------------------------------------------------+ "); From 6fd21488641b2a4c67090211aea0cf520fbd127c Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 09:41:31 +0000 Subject: [PATCH 06/23] Update unit test for stubfunction --- datafusion/optimizer/src/analyzer/type_coercion.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index efc9984acb9b0..5ebfe3d5dea9e 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1888,7 +1888,8 @@ mod test { .err() .unwrap() .strip_backtrace(); - assert!(err.starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'avg' function: coercion from Utf8 to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64]) failed")); + assert!(err.contains("Function 'avg' failed to match any signature")); + assert!(err.contains("Error during planning: Function 'avg' requires Float64, but received String (DataType: Utf8)")); Ok(()) } From ff1eb8bc6419e272b6463229f33577b7f2e8712f Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 10:11:09 +0000 Subject: [PATCH 07/23] Update aggregate SLT to use float --- .../sqllogictest/test_files/aggregate.slt | 129 +++++++++--------- 1 file changed, 66 insertions(+), 63 deletions(-) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index cf894a494ad90..1de4a81a65616 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -144,28 +144,22 @@ statement error DataFusion error: Schema error: Schema contains duplicate unqual SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100 # csv_query_approx_percentile_cont_with_weight -statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function +statement error Function 'approx_percentile_cont_with_weight' failed to match any signature SELECT approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c1) FROM aggregate_test_100 -statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function +statement error Function 'approx_percentile_cont_with_weight' failed to match any signature SELECT approx_percentile_cont_with_weight(c1, 0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 -statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function +statement error Function 'approx_percentile_cont_with_weight' failed to match any signature SELECT approx_percentile_cont_with_weight(c2, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 # csv_query_approx_percentile_cont_with_histogram_bins statement error DataFusion error: Error during planning: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\. SELECT c1, approx_percentile_cont(0.95, -1000) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 -statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function +statement error Function 'approx_percentile_cont' failed to match any signature SELECT approx_percentile_cont(0.95, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 -statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int16, Float64, Float64 to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 - -statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Float64, Float64, Float64 to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 - statement error DataFusion error: Error during planning: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal SELECT approx_percentile_cont(c12) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 @@ -1986,26 +1980,26 @@ SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c11) AS true # percentile_cont_with_nulls -query I +query R SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v); ---- 2 # percentile_cont_with_nulls_only -query I +query R SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v); ---- NULL # percentile_cont_with_weight_with_nulls -query I +query R SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(w, 0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1, 1), (2, 1), (3, 1), (4, NULL), (NULL, 1), (NULL, NULL)) as t (v, w); ---- 2 # percentile_cont_with_weight_nulls_only -query I +query R SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(1, 0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v); ---- NULL @@ -2286,36 +2280,36 @@ b 5 NULL 20135.4 b NULL NULL 7732.315789473684 # csv_query_approx_percentile_cont_with_weight -query TI +query TR SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 # csv_query_approx_percentile_cont_with_weight (should be the same as above) -query TI +query TR SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 # using approx_percentile_cont on 2 columns with same signature -query TII +query TRR SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0.95) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 5 73 +a 5 73.55 b 5 68 -c 5 122 -d 5 124 -e 5 115 +c 5 122.5 +d 5 124.2 +e 5 115.6 # error is unique to this UDAF query TRR @@ -2329,73 +2323,82 @@ e 3 40.333333333333 -query TI +query TR SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a -101 -b -114 -c -109 -d -98 -e -93 +b -114.3 +c -109.475 +d -98.6 +e -93.65 # csv_query_approx_percentile_cont_with_weight (2) -query TI +query TR SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 # csv_query_approx_percentile_cont_with_weight alternate syntax -query TI +query TR SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 - +c 122.5 +d 124.2 +e 115.6 -query TI +query TR SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a -101 -b -114 -c -109 -d -98 -e -93 +b -114.3 +c -109.475 +d -98.6 +e -93.65 # csv_query_approx_percentile_cont_with_histogram_bins -query TI +query TR SELECT c1, approx_percentile_cont(0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 73 +a 73.55 b 68 -c 122 -d 124 -e 115 +c 122.5 +d 124.2 +e 115.6 -query TI +query TR +SELECT c1, approx_percentile_cont(0.95, 200.1) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 73.55 +b 68 +c 122.5 +d 124.2 +e 115.6 + + +query TR SELECT c1, approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 65 b 68 c 122 -d 123 -e 110 +d 123.15 +e 110.266666666667 # approx_percentile_cont_with_weight with centroids -query TI +query TR SELECT c1, approx_percentile_cont_with_weight(c2, 0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 65 b 68 c 122 -d 123 -e 110 +d 123.15 +e 110.266666666667 # csv_query_sum_crossjoin query TTI @@ -3788,10 +3791,10 @@ SELECT COUNT(DISTINCT c1) FROM test # TODO: aggregate_with_alias # test_approx_percentile_cont_decimal_support -query TI +query TR SELECT c1, approx_percentile_cont(cast(0.85 as decimal(10,2))) WITHIN GROUP (ORDER BY c2) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- -a 4 +a 4.175 b 5 c 4 d 4 From c04052cb0e932905c614e63deeddedf7ce306129 Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 10:34:35 +0000 Subject: [PATCH 08/23] Fix flaky test --- datafusion/optimizer/src/analyzer/type_coercion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 5ebfe3d5dea9e..7e76ef8e4bf66 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1889,7 +1889,7 @@ mod test { .unwrap() .strip_backtrace(); assert!(err.contains("Function 'avg' failed to match any signature")); - assert!(err.contains("Error during planning: Function 'avg' requires Float64, but received String (DataType: Utf8)")); + assert!(err.contains("Function 'avg' requires Float64, but received String (DataType: Utf8)")); Ok(()) } From 0a17a19a08fb3ababd448a8e03dca31f122d05a5 Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 10:39:30 +0000 Subject: [PATCH 09/23] Format --- datafusion/optimizer/src/analyzer/type_coercion.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 7e76ef8e4bf66..5d847892a263a 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1889,7 +1889,9 @@ mod test { .unwrap() .strip_backtrace(); assert!(err.contains("Function 'avg' failed to match any signature")); - assert!(err.contains("Function 'avg' requires Float64, but received String (DataType: Utf8)")); + assert!(err.contains( + "Function 'avg' requires Float64, but received String (DataType: Utf8)" + )); Ok(()) } From add22d8e11307c1cf6e900e23580cb5ce2ef787a Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 11:01:05 +0000 Subject: [PATCH 10/23] Debug flaky test --- .../optimizer/src/analyzer/type_coercion.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 5d847892a263a..6ec7c5078f73d 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1888,10 +1888,18 @@ mod test { .err() .unwrap() .strip_backtrace(); - assert!(err.contains("Function 'avg' failed to match any signature")); - assert!(err.contains( - "Function 'avg' requires Float64, but received String (DataType: Utf8)" - )); + assert!( + err.contains("Function 'avg' failed to match any signature"), + "Assertion failed: {:?}", + err + ); + assert!( + err.contains( + "Function 'avg' requires Float64, but received String (DataType: Utf8)" + ), + "Assertion failed: {:?}", + err + ); Ok(()) } From 98cdc250977c4f68ef580aec45dddef3ec5525c5 Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 20 Mar 2026 11:10:20 +0000 Subject: [PATCH 11/23] Remove flaky assertion --- datafusion/optimizer/src/analyzer/type_coercion.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 6ec7c5078f73d..74765539c13c3 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1890,15 +1890,7 @@ mod test { .strip_backtrace(); assert!( err.contains("Function 'avg' failed to match any signature"), - "Assertion failed: {:?}", - err - ); - assert!( - err.contains( - "Function 'avg' requires Float64, but received String (DataType: Utf8)" - ), - "Assertion failed: {:?}", - err + "Err: {err:?}" ); Ok(()) } From 09c0b907febfb4652f453f326baf042cb16bc0a0 Mon Sep 17 00:00:00 2001 From: theirix Date: Sun, 22 Mar 2026 16:18:39 +0000 Subject: [PATCH 12/23] Use selection of numeric types for example --- datafusion/expr-common/src/signature.rs | 26 +++++++------------------ 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index c1b38dd40361e..a933319bdb917 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -594,20 +594,6 @@ impl Display for ArrayFunctionArgument { } } -static NUMERICS: &[DataType] = &[ - DataType::Int8, - DataType::Int16, - DataType::Int32, - DataType::Int64, - DataType::UInt8, - DataType::UInt16, - DataType::UInt32, - DataType::UInt64, - DataType::Float16, - DataType::Float32, - DataType::Float64, -]; - impl TypeSignature { pub fn to_string_repr(&self) -> Vec { match self { @@ -926,11 +912,13 @@ impl TypeSignature { .cloned() .map(|data_type| vec![data_type]) .collect(), - TypeSignature::Numeric(arg_count) => NUMERICS - .iter() - .cloned() - .map(|numeric_type| vec![numeric_type; *arg_count]) - .collect(), + TypeSignature::Numeric(arg_count) => { + [DataType::Int8, DataType::UInt64, DataType::Float32] + .iter() + .cloned() + .map(|numeric_type| vec![numeric_type; *arg_count]) + .collect() + } TypeSignature::String(arg_count) => get_data_types(&NativeType::String) .into_iter() .map(|dt| vec![dt; *arg_count]) From abd8b09d4f0dddac64483ba55b5633977ffd1e2e Mon Sep 17 00:00:00 2001 From: theirix Date: Sun, 22 Mar 2026 16:18:51 +0000 Subject: [PATCH 13/23] Remove TODO as arrays are deprecated --- datafusion/expr-common/src/type_coercion/aggregates.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/aggregates.rs b/datafusion/expr-common/src/type_coercion/aggregates.rs index d2c33b1e602d7..ada0bd26b8d06 100644 --- a/datafusion/expr-common/src/type_coercion/aggregates.rs +++ b/datafusion/expr-common/src/type_coercion/aggregates.rs @@ -20,8 +20,6 @@ use arrow::datatypes::{DataType, FieldRef}; use datafusion_common::{Result, internal_err, plan_err}; -// TODO: remove usage of these (INTEGERS and NUMERICS) in favour of signatures -// see https://github.com/apache/datafusion/issues/18092 #[deprecated(since = "54.0.0", note = "Use functions signatures")] pub static INTEGERS: &[DataType] = &[ DataType::Int8, From 2ec73321a5db0486f12521628eb21a74931fac01 Mon Sep 17 00:00:00 2001 From: theirix Date: Sun, 22 Mar 2026 16:43:17 +0000 Subject: [PATCH 14/23] Revert "Use selection of numeric types for example" This reverts commit 09c0b907febfb4652f453f326baf042cb16bc0a0. --- datafusion/expr-common/src/signature.rs | 26 ++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index a933319bdb917..c1b38dd40361e 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -594,6 +594,20 @@ impl Display for ArrayFunctionArgument { } } +static NUMERICS: &[DataType] = &[ + DataType::Int8, + DataType::Int16, + DataType::Int32, + DataType::Int64, + DataType::UInt8, + DataType::UInt16, + DataType::UInt32, + DataType::UInt64, + DataType::Float16, + DataType::Float32, + DataType::Float64, +]; + impl TypeSignature { pub fn to_string_repr(&self) -> Vec { match self { @@ -912,13 +926,11 @@ impl TypeSignature { .cloned() .map(|data_type| vec![data_type]) .collect(), - TypeSignature::Numeric(arg_count) => { - [DataType::Int8, DataType::UInt64, DataType::Float32] - .iter() - .cloned() - .map(|numeric_type| vec![numeric_type; *arg_count]) - .collect() - } + TypeSignature::Numeric(arg_count) => NUMERICS + .iter() + .cloned() + .map(|numeric_type| vec![numeric_type; *arg_count]) + .collect(), TypeSignature::String(arg_count) => get_data_types(&NativeType::String) .into_iter() .map(|dt| vec![dt; *arg_count]) From 2fd92598963fd2972d9b7b7d131ef44b48532bca Mon Sep 17 00:00:00 2001 From: theirix Date: Sun, 22 Mar 2026 18:09:54 +0000 Subject: [PATCH 15/23] Better error for null parameter --- datafusion/functions-aggregate/src/utils.rs | 5 +++++ datafusion/sqllogictest/test_files/aggregate.slt | 3 +++ 2 files changed, 8 insertions(+) diff --git a/datafusion/functions-aggregate/src/utils.rs b/datafusion/functions-aggregate/src/utils.rs index 5e1925fcdbb5d..6d816e54bdaf2 100644 --- a/datafusion/functions-aggregate/src/utils.rs +++ b/datafusion/functions-aggregate/src/utils.rs @@ -54,6 +54,11 @@ pub(crate) fn validate_percentile_expr( let percentile = match scalar_value { ScalarValue::Float32(Some(value)) => value as f64, ScalarValue::Float64(Some(value)) => value, + ScalarValue::Float32(None) | ScalarValue::Float64(None) => { + return plan_err!( + "Percentile value for '{fn_name}' must be Float32 or Float64 literal (got null)" + ); + } sv => { return plan_err!( "Percentile value for '{fn_name}' must be Float32 or Float64 literal (got data type {})", diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 1de4a81a65616..6190617255a2d 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1991,6 +1991,9 @@ SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST( ---- NULL +query error DataFusion error: Error during planning: Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32 or Float64 literal \(got null\) +SELECT APPROX_PERCENTILE_CONT(NULL) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v); + # percentile_cont_with_weight_with_nulls query R SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(w, 0.5) WITHIN GROUP (ORDER BY v) From ae04043cb005868d23cf4f59f92a9a39556b75bf Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 16 Apr 2026 20:14:02 +0100 Subject: [PATCH 16/23] Change median return type to float --- datafusion/sqllogictest/test_files/aggregate.slt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 1ac2da5402212..8c71564ba1280 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -942,16 +942,16 @@ SELECT c2, var_samp(CASE WHEN c12 > 0.90 THEN c12 ELSE null END) FROM aggregate_ # csv_query_approx_median_1 -query I +query R SELECT approx_median(c2) FROM aggregate_test_100 ---- 3 # csv_query_approx_median_2 -query I +query R SELECT approx_median(c6) FROM aggregate_test_100 ---- -1146409980542786560 +1146409980542786600 # csv_query_approx_median_3 query R @@ -1000,7 +1000,7 @@ SELECT median(col_i8), median(distinct col_i8) FROM median_table -14 100 # approx_distinct_median_i8 -query I +query R SELECT approx_median(distinct col_i8) FROM median_table ---- 100 @@ -5968,7 +5968,7 @@ select median(a) from (select 1 as a where 1=0); ---- NULL -query I +query R select approx_median(a) from (select 1 as a where 1=0); ---- NULL From 793609eba27beaeb9b4ccec9adaa1bf2161f8288 Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 16 Apr 2026 20:14:35 +0100 Subject: [PATCH 17/23] Remove integer code path for approx_percentile --- .../src/approx_percentile_cont.rs | 57 +------------------ 1 file changed, 2 insertions(+), 55 deletions(-) diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 1271af6d2ff7b..3b850dcfda46a 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -23,10 +23,7 @@ use arrow::array::{Array, Float16Array}; use arrow::compute::{filter, is_not_null}; use arrow::datatypes::FieldRef; use arrow::{ - array::{ - ArrayRef, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, - Int64Array, UInt8Array, UInt16Array, UInt32Array, UInt64Array, - }, + array::{ArrayRef, Float32Array, Float64Array}, datatypes::{DataType, Field}, }; use datafusion_common::types::{NativeType, logical_float64}; @@ -200,17 +197,7 @@ impl ApproxPercentileCont { let data_type = args.expr_fields[0].data_type(); let accumulator: ApproxPercentileAccumulator = match data_type { - DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 => { + DataType::Float16 | DataType::Float32 | DataType::Float64 => { if let Some(max_size) = tdigest_max_size { ApproxPercentileAccumulator::new_with_max_size( percentile, @@ -397,38 +384,6 @@ impl ApproxPercentileAccumulator { .map(|v| v.to_f64()) .collect::>()) } - DataType::Int64 => { - let array = downcast_value!(values, Int64Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::Int32 => { - let array = downcast_value!(values, Int32Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::Int16 => { - let array = downcast_value!(values, Int16Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::Int8 => { - let array = downcast_value!(values, Int8Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::UInt64 => { - let array = downcast_value!(values, UInt64Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::UInt32 => { - let array = downcast_value!(values, UInt32Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::UInt16 => { - let array = downcast_value!(values, UInt16Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } - DataType::UInt8 => { - let array = downcast_value!(values, UInt8Array); - Ok(array.values().iter().map(|v| *v as f64).collect::>()) - } e => internal_err!( "APPROX_PERCENTILE_CONT is not expected to receive the type {e:?}" ), @@ -462,14 +417,6 @@ impl Accumulator for ApproxPercentileAccumulator { // These acceptable return types MUST match the validation in // ApproxPercentile::create_accumulator. Ok(match &self.return_type { - DataType::Int8 => ScalarValue::Int8(Some(q as i8)), - DataType::Int16 => ScalarValue::Int16(Some(q as i16)), - DataType::Int32 => ScalarValue::Int32(Some(q as i32)), - DataType::Int64 => ScalarValue::Int64(Some(q as i64)), - DataType::UInt8 => ScalarValue::UInt8(Some(q as u8)), - DataType::UInt16 => ScalarValue::UInt16(Some(q as u16)), - DataType::UInt32 => ScalarValue::UInt32(Some(q as u32)), - DataType::UInt64 => ScalarValue::UInt64(Some(q as u64)), DataType::Float16 => ScalarValue::Float16(Some(half::f16::from_f64(q))), DataType::Float32 => ScalarValue::Float32(Some(q as f32)), DataType::Float64 => ScalarValue::Float64(Some(q)), From a08f2ffa5d682c3b3266180db7a0083a9711d69f Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 16 Apr 2026 20:15:01 +0100 Subject: [PATCH 18/23] Change signature of median (aliased to approx_percentile_cont) --- .../functions-aggregate/src/approx_median.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs index c016ada695716..162dc224f2ccb 100644 --- a/datafusion/functions-aggregate/src/approx_median.rs +++ b/datafusion/functions-aggregate/src/approx_median.rs @@ -74,16 +74,11 @@ impl ApproxMedian { pub fn new() -> Self { Self { signature: Signature::one_of( - vec![ - TypeSignature::Coercible(vec![Coercion::new_exact( - TypeSignatureClass::Integer, - )]), - TypeSignature::Coercible(vec![Coercion::new_implicit( - TypeSignatureClass::Float, - vec![TypeSignatureClass::Decimal], - NativeType::Float64, - )]), - ], + vec![TypeSignature::Coercible(vec![Coercion::new_implicit( + TypeSignatureClass::Float, + vec![TypeSignatureClass::Numeric], + NativeType::Float64, + )])], Volatility::Immutable, ), } From b7b3020cbcc7456ba88a76a02910e1c196370c4c Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 16 Apr 2026 20:46:38 +0100 Subject: [PATCH 19/23] Update median unit tests --- .../tests/dataframe/dataframe_functions.rs | 2 +- datafusion/core/tests/dataframe/mod.rs | 40 +++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index d790f9cc25a1e..2ada0411f4f8c 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -402,7 +402,7 @@ async fn test_fn_approx_median() -> Result<()> { +-----------------------+ | approx_median(test.b) | +-----------------------+ - | 10 | + | 10.0 | +-----------------------+ "); diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 6e22aeab61089..0beb38f19f020 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1204,26 +1204,26 @@ async fn window_using_aggregates() -> Result<()> { | first_value | last_val | approx_distinct | approx_median | median | max | min | c2 | c3 | +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ | | | | | | | | 1 | -85 | - | -85 | -101 | 14 | -12 | -12 | 83 | -101 | 4 | -54 | - | -85 | -101 | 17 | -25 | -25 | 83 | -101 | 5 | -31 | - | -85 | -12 | 10 | -32 | -34 | 83 | -85 | 3 | 13 | - | -85 | -25 | 3 | -56 | -56 | -25 | -85 | 1 | -5 | - | -85 | -31 | 18 | -29 | -28 | 83 | -101 | 5 | 36 | - | -85 | -38 | 16 | -25 | -25 | 83 | -101 | 4 | 65 | - | -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 | - | -85 | -48 | 6 | -35 | -36 | 83 | -85 | 2 | -43 | - | -85 | -5 | 4 | -37 | -40 | -5 | -85 | 1 | 83 | - | -85 | -54 | 15 | -17 | -18 | 83 | -101 | 4 | -38 | - | -85 | -56 | 2 | -70 | -70 | -56 | -85 | 1 | -25 | - | -85 | -72 | 9 | -43 | -43 | 83 | -85 | 3 | -12 | - | -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 | - | -85 | 13 | 11 | -17 | -18 | 83 | -85 | 3 | 14 | - | -85 | 13 | 11 | -25 | -25 | 83 | -85 | 3 | 13 | - | -85 | 14 | 12 | -12 | -12 | 83 | -85 | 3 | 17 | - | -85 | 17 | 13 | -11 | -8 | 83 | -85 | 4 | -101 | - | -85 | 45 | 8 | -34 | -34 | 83 | -85 | 3 | -72 | - | -85 | 65 | 17 | -17 | -18 | 83 | -101 | 5 | -101 | - | -85 | 83 | 5 | -25 | -25 | 83 | -85 | 2 | -48 | + | -85 | -101 | 14 | -12.0 | -12 | 83 | -101 | 4 | -54 | + | -85 | -101 | 17 | -25.0 | -25 | 83 | -101 | 5 | -31 | + | -85 | -12 | 10 | -32.75 | -34 | 83 | -85 | 3 | 13 | + | -85 | -25 | 3 | -56.0 | -56 | -25 | -85 | 1 | -5 | + | -85 | -31 | 18 | -29.75 | -28 | 83 | -101 | 5 | 36 | + | -85 | -38 | 16 | -25.0 | -25 | 83 | -101 | 4 | 65 | + | -85 | -43 | 7 | -43.0 | -43 | 83 | -85 | 2 | 45 | + | -85 | -48 | 6 | -35.75 | -36 | 83 | -85 | 2 | -43 | + | -85 | -5 | 4 | -37.75 | -40 | -5 | -85 | 1 | 83 | + | -85 | -54 | 15 | -17.0 | -18 | 83 | -101 | 4 | -38 | + | -85 | -56 | 2 | -70.5 | -70 | -56 | -85 | 1 | -25 | + | -85 | -72 | 9 | -43.0 | -43 | 83 | -85 | 3 | -12 | + | -85 | -85 | 1 | -85.0 | -85 | -85 | -85 | 1 | -56 | + | -85 | 13 | 11 | -17.0 | -18 | 83 | -85 | 3 | 14 | + | -85 | 13 | 11 | -25.0 | -25 | 83 | -85 | 3 | 13 | + | -85 | 14 | 12 | -12.0 | -12 | 83 | -85 | 3 | 17 | + | -85 | 17 | 13 | -11.25 | -8 | 83 | -85 | 4 | -101 | + | -85 | 45 | 8 | -34.5 | -34 | 83 | -85 | 3 | -72 | + | -85 | 65 | 17 | -17.0 | -18 | 83 | -101 | 5 | -101 | + | -85 | 83 | 5 | -25.0 | -25 | 83 | -85 | 2 | -48 | +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ " ); From 462bfa8ed80cd6764c9e42a4e51b0fa4ee3a9d55 Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 16 Apr 2026 22:03:02 +0100 Subject: [PATCH 20/23] Update median SLTs --- .../test_files/aggregate_skip_partial.slt | 26 +++++++++---------- .../sqllogictest/test_files/metadata.slt | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index e12ac5782e3a4..cd95426a9b0bb 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -145,7 +145,7 @@ GROUP BY 1, 2 ORDER BY 1 LIMIT 5; -2117946883 d -2117946883 NULL NULL NULL -2098805236 c -2098805236 NULL NULL NULL -query ITIIII +query ITRRRR SELECT c5, c1, APPROX_MEDIAN(c5), APPROX_MEDIAN(CASE WHEN c1 = 'a' THEN c5 ELSE NULL END), @@ -318,14 +318,14 @@ SELECT c2, median(c5), median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY 5 604973998 0.49842384 # Test approx_median for int / float -query IIR +query IRR SELECT c2, approx_median(c5), approx_median(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; ---- -1 191655437 0.59926736 +1 191655437.25 0.59926736 2 -587831330 0.43230486 3 240273900 0.40199697 4 762932956 0.48515016 -5 593204320 0.5156586 +5 593204320.5 0.5156586 # Test approx_distinct for varchar / int query III @@ -399,14 +399,14 @@ SELECT c2, median(c3), median(c11) FROM aggregate_test_100_null GROUP BY c2 ORDE 5 -35 0.5536642 # Test approx_median with nullable fields -query IIR +query IRR SELECT c2, approx_median(c3), approx_median(c11) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; ---- 1 12 0.6067944 2 1 0.46076488 3 14 0.40154034 -4 -7 0.48515016 -5 -39 0.5536642 +4 -7.75 0.48515016 +5 -39.75 0.5536642 # Test approx_distinct with nullable fields query II @@ -516,7 +516,7 @@ FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; 5 64 -59 # Test approx_median with filter -query III +query IRR SELECT c2, approx_median(c3) FILTER (WHERE c3 > 0), @@ -526,7 +526,7 @@ FROM aggregate_test_100 GROUP BY c2 ORDER BY c2; 1 57 -56 2 52 -60 3 71 -76 -4 65 -64 +4 65 -64.75 5 64 -59 # Test count with nullable fields and filter @@ -662,20 +662,20 @@ FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; 5 -22 # Test approx_median with nullable fields and filter -query IIR +query IRR SELECT c2, approx_median(c3) FILTER (WHERE c5 > 0), approx_median(c11) FILTER (WHERE c5 < 0) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; ---- 1 -5 0.6623719 -2 12 0.52930677 +2 12.25 0.52930677 3 13 0.32792538 4 -38 0.49774808 -5 -21 0.47652745 +5 -21.75 0.47652745 # Test approx_median with nullable fields and nullable filter -query II +query IR SELECT c2, approx_median(c3) FILTER (WHERE c11 > 0.5) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index 11ed4fc632e2a..e06a3dd72498c 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -83,7 +83,7 @@ select count(distinct name) from table_with_metadata; 2 # Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687 -query I +query R select approx_median(distinct id) from table_with_metadata; ---- 2 From e24229062da6c62a3d0cbb2059e5b342f0d6c7f9 Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 16 Apr 2026 23:00:20 +0100 Subject: [PATCH 21/23] Clarify signature --- datafusion/expr/src/test/function_stub.rs | 3 --- datafusion/functions-aggregate/src/approx_percentile_cont.rs | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/datafusion/expr/src/test/function_stub.rs b/datafusion/expr/src/test/function_stub.rs index 506d9ce89460a..a1f29b649b2f8 100644 --- a/datafusion/expr/src/test/function_stub.rs +++ b/datafusion/expr/src/test/function_stub.rs @@ -450,9 +450,6 @@ impl Avg { TypeSignature::Coercible(vec![Coercion::new_exact( TypeSignatureClass::Decimal, )]), - TypeSignature::Coercible(vec![Coercion::new_exact( - TypeSignatureClass::Duration, - )]), TypeSignature::Coercible(vec![Coercion::new_implicit( TypeSignatureClass::Native(logical_float64()), vec![TypeSignatureClass::Integer, TypeSignatureClass::Float], diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 3b850dcfda46a..3f1adcca12362 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -130,7 +130,6 @@ impl ApproxPercentileCont { /// Create a new [`ApproxPercentileCont`] aggregate function. pub fn new() -> Self { // Accept any numeric value paired with a float64 percentile - // Additionally accept an integer number of centroids for T-Digest let signature = Signature::one_of( vec![ // 2 args - numeric, percentile (float) @@ -146,7 +145,7 @@ impl ApproxPercentileCont { NativeType::Float64, ), ]), - // 3 args - numeric, percentile (float), centroid (integer) + // 3 args - numeric, percentile (float), number of centroid for T-Digest (integer) TypeSignature::Coercible(vec![ Coercion::new_implicit( TypeSignatureClass::Float, From e6752c2cfd5325907710d06e0243b8b6bc3a4b21 Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 17 Apr 2026 17:29:15 +0100 Subject: [PATCH 22/23] Add upgrade notes for 54 --- .../library-user-guide/upgrading/54.0.0.md | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index d5fe197854c4d..4bc442f41d72a 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -321,3 +321,30 @@ clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text, behavior is unchanged. [#17861]: https://github.com/apache/datafusion/pull/17861 + + +### `approx_percentile_cont`, `approx_percentile_cont_with_weight`, `approx_median` now coerce to floats + +The type signatures of `approx_percentile_cont`, `approx_percentile_cont_with_weight`, and +`approx_median` now coerce integer input values to `Float64` before computing the approximation. +As a result, these functions always return a float, even when the input column is an integer type. + +**Who is affected:** + +- Queries or downstream code that relied on `approx_percentile_cont` / `approx_percentile_cont_with_weight` / + `approx_median` returning an integer type when given an integer column. + +**Migration guide:** + +If downstream code checks or relies on the return type being an integer, add an explicit +`CAST` back to the desired integer type, or update the type assertion: + +```sql +-- Before (returned Int64): +SELECT approx_percentile_cont(quantity, 0.5) FROM orders; + +-- After (returns Float64); cast if an integer result is required: +SELECT CAST(approx_percentile_cont(quantity, 0.5) AS BIGINT) FROM orders; +``` + +[#21074]: https://github.com/apache/datafusion/pull/21074 From 858bbd89016ebf9e1f4cf1585e5707bb749180a5 Mon Sep 17 00:00:00 2001 From: theirix Date: Fri, 17 Apr 2026 17:32:30 +0100 Subject: [PATCH 23/23] Run prettier --- docs/source/library-user-guide/upgrading/54.0.0.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index 4bc442f41d72a..c277f69d0bee2 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -322,7 +322,6 @@ behavior is unchanged. [#17861]: https://github.com/apache/datafusion/pull/17861 - ### `approx_percentile_cont`, `approx_percentile_cont_with_weight`, `approx_median` now coerce to floats The type signatures of `approx_percentile_cont`, `approx_percentile_cont_with_weight`, and