From 475de9fbccf9e98b97f2cd4c33dc116f403bb8b0 Mon Sep 17 00:00:00 2001 From: Shiv Bhatia Date: Thu, 19 Mar 2026 20:07:08 +0000 Subject: [PATCH 1/7] Fix filter pushdown optimisation --- datafusion/optimizer/src/push_down_filter.rs | 73 ++++++++++ datafusion/physical-plan/src/sorts/sort.rs | 135 ++++++++++++++++-- .../push_down_filter_sort_fetch.slt | 55 +++++++ datafusion/sqllogictest/test_files/window.slt | 15 +- 4 files changed, 263 insertions(+), 15 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/push_down_filter_sort_fetch.slt diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 03a7a0b864177..183ea5efe06e4 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -832,6 +832,13 @@ impl OptimizerRule for PushDownFilter { insert_below(LogicalPlan::Distinct(distinct), new_filter) } LogicalPlan::Sort(sort) => { + // If the sort has a fetch (limit), pushing a filter below + // it would change semantics: the limit should apply before + // the filter, not after. + if sort.fetch.is_some() { + filter.input = Arc::new(LogicalPlan::Sort(sort)); + return Ok(Transformed::no(LogicalPlan::Filter(filter))); + } let new_filter = Filter::try_new(filter.predicate, Arc::clone(&sort.input)) .map(LogicalPlan::Filter)?; @@ -1130,6 +1137,13 @@ impl OptimizerRule for PushDownFilter { } LogicalPlan::Join(join) => push_down_join(join, Some(&filter.predicate)), LogicalPlan::TableScan(scan) => { + // If the scan has a fetch (limit), pushing filters into it + // would change semantics: the limit should apply before the + // filter, not after. + if scan.fetch.is_some() { + filter.input = Arc::new(LogicalPlan::TableScan(scan)); + return Ok(Transformed::no(LogicalPlan::Filter(filter))); + } let filter_predicates = split_conjunction(&filter.predicate); let (volatile_filters, non_volatile_filters): (Vec<&Expr>, Vec<&Expr>) = @@ -4315,4 +4329,63 @@ mod tests { " ) } + + #[test] + fn filter_not_pushed_down_through_table_scan_with_fetch() -> Result<()> { + let scan = test_table_scan()?; + let scan_with_fetch = match scan { + LogicalPlan::TableScan(scan) => LogicalPlan::TableScan(TableScan { + fetch: Some(10), + ..scan + }), + _ => unreachable!(), + }; + let plan = LogicalPlanBuilder::from(scan_with_fetch) + .filter(col("a").gt(lit(10i64)))? + .build()?; + // Filter must NOT be pushed into the table scan when it has a fetch (limit) + assert_optimized_plan_equal!( + plan, + @r" + Filter: test.a > Int64(10) + TableScan: test, fetch=10 + " + ) + } + + #[test] + fn filter_push_down_through_sort_without_fetch() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .sort(vec![col("a").sort(true, true)])? + .filter(col("a").gt(lit(10i64)))? + .build()?; + // Filter should be pushed below the sort + assert_optimized_plan_equal!( + plan, + @r" + Sort: test.a ASC NULLS FIRST + TableScan: test, full_filters=[test.a > Int64(10)] + " + ) + } + + #[test] + fn filter_not_pushed_down_through_sort_with_fetch() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .sort_with_limit(vec![col("a").sort(true, true)], Some(5))? + .filter(col("a").gt(lit(10i64)))? + .build()?; + // Filter must NOT be pushed below the sort when it has a fetch (limit), + // because the limit should apply before the filter. + assert_optimized_plan_equal!( + plan, + @r" + Filter: test.a > Int64(10) + Sort: test.a ASC NULLS FIRST, fetch=5 + TableScan: test + " + ) + } } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index da2171847cc7b..e4a2024effece 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1405,11 +1405,22 @@ impl ExecutionPlan for SortExec { config: &datafusion_common::config::ConfigOptions, ) -> Result { if phase != FilterPushdownPhase::Post { + if self.fetch.is_some() { + return Ok(FilterDescription::all_unsupported( + &parent_filters, + &self.children(), + )); + } return FilterDescription::from_children(parent_filters, &self.children()); } - let mut child = - ChildFilterDescription::from_child(&parent_filters, self.input())?; + // In Post phase: block parent filters when fetch is set, + // but still push the TopK dynamic filter (self-filter). + let mut child = if self.fetch.is_some() { + ChildFilterDescription::all_unsupported(&parent_filters) + } else { + ChildFilterDescription::from_child(&parent_filters, self.input())? + }; if let Some(filter) = &self.filter && config.optimizer.enable_topk_dynamic_filter_pushdown @@ -1430,8 +1441,10 @@ mod tests { use super::*; use crate::coalesce_partitions::CoalescePartitionsExec; use crate::collect; + use crate::empty::EmptyExec; use crate::execution_plan::Boundedness; use crate::expressions::col; + use crate::filter_pushdown::{FilterPushdownPhase, PushedDown}; use crate::test; use crate::test::TestMemoryExec; use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero}; @@ -1441,15 +1454,19 @@ mod tests { use arrow::compute::SortOptions; use arrow::datatypes::*; use datafusion_common::cast::as_primitive_array; + use datafusion_common::config::ConfigOptions; use datafusion_common::test_util::batches_to_string; use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_execution::RecordBatchStream; use datafusion_execution::config::SessionConfig; + use datafusion_execution::memory_pool::{ + GreedyMemoryPool, MemoryConsumer, MemoryPool, + }; use datafusion_execution::runtime_env::RuntimeEnvBuilder; use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_expr::expressions::{Column, Literal}; - use futures::{FutureExt, Stream}; + use futures::{FutureExt, Stream, TryStreamExt}; use insta::assert_snapshot; #[derive(Debug, Clone)] @@ -2748,11 +2765,6 @@ mod tests { /// those bytes become unaccounted-for reserved memory that nobody uses. #[tokio::test] async fn test_sort_merge_reservation_transferred_not_freed() -> Result<()> { - use datafusion_execution::memory_pool::{ - GreedyMemoryPool, MemoryConsumer, MemoryPool, - }; - use futures::TryStreamExt; - let sort_spill_reservation_bytes: usize = 10 * 1024; // 10 KB // Pool: merge reservation (10KB) + enough room for sort to work. @@ -2863,4 +2875,111 @@ mod tests { drop(contender); Ok(()) } + + #[test] + fn test_sort_with_fetch_blocks_filter_pushdown() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let input = Arc::new(EmptyExec::new(Arc::clone(&schema))); + let sort = SortExec::new( + [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(), + input, + ) + .with_fetch(Some(10)); + + let parent_filter: Arc = Arc::new(Column::new("a", 0)); + let config = ConfigOptions::new(); + + let desc = sort.gather_filters_for_pushdown( + FilterPushdownPhase::Pre, + vec![parent_filter], + &config, + )?; + + // Parent filter must be unsupported — it must not be pushed below + // a sort with fetch (TopK). + let parent_filters = desc.parent_filters(); + assert_eq!(parent_filters.len(), 1); + assert_eq!(parent_filters[0].len(), 1); + assert!( + matches!(parent_filters[0][0].discriminant, PushedDown::No), + "Parent filter should be unsupported when sort has fetch" + ); + + Ok(()) + } + + #[test] + fn test_sort_without_fetch_allows_filter_pushdown() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let input = Arc::new(EmptyExec::new(Arc::clone(&schema))); + let sort = SortExec::new( + [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(), + input, + ); + + let parent_filter: Arc = Arc::new(Column::new("a", 0)); + let config = ConfigOptions::new(); + + let desc = sort.gather_filters_for_pushdown( + FilterPushdownPhase::Pre, + vec![parent_filter], + &config, + )?; + + // Parent filter should be supported — plain sort (no fetch) is + // filter-commutative. + let parent_filters = desc.parent_filters(); + assert_eq!(parent_filters.len(), 1); + assert_eq!(parent_filters[0].len(), 1); + assert!( + matches!(parent_filters[0][0].discriminant, PushedDown::Yes), + "Parent filter should be supported when sort has no fetch" + ); + + Ok(()) + } + + #[test] + fn test_sort_with_fetch_allows_topk_self_filter_in_post_phase() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let input = Arc::new(EmptyExec::new(Arc::clone(&schema))); + let sort = SortExec::new( + [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(), + input, + ) + .with_fetch(Some(10)); + + // with_fetch(Some(_)) creates the TopK dynamic filter automatically. + assert!(sort.filter.is_some(), "TopK filter should be created"); + + let parent_filter: Arc = Arc::new(Column::new("a", 0)); + let mut config = ConfigOptions::new(); + config.optimizer.enable_topk_dynamic_filter_pushdown = true; + + let desc = sort.gather_filters_for_pushdown( + FilterPushdownPhase::Post, + vec![parent_filter], + &config, + )?; + + // Parent filters should be blocked in Post phase when fetch is set. + let parent_filters = desc.parent_filters(); + assert_eq!(parent_filters.len(), 1); + assert_eq!(parent_filters[0].len(), 1); + assert!( + matches!(parent_filters[0][0].discriminant, PushedDown::No), + "Parent filter should be unsupported in Post phase when sort has fetch" + ); + + // The TopK self-filter should still be allowed through. + let self_filters = desc.self_filters(); + assert_eq!(self_filters.len(), 1); + assert_eq!( + self_filters[0].len(), + 1, + "TopK dynamic self-filter should be pushed down" + ); + + Ok(()) + } } diff --git a/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch.slt b/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch.slt new file mode 100644 index 0000000000000..ab23fff030489 --- /dev/null +++ b/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch.slt @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for filter pushdown behavior with Sort + LIMIT (fetch). + +statement ok +CREATE TABLE t(id INT, value INT) AS VALUES +(1, 100), +(2, 200), +(3, 300), +(4, 400), +(5, 500); + +# Take the 3 smallest values (100, 200, 300), then filter value > 200. +query II +SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; +---- +3 300 + +# Take the 3 largest values (500, 400, 300), then filter value < 400. +query II +SELECT * FROM (SELECT * FROM t ORDER BY value DESC LIMIT 3) sub WHERE sub.value < 400; +---- +3 300 + +# The filter stays above the sort+fetch in the plan. +query TT +EXPLAIN SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; +---- +logical_plan +01)SubqueryAlias: sub +02)--Filter: t.value > Int32(200) +03)----Sort: t.value ASC NULLS LAST, fetch=3 +04)------TableScan: t projection=[id, value] +physical_plan +01)FilterExec: value@1 > 200 +02)--SortExec: TopK(fetch=3), expr=[value@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +statement ok +DROP TABLE t; diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 61faf4dc9650f..05e364a14bd66 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -3198,16 +3198,17 @@ EXPLAIN SELECT * FROM (SELECT *, ROW_NUMBER() OVER(ORDER BY a ASC) as rn1 ---- logical_plan 01)Sort: rn1 ASC NULLS LAST -02)--Sort: rn1 ASC NULLS LAST, fetch=5 -03)----Projection: annotated_data_infinite2.a0, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1 -04)------Filter: row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW < UInt64(50) +02)--Filter: rn1 < UInt64(50) +03)----Sort: rn1 ASC NULLS LAST, fetch=5 +04)------Projection: annotated_data_infinite2.a0, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1 05)--------WindowAggr: windowExpr=[[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] 06)----------TableScan: annotated_data_infinite2 projection=[a0, a, b, c, d] physical_plan -01)ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as rn1] -02)--FilterExec: row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 < 50, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] -04)------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST] +01)FilterExec: rn1@5 < 50 +02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as rn1] +03)----GlobalLimitExec: skip=0, fetch=5 +04)------BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST] # Top level sort is pushed down through BoundedWindowAggExec as its SUM result does already satisfy the required # global order. The existing sort is for the second-term lexicographical ordering requirement, which is being From 0e83891753d055f62dea152c9cacad2a9f4a642c Mon Sep 17 00:00:00 2001 From: Shiv Bhatia Date: Fri, 20 Mar 2026 18:59:08 +0000 Subject: [PATCH 2/7] Add fetch method to LogicalPlan --- datafusion/expr/src/logical_plan/plan.rs | 34 ++++++++++++++++++++ datafusion/optimizer/src/push_down_filter.rs | 20 ++++-------- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index b2a56971837f0..55fdd130db49c 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1392,6 +1392,40 @@ impl LogicalPlan { } } + /// Returns the fetch (limit) of this plan node, if it has one. + /// + /// Only [`LogicalPlan::Sort`] and [`LogicalPlan::TableScan`] carry a fetch + /// value directly; all other variants return `None`. + pub fn fetch(&self) -> Option { + match self { + LogicalPlan::Sort(Sort { fetch, .. }) => *fetch, + LogicalPlan::TableScan(TableScan { fetch, .. }) => *fetch, + LogicalPlan::Projection(_) => None, + LogicalPlan::Filter(_) => None, + LogicalPlan::Window(_) => None, + LogicalPlan::Aggregate(_) => None, + LogicalPlan::Join(_) => None, + LogicalPlan::Repartition(_) => None, + LogicalPlan::Union(_) => None, + LogicalPlan::EmptyRelation(_) => None, + LogicalPlan::Subquery(_) => None, + LogicalPlan::SubqueryAlias(_) => None, + LogicalPlan::Limit(_) => None, + LogicalPlan::Statement(_) => None, + LogicalPlan::Values(_) => None, + LogicalPlan::Explain(_) => None, + LogicalPlan::Analyze(_) => None, + LogicalPlan::Extension(_) => None, + LogicalPlan::Distinct(_) => None, + LogicalPlan::Dml(_) => None, + LogicalPlan::Ddl(_) => None, + LogicalPlan::Copy(_) => None, + LogicalPlan::DescribeTable(_) => None, + LogicalPlan::Unnest(_) => None, + LogicalPlan::RecursiveQuery(_) => None, + } + } + /// If this node's expressions contains any references to an outer subquery pub fn contains_outer_reference(&self) -> bool { let mut contains = false; diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 183ea5efe06e4..c2c16e9fe7803 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -796,6 +796,12 @@ impl OptimizerRule for PushDownFilter { filter.predicate = new_predicate; } + // If the child has a fetch (limit), pushing a filter below it would + // change semantics: the limit should apply before the filter, not after. + if filter.input.fetch().is_some() { + return Ok(Transformed::no(LogicalPlan::Filter(filter))); + } + match Arc::unwrap_or_clone(filter.input) { LogicalPlan::Filter(child_filter) => { let parents_predicates = split_conjunction_owned(filter.predicate); @@ -832,13 +838,6 @@ impl OptimizerRule for PushDownFilter { insert_below(LogicalPlan::Distinct(distinct), new_filter) } LogicalPlan::Sort(sort) => { - // If the sort has a fetch (limit), pushing a filter below - // it would change semantics: the limit should apply before - // the filter, not after. - if sort.fetch.is_some() { - filter.input = Arc::new(LogicalPlan::Sort(sort)); - return Ok(Transformed::no(LogicalPlan::Filter(filter))); - } let new_filter = Filter::try_new(filter.predicate, Arc::clone(&sort.input)) .map(LogicalPlan::Filter)?; @@ -1137,13 +1136,6 @@ impl OptimizerRule for PushDownFilter { } LogicalPlan::Join(join) => push_down_join(join, Some(&filter.predicate)), LogicalPlan::TableScan(scan) => { - // If the scan has a fetch (limit), pushing filters into it - // would change semantics: the limit should apply before the - // filter, not after. - if scan.fetch.is_some() { - filter.input = Arc::new(LogicalPlan::TableScan(scan)); - return Ok(Transformed::no(LogicalPlan::Filter(filter))); - } let filter_predicates = split_conjunction(&filter.predicate); let (volatile_filters, non_volatile_filters): (Vec<&Expr>, Vec<&Expr>) = From db84bf8cd7b52f09d49c87e550971aebd38890e9 Mon Sep 17 00:00:00 2001 From: Shiv Bhatia Date: Sat, 21 Mar 2026 11:20:07 +0000 Subject: [PATCH 3/7] Add skip method to LogicalPlan, handle Limit correctly --- .github/workflows/codeql 2.yml | 55 +++ AGENTS 2.md | 34 ++ CLAUDE 2.md | 1 + benchmarks/src/util/latency_object_store 2.rs | 157 ++++++++ ...nsumers_with_mem_pool_type@no_track 2.snap | 23 ++ ...y_consumers_with_mem_pool_type@top2 2.snap | 26 ++ ..._with_unbounded_memory_pool@default 2.snap | 36 ++ datafusion/common/benches/stats_merge 2.rs | 85 ++++ datafusion/common/src/utils/aggregate 2.rs | 149 +++++++ datafusion/core/benches/topk_repartition 2.rs | 90 +++++ .../parquet_struct_filter_pushdown 2.rs | 353 +++++++++++++++++ .../src/test_data/ndv_test 2.parquet | Bin 0 -> 1141 bytes datafusion/expr/src/logical_plan/plan.rs | 98 +++-- datafusion/ffi/tests/ffi_execution_plan 2.rs | 108 +++++ .../benches/approx_distinct 2.rs | 128 ++++++ .../benches/array_concat 2.rs | 94 +++++ .../benches/array_to_string 2.rs | 188 +++++++++ datafusion/optimizer/src/push_down_filter.rs | 7 +- .../linear_aggregates 2.rs | 229 +++++++++++ .../benches/compare_nested 2.rs | 74 ++++ .../src/hash_join_buffering 2.rs | 103 +++++ .../src/topk_repartition 2.rs | 368 ++++++++++++++++++ .../src/function/array/array_contains 2.rs | 168 ++++++++ datafusion/sqllogictest/src/test_file 2.rs | 186 +++++++++ .../test_files/aggregates_simplify 2.slt | 358 +++++++++++++++++ .../push_down_filter_sort_fetch 2.slt | 55 +++ .../spark/array/array_contains 2.slt | 140 +++++++ .../test_files/window_topk_pushdown 2.slt | 141 +++++++ .../logical_plan/consumer/expr/nested 2.rs | 151 +++++++ .../nested_list_expressions.substrait 2.json | 77 ++++ dev/changelog/52.2.0 2.md | 47 +++ dev/changelog/52.3.0 2.md | 50 +++ .../library-user-guide/upgrading/54.0.0 2.md | 124 ++++++ 33 files changed, 3872 insertions(+), 31 deletions(-) create mode 100644 .github/workflows/codeql 2.yml create mode 100644 AGENTS 2.md create mode 120000 CLAUDE 2.md create mode 100644 benchmarks/src/util/latency_object_store 2.rs create mode 100644 datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track 2.snap create mode 100644 datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2 2.snap create mode 100644 datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default 2.snap create mode 100644 datafusion/common/benches/stats_merge 2.rs create mode 100644 datafusion/common/src/utils/aggregate 2.rs create mode 100644 datafusion/core/benches/topk_repartition 2.rs create mode 100644 datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown 2.rs create mode 100644 datafusion/datasource-parquet/src/test_data/ndv_test 2.parquet create mode 100644 datafusion/ffi/tests/ffi_execution_plan 2.rs create mode 100644 datafusion/functions-aggregate/benches/approx_distinct 2.rs create mode 100644 datafusion/functions-nested/benches/array_concat 2.rs create mode 100644 datafusion/functions-nested/benches/array_to_string 2.rs create mode 100644 datafusion/optimizer/src/simplify_expressions/linear_aggregates 2.rs create mode 100644 datafusion/physical-expr-common/benches/compare_nested 2.rs create mode 100644 datafusion/physical-optimizer/src/hash_join_buffering 2.rs create mode 100644 datafusion/physical-optimizer/src/topk_repartition 2.rs create mode 100644 datafusion/spark/src/function/array/array_contains 2.rs create mode 100644 datafusion/sqllogictest/src/test_file 2.rs create mode 100644 datafusion/sqllogictest/test_files/aggregates_simplify 2.slt create mode 100644 datafusion/sqllogictest/test_files/push_down_filter_sort_fetch 2.slt create mode 100644 datafusion/sqllogictest/test_files/spark/array/array_contains 2.slt create mode 100644 datafusion/sqllogictest/test_files/window_topk_pushdown 2.slt create mode 100644 datafusion/substrait/src/logical_plan/consumer/expr/nested 2.rs create mode 100644 datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait 2.json create mode 100644 dev/changelog/52.2.0 2.md create mode 100644 dev/changelog/52.3.0 2.md create mode 100644 docs/source/library-user-guide/upgrading/54.0.0 2.md diff --git a/.github/workflows/codeql 2.yml b/.github/workflows/codeql 2.yml new file mode 100644 index 0000000000000..d42c2b4aa8d39 --- /dev/null +++ b/.github/workflows/codeql 2.yml @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "CodeQL" + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + schedule: + - cron: '16 4 * * 1' + +permissions: + contents: read + +jobs: + analyze: + name: Analyze Actions + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + packages: read + + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false + + - name: Initialize CodeQL + uses: github/codeql-action/init@b1bff81932f5cdfc8695c7752dcee935dcd061c8 # v4 + with: + languages: actions + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@b1bff81932f5cdfc8695c7752dcee935dcd061c8 # v4 + with: + category: "/language:actions" diff --git a/AGENTS 2.md b/AGENTS 2.md new file mode 100644 index 0000000000000..eeedbd8bc45ec --- /dev/null +++ b/AGENTS 2.md @@ -0,0 +1,34 @@ +# Agent Guidelines for Apache DataFusion + +## Developer Documentation + +- [Contributor Guide](docs/source/contributor-guide/index.md) +- [Architecture Guide](docs/source/contributor-guide/architecture.md) + +## Before Committing + +Before committing any changes, you **must** run the following checks and fix any issues: + +```bash +cargo fmt --all +cargo clippy --all-targets --all-features -- -D warnings +``` + +- `cargo fmt` ensures consistent code formatting across the project. +- `cargo clippy` catches common mistakes and enforces idiomatic Rust patterns. All warnings must be resolved (treated as errors via `-D warnings`). + +Do not commit code that fails either of these checks. + +## Testing + +Run relevant tests before submitting changes: + +```bash +cargo test --all-features +``` + +For SQL logic tests: + +```bash +cargo test -p datafusion-sqllogictest +``` diff --git a/CLAUDE 2.md b/CLAUDE 2.md new file mode 120000 index 0000000000000..47dc3e3d863cf --- /dev/null +++ b/CLAUDE 2.md @@ -0,0 +1 @@ +AGENTS.md \ No newline at end of file diff --git a/benchmarks/src/util/latency_object_store 2.rs b/benchmarks/src/util/latency_object_store 2.rs new file mode 100644 index 0000000000000..9ef8d1b78b751 --- /dev/null +++ b/benchmarks/src/util/latency_object_store 2.rs @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An ObjectStore wrapper that adds simulated S3-like latency to get and list operations. +//! +//! Cycles through a fixed latency distribution inspired by real S3 performance: +//! - P50: ~30ms +//! - P75-P90: ~100-120ms +//! - P99: ~150-200ms + +use std::fmt; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::time::Duration; + +use async_trait::async_trait; +use futures::StreamExt; +use futures::stream::BoxStream; +use object_store::path::Path; +use object_store::{ + CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, + ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, +}; + +/// GET latency distribution, inspired by S3 latencies. +/// Deterministic but shuffled to avoid artificial patterns. +/// 20 values: 11x P50 (~25-35ms), 5x P75-P90 (~70-110ms), 2x P95 (~120-150ms), 2x P99 (~180-200ms) +/// Sorted: 25,25,28,28,30,30,30,30,32,32,35, 70,85,100,100,110, 130,150, 180,200 +/// P50≈32ms, P90≈110ms, P99≈200ms +const GET_LATENCIES_MS: &[u64] = &[ + 30, 100, 25, 85, 32, 200, 28, 130, 35, 70, 30, 150, 30, 110, 28, 180, 32, 25, 100, 30, +]; + +/// LIST latency distribution, generally higher than GET. +/// 20 values: 11x P50 (~40-70ms), 5x P75-P90 (~120-180ms), 2x P95 (~200-250ms), 2x P99 (~300-400ms) +/// Sorted: 40,40,50,50,55,55,60,60,65,65,70, 120,140,160,160,180, 210,250, 300,400 +/// P50≈65ms, P90≈180ms, P99≈400ms +const LIST_LATENCIES_MS: &[u64] = &[ + 55, 160, 40, 140, 65, 400, 50, 210, 70, 120, 60, 250, 55, 180, 50, 300, 65, 40, 160, + 60, +]; + +/// An ObjectStore wrapper that injects simulated latency on get and list calls. +#[derive(Debug)] +pub struct LatencyObjectStore { + inner: T, + get_counter: AtomicUsize, + list_counter: AtomicUsize, +} + +impl LatencyObjectStore { + pub fn new(inner: T) -> Self { + Self { + inner, + get_counter: AtomicUsize::new(0), + list_counter: AtomicUsize::new(0), + } + } + + fn next_get_latency(&self) -> Duration { + let idx = + self.get_counter.fetch_add(1, Ordering::Relaxed) % GET_LATENCIES_MS.len(); + Duration::from_millis(GET_LATENCIES_MS[idx]) + } + + fn next_list_latency(&self) -> Duration { + let idx = + self.list_counter.fetch_add(1, Ordering::Relaxed) % LIST_LATENCIES_MS.len(); + Duration::from_millis(LIST_LATENCIES_MS[idx]) + } +} + +impl fmt::Display for LatencyObjectStore { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "LatencyObjectStore({})", self.inner) + } +} + +#[async_trait] +impl ObjectStore for LatencyObjectStore { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + self.inner.put_opts(location, payload, opts).await + } + + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOptions, + ) -> Result> { + self.inner.put_multipart_opts(location, opts).await + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + tokio::time::sleep(self.next_get_latency()).await; + self.inner.get_opts(location, options).await + } + + async fn get_ranges( + &self, + location: &Path, + ranges: &[std::ops::Range], + ) -> Result> { + tokio::time::sleep(self.next_get_latency()).await; + self.inner.get_ranges(location, ranges).await + } + + fn delete_stream( + &self, + locations: BoxStream<'static, Result>, + ) -> BoxStream<'static, Result> { + self.inner.delete_stream(locations) + } + + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { + let latency = self.next_list_latency(); + let stream = self.inner.list(prefix); + futures::stream::once(async move { + tokio::time::sleep(latency).await; + futures::stream::empty() + }) + .flatten() + .chain(stream) + .boxed() + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + tokio::time::sleep(self.next_list_latency()).await; + self.inner.list_with_delimiter(prefix).await + } + + async fn copy_opts( + &self, + from: &Path, + to: &Path, + options: CopyOptions, + ) -> Result<()> { + self.inner.copy_opts(from, to, options).await + } +} diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track 2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track 2.snap new file mode 100644 index 0000000000000..25267ea1617e5 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track 2.snap @@ -0,0 +1,23 @@ +--- +source: datafusion-cli/tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--memory-limit" + - 10M + - "--mem-pool-type" + - fair + - "--command" + - "select * from generate_series(1,500000) as t1(v1) order by v1;" + - "--top-memory-consumers" + - "0" +--- +success: false +exit_code: 1 +----- stdout ----- +[CLI_VERSION] +Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'. +caused by +Resources exhausted: Failed to allocate + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2 2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2 2.snap new file mode 100644 index 0000000000000..6515050047107 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2 2.snap @@ -0,0 +1,26 @@ +--- +source: datafusion-cli/tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--memory-limit" + - 10M + - "--mem-pool-type" + - fair + - "--command" + - "select * from generate_series(1,500000) as t1(v1) order by v1;" + - "--top-memory-consumers" + - "2" +--- +success: false +exit_code: 1 +----- stdout ----- +[CLI_VERSION] +Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'. +caused by +Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as: + Consumer(can spill: bool) consumed XB, peak XB, + Consumer(can spill: bool) consumed XB, peak XB. +Error: Failed to allocate + +----- stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default 2.snap b/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default 2.snap new file mode 100644 index 0000000000000..7bdcd63dc7be6 --- /dev/null +++ b/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default 2.snap @@ -0,0 +1,36 @@ +--- +source: datafusion-cli/tests/cli_integration.rs +info: + program: datafusion-cli + args: + - "--maxrows" + - "10" + - "--command" + - "select * from generate_series(1,500000) as t1(v1) order by v1;" +--- +success: true +exit_code: 0 +----- stdout ----- +[CLI_VERSION] ++----+ +| v1 | ++----+ +| 1 | +| 2 | +| 3 | +| 4 | +| 5 | +| 6 | +| 7 | +| 8 | +| 9 | +| 10 | +| . | +| . | +| . | ++----+ +500000 row(s) fetched. (First 10 displayed. Use --maxrows to adjust) +[ELAPSED] + + +----- stderr ----- diff --git a/datafusion/common/benches/stats_merge 2.rs b/datafusion/common/benches/stats_merge 2.rs new file mode 100644 index 0000000000000..73229b6379360 --- /dev/null +++ b/datafusion/common/benches/stats_merge 2.rs @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for `Statistics::try_merge_iter`. + +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, Schema}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_common::stats::Precision; +use datafusion_common::{ColumnStatistics, ScalarValue, Statistics}; + +/// Build a vector of `n` with `num_cols` columns +fn make_stats(n: usize, num_cols: usize) -> Vec { + (0..n) + .map(|i| { + let mut stats = Statistics::default() + .with_num_rows(Precision::Exact(100 + i)) + .with_total_byte_size(Precision::Exact(8000 + i * 80)); + for c in 0..num_cols { + let base = (i * num_cols + c) as i64; + stats = stats.add_column_statistics( + ColumnStatistics::new_unknown() + .with_null_count(Precision::Exact(i)) + .with_min_value(Precision::Exact(ScalarValue::Int64(Some(base)))) + .with_max_value(Precision::Exact(ScalarValue::Int64(Some( + base + 1000, + )))) + .with_sum_value(Precision::Exact(ScalarValue::Int64(Some( + base * 100, + )))), + ); + } + stats + }) + .collect() +} + +fn bench_stats_merge(c: &mut Criterion) { + let mut group = c.benchmark_group("stats_merge"); + + for &num_partitions in &[10, 100, 500] { + for &num_cols in &[1, 5, 20] { + let items = make_stats(num_partitions, num_cols); + let schema = Arc::new(Schema::new( + (0..num_cols) + .map(|i| Field::new(format!("col{i}"), DataType::Int64, true)) + .collect::>(), + )); + + let param = format!("{num_partitions}parts_{num_cols}cols"); + + group.bench_with_input( + BenchmarkId::new("try_merge_iter", ¶m), + &(&items, &schema), + |b, (items, schema)| { + b.iter(|| { + std::hint::black_box( + Statistics::try_merge_iter(*items, schema).unwrap(), + ); + }); + }, + ); + } + } + + group.finish(); +} + +criterion_group!(benches, bench_stats_merge); +criterion_main!(benches); diff --git a/datafusion/common/src/utils/aggregate 2.rs b/datafusion/common/src/utils/aggregate 2.rs new file mode 100644 index 0000000000000..43bc0676b2d3c --- /dev/null +++ b/datafusion/common/src/utils/aggregate 2.rs @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Scalar-level aggregation utilities for statistics merging. +//! +//! Provides a cheap pairwise [`ScalarValue`] addition that directly +//! extracts inner primitive values, avoiding the expensive +//! `ScalarValue::add` path (which round-trips through Arrow arrays). +use arrow::datatypes::i256; + +use crate::stats::Precision; +use crate::{Result, ScalarValue}; + +/// Saturating addition for [`i256`] (which lacks a built-in +/// `saturating_add`). Returns `i256::MAX` on positive overflow and +/// `i256::MIN` on negative overflow. +#[inline] +fn i256_saturating_add(a: i256, b: i256) -> i256 { + match a.checked_add(b) { + Some(sum) => sum, + None => { + // If b is non-negative the overflow is positive, otherwise + // negative. + if b >= i256::ZERO { + i256::MAX + } else { + i256::MIN + } + } + } +} + +/// Add two [`ScalarValue`]s by directly extracting and adding their +/// inner primitive values. +/// +/// This avoids `ScalarValue::add` which converts both operands to +/// single-element Arrow arrays, runs the `add_wrapping` kernel, and +/// converts the result back — 3 heap allocations per call. +/// +/// For non-primitive types, falls back to `ScalarValue::add`. +pub(crate) fn scalar_add(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { + macro_rules! add_int { + ($lhs:expr, $rhs:expr, $VARIANT:ident) => { + match ($lhs, $rhs) { + (ScalarValue::$VARIANT(Some(a)), ScalarValue::$VARIANT(Some(b))) => { + Ok(ScalarValue::$VARIANT(Some(a.saturating_add(*b)))) + } + (ScalarValue::$VARIANT(None), other) + | (other, ScalarValue::$VARIANT(None)) => Ok(other.clone()), + _ => unreachable!(), + } + }; + } + + macro_rules! add_decimal { + ($lhs:expr, $rhs:expr, $VARIANT:ident) => { + match ($lhs, $rhs) { + ( + ScalarValue::$VARIANT(Some(a), p, s), + ScalarValue::$VARIANT(Some(b), _, _), + ) => Ok(ScalarValue::$VARIANT(Some(a.saturating_add(*b)), *p, *s)), + (ScalarValue::$VARIANT(None, _, _), other) + | (other, ScalarValue::$VARIANT(None, _, _)) => Ok(other.clone()), + _ => unreachable!(), + } + }; + } + + macro_rules! add_float { + ($lhs:expr, $rhs:expr, $VARIANT:ident) => { + match ($lhs, $rhs) { + (ScalarValue::$VARIANT(Some(a)), ScalarValue::$VARIANT(Some(b))) => { + Ok(ScalarValue::$VARIANT(Some(*a + *b))) + } + (ScalarValue::$VARIANT(None), other) + | (other, ScalarValue::$VARIANT(None)) => Ok(other.clone()), + _ => unreachable!(), + } + }; + } + + match lhs { + ScalarValue::Int8(_) => add_int!(lhs, rhs, Int8), + ScalarValue::Int16(_) => add_int!(lhs, rhs, Int16), + ScalarValue::Int32(_) => add_int!(lhs, rhs, Int32), + ScalarValue::Int64(_) => add_int!(lhs, rhs, Int64), + ScalarValue::UInt8(_) => add_int!(lhs, rhs, UInt8), + ScalarValue::UInt16(_) => add_int!(lhs, rhs, UInt16), + ScalarValue::UInt32(_) => add_int!(lhs, rhs, UInt32), + ScalarValue::UInt64(_) => add_int!(lhs, rhs, UInt64), + ScalarValue::Float16(_) => add_float!(lhs, rhs, Float16), + ScalarValue::Float32(_) => add_float!(lhs, rhs, Float32), + ScalarValue::Float64(_) => add_float!(lhs, rhs, Float64), + ScalarValue::Decimal32(_, _, _) => add_decimal!(lhs, rhs, Decimal32), + ScalarValue::Decimal64(_, _, _) => add_decimal!(lhs, rhs, Decimal64), + ScalarValue::Decimal128(_, _, _) => add_decimal!(lhs, rhs, Decimal128), + ScalarValue::Decimal256(_, _, _) => match (lhs, rhs) { + ( + ScalarValue::Decimal256(Some(a), p, s), + ScalarValue::Decimal256(Some(b), _, _), + ) => Ok(ScalarValue::Decimal256( + Some(i256_saturating_add(*a, *b)), + *p, + *s, + )), + (ScalarValue::Decimal256(None, _, _), other) + | (other, ScalarValue::Decimal256(None, _, _)) => Ok(other.clone()), + _ => unreachable!(), + }, + // Fallback: use the existing ScalarValue::add + _ => lhs.add(rhs), + } +} + +/// [`Precision`]-aware sum of two [`ScalarValue`] precisions using +/// cheap direct addition via [`scalar_add`]. +/// +/// Mirrors the semantics of `Precision::add` but avoids +/// the expensive `ScalarValue::add` round-trip through Arrow arrays. +pub(crate) fn precision_add( + lhs: &Precision, + rhs: &Precision, +) -> Precision { + match (lhs, rhs) { + (Precision::Exact(a), Precision::Exact(b)) => scalar_add(a, b) + .map(Precision::Exact) + .unwrap_or(Precision::Absent), + (Precision::Inexact(a), Precision::Exact(b)) + | (Precision::Exact(a), Precision::Inexact(b)) + | (Precision::Inexact(a), Precision::Inexact(b)) => scalar_add(a, b) + .map(Precision::Inexact) + .unwrap_or(Precision::Absent), + (_, _) => Precision::Absent, + } +} diff --git a/datafusion/core/benches/topk_repartition 2.rs b/datafusion/core/benches/topk_repartition 2.rs new file mode 100644 index 0000000000000..e1f14e4aaa633 --- /dev/null +++ b/datafusion/core/benches/topk_repartition 2.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for the TopKRepartition optimizer rule. +//! +//! Measures the benefit of pushing TopK (Sort with fetch) below hash +//! repartition when running partitioned window functions with LIMIT. + +mod data_utils; + +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use data_utils::create_table_provider; +use datafusion::prelude::{SessionConfig, SessionContext}; +use parking_lot::Mutex; +use std::hint::black_box; +use std::sync::Arc; +use tokio::runtime::Runtime; + +#[expect(clippy::needless_pass_by_value)] +fn query(ctx: Arc>, rt: &Runtime, sql: &str) { + let df = rt.block_on(ctx.lock().sql(sql)).unwrap(); + black_box(rt.block_on(df.collect()).unwrap()); +} + +fn create_context( + partitions_len: usize, + target_partitions: usize, + enable_topk_repartition: bool, +) -> Arc> { + let array_len = 1024 * 1024; + let batch_size = 8 * 1024; + let mut config = SessionConfig::new().with_target_partitions(target_partitions); + config.options_mut().optimizer.enable_topk_repartition = enable_topk_repartition; + let ctx = SessionContext::new_with_config(config); + let rt = Runtime::new().unwrap(); + rt.block_on(async { + let provider = + create_table_provider(partitions_len, array_len, batch_size).unwrap(); + ctx.register_table("t", provider).unwrap(); + }); + Arc::new(Mutex::new(ctx)) +} + +fn criterion_benchmark(c: &mut Criterion) { + let rt = Runtime::new().unwrap(); + + let limits = [10, 1_000, 10_000, 100_000]; + let scans = 16; + let target_partitions = 4; + + let group = format!("topk_repartition_{scans}_to_{target_partitions}"); + let mut group = c.benchmark_group(group); + for limit in limits { + let sql = format!( + "SELECT \ + SUM(f64) OVER (PARTITION BY u64_narrow ORDER BY u64_wide ROWS UNBOUNDED PRECEDING) \ + FROM t \ + ORDER BY u64_narrow, u64_wide \ + LIMIT {limit}" + ); + + let ctx_disabled = create_context(scans, target_partitions, false); + group.bench_function(BenchmarkId::new("disabled", limit), |b| { + b.iter(|| query(ctx_disabled.clone(), &rt, &sql)) + }); + + let ctx_enabled = create_context(scans, target_partitions, true); + group.bench_function(BenchmarkId::new("enabled", limit), |b| { + b.iter(|| query(ctx_enabled.clone(), &rt, &sql)) + }); + } + group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown 2.rs b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown 2.rs new file mode 100644 index 0000000000000..b52408d4222d8 --- /dev/null +++ b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown 2.rs @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for struct field filter pushdown in Parquet. +//! +//! Compares scanning with vs without row-level filter pushdown for +//! predicates on struct sub-fields (e.g. `get_field(s, 'id') = 42`). +//! +//! The dataset schema (in SQL-like notation): +//! +//! ```sql +//! CREATE TABLE t ( +//! id INT, -- top-level id, useful for correctness checks +//! large_string TEXT, -- wide column so SELECT * is expensive +//! s STRUCT< +//! id: INT, -- mirrors top-level id +//! large_string: TEXT -- wide sub-field; pushdown with proper projection +//! -- should avoid reading this when filtering on s.id +//! > +//! ); +//! ``` +//! +//! Benchmark queries: +//! +//! 1. `SELECT * FROM t WHERE get_field(s, 'id') = 42` +//! - no pushdown vs. row-level filter pushdown +//! 2. `SELECT * FROM t WHERE get_field(s, 'id') = id` +//! - cross-column predicate; no pushdown vs. row-level filter pushdown +//! 3. `SELECT id FROM t WHERE get_field(s, 'id') = 42` +//! - narrow projection; pushdown should avoid reading s.large_string + +use std::path::{Path, PathBuf}; +use std::sync::{Arc, LazyLock}; + +use arrow::array::{BooleanArray, Int32Array, RecordBatch, StringBuilder, StructArray}; +use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; +use criterion::{Criterion, Throughput, criterion_group, criterion_main}; +use datafusion_common::ScalarValue; +use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter}; +use datafusion_expr::{Expr, col}; +use datafusion_physical_expr::planner::logical2physical; +use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use parquet::arrow::{ArrowWriter, ProjectionMask}; +use parquet::file::properties::WriterProperties; +use tempfile::TempDir; + +const ROW_GROUP_ROW_COUNT: usize = 10_000; +const TOTAL_ROW_GROUPS: usize = 10; +const TOTAL_ROWS: usize = ROW_GROUP_ROW_COUNT * TOTAL_ROW_GROUPS; +/// Only one row group will contain the target value. +const TARGET_VALUE: i32 = 42; +const ID_COLUMN_NAME: &str = "id"; +const LARGE_STRING_COLUMN_NAME: &str = "large_string"; +const STRUCT_COLUMN_NAME: &str = "s"; +// Large string payload to emphasize decoding overhead when pushdown is disabled. +const LARGE_STRING_LEN: usize = 8 * 1024; + +struct BenchmarkDataset { + _tempdir: TempDir, + file_path: PathBuf, +} + +impl BenchmarkDataset { + fn path(&self) -> &Path { + &self.file_path + } +} + +static DATASET: LazyLock = LazyLock::new(|| { + create_dataset().expect("failed to prepare parquet benchmark dataset") +}); + +fn parquet_struct_filter_pushdown(c: &mut Criterion) { + let dataset_path = DATASET.path().to_owned(); + let mut group = c.benchmark_group("parquet_struct_filter_pushdown"); + group.throughput(Throughput::Elements(TOTAL_ROWS as u64)); + + // Scenario 1: SELECT * FROM t WHERE get_field(s, 'id') = 42 + group.bench_function("select_star/no_pushdown", |b| { + let file_schema = setup_reader(&dataset_path); + let predicate = logical2physical(&struct_id_eq_literal(), &file_schema); + b.iter(|| { + let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all()) + .expect("scan succeeded"); + assert_eq!(matched, ROW_GROUP_ROW_COUNT); + }); + }); + + group.bench_function("select_star/with_pushdown", |b| { + let file_schema = setup_reader(&dataset_path); + let predicate = logical2physical(&struct_id_eq_literal(), &file_schema); + b.iter(|| { + let matched = scan(&dataset_path, &predicate, true, ProjectionMask::all()) + .expect("scan succeeded"); + assert_eq!(matched, ROW_GROUP_ROW_COUNT); + }); + }); + + // Scenario 2: SELECT * FROM t WHERE get_field(s, 'id') = id + group.bench_function("select_star_cross_col/no_pushdown", |b| { + let file_schema = setup_reader(&dataset_path); + let predicate = logical2physical(&struct_id_eq_top_id(), &file_schema); + b.iter(|| { + let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all()) + .expect("scan succeeded"); + assert_eq!(matched, TOTAL_ROWS); + }); + }); + + group.bench_function("select_star_cross_col/with_pushdown", |b| { + let file_schema = setup_reader(&dataset_path); + let predicate = logical2physical(&struct_id_eq_top_id(), &file_schema); + b.iter(|| { + let matched = scan(&dataset_path, &predicate, true, ProjectionMask::all()) + .expect("scan succeeded"); + assert_eq!(matched, TOTAL_ROWS); + }); + }); + + // Scenario 3: SELECT id FROM t WHERE get_field(s, 'id') = 42 + group.bench_function("select_id/no_pushdown", |b| { + let file_schema = setup_reader(&dataset_path); + let predicate = logical2physical(&struct_id_eq_literal(), &file_schema); + b.iter(|| { + // Without pushdown we must read all columns to evaluate the predicate. + let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all()) + .expect("scan succeeded"); + assert_eq!(matched, ROW_GROUP_ROW_COUNT); + }); + }); + + group.bench_function("select_id/with_pushdown", |b| { + let file_schema = setup_reader(&dataset_path); + let predicate = logical2physical(&struct_id_eq_literal(), &file_schema); + let id_only = id_projection(&dataset_path); + b.iter(|| { + // With pushdown the filter runs first, then we only project `id`. + let matched = scan(&dataset_path, &predicate, true, id_only.clone()) + .expect("scan succeeded"); + assert_eq!(matched, ROW_GROUP_ROW_COUNT); + }); + }); + + group.finish(); +} + +fn setup_reader(path: &Path) -> SchemaRef { + let file = std::fs::File::open(path).expect("failed to open file"); + let builder = + ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader"); + Arc::clone(builder.schema()) +} + +/// `get_field(s, 'id') = TARGET_VALUE` +fn struct_id_eq_literal() -> Expr { + let get_field_expr = datafusion_functions::core::get_field().call(vec![ + col(STRUCT_COLUMN_NAME), + Expr::Literal(ScalarValue::Utf8(Some("id".to_string())), None), + ]); + get_field_expr.eq(Expr::Literal(ScalarValue::Int32(Some(TARGET_VALUE)), None)) +} + +/// `get_field(s, 'id') = id` +fn struct_id_eq_top_id() -> Expr { + let get_field_expr = datafusion_functions::core::get_field().call(vec![ + col(STRUCT_COLUMN_NAME), + Expr::Literal(ScalarValue::Utf8(Some("id".to_string())), None), + ]); + get_field_expr.eq(col(ID_COLUMN_NAME)) +} + +/// Build a [`ProjectionMask`] that only reads the top-level `id` leaf column. +fn id_projection(path: &Path) -> ProjectionMask { + let file = std::fs::File::open(path).expect("failed to open file"); + let builder = + ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader"); + let parquet_schema = builder.metadata().file_metadata().schema_descr_ptr(); + // Leaf index 0 corresponds to the top-level `id` column. + ProjectionMask::leaves(&parquet_schema, [0]) +} + +fn scan( + path: &Path, + predicate: &Arc, + pushdown: bool, + projection: ProjectionMask, +) -> datafusion_common::Result { + let file = std::fs::File::open(path)?; + let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; + let metadata = builder.metadata().clone(); + let file_schema = builder.schema(); + + let metrics = ExecutionPlanMetricsSet::new(); + let file_metrics = ParquetFileMetrics::new(0, &path.display().to_string(), &metrics); + + let mut filter_applied = false; + let builder = if pushdown { + if let Some(row_filter) = + build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)? + { + filter_applied = true; + builder.with_row_filter(row_filter) + } else { + builder + } + } else { + builder + }; + + // Only apply a narrow projection when the filter was actually pushed down. + // Otherwise we need all columns to evaluate the predicate manually. + let output_projection = if filter_applied { + projection + } else { + ProjectionMask::all() + }; + let reader = builder.with_projection(output_projection).build()?; + + let mut matched_rows = 0usize; + for batch in reader { + let batch = batch?; + if filter_applied { + // When the row filter was applied, rows are already filtered. + matched_rows += batch.num_rows(); + } else { + matched_rows += count_matches(predicate, &batch)?; + } + } + + Ok(matched_rows) +} + +fn count_matches( + expr: &Arc, + batch: &RecordBatch, +) -> datafusion_common::Result { + let values = expr.evaluate(batch)?.into_array(batch.num_rows())?; + let bools = values + .as_any() + .downcast_ref::() + .expect("boolean filter result"); + + Ok(bools.iter().filter(|v| matches!(v, Some(true))).count()) +} + +fn schema() -> SchemaRef { + let struct_fields = Fields::from(vec![ + Field::new("id", DataType::Int32, false), + Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false), + ]); + Arc::new(Schema::new(vec![ + Field::new(ID_COLUMN_NAME, DataType::Int32, false), + Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false), + Field::new(STRUCT_COLUMN_NAME, DataType::Struct(struct_fields), false), + ])) +} + +fn create_dataset() -> datafusion_common::Result { + let tempdir = TempDir::new()?; + let file_path = tempdir.path().join("struct_filter.parquet"); + + let schema = schema(); + let writer_props = WriterProperties::builder() + .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT)) + .build(); + + let mut writer = ArrowWriter::try_new( + std::fs::File::create(&file_path)?, + Arc::clone(&schema), + Some(writer_props), + )?; + + // Each row group has a distinct `s.id` value. Only one row group + // matches the target, so pushdown should prune 90% of rows. + for rg_idx in 0..TOTAL_ROW_GROUPS { + let id_value = if rg_idx == TOTAL_ROW_GROUPS - 1 { + TARGET_VALUE + } else { + (rg_idx as i32 + 1) * 1000 + }; + let batch = build_struct_batch(&schema, id_value, ROW_GROUP_ROW_COUNT)?; + writer.write(&batch)?; + } + + writer.close()?; + + let reader = + ParquetRecordBatchReaderBuilder::try_new(std::fs::File::open(&file_path)?)?; + assert_eq!(reader.metadata().row_groups().len(), TOTAL_ROW_GROUPS); + + Ok(BenchmarkDataset { + _tempdir: tempdir, + file_path, + }) +} + +fn build_struct_batch( + schema: &SchemaRef, + id_value: i32, + len: usize, +) -> datafusion_common::Result { + let large_string: String = "x".repeat(LARGE_STRING_LEN); + + // Top-level columns + let top_id_array = Arc::new(Int32Array::from(vec![id_value; len])); + let mut top_string_builder = StringBuilder::new(); + for _ in 0..len { + top_string_builder.append_value(&large_string); + } + let top_string_array = Arc::new(top_string_builder.finish()); + + // Struct sub-fields: s.id mirrors top-level id, s.large_string is the same payload + let struct_id_array = Arc::new(Int32Array::from(vec![id_value; len])); + let mut struct_string_builder = StringBuilder::new(); + for _ in 0..len { + struct_string_builder.append_value(&large_string); + } + let struct_string_array = Arc::new(struct_string_builder.finish()); + + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, false)), + struct_id_array as Arc, + ), + ( + Arc::new(Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false)), + struct_string_array as Arc, + ), + ]); + + Ok(RecordBatch::try_new( + Arc::clone(schema), + vec![top_id_array, top_string_array, Arc::new(struct_array)], + )?) +} + +criterion_group!(benches, parquet_struct_filter_pushdown); +criterion_main!(benches); diff --git a/datafusion/datasource-parquet/src/test_data/ndv_test 2.parquet b/datafusion/datasource-parquet/src/test_data/ndv_test 2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3ecbe320f506efd450c6c2ebd31fd626571db80f GIT binary patch literal 1141 zcmZwHOHUI~6bJBgr_J)2zRYXL5Ay_(98U+#%5_W`z zDGL*HVOSbBZjCFK#uYKfL_dHbegId-rSU&ZYY1_YU+(nGz30A8I+Pnua^hbCd{gS? z8w~Ffmx$g-hKGnY7^5Hy`$-6fSUY}*%o{-5>o$_x_?S0@piF*7e!ov7B`8x91Rw~z zpcz6ChTYHtt-wKrJ)nUOZLk*%*a!Qe9U^c5I-nDx5CanqLL9o_5FCbXI08Lz6cUhx z6!bzG`rsHGhYa)wW$}cwlatO)vWT35({KhX$iV=dg*=>t0-T2na1k!SWhlZG7=)`Z z1lM2~uEPl2fKj*!x8OG1fx9pU_h1|*;69Xsa&{8u?L1|-AujS)jN3)t$`KhN+!`_T z6~F@`ctHXm#>@3_{>FIXF9e?1Dgn_gfmN2Smw=cHrA8Ll-+9sb52F-wrn7mz$Q5U{ zR1^h6Go|UuM1m`ngcMiez5k+VRMj`e3){0-$Lh&FxmqFL`8xcyHkD6zw1uCIHj^mR z4@_bi22lfPE1LXN&PzgNS+N9jaE@6f`|xqyDo`}DWbN$k6_;6rmg5wVoXT7wS{2E9 zF4P1@5jkOTK`-{9;_QX;BYSdUzC2Z#E}`_f7!x$1YR97Pt6VNUsXUyWTXF&cd=s6W z#)#AnrW<ni=Pc$6H3Dz@-I>y!Kbo2f4 zsNb2nCYx(MLz5di&7ZQNNn7`quD1!5 zRApz(N%ykJNvCeMV4tR}zEvn5&*KF11Chnb+ Result> { + match self { + LogicalPlan::Limit(limit) => match limit.get_skip_type()? { + SkipType::Literal(0) => Ok(None), + SkipType::Literal(n) => Ok(Some(n)), + SkipType::UnsupportedExpr => Ok(None), + }, + LogicalPlan::Sort(_) => Ok(None), + LogicalPlan::TableScan(_) => Ok(None), + LogicalPlan::Projection(_) => Ok(None), + LogicalPlan::Filter(_) => Ok(None), + LogicalPlan::Window(_) => Ok(None), + LogicalPlan::Aggregate(_) => Ok(None), + LogicalPlan::Join(_) => Ok(None), + LogicalPlan::Repartition(_) => Ok(None), + LogicalPlan::Union(_) => Ok(None), + LogicalPlan::EmptyRelation(_) => Ok(None), + LogicalPlan::Subquery(_) => Ok(None), + LogicalPlan::SubqueryAlias(_) => Ok(None), + LogicalPlan::Statement(_) => Ok(None), + LogicalPlan::Values(_) => Ok(None), + LogicalPlan::Explain(_) => Ok(None), + LogicalPlan::Analyze(_) => Ok(None), + LogicalPlan::Extension(_) => Ok(None), + LogicalPlan::Distinct(_) => Ok(None), + LogicalPlan::Dml(_) => Ok(None), + LogicalPlan::Ddl(_) => Ok(None), + LogicalPlan::Copy(_) => Ok(None), + LogicalPlan::DescribeTable(_) => Ok(None), + LogicalPlan::Unnest(_) => Ok(None), + LogicalPlan::RecursiveQuery(_) => Ok(None), + } + } + /// Returns the fetch (limit) of this plan node, if it has one. /// - /// Only [`LogicalPlan::Sort`] and [`LogicalPlan::TableScan`] carry a fetch - /// value directly; all other variants return `None`. - pub fn fetch(&self) -> Option { + /// [`LogicalPlan::Sort`], [`LogicalPlan::TableScan`], and + /// [`LogicalPlan::Limit`] may carry a fetch value; all other variants + /// return `Ok(None)`. + pub fn fetch(&self) -> Result> { match self { - LogicalPlan::Sort(Sort { fetch, .. }) => *fetch, - LogicalPlan::TableScan(TableScan { fetch, .. }) => *fetch, - LogicalPlan::Projection(_) => None, - LogicalPlan::Filter(_) => None, - LogicalPlan::Window(_) => None, - LogicalPlan::Aggregate(_) => None, - LogicalPlan::Join(_) => None, - LogicalPlan::Repartition(_) => None, - LogicalPlan::Union(_) => None, - LogicalPlan::EmptyRelation(_) => None, - LogicalPlan::Subquery(_) => None, - LogicalPlan::SubqueryAlias(_) => None, - LogicalPlan::Limit(_) => None, - LogicalPlan::Statement(_) => None, - LogicalPlan::Values(_) => None, - LogicalPlan::Explain(_) => None, - LogicalPlan::Analyze(_) => None, - LogicalPlan::Extension(_) => None, - LogicalPlan::Distinct(_) => None, - LogicalPlan::Dml(_) => None, - LogicalPlan::Ddl(_) => None, - LogicalPlan::Copy(_) => None, - LogicalPlan::DescribeTable(_) => None, - LogicalPlan::Unnest(_) => None, - LogicalPlan::RecursiveQuery(_) => None, + LogicalPlan::Sort(Sort { fetch, .. }) => Ok(*fetch), + LogicalPlan::TableScan(TableScan { fetch, .. }) => Ok(*fetch), + LogicalPlan::Limit(limit) => match limit.get_fetch_type()? { + FetchType::Literal(s) => Ok(s), + FetchType::UnsupportedExpr => Ok(None), + }, + LogicalPlan::Projection(_) => Ok(None), + LogicalPlan::Filter(_) => Ok(None), + LogicalPlan::Window(_) => Ok(None), + LogicalPlan::Aggregate(_) => Ok(None), + LogicalPlan::Join(_) => Ok(None), + LogicalPlan::Repartition(_) => Ok(None), + LogicalPlan::Union(_) => Ok(None), + LogicalPlan::EmptyRelation(_) => Ok(None), + LogicalPlan::Subquery(_) => Ok(None), + LogicalPlan::SubqueryAlias(_) => Ok(None), + LogicalPlan::Statement(_) => Ok(None), + LogicalPlan::Values(_) => Ok(None), + LogicalPlan::Explain(_) => Ok(None), + LogicalPlan::Analyze(_) => Ok(None), + LogicalPlan::Extension(_) => Ok(None), + LogicalPlan::Distinct(_) => Ok(None), + LogicalPlan::Dml(_) => Ok(None), + LogicalPlan::Ddl(_) => Ok(None), + LogicalPlan::Copy(_) => Ok(None), + LogicalPlan::DescribeTable(_) => Ok(None), + LogicalPlan::Unnest(_) => Ok(None), + LogicalPlan::RecursiveQuery(_) => Ok(None), } } diff --git a/datafusion/ffi/tests/ffi_execution_plan 2.rs b/datafusion/ffi/tests/ffi_execution_plan 2.rs new file mode 100644 index 0000000000000..d81f947dc80ed --- /dev/null +++ b/datafusion/ffi/tests/ffi_execution_plan 2.rs @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[cfg(feature = "integration-tests")] +mod tests { + use arrow::datatypes::Field; + use arrow::datatypes::Schema; + use arrow_schema::DataType; + use datafusion_common::DataFusionError; + use datafusion_ffi::execution_plan::FFI_ExecutionPlan; + use datafusion_ffi::execution_plan::ForeignExecutionPlan; + use datafusion_ffi::execution_plan::{ExecutionPlanPrivateData, tests::EmptyExec}; + use datafusion_ffi::tests::utils::get_module; + use datafusion_physical_plan::ExecutionPlan; + use std::sync::Arc; + + #[test] + fn test_ffi_execution_plan_new_sets_runtimes_on_children() + -> Result<(), DataFusionError> { + // We want to test the case where we have two libraries. + // Library A will have a foreign plan from Library B, called child_plan. + // Library A will add a plan called grandchild_plan under child_plan + // Library A will create a plan called parent_plan, that has child_plan + // under it. So we should have: + // parent_plan (local) -> child_plan (foreign) -> grandchild_plan (local) + // Then we want to turn parent_plan into a FFI plan. + // Verify that grandchild_plan also gets the same runtime as parent_plan. + + let module = get_module()?; + + fn generate_local_plan() -> Arc { + let schema = + Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); + + Arc::new(EmptyExec::new(schema)) + } + + let child_plan = + module + .create_empty_exec() + .ok_or(DataFusionError::NotImplemented( + "External module failed to implement create_empty_exec".to_string(), + ))?(); + let child_plan: Arc = (&child_plan) + .try_into() + .expect("should be able create plan"); + assert!(child_plan.as_any().is::()); + + let grandchild_plan = generate_local_plan(); + + let child_plan = child_plan.with_new_children(vec![grandchild_plan])?; + + unsafe { + // Originally the runtime is not set. We go through the unsafe casting + // of data here because the `inner()` function is private and this is + // only an integration test so we do not want to expose it. + let ffi_child = FFI_ExecutionPlan::new(Arc::clone(&child_plan), None); + let ffi_grandchild = + (ffi_child.children)(&ffi_child).into_iter().next().unwrap(); + + let grandchild_private_data = + ffi_grandchild.private_data as *const ExecutionPlanPrivateData; + assert!((*grandchild_private_data).runtime.is_none()); + } + + let parent_plan = generate_local_plan().with_new_children(vec![child_plan])?; + + // Adding the grandchild beneath this FFI plan should get the runtime passed down. + let runtime = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + let ffi_parent = + FFI_ExecutionPlan::new(parent_plan, Some(runtime.handle().clone())); + + unsafe { + let ffi_child = (ffi_parent.children)(&ffi_parent) + .into_iter() + .next() + .unwrap(); + let ffi_grandchild = + (ffi_child.children)(&ffi_child).into_iter().next().unwrap(); + assert_eq!( + (ffi_grandchild.library_marker_id)(), + (ffi_parent.library_marker_id)() + ); + + let grandchild_private_data = + ffi_grandchild.private_data as *const ExecutionPlanPrivateData; + assert!((*grandchild_private_data).runtime.is_some()); + } + + Ok(()) + } +} diff --git a/datafusion/functions-aggregate/benches/approx_distinct 2.rs b/datafusion/functions-aggregate/benches/approx_distinct 2.rs new file mode 100644 index 0000000000000..538103d991f1f --- /dev/null +++ b/datafusion/functions-aggregate/benches/approx_distinct 2.rs @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray}; +use arrow::datatypes::{DataType, Field, Schema}; +use criterion::{Criterion, criterion_group, criterion_main}; +use datafusion_expr::function::AccumulatorArgs; +use datafusion_expr::{Accumulator, AggregateUDFImpl}; +use datafusion_functions_aggregate::approx_distinct::ApproxDistinct; +use datafusion_physical_expr::expressions::col; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; + +const BATCH_SIZE: usize = 8192; +const STRING_LENGTH: usize = 20; + +fn prepare_accumulator(data_type: DataType) -> Box { + let schema = Arc::new(Schema::new(vec![Field::new("f", data_type, true)])); + let expr = col("f", &schema).unwrap(); + let accumulator_args = AccumulatorArgs { + return_field: Field::new("f", DataType::UInt64, true).into(), + schema: &schema, + expr_fields: &[expr.return_field(&schema).unwrap()], + ignore_nulls: false, + order_bys: &[], + is_reversed: false, + name: "approx_distinct(f)", + is_distinct: false, + exprs: &[expr], + }; + ApproxDistinct::new().accumulator(accumulator_args).unwrap() +} + +/// Creates an Int64Array where values are drawn from `0..n_distinct`. +fn create_i64_array(n_distinct: usize) -> Int64Array { + let mut rng = StdRng::seed_from_u64(42); + (0..BATCH_SIZE) + .map(|_| Some(rng.random_range(0..n_distinct as i64))) + .collect() +} + +/// Creates a pool of `n_distinct` random strings. +fn create_string_pool(n_distinct: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(42); + (0..n_distinct) + .map(|_| { + (0..STRING_LENGTH) + .map(|_| rng.random_range(b'a'..=b'z') as char) + .collect() + }) + .collect() +} + +/// Creates a StringArray where values are drawn from the given pool. +fn create_string_array(pool: &[String]) -> StringArray { + let mut rng = StdRng::seed_from_u64(99); + (0..BATCH_SIZE) + .map(|_| Some(pool[rng.random_range(0..pool.len())].as_str())) + .collect() +} + +/// Creates a StringViewArray where values are drawn from the given pool. +fn create_string_view_array(pool: &[String]) -> StringViewArray { + let mut rng = StdRng::seed_from_u64(99); + (0..BATCH_SIZE) + .map(|_| Some(pool[rng.random_range(0..pool.len())].as_str())) + .collect() +} + +fn approx_distinct_benchmark(c: &mut Criterion) { + for pct in [80, 99] { + let n_distinct = BATCH_SIZE * pct / 100; + + // --- Int64 benchmarks --- + let values = Arc::new(create_i64_array(n_distinct)) as ArrayRef; + c.bench_function(&format!("approx_distinct i64 {pct}% distinct"), |b| { + b.iter(|| { + let mut accumulator = prepare_accumulator(DataType::Int64); + accumulator + .update_batch(std::slice::from_ref(&values)) + .unwrap() + }) + }); + + let string_pool = create_string_pool(n_distinct); + + // --- Utf8 benchmarks --- + let values = Arc::new(create_string_array(&string_pool)) as ArrayRef; + c.bench_function(&format!("approx_distinct utf8 {pct}% distinct"), |b| { + b.iter(|| { + let mut accumulator = prepare_accumulator(DataType::Utf8); + accumulator + .update_batch(std::slice::from_ref(&values)) + .unwrap() + }) + }); + + // --- Utf8View benchmarks --- + let values = Arc::new(create_string_view_array(&string_pool)) as ArrayRef; + c.bench_function(&format!("approx_distinct utf8view {pct}% distinct"), |b| { + b.iter(|| { + let mut accumulator = prepare_accumulator(DataType::Utf8View); + accumulator + .update_batch(std::slice::from_ref(&values)) + .unwrap() + }) + }); + } +} + +criterion_group!(benches, approx_distinct_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions-nested/benches/array_concat 2.rs b/datafusion/functions-nested/benches/array_concat 2.rs new file mode 100644 index 0000000000000..75dcc88f14737 --- /dev/null +++ b/datafusion/functions-nested/benches/array_concat 2.rs @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int32Array, ListArray}; +use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow::datatypes::{DataType, Field}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; + +use datafusion_functions_nested::concat::array_concat_inner; + +const SEED: u64 = 42; + +/// Build a `ListArray` with `num_lists` rows, each containing +/// `elements_per_list` random i32 values. Every 10th row is null. +fn make_list_array( + rng: &mut StdRng, + num_lists: usize, + elements_per_list: usize, +) -> ArrayRef { + let total_values = num_lists * elements_per_list; + let values: Vec = (0..total_values).map(|_| rng.random()).collect(); + let values = Arc::new(Int32Array::from(values)); + + let offsets: Vec = (0..=num_lists) + .map(|i| (i * elements_per_list) as i32) + .collect(); + let offsets = OffsetBuffer::new(ScalarBuffer::from(offsets)); + + let nulls: Vec = (0..num_lists).map(|i| i % 10 != 0).collect(); + let nulls = Some(NullBuffer::from(nulls)); + + Arc::new(ListArray::new( + Arc::new(Field::new("item", DataType::Int32, false)), + offsets, + values, + nulls, + )) +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("array_concat"); + + // Benchmark: varying number of rows, 20 elements per list + for num_rows in [100, 1000, 10000] { + let mut rng = StdRng::seed_from_u64(SEED); + let list_a = make_list_array(&mut rng, num_rows, 20); + let list_b = make_list_array(&mut rng, num_rows, 20); + let args: Vec = vec![list_a, list_b]; + + group.bench_with_input(BenchmarkId::new("rows", num_rows), &args, |b, args| { + b.iter(|| black_box(array_concat_inner(args).unwrap())); + }); + } + + // Benchmark: 1000 rows, varying element counts per list + for elements_per_list in [5, 50, 500] { + let mut rng = StdRng::seed_from_u64(SEED); + let list_a = make_list_array(&mut rng, 1000, elements_per_list); + let list_b = make_list_array(&mut rng, 1000, elements_per_list); + let args: Vec = vec![list_a, list_b]; + + group.bench_with_input( + BenchmarkId::new("elements_per_list", elements_per_list), + &args, + |b, args| { + b.iter(|| black_box(array_concat_inner(args).unwrap())); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions-nested/benches/array_to_string 2.rs b/datafusion/functions-nested/benches/array_to_string 2.rs new file mode 100644 index 0000000000000..286ed4eeb0003 --- /dev/null +++ b/datafusion/functions-nested/benches/array_to_string 2.rs @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray}; +use arrow::buffer::OffsetBuffer; +use arrow::datatypes::{DataType, Field}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_common::ScalarValue; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions_nested::string::ArrayToString; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use std::hint::black_box; +use std::sync::Arc; + +const NUM_ROWS: usize = 1000; +const ARRAY_SIZES: &[usize] = &[5, 20, 100]; +const NESTED_ARRAY_SIZE: usize = 3; +const SEED: u64 = 42; +const NULL_DENSITY: f64 = 0.1; + +fn criterion_benchmark(c: &mut Criterion) { + bench_array_to_string(c, "array_to_string_int64", create_int64_list_array); + bench_array_to_string(c, "array_to_string_float64", create_float64_list_array); + bench_array_to_string(c, "array_to_string_string", create_string_list_array); + bench_array_to_string( + c, + "array_to_string_nested_int64", + create_nested_int64_list_array, + ); +} + +fn bench_array_to_string( + c: &mut Criterion, + group_name: &str, + make_array: impl Fn(usize) -> ArrayRef, +) { + let mut group = c.benchmark_group(group_name); + + for &array_size in ARRAY_SIZES { + let list_array = make_array(array_size); + let args = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))), + ]; + let arg_fields = vec![ + Field::new("array", list_array.data_type().clone(), true).into(), + Field::new("delimiter", DataType::Utf8, false).into(), + ]; + + group.bench_with_input( + BenchmarkId::from_parameter(array_size), + &array_size, + |b, _| { + let udf = ArrayToString::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: Field::new("result", DataType::Utf8, true) + .into(), + config_options: Arc::new(ConfigOptions::default()), + }) + .unwrap(), + ) + }) + }, + ); + } + + group.finish(); +} + +fn create_int64_list_array(array_size: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..NUM_ROWS * array_size) + .map(|_| { + if rng.random::() < NULL_DENSITY { + None + } else { + Some(rng.random_range(0..1000)) + } + }) + .collect::(); + let offsets = (0..=NUM_ROWS) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} + +fn create_nested_int64_list_array(array_size: usize) -> ArrayRef { + let inner = create_int64_list_array(array_size); + let inner_rows = NUM_ROWS; + let outer_rows = inner_rows / NESTED_ARRAY_SIZE; + let offsets = (0..=outer_rows) + .map(|i| (i * NESTED_ARRAY_SIZE) as i32) + .collect::>(); + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", inner.data_type().clone(), true)), + OffsetBuffer::new(offsets.into()), + inner, + None, + ) + .unwrap(), + ) +} + +fn create_float64_list_array(array_size: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..NUM_ROWS * array_size) + .map(|_| { + if rng.random::() < NULL_DENSITY { + None + } else { + Some(rng.random_range(-1000.0..1000.0)) + } + }) + .collect::(); + let offsets = (0..=NUM_ROWS) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Float64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} + +fn create_string_list_array(array_size: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..NUM_ROWS * array_size) + .map(|_| { + if rng.random::() < NULL_DENSITY { + None + } else { + Some(format!("value_{}", rng.random_range(0..100))) + } + }) + .collect::(); + let offsets = (0..=NUM_ROWS) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Utf8, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index c2c16e9fe7803..0973da4b0a909 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -796,9 +796,10 @@ impl OptimizerRule for PushDownFilter { filter.predicate = new_predicate; } - // If the child has a fetch (limit), pushing a filter below it would - // change semantics: the limit should apply before the filter, not after. - if filter.input.fetch().is_some() { + // If the child has a fetch (limit) or skip (offset), pushing a filter + // below it would change semantics: the limit/offset should apply before + // the filter, not after. + if filter.input.fetch()?.is_some() || filter.input.skip()?.is_some() { return Ok(Transformed::no(LogicalPlan::Filter(filter))); } diff --git a/datafusion/optimizer/src/simplify_expressions/linear_aggregates 2.rs b/datafusion/optimizer/src/simplify_expressions/linear_aggregates 2.rs new file mode 100644 index 0000000000000..21389cf326c24 --- /dev/null +++ b/datafusion/optimizer/src/simplify_expressions/linear_aggregates 2.rs @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Simplification to refactor multiple aggregate functions to use the same aggregate function + +use datafusion_common::HashMap; +use datafusion_expr::expr::AggregateFunctionParams; +use datafusion_expr::{BinaryExpr, Expr}; +use datafusion_expr_common::operator::Operator; + +/// Threshold of the number of aggregates that share similar arguments before +/// triggering rewrite. +/// +/// There is a threshold because the canonical SUM rewrite described in +/// [`AggregateUDFImpl::simplify_expr_op_literal`] actually results in more +/// aggregates (2) for each original aggregate. It is important that CSE then +/// eliminate them. +/// +/// [`AggregateUDFImpl::simplify_expr_op_literal`]: datafusion_expr::AggregateUDFImpl::simplify_expr_op_literal +const DUPLICATE_THRESHOLD: usize = 2; + +/// Rewrites multiple aggregate expressions that have a common linear component +/// into multiple aggregate expressions that share that common component. +/// +/// For example, rewrites patterns such as +/// * `SUM(x + 1), SUM(x + 2), ...` +/// +/// Into +/// * `SUM(x) + 1 * COUNT(x), SUM(x) + 2 * COUNT(x), ...` +/// +/// See the background [`AggregateUDFImpl::simplify_expr_op_literal`] for details. +/// +/// Returns `true` if any of the arguments are rewritten (modified), `false` +/// otherwise. +/// +/// ## Design goals: +/// 1. Keep the aggregate specific logic out of the optimizer (can't depend directly on SUM) +/// 2. Optimize for the case that this rewrite will not apply (it almost never does) +/// +/// [`AggregateUDFImpl::simplify_expr_op_literal`]: datafusion_expr::AggregateUDFImpl::simplify_expr_op_literal +pub(super) fn rewrite_multiple_linear_aggregates( + agg_expr: &mut [Expr], +) -> datafusion_common::Result { + // map : count of expressions that have a common argument + let mut common_args = HashMap::new(); + + // First pass -- figure out any aggregates that can be split and have common + // expressions. + for agg in agg_expr.iter() { + let Expr::AggregateFunction(agg_function) = agg else { + continue; + }; + + let Some(arg) = candidate_linear_param(&agg_function.params) else { + continue; + }; + + let Some(expr_literal) = ExprLiteral::try_new(arg) else { + continue; + }; + + let counter = common_args.entry(expr_literal.expr()).or_insert(0); + *counter += 1; + } + + // (agg_index, new_expr) + let mut new_aggs = vec![]; + + // Second pass, actually rewrite any aggregates that have a common + // expression and enough duplicates. + for (idx, agg) in agg_expr.iter().enumerate() { + let Expr::AggregateFunction(agg_function) = agg else { + continue; + }; + + let Some(arg) = candidate_linear_param(&agg_function.params) else { + continue; + }; + + let Some(expr_literal) = ExprLiteral::try_new(arg) else { + continue; + }; + + // Not enough common expressions to make it worth rewriting + if common_args.get(expr_literal.expr()).unwrap_or(&0) < &DUPLICATE_THRESHOLD { + continue; + } + + if let Some(new_agg_function) = agg_function.func.simplify_expr_op_literal( + agg_function, + expr_literal.expr(), + expr_literal.op(), + expr_literal.lit(), + expr_literal.arg_is_left(), + )? { + new_aggs.push((idx, new_agg_function)); + } + } + + if new_aggs.is_empty() { + return Ok(false); + } + + // Otherwise replace the aggregate expressions + drop(common_args); // release borrow + for (idx, new_agg) in new_aggs { + let orig_name = agg_expr[idx].name_for_alias()?; + agg_expr[idx] = new_agg.alias_if_changed(orig_name)? + } + + Ok(true) +} + +/// Returns Some(&Expr) with the single argument if this is a suitable candidate +/// for the linear rewrite +fn candidate_linear_param(params: &AggregateFunctionParams) -> Option<&Expr> { + // Explicitly destructure to ensure we check all relevant fields + let AggregateFunctionParams { + args, + distinct, + filter, + order_by, + null_treatment, + } = params; + + // Disqualify anything "non standard" + if *distinct + || filter.is_some() + || !order_by.is_empty() + || null_treatment.is_some() + || args.len() != 1 + { + return None; + } + let arg = args.first()?; + if arg.is_volatile() { + return None; + }; + Some(arg) +} + +/// A view into a [`Expr::BinaryExpr`] that is arbitrary expression and a +/// literal +/// +/// This is an enum to distinguish the direction of the operator arguments +#[derive(Debug, Clone)] +pub enum ExprLiteral<'a> { + /// if the expression is ` ` + ArgOpLit { + arg: &'a Expr, + op: Operator, + lit: &'a Expr, + }, + /// if the expression is ` ` + LitOpArg { + lit: &'a Expr, + op: Operator, + arg: &'a Expr, + }, +} + +impl<'a> ExprLiteral<'a> { + /// Try and split the Expr into its parts + fn try_new(expr: &'a Expr) -> Option { + match expr { + // + Expr::BinaryExpr(BinaryExpr { left, op, right }) + if matches!(left.as_ref(), Expr::Literal(..)) => + { + Some(Self::LitOpArg { + arg: right, + lit: left, + op: *op, + }) + } + + // + + Expr::BinaryExpr(BinaryExpr { left, op, right }) + if matches!(right.as_ref(), Expr::Literal(..)) => + { + Some(Self::ArgOpLit { + arg: left, + lit: right, + op: *op, + }) + } + _ => None, + } + } + + fn expr(&self) -> &'a Expr { + match self { + Self::ArgOpLit { arg, .. } => arg, + Self::LitOpArg { arg, .. } => arg, + } + } + + fn lit(&self) -> &'a Expr { + match self { + Self::ArgOpLit { lit, .. } => lit, + Self::LitOpArg { lit, .. } => lit, + } + } + + fn op(&self) -> Operator { + match self { + Self::ArgOpLit { op, .. } => *op, + Self::LitOpArg { op, .. } => *op, + } + } + + fn arg_is_left(&self) -> bool { + matches!(self, Self::ArgOpLit { .. }) + } +} diff --git a/datafusion/physical-expr-common/benches/compare_nested 2.rs b/datafusion/physical-expr-common/benches/compare_nested 2.rs new file mode 100644 index 0000000000000..56c122fef9420 --- /dev/null +++ b/datafusion/physical-expr-common/benches/compare_nested 2.rs @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray}; +use arrow::datatypes::{DataType, Field, Fields}; +use criterion::{Criterion, criterion_group, criterion_main}; +use datafusion_expr_common::operator::Operator; +use datafusion_physical_expr_common::datum::compare_op_for_nested; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use std::hint::black_box; +use std::sync::Arc; + +/// Build a StructArray with fields {x: Int32, y: Utf8}. +fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef { + let ints: Int32Array = (0..num_rows).map(|_| Some(rng.random::())).collect(); + + let strings: StringArray = (0..num_rows) + .map(|_| { + let s: String = (0..12) + .map(|_| rng.random_range(b'a'..=b'z') as char) + .collect(); + Some(s) + }) + .collect(); + + let fields = Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), + ]); + + Arc::new( + StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], None) + .unwrap(), + ) +} + +fn criterion_benchmark(c: &mut Criterion) { + let num_rows = 8192; + let mut rng = StdRng::seed_from_u64(42); + + let lhs = make_struct_array(num_rows, &mut rng); + let rhs_array = make_struct_array(num_rows, &mut rng); + let rhs_scalar = Scalar::new(make_struct_array(1, &mut rng)); + + c.bench_function("compare_nested array_array", |b| { + b.iter(|| { + black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_array).unwrap()) + }) + }); + + c.bench_function("compare_nested array_scalar", |b| { + b.iter(|| { + black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_scalar).unwrap()) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/physical-optimizer/src/hash_join_buffering 2.rs b/datafusion/physical-optimizer/src/hash_join_buffering 2.rs new file mode 100644 index 0000000000000..3c29b46c0fa64 --- /dev/null +++ b/datafusion/physical-optimizer/src/hash_join_buffering 2.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::PhysicalOptimizerRule; +use datafusion_common::JoinSide; +use datafusion_common::config::ConfigOptions; +use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; +use datafusion_physical_plan::ExecutionPlan; +use datafusion_physical_plan::buffer::BufferExec; +use datafusion_physical_plan::joins::HashJoinExec; +use std::sync::Arc; + +/// Looks for all the [HashJoinExec]s in the plan and places a [BufferExec] node with the +/// configured capacity in the probe side: +/// +/// ```text +/// ┌───────────────────┐ +/// │ HashJoinExec │ +/// └─────▲────────▲────┘ +/// ┌───────┘ └─────────┐ +/// │ │ +/// ┌────────────────┐ ┌─────────────────┐ +/// │ Build side │ + │ BufferExec │ +/// └────────────────┘ └────────▲────────┘ +/// │ +/// ┌────────┴────────┐ +/// │ Probe side │ +/// └─────────────────┘ +/// ``` +/// +/// Which allows eagerly pulling it even before the build side has completely finished. +#[derive(Debug, Default)] +pub struct HashJoinBuffering {} + +impl HashJoinBuffering { + pub fn new() -> Self { + Self::default() + } +} + +impl PhysicalOptimizerRule for HashJoinBuffering { + fn optimize( + &self, + plan: Arc, + config: &ConfigOptions, + ) -> datafusion_common::Result> { + let capacity = config.execution.hash_join_buffering_capacity; + if capacity == 0 { + return Ok(plan); + } + + plan.transform_down(|plan| { + let Some(node) = plan.as_any().downcast_ref::() else { + return Ok(Transformed::no(plan)); + }; + let plan = Arc::clone(&plan); + Ok(Transformed::yes( + if HashJoinExec::probe_side() == JoinSide::Left { + // Do not stack BufferExec nodes together. + if node.left.as_any().downcast_ref::().is_some() { + return Ok(Transformed::no(plan)); + } + plan.with_new_children(vec![ + Arc::new(BufferExec::new(Arc::clone(&node.left), capacity)), + Arc::clone(&node.right), + ])? + } else { + // Do not stack BufferExec nodes together. + if node.right.as_any().downcast_ref::().is_some() { + return Ok(Transformed::no(plan)); + } + plan.with_new_children(vec![ + Arc::clone(&node.left), + Arc::new(BufferExec::new(Arc::clone(&node.right), capacity)), + ])? + }, + )) + }) + .data() + } + + fn name(&self) -> &str { + "HashJoinBuffering" + } + + fn schema_check(&self) -> bool { + true + } +} diff --git a/datafusion/physical-optimizer/src/topk_repartition 2.rs b/datafusion/physical-optimizer/src/topk_repartition 2.rs new file mode 100644 index 0000000000000..668e0d273288b --- /dev/null +++ b/datafusion/physical-optimizer/src/topk_repartition 2.rs @@ -0,0 +1,368 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Push TopK (Sort with fetch) past Hash Repartition +//! +//! When a `SortExec` with a fetch limit (TopK) sits above a +//! `RepartitionExec(Hash)`, and the hash partition expressions are a prefix +//! of the sort expressions, this rule inserts a copy of the TopK below +//! the repartition to reduce the volume of data flowing through the shuffle. +//! +//! This is correct because the hash partition key being a prefix of the sort +//! key guarantees that all rows with the same partition key end up in the same +//! output partition. Therefore, rows that survive the final TopK after +//! repartitioning will always survive the pre-repartition TopK as well. +//! +//! ## Example +//! +//! Before: +//! ```text +//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC] +//! RepartitionExec: Hash([a], 4) +//! DataSourceExec +//! ``` +//! +//! After: +//! ```text +//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC] +//! RepartitionExec: Hash([a], 4) +//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC] +//! DataSourceExec +//! ``` + +use crate::PhysicalOptimizerRule; +use datafusion_common::Result; +use datafusion_common::config::ConfigOptions; +use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; +use std::sync::Arc; +// CoalesceBatchesExec is deprecated on main (replaced by arrow-rs BatchCoalescer), +// but older DataFusion versions may still insert it between SortExec and RepartitionExec. +#[expect(deprecated)] +use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion_physical_plan::repartition::RepartitionExec; +use datafusion_physical_plan::sorts::sort::SortExec; +use datafusion_physical_plan::{ExecutionPlan, Partitioning}; + +/// A physical optimizer rule that pushes TopK (Sort with fetch) past +/// hash repartition when the partition key is a prefix of the sort key. +/// +/// See module-level documentation for details. +#[derive(Debug, Clone, Default)] +pub struct TopKRepartition; + +impl TopKRepartition { + pub fn new() -> Self { + Self {} + } +} + +impl PhysicalOptimizerRule for TopKRepartition { + #[expect(deprecated)] // CoalesceBatchesExec: kept for older DataFusion versions + fn optimize( + &self, + plan: Arc, + config: &ConfigOptions, + ) -> Result> { + if !config.optimizer.enable_topk_repartition { + return Ok(plan); + } + plan.transform_down(|node| { + // Match SortExec with fetch (TopK) + let Some(sort_exec) = node.as_any().downcast_ref::() else { + return Ok(Transformed::no(node)); + }; + let Some(fetch) = sort_exec.fetch() else { + return Ok(Transformed::no(node)); + }; + + // The child might be a CoalesceBatchesExec; look through it + let sort_input = sort_exec.input(); + let sort_any = sort_input.as_any(); + let (repart_parent, repart_exec) = if let Some(rp) = + sort_any.downcast_ref::() + { + // found a RepartitionExec, use it + (None, rp) + } else if let Some(cb_exec) = sort_any.downcast_ref::() { + // There's a CoalesceBatchesExec between TopK & RepartitionExec + // in this case we will need to reconstruct both nodes + let cb_input = cb_exec.input(); + let Some(rp) = cb_input.as_any().downcast_ref::() else { + return Ok(Transformed::no(node)); + }; + (Some(Arc::clone(sort_input)), rp) + } else { + return Ok(Transformed::no(node)); + }; + + // Only handle Hash partitioning + let Partitioning::Hash(hash_exprs, num_partitions) = + repart_exec.partitioning() + else { + return Ok(Transformed::no(node)); + }; + + let sort_exprs = sort_exec.expr(); + + // Check that hash expressions are a prefix of the sort expressions. + // Each hash expression must match the corresponding sort expression + // (ignoring sort options like ASC/DESC since hash doesn't care about order). + if hash_exprs.len() > sort_exprs.len() { + return Ok(Transformed::no(node)); + } + for (hash_expr, sort_expr) in hash_exprs.iter().zip(sort_exprs.iter()) { + if !hash_expr.eq(&sort_expr.expr) { + return Ok(Transformed::no(node)); + } + } + + // Don't push if the input to the repartition is already bounded + // (e.g., another TopK), as it would be redundant. + let repart_input = repart_exec.input(); + if repart_input.as_any().downcast_ref::().is_some() { + return Ok(Transformed::no(node)); + } + + // Insert a copy of the TopK below the repartition + let new_sort: Arc = Arc::new( + SortExec::new(sort_exprs.clone(), Arc::clone(repart_input)) + .with_fetch(Some(fetch)) + .with_preserve_partitioning(sort_exec.preserve_partitioning()), + ); + + let new_partitioning = + Partitioning::Hash(hash_exprs.clone(), *num_partitions); + let new_repartition: Arc = + Arc::new(RepartitionExec::try_new(new_sort, new_partitioning)?); + + // Rebuild the tree above the repartition + let new_sort_input = if let Some(parent) = repart_parent { + parent.with_new_children(vec![new_repartition])? + } else { + new_repartition + }; + + let new_top_sort: Arc = Arc::new( + SortExec::new(sort_exprs.clone(), new_sort_input) + .with_fetch(Some(fetch)) + .with_preserve_partitioning(sort_exec.preserve_partitioning()), + ); + + Ok(Transformed::yes(new_top_sort)) + }) + .data() + } + + fn name(&self) -> &str { + "TopKRepartition" + } + + fn schema_check(&self) -> bool { + true + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_physical_expr::expressions::col; + use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; + use datafusion_physical_plan::displayable; + use datafusion_physical_plan::test::scan_partitioned; + use insta::assert_snapshot; + use std::sync::Arc; + + fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Int64, false), + ])) + } + + fn sort_exprs(schema: &Schema) -> LexOrdering { + LexOrdering::new(vec![ + PhysicalSortExpr::new_default(col("a", schema).unwrap()).asc(), + PhysicalSortExpr::new_default(col("b", schema).unwrap()).asc(), + ]) + .unwrap() + } + + /// TopK above Hash(a) repartition should get pushed below it, + /// because `a` is a prefix of the sort key `(a, b)`. + #[test] + fn topk_pushed_below_hash_repartition() { + let s = schema(); + let input = scan_partitioned(1); + let ordering = sort_exprs(&s); + + let repartition = Arc::new( + RepartitionExec::try_new( + input, + Partitioning::Hash(vec![col("a", &s).unwrap()], 4), + ) + .unwrap(), + ); + + let sort = Arc::new( + SortExec::new(ordering, repartition) + .with_fetch(Some(3)) + .with_preserve_partitioning(true), + ); + + let config = ConfigOptions::new(); + let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); + + let display = displayable(optimized.as_ref()).indent(true).to_string(); + assert_snapshot!(display, @r" + SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true], sort_prefix=[a@0 ASC] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, maintains_sort_order=true + SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] + DataSourceExec: partitions=1, partition_sizes=[1] + "); + } + + /// TopK with no fetch (unbounded sort) should NOT be pushed. + #[test] + fn unbounded_sort_not_pushed() { + let s = schema(); + let input = scan_partitioned(1); + let ordering = sort_exprs(&s); + + let repartition = Arc::new( + RepartitionExec::try_new( + input, + Partitioning::Hash(vec![col("a", &s).unwrap()], 4), + ) + .unwrap(), + ); + + let sort: Arc = Arc::new( + SortExec::new(ordering, repartition).with_preserve_partitioning(true), + ); + + let config = ConfigOptions::new(); + let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); + + let display = displayable(optimized.as_ref()).indent(true).to_string(); + assert_snapshot!(display, @r" + SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 + DataSourceExec: partitions=1, partition_sizes=[1] + "); + } + + /// Hash key NOT a prefix of sort key should NOT be pushed. + #[test] + fn non_prefix_hash_key_not_pushed() { + let s = schema(); + let input = scan_partitioned(1); + let ordering = sort_exprs(&s); + + // Hash by `b`, but sort by `(a, b)` - b is not a prefix + let repartition = Arc::new( + RepartitionExec::try_new( + input, + Partitioning::Hash(vec![col("b", &s).unwrap()], 4), + ) + .unwrap(), + ); + + let sort: Arc = Arc::new( + SortExec::new(ordering, repartition) + .with_fetch(Some(3)) + .with_preserve_partitioning(true), + ); + + let config = ConfigOptions::new(); + let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); + + let display = displayable(optimized.as_ref()).indent(true).to_string(); + assert_snapshot!(display, @r" + SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=1 + DataSourceExec: partitions=1, partition_sizes=[1] + "); + } + + /// TopK above CoalesceBatchesExec above Hash(a) repartition should + /// push through both, inserting a new TopK below the repartition. + #[expect(deprecated)] + #[test] + fn topk_pushed_through_coalesce_batches() { + let s = schema(); + let input = scan_partitioned(1); + let ordering = sort_exprs(&s); + + let repartition = Arc::new( + RepartitionExec::try_new( + input, + Partitioning::Hash(vec![col("a", &s).unwrap()], 4), + ) + .unwrap(), + ); + + let coalesce: Arc = + Arc::new(CoalesceBatchesExec::new(repartition, 8192)); + + let sort = Arc::new( + SortExec::new(ordering, coalesce) + .with_fetch(Some(3)) + .with_preserve_partitioning(true), + ); + + let config = ConfigOptions::new(); + let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); + + let display = displayable(optimized.as_ref()).indent(true).to_string(); + assert_snapshot!(display, @r" + SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true], sort_prefix=[a@0 ASC] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, maintains_sort_order=true + SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] + DataSourceExec: partitions=1, partition_sizes=[1] + "); + } + + /// RoundRobin repartition should NOT be pushed. + #[test] + fn round_robin_not_pushed() { + let s = schema(); + let input = scan_partitioned(1); + let ordering = sort_exprs(&s); + + let repartition = Arc::new( + RepartitionExec::try_new(input, Partitioning::RoundRobinBatch(4)).unwrap(), + ); + + let sort: Arc = Arc::new( + SortExec::new(ordering, repartition) + .with_fetch(Some(3)) + .with_preserve_partitioning(true), + ); + + let config = ConfigOptions::new(); + let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); + + let display = displayable(optimized.as_ref()).indent(true).to_string(); + assert_snapshot!(display, @r" + SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 + DataSourceExec: partitions=1, partition_sizes=[1] + "); + } +} diff --git a/datafusion/spark/src/function/array/array_contains 2.rs b/datafusion/spark/src/function/array/array_contains 2.rs new file mode 100644 index 0000000000000..2bc5d64d8bff8 --- /dev/null +++ b/datafusion/spark/src/function/array/array_contains 2.rs @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, AsArray, BooleanArray, BooleanBufferBuilder, GenericListArray, OffsetSizeTrait, +}; +use arrow::buffer::{BooleanBuffer, NullBuffer}; +use arrow::datatypes::DataType; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use datafusion_functions_nested::array_has::array_has_udf; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible `array_contains` function. +/// +/// Calls DataFusion's `array_has` and then applies Spark's null semantics: +/// - If the result from `array_has` is `true`, return `true`. +/// - If the result is `false` and the input array row contains any null elements, +/// return `null` (because the element might have been the null). +/// - If the result is `false` and the input array row has no null elements, +/// return `false`. +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkArrayContains { + signature: Signature, +} + +impl Default for SparkArrayContains { + fn default() -> Self { + Self::new() + } +} + +impl SparkArrayContains { + pub fn new() -> Self { + Self { + signature: Signature::array_and_element(Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkArrayContains { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "array_contains" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let haystack = args.args[0].clone(); + let array_has_result = array_has_udf().invoke_with_args(args)?; + + let result_array = array_has_result.to_array(1)?; + let patched = apply_spark_null_semantics(result_array.as_boolean(), &haystack)?; + Ok(ColumnarValue::Array(Arc::new(patched))) + } +} + +/// For each row where `array_has` returned `false`, set the output to null +/// if that row's input array contains any null elements. +fn apply_spark_null_semantics( + result: &BooleanArray, + haystack_arg: &ColumnarValue, +) -> Result { + // happy path + if result.false_count() == 0 || haystack_arg.data_type() == DataType::Null { + return Ok(result.clone()); + } + + let haystack = haystack_arg.to_array_of_size(result.len())?; + + let row_has_nulls = compute_row_has_nulls(&haystack)?; + + // A row keeps its validity when result is true OR the row has no nulls. + let keep_mask = result.values() | &!&row_has_nulls; + let new_validity = match result.nulls() { + Some(n) => n.inner() & &keep_mask, + None => keep_mask, + }; + + Ok(BooleanArray::new( + result.values().clone(), + Some(NullBuffer::new(new_validity)), + )) +} + +/// Returns a per-row bitmap where bit i is set if row i's list contains any null element. +fn compute_row_has_nulls(haystack: &dyn Array) -> Result { + match haystack.data_type() { + DataType::List(_) => generic_list_row_has_nulls(haystack.as_list::()), + DataType::LargeList(_) => generic_list_row_has_nulls(haystack.as_list::()), + DataType::FixedSizeList(_, _) => { + let list = haystack.as_fixed_size_list(); + let buf = match list.values().nulls() { + Some(nulls) => { + let validity = nulls.inner(); + let vl = list.value_length() as usize; + let mut builder = BooleanBufferBuilder::new(list.len()); + for i in 0..list.len() { + builder.append(validity.slice(i * vl, vl).count_set_bits() < vl); + } + builder.finish() + } + None => BooleanBuffer::new_unset(list.len()), + }; + Ok(mask_with_list_nulls(buf, list.nulls())) + } + dt => exec_err!("compute_row_has_nulls: unsupported data type {dt}"), + } +} + +/// Computes per-row null presence for `List` and `LargeList` arrays. +fn generic_list_row_has_nulls( + list: &GenericListArray, +) -> Result { + let buf = match list.values().nulls() { + Some(nulls) => { + let validity = nulls.inner(); + let offsets = list.offsets(); + let mut builder = BooleanBufferBuilder::new(list.len()); + for i in 0..list.len() { + let s = offsets[i].as_usize(); + let len = offsets[i + 1].as_usize() - s; + builder.append(validity.slice(s, len).count_set_bits() < len); + } + builder.finish() + } + None => BooleanBuffer::new_unset(list.len()), + }; + Ok(mask_with_list_nulls(buf, list.nulls())) +} + +/// Rows where the list itself is null should not be marked as "has nulls". +fn mask_with_list_nulls( + buf: BooleanBuffer, + list_nulls: Option<&NullBuffer>, +) -> BooleanBuffer { + match list_nulls { + Some(n) => &buf & n.inner(), + None => buf, + } +} diff --git a/datafusion/sqllogictest/src/test_file 2.rs b/datafusion/sqllogictest/src/test_file 2.rs new file mode 100644 index 0000000000000..c44cae133639b --- /dev/null +++ b/datafusion/sqllogictest/src/test_file 2.rs @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::ffi::OsStr; +use std::path::{Path, PathBuf}; +use std::sync::LazyLock; + +/// Represents a parsed test file +/// +/// Note there is a custom Ord implementation that sorts test files by: +/// 1. Hard coded test priority (lower runs first), +/// 2. Relative path as deterministic tie-breaker. +#[derive(Debug, PartialEq, Eq)] +pub struct TestFile { + /// The absolute path to the file + pub path: PathBuf, + /// The relative path of the file (used for display) + pub relative_path: PathBuf, +} + +impl TestFile { + /// Create a new [`TestFile`] from the given path, stripping any of the + /// known test directory prefixes for the relative path. + pub fn new(path: PathBuf, prefixes: &[&str]) -> Self { + let p = path.to_string_lossy(); + for prefix in prefixes { + if p.starts_with(prefix) { + let relative_path = PathBuf::from(p.strip_prefix(prefix).unwrap()); + return Self { + path, + relative_path, + }; + } + } + let relative_path = PathBuf::from(""); + + Self { + path, + relative_path, + } + } + + /// Returns true if the file has a .slt extension, indicating it is a sqllogictest file. + pub fn is_slt_file(&self) -> bool { + self.path.extension() == Some(OsStr::new("slt")) + } + + /// Returns true if the relative path starts with the given prefix, which + /// can be used to filter tests by subdirectory or filename patterns. + pub fn relative_path_starts_with(&self, prefix: impl AsRef) -> bool { + self.relative_path.starts_with(prefix) + } +} + +impl PartialOrd for TestFile { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for TestFile { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + let self_path = &self.relative_path; + let other_path = &other.relative_path; + + let priority_self = TEST_PRIORITY.get(self_path).unwrap_or(&DEFAULT_PRIORITY); + let priority_other = TEST_PRIORITY.get(other_path).unwrap_or(&DEFAULT_PRIORITY); + + priority_self + .cmp(priority_other) + .then_with(|| self_path.cmp(other_path)) // Tie-breaker: lexicographic order of relative paths. + // Final tie-breaker keeps Ord consistent with Eq when relative paths collide. + .then_with(|| self.path.cmp(&other.path)) + } +} + +/// TEST PRIORITY +/// +/// Heuristically prioritize some test to run earlier. +/// +/// Prioritizes test to run earlier if they are known to be long running (as +/// each test file itself is run sequentially, but multiple test files are run +/// in parallel. +/// +/// Tests not listed here will run after the listed tests in deterministic +/// lexicographic order by relative path. +/// +/// You can find the top longest running tests by running `--timing-summary` +/// mode. For example +/// +/// ```shell +/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top +/// ... +/// Per-file elapsed summary (deterministic): +/// 1. 3.568s aggregate.slt +/// 2. 3.464s joins.slt +/// 3. 3.336s imdb.slt +/// 4. 3.085s push_down_filter_regression.slt +/// 5. 2.926s aggregate_skip_partial.slt +/// 6. 2.453s array.slt +/// 7. 2.399s window.slt +/// 8. 2.198s group_by.slt +/// 9. 1.281s clickbench.slt +/// 10. 1.058s datetime/timestamps.slt +/// ``` +const TEST_PRIORITY_ENTRIES: &[&str] = &[ + "aggregate.slt", // longest-running files go first + "joins.slt", + "imdb.slt", + "push_down_filter_regression.slt", + "aggregate_skip_partial.slt", + "array.slt", + "window.slt", + "group_by.slt", + "clickbench.slt", + "datetime/timestamps.slt", +]; + +/// Default priority for tests not in the priority map. Tests with lower +/// priority values run first. +const DEFAULT_PRIORITY: usize = 100; + +static TEST_PRIORITY: LazyLock> = LazyLock::new(|| { + TEST_PRIORITY_ENTRIES + .iter() + .enumerate() + .map(|(priority, path)| (PathBuf::from(path), priority)) + .collect() +}); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn prioritized_files_are_first() { + let mut input = vec!["z_unlisted.slt", "a_unlisted.slt"]; + input.extend(TEST_PRIORITY_ENTRIES.iter()); + input.push("q_unlisted.slt"); + + let mut sorted = to_test_files(input); + sorted.sort_unstable(); + + println!("Sorted input: {sorted:?}"); + + // the prioritized files should be first, in the order specified by TEST_PRIORITY_ENTRIES + for file in sorted.iter().take(TEST_PRIORITY_ENTRIES.len()) { + assert!( + TEST_PRIORITY.contains_key(&file.relative_path), + "Expected prioritized file {file:?} not found in input {sorted:?}" + ); + } + // last three files should be the unlisted ones in deterministic order + let expected_files = + to_test_files(["a_unlisted.slt", "q_unlisted.slt", "z_unlisted.slt"]); + assert!( + sorted.ends_with(&expected_files), + "Expected unlisted files {expected_files:?} at the end in deterministic order of {sorted:?}" + ); + } + + fn to_test_files<'a>(files: impl IntoIterator) -> Vec { + files + .into_iter() + .map(|f| TestFile { + path: PathBuf::from(f), + relative_path: PathBuf::from(f), + }) + .collect() + } +} diff --git a/datafusion/sqllogictest/test_files/aggregates_simplify 2.slt b/datafusion/sqllogictest/test_files/aggregates_simplify 2.slt new file mode 100644 index 0000000000000..9aa3ecf7a29f8 --- /dev/null +++ b/datafusion/sqllogictest/test_files/aggregates_simplify 2.slt @@ -0,0 +1,358 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####### +# Tests for aggregate optimizations / simplifications +####### + +statement ok +CREATE TABLE sum_simplify_t AS VALUES (1, 100), (1, 200), (2, 100), (NULL, NULL); + +# Baseline SUM of an expression +query I +SELECT SUM(column1 + 1) FROM sum_simplify_t; +---- +7 + +query TT +EXPLAIN SELECT SUM(column1 + 1) FROM sum_simplify_t; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]] +02)--TableScan: sum_simplify_t projection=[column1] +physical_plan +01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + + +# Mixed aggregate expressions with type validation +query TI +SELECT arrow_typeof(SUM(column1)), SUM(column1 + 1) FROM sum_simplify_t; +---- +Int64 7 + +query TT +EXPLAIN SELECT arrow_typeof(SUM(column1)), SUM(column1), SUM(column1 + 1) FROM sum_simplify_t; +---- +logical_plan +01)Projection: arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1)) +02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]] +03)----TableScan: sum_simplify_t projection=[column1] +physical_plan +01)ProjectionExec: expr=[arrow_typeof(sum(sum_simplify_t.column1)@0) as arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1)@0 as sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))@1 as sum(sum_simplify_t.column1 + Int64(1))] +02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +# Duplicate aggregate expressions +query II +SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t; +---- +7 7 + +query TT +EXPLAIN SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t; +---- +logical_plan +01)Projection: sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_b +02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]] +03)----TableScan: sum_simplify_t projection=[column1] +physical_plan +01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_b] +02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + + +# constant aggregate expressions +query II +SELECT SUM(2+1), SUM(3) FROM sum_simplify_t; +---- +12 12 + +query TT +EXPLAIN SELECT SUM(2+1), SUM(3) FROM sum_simplify_t; +---- +logical_plan +01)Projection: __common_expr_1 AS sum(Int64(2) + Int64(1)), __common_expr_1 AS sum(Int64(3)) +02)--Aggregate: groupBy=[[]], aggr=[[sum(Int64(3)) AS __common_expr_1]] +03)----TableScan: sum_simplify_t projection=[] +physical_plan +01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(2) + Int64(1)), __common_expr_1@0 as sum(Int64(3))] +02)--AggregateExec: mode=Single, gby=[], aggr=[__common_expr_1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + + +# Duplicated expression across multiple aggregate arguments. +query II +SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t; +---- +7 10 + + +query TT +EXPLAIN SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t; +---- +logical_plan +01)Projection: sum(sum_simplify_t.column1) + __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2)) +02)--Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum(sum_simplify_t.column1) +03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]] +04)------TableScan: sum_simplify_t projection=[column1] +physical_plan +01)ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1)@0 + 2 * count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(2))] +02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +# Reordered expressions that still compute the same thing +query II +SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t; +---- +7 10 + +query TT +EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t; +---- +logical_plan +01)Projection: sum(sum_simplify_t.column1) + __common_expr_1 AS sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2)) +02)--Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum(sum_simplify_t.column1) +03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]] +04)------TableScan: sum_simplify_t projection=[column1] +physical_plan +01)ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1)@0 + 2 * count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(2))] +02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +# DISTINCT aggregates with different arguments +query II +SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t; +---- +5 7 + +query TT +EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))]] +02)--TableScan: sum_simplify_t projection=[column1] +physical_plan +01)AggregateExec: mode=Single, gby=[], aggr=[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# DISTINCT and non-DISTINCT aggregates +query II +SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t; +---- +5 7 + +query TT +EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t; +---- +logical_plan +01)Projection: sum(alias1) AS sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2) AS sum(sum_simplify_t.column1 + Int64(1)) +02)--Aggregate: groupBy=[[]], aggr=[[sum(alias1), sum(alias2)]] +03)----Aggregate: groupBy=[[__common_expr_1 AS alias1]], aggr=[[sum(__common_expr_1) AS alias2]] +04)------Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1 +05)--------TableScan: sum_simplify_t projection=[column1] +physical_plan +01)ProjectionExec: expr=[sum(alias1)@0 as sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2)@1 as sum(sum_simplify_t.column1 + Int64(1))] +02)--AggregateExec: mode=Final, gby=[], aggr=[sum(alias1), sum(alias2)] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(alias1), sum(alias2)] +05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[alias2] +06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1 +07)------------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as alias1], aggr=[alias2] +08)--------------ProjectionExec: expr=[column1@0 + 1 as __common_expr_1] +09)----------------DataSourceExec: partitions=1, partition_sizes=[1] + +# FILTER clauses with different aggregate arguments +query II +SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t; +---- +3 NULL + +query TT +EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))]] +02)--TableScan: sum_simplify_t projection=[column1] +physical_plan +01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# FILTER clauses with the same aggregate argument +query II +SELECT + SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a, + SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b +FROM sum_simplify_t; +---- +3 3 + +query TT +EXPLAIN SELECT + SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a, + SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b +FROM sum_simplify_t; +---- +logical_plan +01)Projection: sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_b +02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))]] +03)----TableScan: sum_simplify_t projection=[column1] +physical_plan +01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_b] +02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +# Same aggregate argument with different FILTER predicates +query II +SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t; +---- +3 7 + +query TT +EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))]] +02)--Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1, sum_simplify_t.column1 +03)----TableScan: sum_simplify_t projection=[column1] +physical_plan +01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))] +02)--ProjectionExec: expr=[column1@0 + 1 as __common_expr_1, column1@0 as column1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +# volatile aggregate arguments +query B +SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t; +---- +true + +query TT +EXPLAIN SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t; +---- +logical_plan +01)Projection: sum(random() + Int64(2)) > sum(random() + Int64(1)) AS sum(random() + Int64(1)) < sum(random() + Int64(2)) +02)--Aggregate: groupBy=[[]], aggr=[[sum(random() + Float64(1)) AS sum(random() + Int64(1)), sum(random() + Float64(2)) AS sum(random() + Int64(2))]] +03)----TableScan: sum_simplify_t projection=[] +physical_plan +01)ProjectionExec: expr=[sum(random() + Int64(2))@1 > sum(random() + Int64(1))@0 as sum(random() + Int64(1)) < sum(random() + Int64(2))] +02)--AggregateExec: mode=Single, gby=[], aggr=[sum(random() + Int64(1)), sum(random() + Int64(2))] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +# Checks grouped aggregates with explicit ORDER BY return deterministic row order. +query III +SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST; +---- +200 2 3 +100 5 7 +NULL NULL NULL + +query TT +EXPLAIN SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST; +---- +logical_plan +01)Sort: sum_simplify_t.column2 DESC NULLS LAST +02)--Projection: sum_simplify_t.column2, sum(sum_simplify_t.column1) + __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2)) +03)----Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum_simplify_t.column2, sum(sum_simplify_t.column1) +04)------Aggregate: groupBy=[[sum_simplify_t.column2]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]] +05)--------TableScan: sum_simplify_t projection=[column1, column2] +physical_plan +01)SortPreservingMergeExec: [column2@0 DESC NULLS LAST] +02)--SortExec: expr=[column2@0 DESC NULLS LAST], preserve_partitioning=[true] +03)----ProjectionExec: expr=[column2@0 as column2, sum(sum_simplify_t.column1)@1 + count(sum_simplify_t.column1)@2 as sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1)@1 + 2 * count(sum_simplify_t.column1)@2 as sum(sum_simplify_t.column1 + Int64(2))] +04)------AggregateExec: mode=FinalPartitioned, gby=[column2@0 as column2], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] +05)--------RepartitionExec: partitioning=Hash([column2@0], 4), input_partitions=1 +06)----------AggregateExec: mode=Partial, gby=[column2@1 as column2], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] +07)------------DataSourceExec: partitions=1, partition_sizes=[1] + +# Checks commutative forms of equivalent aggregate arguments are simplified consistently. +query II +SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t; +---- +7 7 + +query TT +EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t; +---- +logical_plan +01)Projection: __common_expr_1 AS sum(Int64(1) + sum_simplify_t.column1), __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)) +02)--Projection: sum(sum_simplify_t.column1) + CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1 +03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]] +04)------TableScan: sum_simplify_t projection=[column1] +physical_plan +01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(1) + sum_simplify_t.column1), __common_expr_1@0 as sum(sum_simplify_t.column1 + Int64(1))] +02)--ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as __common_expr_1] +03)----AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] +04)------DataSourceExec: partitions=1, partition_sizes=[1] + +# Checks unsigned overflow edge case from PR discussion using transformed SUM arguments. +statement ok +CREATE TABLE IF NOT EXISTS tbl (val INTEGER UNSIGNED); + +statement ok +INSERT INTO tbl VALUES (4294967295); + +statement ok +INSERT INTO tbl VALUES (4294967295); + +# Checks transformed SUM results for unsigned max values are preserved. +query TII +SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl; +---- +Int64 8589934592 8589934594 + +query TT +EXPLAIN SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl; +---- +logical_plan +01)Projection: arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2)) +02)--Projection: sum(tbl.val) + __common_expr_1 AS sum(tbl.val + Int64(1)), sum(tbl.val) + Int64(2) * __common_expr_1 AS sum(tbl.val + Int64(2)) +03)----Projection: CAST(count(tbl.val) AS Int64) AS __common_expr_1, sum(tbl.val) +04)------Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_2 AS tbl.val), count(__common_expr_2 AS tbl.val)]] +05)--------Projection: CAST(tbl.val AS Int64) AS __common_expr_2 +06)----------TableScan: tbl projection=[val] +physical_plan +01)ProjectionExec: expr=[arrow_typeof(sum(tbl.val + Int64(1))@0) as arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1))@0 as sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))@1 as sum(tbl.val + Int64(2))] +02)--ProjectionExec: expr=[sum(tbl.val)@0 + count(tbl.val)@1 as sum(tbl.val + Int64(1)), sum(tbl.val)@0 + 2 * count(tbl.val)@1 as sum(tbl.val + Int64(2))] +03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)] +04)------ProjectionExec: expr=[CAST(val@0 AS Int64) as __common_expr_2] +05)--------DataSourceExec: partitions=1, partition_sizes=[2] + +# Checks equivalent rewritten form (SUM + COUNT terms) matches transformed SUM semantics. +query RR +SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl; +---- +8589934592 8589934594 + +query TT +EXPLAIN SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl; +---- +logical_plan +01)Projection: __common_expr_1 + CAST(count(tbl.val) AS Decimal128(20, 0)) AS sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1 AS sum(tbl.val) + CAST(Int64(2) * count(tbl.val) AS Decimal128(20, 0)) +02)--Projection: CAST(sum(tbl.val) AS Decimal128(20, 0)) AS __common_expr_1, count(tbl.val) +03)----Aggregate: groupBy=[[]], aggr=[[sum(CAST(tbl.val AS UInt64)), count(tbl.val)]] +04)------TableScan: tbl projection=[val] +physical_plan +01)ProjectionExec: expr=[__common_expr_1@0 + CAST(count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1@0 + CAST(2 * count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(2) * count(tbl.val)] +02)--ProjectionExec: expr=[CAST(sum(tbl.val)@0 AS Decimal128(20, 0)) as __common_expr_1, count(tbl.val)@1 as count(tbl.val)] +03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)] +04)------DataSourceExec: partitions=1, partition_sizes=[2] + +statement ok +DROP TABLE IF EXISTS tbl; + +statement ok +DROP TABLE sum_simplify_t; diff --git a/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch 2.slt b/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch 2.slt new file mode 100644 index 0000000000000..ab23fff030489 --- /dev/null +++ b/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch 2.slt @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for filter pushdown behavior with Sort + LIMIT (fetch). + +statement ok +CREATE TABLE t(id INT, value INT) AS VALUES +(1, 100), +(2, 200), +(3, 300), +(4, 400), +(5, 500); + +# Take the 3 smallest values (100, 200, 300), then filter value > 200. +query II +SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; +---- +3 300 + +# Take the 3 largest values (500, 400, 300), then filter value < 400. +query II +SELECT * FROM (SELECT * FROM t ORDER BY value DESC LIMIT 3) sub WHERE sub.value < 400; +---- +3 300 + +# The filter stays above the sort+fetch in the plan. +query TT +EXPLAIN SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; +---- +logical_plan +01)SubqueryAlias: sub +02)--Filter: t.value > Int32(200) +03)----Sort: t.value ASC NULLS LAST, fetch=3 +04)------TableScan: t projection=[id, value] +physical_plan +01)FilterExec: value@1 > 200 +02)--SortExec: TopK(fetch=3), expr=[value@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +statement ok +DROP TABLE t; diff --git a/datafusion/sqllogictest/test_files/spark/array/array_contains 2.slt b/datafusion/sqllogictest/test_files/spark/array/array_contains 2.slt new file mode 100644 index 0000000000000..db9ac6b122e3f --- /dev/null +++ b/datafusion/sqllogictest/test_files/spark/array/array_contains 2.slt @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for Spark-compatible array_contains function. +# Spark semantics: if element is found -> true; if not found and array has nulls -> null; if not found and no nulls -> false. + +### +### Scalar tests +### + +# Element found in array +query B +SELECT array_contains(array(1, 2, 3), 2); +---- +true + +# Element not found, no nulls in array +query B +SELECT array_contains(array(1, 2, 3), 4); +---- +false + +# Element not found, array has null elements -> null +query B +SELECT array_contains(array(1, NULL, 3), 2); +---- +NULL + +# Element found, array has null elements -> true (nulls don't matter) +query B +SELECT array_contains(array(1, NULL, 3), 1); +---- +true + +# Element found at the end, array has null elements -> true +query B +SELECT array_contains(array(1, NULL, 3), 3); +---- +true + +# Null array -> null +query B +SELECT array_contains(NULL, 1); +---- +NULL + +# Null element -> null +query B +SELECT array_contains(array(1, 2, 3), NULL); +---- +NULL + +# Empty array, element not found -> false +query B +SELECT array_contains(array(), 1); +---- +false + +# Array with only nulls, element not found -> null +query B +SELECT array_contains(array(NULL, NULL), 1); +---- +NULL + +# String array, element found +query B +SELECT array_contains(array('a', 'b', 'c'), 'b'); +---- +true + +# String array, element not found, no nulls +query B +SELECT array_contains(array('a', 'b', 'c'), 'd'); +---- +false + +# String array, element not found, has null +query B +SELECT array_contains(array('a', NULL, 'c'), 'd'); +---- +NULL + +### +### Columnar tests with a table +### + +statement ok +CREATE TABLE test_arrays AS VALUES + (1, make_array(1, 2, 3), 10), + (2, make_array(4, NULL, 6), 5), + (3, make_array(7, 8, 9), 10), + (4, NULL, 1), + (5, make_array(10, NULL, NULL), 10); + +# Column needle against column array +query IBB +SELECT column1, + array_contains(column2, column3), + array_contains(column2, 10) +FROM test_arrays +ORDER BY column1; +---- +1 false false +2 NULL NULL +3 false false +4 NULL NULL +5 true true + +statement ok +DROP TABLE test_arrays; + +### +### Nested array tests +### + +# Nested array element found +query B +SELECT array_contains(array(array(1, 2), array(3, 4)), array(3, 4)); +---- +true + +# Nested array element not found, no nulls +query B +SELECT array_contains(array(array(1, 2), array(3, 4)), array(5, 6)); +---- +false diff --git a/datafusion/sqllogictest/test_files/window_topk_pushdown 2.slt b/datafusion/sqllogictest/test_files/window_topk_pushdown 2.slt new file mode 100644 index 0000000000000..2c33566736745 --- /dev/null +++ b/datafusion/sqllogictest/test_files/window_topk_pushdown 2.slt @@ -0,0 +1,141 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for the TopKRepartition optimizer rule. +# +# When a partitioned window function has ORDER BY + LIMIT, the optimizer +# can push a TopK (Sort with fetch) below the hash repartition to reduce +# the volume of data flowing through the shuffle. +# +# The optimization is correct when the hash partition key is a prefix of +# the sort key, because all rows with the same partition key land in the +# same output partition. + +statement ok +CREATE EXTERNAL TABLE employees ( + depname VARCHAR NOT NULL, + c2 TINYINT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT, + c5 INT, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + empno INT NOT NULL, + salary BIGINT UNSIGNED NOT NULL, + c10 VARCHAR NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL, + hire_date DATE NOT NULL, + c15 TIMESTAMP NOT NULL +) +STORED AS CSV +LOCATION '../../testing/data/csv/aggregate_test_100_with_dates.csv' +OPTIONS ('format.has_header' 'true'); + +# Use multiple partitions to trigger hash repartitioning for the window function +statement ok +SET datafusion.execution.target_partitions = 4; + +### +### Results correctness: both enabled and disabled must produce the same output +### + +# Disabled: baseline results without the optimization +statement ok +SET datafusion.optimizer.enable_topk_repartition = false; + +query TI +SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total +FROM employees +ORDER BY depname, empno +LIMIT 3; +---- +a 1 +a 2 +a 3 + +# Enabled: results must match baseline +statement ok +SET datafusion.optimizer.enable_topk_repartition = true; + +query TI +SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total +FROM employees +ORDER BY depname, empno +LIMIT 3; +---- +a 1 +a 2 +a 3 + +### +### Plan shape: disabled should have TopK only above repartition +### + +statement ok +SET datafusion.optimizer.enable_topk_repartition = false; + +query TT +EXPLAIN SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total +FROM employees +ORDER BY depname, empno +LIMIT 3; +---- +logical_plan +01)Projection: employees.depname, running_total +02)--Sort: employees.depname ASC NULLS LAST, employees.empno ASC NULLS LAST, fetch=3 +03)----Projection: employees.depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_total, employees.empno +04)------WindowAggr: windowExpr=[[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +05)--------TableScan: employees projection=[depname, empno] +physical_plan +01)ProjectionExec: expr=[depname@0 as depname, running_total@1 as running_total] +02)--SortPreservingMergeExec: [depname@0 ASC NULLS LAST, empno@2 ASC NULLS LAST], fetch=3 +03)----ProjectionExec: expr=[depname@0 as depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as running_total, empno@1 as empno] +04)------BoundedWindowAggExec: wdw=[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true] +06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1 +07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno], file_type=csv, has_header=true + +### +### Plan shape: enabled should have TopK on BOTH sides of the repartition +### + +statement ok +SET datafusion.optimizer.enable_topk_repartition = true; + +query TT +EXPLAIN SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total +FROM employees +ORDER BY depname, empno +LIMIT 3; +---- +logical_plan +01)Projection: employees.depname, running_total +02)--Sort: employees.depname ASC NULLS LAST, employees.empno ASC NULLS LAST, fetch=3 +03)----Projection: employees.depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_total, employees.empno +04)------WindowAggr: windowExpr=[[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +05)--------TableScan: employees projection=[depname, empno] +physical_plan +01)ProjectionExec: expr=[depname@0 as depname, running_total@1 as running_total] +02)--SortPreservingMergeExec: [depname@0 ASC NULLS LAST, empno@2 ASC NULLS LAST], fetch=3 +03)----ProjectionExec: expr=[depname@0 as depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as running_total, empno@1 as empno] +04)------BoundedWindowAggExec: wdw=[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST] +06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1, maintains_sort_order=true +07)------------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true] +08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno], file_type=csv, has_header=true diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/nested 2.rs b/datafusion/substrait/src/logical_plan/consumer/expr/nested 2.rs new file mode 100644 index 0000000000000..f94a701342826 --- /dev/null +++ b/datafusion/substrait/src/logical_plan/consumer/expr/nested 2.rs @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::logical_plan::consumer::SubstraitConsumer; +use datafusion::common::{DFSchema, not_impl_err, substrait_err}; +use datafusion::execution::FunctionRegistry; +use datafusion::logical_expr::Expr; +use substrait::proto::expression::Nested; +use substrait::proto::expression::nested::NestedType; + +/// Converts a Substrait [Nested] expression into a DataFusion [Expr]. +/// +/// Substrait Nested expressions represent complex type constructors (list, struct, map) +/// where elements are full expressions rather than just literals. This is used by +/// producers that emit `Nested { list: ... }` for array construction, as opposed to +/// `Literal { list: ... }` which only supports scalar values. +pub async fn from_nested( + consumer: &impl SubstraitConsumer, + nested: &Nested, + input_schema: &DFSchema, +) -> datafusion::common::Result { + let Some(nested_type) = &nested.nested_type else { + return substrait_err!("Nested expression requires a nested_type"); + }; + + match nested_type { + NestedType::List(list) => { + if list.values.is_empty() { + return substrait_err!( + "Empty Nested lists are not supported; use Literal.empty_list instead" + ); + } + + let mut args = Vec::with_capacity(list.values.len()); + for value in &list.values { + args.push(consumer.consume_expression(value, input_schema).await?); + } + + let make_array_udf = consumer.get_function_registry().udf("make_array")?; + Ok(Expr::ScalarFunction( + datafusion::logical_expr::expr::ScalarFunction::new_udf( + make_array_udf, + args, + ), + )) + } + NestedType::Struct(_) => { + not_impl_err!("Nested struct expressions are not yet supported") + } + NestedType::Map(_) => { + not_impl_err!("Nested map expressions are not yet supported") + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::logical_plan::consumer::utils::tests::test_consumer; + use substrait::proto::expression::Literal; + use substrait::proto::expression::nested::List; + use substrait::proto::{self, Expression}; + + fn make_i64_literal(value: i64) -> Expression { + Expression { + rex_type: Some(proto::expression::RexType::Literal(Literal { + nullable: false, + type_variation_reference: 0, + literal_type: Some(proto::expression::literal::LiteralType::I64(value)), + })), + } + } + + #[tokio::test] + async fn nested_list_with_literals() -> datafusion::common::Result<()> { + let consumer = test_consumer(); + let schema = DFSchema::empty(); + let nested = Nested { + nullable: false, + type_variation_reference: 0, + nested_type: Some(NestedType::List(List { + values: vec![ + make_i64_literal(1), + make_i64_literal(2), + make_i64_literal(3), + ], + })), + }; + + let expr = from_nested(&consumer, &nested, &schema).await?; + assert_eq!( + format!("{expr}"), + "make_array(Int64(1), Int64(2), Int64(3))" + ); + + Ok(()) + } + + #[tokio::test] + async fn nested_list_empty_rejected() -> datafusion::common::Result<()> { + let consumer = test_consumer(); + let schema = DFSchema::empty(); + let nested = Nested { + nullable: true, + type_variation_reference: 0, + nested_type: Some(NestedType::List(List { values: vec![] })), + }; + + let result = from_nested(&consumer, &nested, &schema).await; + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Empty Nested lists are not supported") + ); + + Ok(()) + } + + #[tokio::test] + async fn nested_missing_type() -> datafusion::common::Result<()> { + let consumer = test_consumer(); + let schema = DFSchema::empty(); + let nested = Nested { + nullable: false, + type_variation_reference: 0, + nested_type: None, + }; + + let result = from_nested(&consumer, &nested, &schema).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("nested_type")); + + Ok(()) + } +} diff --git a/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait 2.json b/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait 2.json new file mode 100644 index 0000000000000..85a69c41c5eb1 --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait 2.json @@ -0,0 +1,77 @@ +{ + "relations": [ + { + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [2] + } + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": ["a", "b"], + "struct": { + "types": [ + { + "i32": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["DATA"] + } + } + }, + "expressions": [ + { + "nested": { + "nullable": false, + "list": { + "values": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": {} + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": {} + } + } + ] + } + } + } + ] + } + }, + "names": ["my_list"] + } + } + ] +} diff --git a/dev/changelog/52.2.0 2.md b/dev/changelog/52.2.0 2.md new file mode 100644 index 0000000000000..0801ec5e6a7ee --- /dev/null +++ b/dev/changelog/52.2.0 2.md @@ -0,0 +1,47 @@ + + +# Apache DataFusion 52.2.0 Changelog + +This release consists of 5 commits from 3 contributors. See credits at the end of this changelog for more information. + +See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions. + +**Other:** + +- [branch-52] fix: filter pushdown when merge filter (#20110) [#20289](https://github.com/apache/datafusion/pull/20289) (haohuaijin) +- [branch-52] FilterExec should remap indices of parent dynamic filters (#20286) [#20347](https://github.com/apache/datafusion/pull/20347) (alamb) +- [branch-52] fix: validate inter-file ordering in eq_properties() (#20329) [#20509](https://github.com/apache/datafusion/pull/20509) (alamb) +- Fix name tracker (#19856) [#20539](https://github.com/apache/datafusion/pull/20539) (hareshkh) +- [branch-52] fix: HashJoin panic with dictionary-encoded columns in multi-key joins (#20441) [#20512](https://github.com/apache/datafusion/pull/20512) (alamb) +- [branch-52] Fix incorrect `SortExec` removal before `AggregateExec` (#20247) [#20507](https://github.com/apache/datafusion/pull/20507) (alamb) +- [branch-52] Update aws-smithy, bytes and time for security audits [#20546](https://github.com/apache/datafusion/pull/20546) (alamb) +- [branch-52] Clamp early aggregation emit to the sort boundary when using partial group ordering (#20446) [#20558](https://github.com/apache/datafusion/pull/20558) (alamb) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 3 Andrew Lamb + 1 Haresh Khanna + 1 Huaijin +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/dev/changelog/52.3.0 2.md b/dev/changelog/52.3.0 2.md new file mode 100644 index 0000000000000..ed505b7fc2d0a --- /dev/null +++ b/dev/changelog/52.3.0 2.md @@ -0,0 +1,50 @@ + + +# Apache DataFusion 52.3.0 Changelog + +This release consists of 7 commits from 4 contributors. See credits at the end of this changelog for more information. + +See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions. + +**Performance related:** + +- [branch-52] perf: sort replace free()->try_grow() pattern with try_resize() to reduce memory pool interactions [#20732](https://github.com/apache/datafusion/pull/20732) (mbutrovich) + +**Other:** + +- [branch-52] Backport fix: SortMergeJoin don't wait for all input before emitting #20482 [#20699](https://github.com/apache/datafusion/pull/20699) (mbutrovich) +- [branch-52] Fix Arrow Spill Underrun (#20159) [#20684](https://github.com/apache/datafusion/pull/20684) (hareshkh) +- [branch-52] Fix constant value from stats (#20042) [#20709](https://github.com/apache/datafusion/pull/20709) (alamb) +- [branch-52] fix: `HashJoin` panic with String dictionary keys (don't flatten keys) (#20505) [#20708](https://github.com/apache/datafusion/pull/20708) (alamb) +- [branch-52] FFI_TableOptions are using default values only [#20705](https://github.com/apache/datafusion/pull/20705) (timsaucer) +- [branch-52] Fix repartition from dropping data when spilling (#20672) [#20777](https://github.com/apache/datafusion/pull/20777) (alamb) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 3 Andrew Lamb + 2 Matt Butrovich + 1 Haresh Khanna + 1 Tim Saucer +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/docs/source/library-user-guide/upgrading/54.0.0 2.md b/docs/source/library-user-guide/upgrading/54.0.0 2.md new file mode 100644 index 0000000000000..77b4fb6f71a35 --- /dev/null +++ b/docs/source/library-user-guide/upgrading/54.0.0 2.md @@ -0,0 +1,124 @@ + + +# Upgrade Guides + +## DataFusion 54.0.0 + +**Note:** DataFusion `54.0.0` has not been released yet. The information provided +in this section pertains to features and changes that have already been merged +to the main branch and are awaiting release in this version. + +### `ExecutionPlan::apply_expressions` is now a required method + +`apply_expressions` has been added as a **required** method on the `ExecutionPlan` trait (no default implementation). The same applies to the `FileSource` and `DataSource` traits. Any custom implementation of these traits must now implement `apply_expressions`. + +**Who is affected:** + +- Users who implement custom `ExecutionPlan` nodes +- Users who implement custom `FileSource` or `DataSource` sources + +**Migration guide:** + +Add `apply_expressions` to your implementation. Call `f` on each top-level `PhysicalExpr` your node owns, using `visit_sibling` to correctly propagate `TreeNodeRecursion`: + +**Node with no expressions:** + +```rust,ignore +fn apply_expressions( + &self, + _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result, +) -> Result { + Ok(TreeNodeRecursion::Continue) +} +``` + +**Node with a single expression:** + +```rust,ignore +fn apply_expressions( + &self, + f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result, +) -> Result { + f(self.predicate.as_ref()) +} +``` + +**Node with multiple expressions:** + +```rust,ignore +fn apply_expressions( + &self, + f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result, +) -> Result { + let mut tnr = TreeNodeRecursion::Continue; + for expr in &self.expressions { + tnr = tnr.visit_sibling(|| f(expr.as_ref()))?; + } + Ok(tnr) +} +``` + +**Node whose only expressions are in `output_ordering()` (e.g. a synthetic test node with no owned expression fields):** + +````rust,ignore +fn apply_expressions( + &self, + f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result, +) -> Result { + let mut tnr = TreeNodeRecursion::Continue; + if let Some(ordering) = self.cache.output_ordering() { + for sort_expr in ordering { + tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?; + } + } + Ok(tnr) +} + +### `ExecutionPlan::partition_statistics` now returns `Arc` + +`ExecutionPlan::partition_statistics` now returns `Result>` instead of `Result`. This avoids cloning `Statistics` when it is shared across multiple consumers. + +**Before:** + +```rust,ignore +fn partition_statistics(&self, partition: Option) -> Result { + Ok(Statistics::new_unknown(&self.schema())) +} +```` + +**After:** + +```rust,ignore +fn partition_statistics(&self, partition: Option) -> Result> { + Ok(Arc::new(Statistics::new_unknown(&self.schema()))) +} +``` + +If you need an owned `Statistics` value (e.g. to mutate it), use `Arc::unwrap_or_clone`: + +```rust,ignore +// If you previously consumed the Statistics directly: +let stats = plan.partition_statistics(None)?; +stats.column_statistics[0].min_value = ...; + +// Now unwrap the Arc first: +let mut stats = Arc::unwrap_or_clone(plan.partition_statistics(None)?); +stats.column_statistics[0].min_value = ...; +``` From 79bdcac9838b654f5f5b17ea10b06192cea5e132 Mon Sep 17 00:00:00 2001 From: Shiv Bhatia Date: Sat, 21 Mar 2026 11:29:22 +0000 Subject: [PATCH 4/7] Remove spurious macOS duplicate files introduced by bad merge --- .github/workflows/codeql 2.yml | 55 --- AGENTS 2.md | 34 -- CLAUDE 2.md | 1 - benchmarks/src/util/latency_object_store 2.rs | 157 -------- ...nsumers_with_mem_pool_type@no_track 2.snap | 23 -- ...y_consumers_with_mem_pool_type@top2 2.snap | 26 -- ..._with_unbounded_memory_pool@default 2.snap | 36 -- datafusion/common/benches/stats_merge 2.rs | 85 ---- datafusion/common/src/utils/aggregate 2.rs | 149 ------- datafusion/core/benches/topk_repartition 2.rs | 90 ----- .../parquet_struct_filter_pushdown 2.rs | 353 ----------------- .../src/test_data/ndv_test 2.parquet | Bin 1141 -> 0 bytes datafusion/ffi/tests/ffi_execution_plan 2.rs | 108 ----- .../benches/approx_distinct 2.rs | 128 ------ .../benches/array_concat 2.rs | 94 ----- .../benches/array_to_string 2.rs | 188 --------- .../linear_aggregates 2.rs | 229 ----------- .../benches/compare_nested 2.rs | 74 ---- .../src/hash_join_buffering 2.rs | 103 ----- .../src/topk_repartition 2.rs | 368 ------------------ .../src/function/array/array_contains 2.rs | 168 -------- datafusion/sqllogictest/src/test_file 2.rs | 186 --------- .../test_files/aggregates_simplify 2.slt | 358 ----------------- .../push_down_filter_sort_fetch 2.slt | 55 --- .../spark/array/array_contains 2.slt | 140 ------- .../test_files/window_topk_pushdown 2.slt | 141 ------- .../logical_plan/consumer/expr/nested 2.rs | 151 ------- .../nested_list_expressions.substrait 2.json | 77 ---- dev/changelog/52.2.0 2.md | 47 --- dev/changelog/52.3.0 2.md | 50 --- .../library-user-guide/upgrading/54.0.0 2.md | 124 ------ 31 files changed, 3798 deletions(-) delete mode 100644 .github/workflows/codeql 2.yml delete mode 100644 AGENTS 2.md delete mode 120000 CLAUDE 2.md delete mode 100644 benchmarks/src/util/latency_object_store 2.rs delete mode 100644 datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track 2.snap delete mode 100644 datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2 2.snap delete mode 100644 datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default 2.snap delete mode 100644 datafusion/common/benches/stats_merge 2.rs delete mode 100644 datafusion/common/src/utils/aggregate 2.rs delete mode 100644 datafusion/core/benches/topk_repartition 2.rs delete mode 100644 datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown 2.rs delete mode 100644 datafusion/datasource-parquet/src/test_data/ndv_test 2.parquet delete mode 100644 datafusion/ffi/tests/ffi_execution_plan 2.rs delete mode 100644 datafusion/functions-aggregate/benches/approx_distinct 2.rs delete mode 100644 datafusion/functions-nested/benches/array_concat 2.rs delete mode 100644 datafusion/functions-nested/benches/array_to_string 2.rs delete mode 100644 datafusion/optimizer/src/simplify_expressions/linear_aggregates 2.rs delete mode 100644 datafusion/physical-expr-common/benches/compare_nested 2.rs delete mode 100644 datafusion/physical-optimizer/src/hash_join_buffering 2.rs delete mode 100644 datafusion/physical-optimizer/src/topk_repartition 2.rs delete mode 100644 datafusion/spark/src/function/array/array_contains 2.rs delete mode 100644 datafusion/sqllogictest/src/test_file 2.rs delete mode 100644 datafusion/sqllogictest/test_files/aggregates_simplify 2.slt delete mode 100644 datafusion/sqllogictest/test_files/push_down_filter_sort_fetch 2.slt delete mode 100644 datafusion/sqllogictest/test_files/spark/array/array_contains 2.slt delete mode 100644 datafusion/sqllogictest/test_files/window_topk_pushdown 2.slt delete mode 100644 datafusion/substrait/src/logical_plan/consumer/expr/nested 2.rs delete mode 100644 datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait 2.json delete mode 100644 dev/changelog/52.2.0 2.md delete mode 100644 dev/changelog/52.3.0 2.md delete mode 100644 docs/source/library-user-guide/upgrading/54.0.0 2.md diff --git a/.github/workflows/codeql 2.yml b/.github/workflows/codeql 2.yml deleted file mode 100644 index d42c2b4aa8d39..0000000000000 --- a/.github/workflows/codeql 2.yml +++ /dev/null @@ -1,55 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -name: "CodeQL" - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - schedule: - - cron: '16 4 * * 1' - -permissions: - contents: read - -jobs: - analyze: - name: Analyze Actions - runs-on: ubuntu-latest - permissions: - contents: read - security-events: write - packages: read - - steps: - - name: Checkout repository - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - with: - persist-credentials: false - - - name: Initialize CodeQL - uses: github/codeql-action/init@b1bff81932f5cdfc8695c7752dcee935dcd061c8 # v4 - with: - languages: actions - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@b1bff81932f5cdfc8695c7752dcee935dcd061c8 # v4 - with: - category: "/language:actions" diff --git a/AGENTS 2.md b/AGENTS 2.md deleted file mode 100644 index eeedbd8bc45ec..0000000000000 --- a/AGENTS 2.md +++ /dev/null @@ -1,34 +0,0 @@ -# Agent Guidelines for Apache DataFusion - -## Developer Documentation - -- [Contributor Guide](docs/source/contributor-guide/index.md) -- [Architecture Guide](docs/source/contributor-guide/architecture.md) - -## Before Committing - -Before committing any changes, you **must** run the following checks and fix any issues: - -```bash -cargo fmt --all -cargo clippy --all-targets --all-features -- -D warnings -``` - -- `cargo fmt` ensures consistent code formatting across the project. -- `cargo clippy` catches common mistakes and enforces idiomatic Rust patterns. All warnings must be resolved (treated as errors via `-D warnings`). - -Do not commit code that fails either of these checks. - -## Testing - -Run relevant tests before submitting changes: - -```bash -cargo test --all-features -``` - -For SQL logic tests: - -```bash -cargo test -p datafusion-sqllogictest -``` diff --git a/CLAUDE 2.md b/CLAUDE 2.md deleted file mode 120000 index 47dc3e3d863cf..0000000000000 --- a/CLAUDE 2.md +++ /dev/null @@ -1 +0,0 @@ -AGENTS.md \ No newline at end of file diff --git a/benchmarks/src/util/latency_object_store 2.rs b/benchmarks/src/util/latency_object_store 2.rs deleted file mode 100644 index 9ef8d1b78b751..0000000000000 --- a/benchmarks/src/util/latency_object_store 2.rs +++ /dev/null @@ -1,157 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! An ObjectStore wrapper that adds simulated S3-like latency to get and list operations. -//! -//! Cycles through a fixed latency distribution inspired by real S3 performance: -//! - P50: ~30ms -//! - P75-P90: ~100-120ms -//! - P99: ~150-200ms - -use std::fmt; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::time::Duration; - -use async_trait::async_trait; -use futures::StreamExt; -use futures::stream::BoxStream; -use object_store::path::Path; -use object_store::{ - CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, - ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, -}; - -/// GET latency distribution, inspired by S3 latencies. -/// Deterministic but shuffled to avoid artificial patterns. -/// 20 values: 11x P50 (~25-35ms), 5x P75-P90 (~70-110ms), 2x P95 (~120-150ms), 2x P99 (~180-200ms) -/// Sorted: 25,25,28,28,30,30,30,30,32,32,35, 70,85,100,100,110, 130,150, 180,200 -/// P50≈32ms, P90≈110ms, P99≈200ms -const GET_LATENCIES_MS: &[u64] = &[ - 30, 100, 25, 85, 32, 200, 28, 130, 35, 70, 30, 150, 30, 110, 28, 180, 32, 25, 100, 30, -]; - -/// LIST latency distribution, generally higher than GET. -/// 20 values: 11x P50 (~40-70ms), 5x P75-P90 (~120-180ms), 2x P95 (~200-250ms), 2x P99 (~300-400ms) -/// Sorted: 40,40,50,50,55,55,60,60,65,65,70, 120,140,160,160,180, 210,250, 300,400 -/// P50≈65ms, P90≈180ms, P99≈400ms -const LIST_LATENCIES_MS: &[u64] = &[ - 55, 160, 40, 140, 65, 400, 50, 210, 70, 120, 60, 250, 55, 180, 50, 300, 65, 40, 160, - 60, -]; - -/// An ObjectStore wrapper that injects simulated latency on get and list calls. -#[derive(Debug)] -pub struct LatencyObjectStore { - inner: T, - get_counter: AtomicUsize, - list_counter: AtomicUsize, -} - -impl LatencyObjectStore { - pub fn new(inner: T) -> Self { - Self { - inner, - get_counter: AtomicUsize::new(0), - list_counter: AtomicUsize::new(0), - } - } - - fn next_get_latency(&self) -> Duration { - let idx = - self.get_counter.fetch_add(1, Ordering::Relaxed) % GET_LATENCIES_MS.len(); - Duration::from_millis(GET_LATENCIES_MS[idx]) - } - - fn next_list_latency(&self) -> Duration { - let idx = - self.list_counter.fetch_add(1, Ordering::Relaxed) % LIST_LATENCIES_MS.len(); - Duration::from_millis(LIST_LATENCIES_MS[idx]) - } -} - -impl fmt::Display for LatencyObjectStore { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "LatencyObjectStore({})", self.inner) - } -} - -#[async_trait] -impl ObjectStore for LatencyObjectStore { - async fn put_opts( - &self, - location: &Path, - payload: PutPayload, - opts: PutOptions, - ) -> Result { - self.inner.put_opts(location, payload, opts).await - } - - async fn put_multipart_opts( - &self, - location: &Path, - opts: PutMultipartOptions, - ) -> Result> { - self.inner.put_multipart_opts(location, opts).await - } - - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - tokio::time::sleep(self.next_get_latency()).await; - self.inner.get_opts(location, options).await - } - - async fn get_ranges( - &self, - location: &Path, - ranges: &[std::ops::Range], - ) -> Result> { - tokio::time::sleep(self.next_get_latency()).await; - self.inner.get_ranges(location, ranges).await - } - - fn delete_stream( - &self, - locations: BoxStream<'static, Result>, - ) -> BoxStream<'static, Result> { - self.inner.delete_stream(locations) - } - - fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { - let latency = self.next_list_latency(); - let stream = self.inner.list(prefix); - futures::stream::once(async move { - tokio::time::sleep(latency).await; - futures::stream::empty() - }) - .flatten() - .chain(stream) - .boxed() - } - - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - tokio::time::sleep(self.next_list_latency()).await; - self.inner.list_with_delimiter(prefix).await - } - - async fn copy_opts( - &self, - from: &Path, - to: &Path, - options: CopyOptions, - ) -> Result<()> { - self.inner.copy_opts(from, to, options).await - } -} diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track 2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track 2.snap deleted file mode 100644 index 25267ea1617e5..0000000000000 --- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track 2.snap +++ /dev/null @@ -1,23 +0,0 @@ ---- -source: datafusion-cli/tests/cli_integration.rs -info: - program: datafusion-cli - args: - - "--memory-limit" - - 10M - - "--mem-pool-type" - - fair - - "--command" - - "select * from generate_series(1,500000) as t1(v1) order by v1;" - - "--top-memory-consumers" - - "0" ---- -success: false -exit_code: 1 ------ stdout ----- -[CLI_VERSION] -Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'. -caused by -Resources exhausted: Failed to allocate - ------ stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2 2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2 2.snap deleted file mode 100644 index 6515050047107..0000000000000 --- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2 2.snap +++ /dev/null @@ -1,26 +0,0 @@ ---- -source: datafusion-cli/tests/cli_integration.rs -info: - program: datafusion-cli - args: - - "--memory-limit" - - 10M - - "--mem-pool-type" - - fair - - "--command" - - "select * from generate_series(1,500000) as t1(v1) order by v1;" - - "--top-memory-consumers" - - "2" ---- -success: false -exit_code: 1 ------ stdout ----- -[CLI_VERSION] -Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'. -caused by -Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as: - Consumer(can spill: bool) consumed XB, peak XB, - Consumer(can spill: bool) consumed XB, peak XB. -Error: Failed to allocate - ------ stderr ----- diff --git a/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default 2.snap b/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default 2.snap deleted file mode 100644 index 7bdcd63dc7be6..0000000000000 --- a/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default 2.snap +++ /dev/null @@ -1,36 +0,0 @@ ---- -source: datafusion-cli/tests/cli_integration.rs -info: - program: datafusion-cli - args: - - "--maxrows" - - "10" - - "--command" - - "select * from generate_series(1,500000) as t1(v1) order by v1;" ---- -success: true -exit_code: 0 ------ stdout ----- -[CLI_VERSION] -+----+ -| v1 | -+----+ -| 1 | -| 2 | -| 3 | -| 4 | -| 5 | -| 6 | -| 7 | -| 8 | -| 9 | -| 10 | -| . | -| . | -| . | -+----+ -500000 row(s) fetched. (First 10 displayed. Use --maxrows to adjust) -[ELAPSED] - - ------ stderr ----- diff --git a/datafusion/common/benches/stats_merge 2.rs b/datafusion/common/benches/stats_merge 2.rs deleted file mode 100644 index 73229b6379360..0000000000000 --- a/datafusion/common/benches/stats_merge 2.rs +++ /dev/null @@ -1,85 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Benchmark for `Statistics::try_merge_iter`. - -use std::sync::Arc; - -use arrow::datatypes::{DataType, Field, Schema}; -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use datafusion_common::stats::Precision; -use datafusion_common::{ColumnStatistics, ScalarValue, Statistics}; - -/// Build a vector of `n` with `num_cols` columns -fn make_stats(n: usize, num_cols: usize) -> Vec { - (0..n) - .map(|i| { - let mut stats = Statistics::default() - .with_num_rows(Precision::Exact(100 + i)) - .with_total_byte_size(Precision::Exact(8000 + i * 80)); - for c in 0..num_cols { - let base = (i * num_cols + c) as i64; - stats = stats.add_column_statistics( - ColumnStatistics::new_unknown() - .with_null_count(Precision::Exact(i)) - .with_min_value(Precision::Exact(ScalarValue::Int64(Some(base)))) - .with_max_value(Precision::Exact(ScalarValue::Int64(Some( - base + 1000, - )))) - .with_sum_value(Precision::Exact(ScalarValue::Int64(Some( - base * 100, - )))), - ); - } - stats - }) - .collect() -} - -fn bench_stats_merge(c: &mut Criterion) { - let mut group = c.benchmark_group("stats_merge"); - - for &num_partitions in &[10, 100, 500] { - for &num_cols in &[1, 5, 20] { - let items = make_stats(num_partitions, num_cols); - let schema = Arc::new(Schema::new( - (0..num_cols) - .map(|i| Field::new(format!("col{i}"), DataType::Int64, true)) - .collect::>(), - )); - - let param = format!("{num_partitions}parts_{num_cols}cols"); - - group.bench_with_input( - BenchmarkId::new("try_merge_iter", ¶m), - &(&items, &schema), - |b, (items, schema)| { - b.iter(|| { - std::hint::black_box( - Statistics::try_merge_iter(*items, schema).unwrap(), - ); - }); - }, - ); - } - } - - group.finish(); -} - -criterion_group!(benches, bench_stats_merge); -criterion_main!(benches); diff --git a/datafusion/common/src/utils/aggregate 2.rs b/datafusion/common/src/utils/aggregate 2.rs deleted file mode 100644 index 43bc0676b2d3c..0000000000000 --- a/datafusion/common/src/utils/aggregate 2.rs +++ /dev/null @@ -1,149 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Scalar-level aggregation utilities for statistics merging. -//! -//! Provides a cheap pairwise [`ScalarValue`] addition that directly -//! extracts inner primitive values, avoiding the expensive -//! `ScalarValue::add` path (which round-trips through Arrow arrays). -use arrow::datatypes::i256; - -use crate::stats::Precision; -use crate::{Result, ScalarValue}; - -/// Saturating addition for [`i256`] (which lacks a built-in -/// `saturating_add`). Returns `i256::MAX` on positive overflow and -/// `i256::MIN` on negative overflow. -#[inline] -fn i256_saturating_add(a: i256, b: i256) -> i256 { - match a.checked_add(b) { - Some(sum) => sum, - None => { - // If b is non-negative the overflow is positive, otherwise - // negative. - if b >= i256::ZERO { - i256::MAX - } else { - i256::MIN - } - } - } -} - -/// Add two [`ScalarValue`]s by directly extracting and adding their -/// inner primitive values. -/// -/// This avoids `ScalarValue::add` which converts both operands to -/// single-element Arrow arrays, runs the `add_wrapping` kernel, and -/// converts the result back — 3 heap allocations per call. -/// -/// For non-primitive types, falls back to `ScalarValue::add`. -pub(crate) fn scalar_add(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { - macro_rules! add_int { - ($lhs:expr, $rhs:expr, $VARIANT:ident) => { - match ($lhs, $rhs) { - (ScalarValue::$VARIANT(Some(a)), ScalarValue::$VARIANT(Some(b))) => { - Ok(ScalarValue::$VARIANT(Some(a.saturating_add(*b)))) - } - (ScalarValue::$VARIANT(None), other) - | (other, ScalarValue::$VARIANT(None)) => Ok(other.clone()), - _ => unreachable!(), - } - }; - } - - macro_rules! add_decimal { - ($lhs:expr, $rhs:expr, $VARIANT:ident) => { - match ($lhs, $rhs) { - ( - ScalarValue::$VARIANT(Some(a), p, s), - ScalarValue::$VARIANT(Some(b), _, _), - ) => Ok(ScalarValue::$VARIANT(Some(a.saturating_add(*b)), *p, *s)), - (ScalarValue::$VARIANT(None, _, _), other) - | (other, ScalarValue::$VARIANT(None, _, _)) => Ok(other.clone()), - _ => unreachable!(), - } - }; - } - - macro_rules! add_float { - ($lhs:expr, $rhs:expr, $VARIANT:ident) => { - match ($lhs, $rhs) { - (ScalarValue::$VARIANT(Some(a)), ScalarValue::$VARIANT(Some(b))) => { - Ok(ScalarValue::$VARIANT(Some(*a + *b))) - } - (ScalarValue::$VARIANT(None), other) - | (other, ScalarValue::$VARIANT(None)) => Ok(other.clone()), - _ => unreachable!(), - } - }; - } - - match lhs { - ScalarValue::Int8(_) => add_int!(lhs, rhs, Int8), - ScalarValue::Int16(_) => add_int!(lhs, rhs, Int16), - ScalarValue::Int32(_) => add_int!(lhs, rhs, Int32), - ScalarValue::Int64(_) => add_int!(lhs, rhs, Int64), - ScalarValue::UInt8(_) => add_int!(lhs, rhs, UInt8), - ScalarValue::UInt16(_) => add_int!(lhs, rhs, UInt16), - ScalarValue::UInt32(_) => add_int!(lhs, rhs, UInt32), - ScalarValue::UInt64(_) => add_int!(lhs, rhs, UInt64), - ScalarValue::Float16(_) => add_float!(lhs, rhs, Float16), - ScalarValue::Float32(_) => add_float!(lhs, rhs, Float32), - ScalarValue::Float64(_) => add_float!(lhs, rhs, Float64), - ScalarValue::Decimal32(_, _, _) => add_decimal!(lhs, rhs, Decimal32), - ScalarValue::Decimal64(_, _, _) => add_decimal!(lhs, rhs, Decimal64), - ScalarValue::Decimal128(_, _, _) => add_decimal!(lhs, rhs, Decimal128), - ScalarValue::Decimal256(_, _, _) => match (lhs, rhs) { - ( - ScalarValue::Decimal256(Some(a), p, s), - ScalarValue::Decimal256(Some(b), _, _), - ) => Ok(ScalarValue::Decimal256( - Some(i256_saturating_add(*a, *b)), - *p, - *s, - )), - (ScalarValue::Decimal256(None, _, _), other) - | (other, ScalarValue::Decimal256(None, _, _)) => Ok(other.clone()), - _ => unreachable!(), - }, - // Fallback: use the existing ScalarValue::add - _ => lhs.add(rhs), - } -} - -/// [`Precision`]-aware sum of two [`ScalarValue`] precisions using -/// cheap direct addition via [`scalar_add`]. -/// -/// Mirrors the semantics of `Precision::add` but avoids -/// the expensive `ScalarValue::add` round-trip through Arrow arrays. -pub(crate) fn precision_add( - lhs: &Precision, - rhs: &Precision, -) -> Precision { - match (lhs, rhs) { - (Precision::Exact(a), Precision::Exact(b)) => scalar_add(a, b) - .map(Precision::Exact) - .unwrap_or(Precision::Absent), - (Precision::Inexact(a), Precision::Exact(b)) - | (Precision::Exact(a), Precision::Inexact(b)) - | (Precision::Inexact(a), Precision::Inexact(b)) => scalar_add(a, b) - .map(Precision::Inexact) - .unwrap_or(Precision::Absent), - (_, _) => Precision::Absent, - } -} diff --git a/datafusion/core/benches/topk_repartition 2.rs b/datafusion/core/benches/topk_repartition 2.rs deleted file mode 100644 index e1f14e4aaa633..0000000000000 --- a/datafusion/core/benches/topk_repartition 2.rs +++ /dev/null @@ -1,90 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Benchmark for the TopKRepartition optimizer rule. -//! -//! Measures the benefit of pushing TopK (Sort with fetch) below hash -//! repartition when running partitioned window functions with LIMIT. - -mod data_utils; - -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use data_utils::create_table_provider; -use datafusion::prelude::{SessionConfig, SessionContext}; -use parking_lot::Mutex; -use std::hint::black_box; -use std::sync::Arc; -use tokio::runtime::Runtime; - -#[expect(clippy::needless_pass_by_value)] -fn query(ctx: Arc>, rt: &Runtime, sql: &str) { - let df = rt.block_on(ctx.lock().sql(sql)).unwrap(); - black_box(rt.block_on(df.collect()).unwrap()); -} - -fn create_context( - partitions_len: usize, - target_partitions: usize, - enable_topk_repartition: bool, -) -> Arc> { - let array_len = 1024 * 1024; - let batch_size = 8 * 1024; - let mut config = SessionConfig::new().with_target_partitions(target_partitions); - config.options_mut().optimizer.enable_topk_repartition = enable_topk_repartition; - let ctx = SessionContext::new_with_config(config); - let rt = Runtime::new().unwrap(); - rt.block_on(async { - let provider = - create_table_provider(partitions_len, array_len, batch_size).unwrap(); - ctx.register_table("t", provider).unwrap(); - }); - Arc::new(Mutex::new(ctx)) -} - -fn criterion_benchmark(c: &mut Criterion) { - let rt = Runtime::new().unwrap(); - - let limits = [10, 1_000, 10_000, 100_000]; - let scans = 16; - let target_partitions = 4; - - let group = format!("topk_repartition_{scans}_to_{target_partitions}"); - let mut group = c.benchmark_group(group); - for limit in limits { - let sql = format!( - "SELECT \ - SUM(f64) OVER (PARTITION BY u64_narrow ORDER BY u64_wide ROWS UNBOUNDED PRECEDING) \ - FROM t \ - ORDER BY u64_narrow, u64_wide \ - LIMIT {limit}" - ); - - let ctx_disabled = create_context(scans, target_partitions, false); - group.bench_function(BenchmarkId::new("disabled", limit), |b| { - b.iter(|| query(ctx_disabled.clone(), &rt, &sql)) - }); - - let ctx_enabled = create_context(scans, target_partitions, true); - group.bench_function(BenchmarkId::new("enabled", limit), |b| { - b.iter(|| query(ctx_enabled.clone(), &rt, &sql)) - }); - } - group.finish(); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown 2.rs b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown 2.rs deleted file mode 100644 index b52408d4222d8..0000000000000 --- a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown 2.rs +++ /dev/null @@ -1,353 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Benchmarks for struct field filter pushdown in Parquet. -//! -//! Compares scanning with vs without row-level filter pushdown for -//! predicates on struct sub-fields (e.g. `get_field(s, 'id') = 42`). -//! -//! The dataset schema (in SQL-like notation): -//! -//! ```sql -//! CREATE TABLE t ( -//! id INT, -- top-level id, useful for correctness checks -//! large_string TEXT, -- wide column so SELECT * is expensive -//! s STRUCT< -//! id: INT, -- mirrors top-level id -//! large_string: TEXT -- wide sub-field; pushdown with proper projection -//! -- should avoid reading this when filtering on s.id -//! > -//! ); -//! ``` -//! -//! Benchmark queries: -//! -//! 1. `SELECT * FROM t WHERE get_field(s, 'id') = 42` -//! - no pushdown vs. row-level filter pushdown -//! 2. `SELECT * FROM t WHERE get_field(s, 'id') = id` -//! - cross-column predicate; no pushdown vs. row-level filter pushdown -//! 3. `SELECT id FROM t WHERE get_field(s, 'id') = 42` -//! - narrow projection; pushdown should avoid reading s.large_string - -use std::path::{Path, PathBuf}; -use std::sync::{Arc, LazyLock}; - -use arrow::array::{BooleanArray, Int32Array, RecordBatch, StringBuilder, StructArray}; -use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; -use criterion::{Criterion, Throughput, criterion_group, criterion_main}; -use datafusion_common::ScalarValue; -use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter}; -use datafusion_expr::{Expr, col}; -use datafusion_physical_expr::planner::logical2physical; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; -use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; -use parquet::arrow::{ArrowWriter, ProjectionMask}; -use parquet::file::properties::WriterProperties; -use tempfile::TempDir; - -const ROW_GROUP_ROW_COUNT: usize = 10_000; -const TOTAL_ROW_GROUPS: usize = 10; -const TOTAL_ROWS: usize = ROW_GROUP_ROW_COUNT * TOTAL_ROW_GROUPS; -/// Only one row group will contain the target value. -const TARGET_VALUE: i32 = 42; -const ID_COLUMN_NAME: &str = "id"; -const LARGE_STRING_COLUMN_NAME: &str = "large_string"; -const STRUCT_COLUMN_NAME: &str = "s"; -// Large string payload to emphasize decoding overhead when pushdown is disabled. -const LARGE_STRING_LEN: usize = 8 * 1024; - -struct BenchmarkDataset { - _tempdir: TempDir, - file_path: PathBuf, -} - -impl BenchmarkDataset { - fn path(&self) -> &Path { - &self.file_path - } -} - -static DATASET: LazyLock = LazyLock::new(|| { - create_dataset().expect("failed to prepare parquet benchmark dataset") -}); - -fn parquet_struct_filter_pushdown(c: &mut Criterion) { - let dataset_path = DATASET.path().to_owned(); - let mut group = c.benchmark_group("parquet_struct_filter_pushdown"); - group.throughput(Throughput::Elements(TOTAL_ROWS as u64)); - - // Scenario 1: SELECT * FROM t WHERE get_field(s, 'id') = 42 - group.bench_function("select_star/no_pushdown", |b| { - let file_schema = setup_reader(&dataset_path); - let predicate = logical2physical(&struct_id_eq_literal(), &file_schema); - b.iter(|| { - let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all()) - .expect("scan succeeded"); - assert_eq!(matched, ROW_GROUP_ROW_COUNT); - }); - }); - - group.bench_function("select_star/with_pushdown", |b| { - let file_schema = setup_reader(&dataset_path); - let predicate = logical2physical(&struct_id_eq_literal(), &file_schema); - b.iter(|| { - let matched = scan(&dataset_path, &predicate, true, ProjectionMask::all()) - .expect("scan succeeded"); - assert_eq!(matched, ROW_GROUP_ROW_COUNT); - }); - }); - - // Scenario 2: SELECT * FROM t WHERE get_field(s, 'id') = id - group.bench_function("select_star_cross_col/no_pushdown", |b| { - let file_schema = setup_reader(&dataset_path); - let predicate = logical2physical(&struct_id_eq_top_id(), &file_schema); - b.iter(|| { - let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all()) - .expect("scan succeeded"); - assert_eq!(matched, TOTAL_ROWS); - }); - }); - - group.bench_function("select_star_cross_col/with_pushdown", |b| { - let file_schema = setup_reader(&dataset_path); - let predicate = logical2physical(&struct_id_eq_top_id(), &file_schema); - b.iter(|| { - let matched = scan(&dataset_path, &predicate, true, ProjectionMask::all()) - .expect("scan succeeded"); - assert_eq!(matched, TOTAL_ROWS); - }); - }); - - // Scenario 3: SELECT id FROM t WHERE get_field(s, 'id') = 42 - group.bench_function("select_id/no_pushdown", |b| { - let file_schema = setup_reader(&dataset_path); - let predicate = logical2physical(&struct_id_eq_literal(), &file_schema); - b.iter(|| { - // Without pushdown we must read all columns to evaluate the predicate. - let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all()) - .expect("scan succeeded"); - assert_eq!(matched, ROW_GROUP_ROW_COUNT); - }); - }); - - group.bench_function("select_id/with_pushdown", |b| { - let file_schema = setup_reader(&dataset_path); - let predicate = logical2physical(&struct_id_eq_literal(), &file_schema); - let id_only = id_projection(&dataset_path); - b.iter(|| { - // With pushdown the filter runs first, then we only project `id`. - let matched = scan(&dataset_path, &predicate, true, id_only.clone()) - .expect("scan succeeded"); - assert_eq!(matched, ROW_GROUP_ROW_COUNT); - }); - }); - - group.finish(); -} - -fn setup_reader(path: &Path) -> SchemaRef { - let file = std::fs::File::open(path).expect("failed to open file"); - let builder = - ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader"); - Arc::clone(builder.schema()) -} - -/// `get_field(s, 'id') = TARGET_VALUE` -fn struct_id_eq_literal() -> Expr { - let get_field_expr = datafusion_functions::core::get_field().call(vec![ - col(STRUCT_COLUMN_NAME), - Expr::Literal(ScalarValue::Utf8(Some("id".to_string())), None), - ]); - get_field_expr.eq(Expr::Literal(ScalarValue::Int32(Some(TARGET_VALUE)), None)) -} - -/// `get_field(s, 'id') = id` -fn struct_id_eq_top_id() -> Expr { - let get_field_expr = datafusion_functions::core::get_field().call(vec![ - col(STRUCT_COLUMN_NAME), - Expr::Literal(ScalarValue::Utf8(Some("id".to_string())), None), - ]); - get_field_expr.eq(col(ID_COLUMN_NAME)) -} - -/// Build a [`ProjectionMask`] that only reads the top-level `id` leaf column. -fn id_projection(path: &Path) -> ProjectionMask { - let file = std::fs::File::open(path).expect("failed to open file"); - let builder = - ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader"); - let parquet_schema = builder.metadata().file_metadata().schema_descr_ptr(); - // Leaf index 0 corresponds to the top-level `id` column. - ProjectionMask::leaves(&parquet_schema, [0]) -} - -fn scan( - path: &Path, - predicate: &Arc, - pushdown: bool, - projection: ProjectionMask, -) -> datafusion_common::Result { - let file = std::fs::File::open(path)?; - let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; - let metadata = builder.metadata().clone(); - let file_schema = builder.schema(); - - let metrics = ExecutionPlanMetricsSet::new(); - let file_metrics = ParquetFileMetrics::new(0, &path.display().to_string(), &metrics); - - let mut filter_applied = false; - let builder = if pushdown { - if let Some(row_filter) = - build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)? - { - filter_applied = true; - builder.with_row_filter(row_filter) - } else { - builder - } - } else { - builder - }; - - // Only apply a narrow projection when the filter was actually pushed down. - // Otherwise we need all columns to evaluate the predicate manually. - let output_projection = if filter_applied { - projection - } else { - ProjectionMask::all() - }; - let reader = builder.with_projection(output_projection).build()?; - - let mut matched_rows = 0usize; - for batch in reader { - let batch = batch?; - if filter_applied { - // When the row filter was applied, rows are already filtered. - matched_rows += batch.num_rows(); - } else { - matched_rows += count_matches(predicate, &batch)?; - } - } - - Ok(matched_rows) -} - -fn count_matches( - expr: &Arc, - batch: &RecordBatch, -) -> datafusion_common::Result { - let values = expr.evaluate(batch)?.into_array(batch.num_rows())?; - let bools = values - .as_any() - .downcast_ref::() - .expect("boolean filter result"); - - Ok(bools.iter().filter(|v| matches!(v, Some(true))).count()) -} - -fn schema() -> SchemaRef { - let struct_fields = Fields::from(vec![ - Field::new("id", DataType::Int32, false), - Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false), - ]); - Arc::new(Schema::new(vec![ - Field::new(ID_COLUMN_NAME, DataType::Int32, false), - Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false), - Field::new(STRUCT_COLUMN_NAME, DataType::Struct(struct_fields), false), - ])) -} - -fn create_dataset() -> datafusion_common::Result { - let tempdir = TempDir::new()?; - let file_path = tempdir.path().join("struct_filter.parquet"); - - let schema = schema(); - let writer_props = WriterProperties::builder() - .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT)) - .build(); - - let mut writer = ArrowWriter::try_new( - std::fs::File::create(&file_path)?, - Arc::clone(&schema), - Some(writer_props), - )?; - - // Each row group has a distinct `s.id` value. Only one row group - // matches the target, so pushdown should prune 90% of rows. - for rg_idx in 0..TOTAL_ROW_GROUPS { - let id_value = if rg_idx == TOTAL_ROW_GROUPS - 1 { - TARGET_VALUE - } else { - (rg_idx as i32 + 1) * 1000 - }; - let batch = build_struct_batch(&schema, id_value, ROW_GROUP_ROW_COUNT)?; - writer.write(&batch)?; - } - - writer.close()?; - - let reader = - ParquetRecordBatchReaderBuilder::try_new(std::fs::File::open(&file_path)?)?; - assert_eq!(reader.metadata().row_groups().len(), TOTAL_ROW_GROUPS); - - Ok(BenchmarkDataset { - _tempdir: tempdir, - file_path, - }) -} - -fn build_struct_batch( - schema: &SchemaRef, - id_value: i32, - len: usize, -) -> datafusion_common::Result { - let large_string: String = "x".repeat(LARGE_STRING_LEN); - - // Top-level columns - let top_id_array = Arc::new(Int32Array::from(vec![id_value; len])); - let mut top_string_builder = StringBuilder::new(); - for _ in 0..len { - top_string_builder.append_value(&large_string); - } - let top_string_array = Arc::new(top_string_builder.finish()); - - // Struct sub-fields: s.id mirrors top-level id, s.large_string is the same payload - let struct_id_array = Arc::new(Int32Array::from(vec![id_value; len])); - let mut struct_string_builder = StringBuilder::new(); - for _ in 0..len { - struct_string_builder.append_value(&large_string); - } - let struct_string_array = Arc::new(struct_string_builder.finish()); - - let struct_array = StructArray::from(vec![ - ( - Arc::new(Field::new("id", DataType::Int32, false)), - struct_id_array as Arc, - ), - ( - Arc::new(Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false)), - struct_string_array as Arc, - ), - ]); - - Ok(RecordBatch::try_new( - Arc::clone(schema), - vec![top_id_array, top_string_array, Arc::new(struct_array)], - )?) -} - -criterion_group!(benches, parquet_struct_filter_pushdown); -criterion_main!(benches); diff --git a/datafusion/datasource-parquet/src/test_data/ndv_test 2.parquet b/datafusion/datasource-parquet/src/test_data/ndv_test 2.parquet deleted file mode 100644 index 3ecbe320f506efd450c6c2ebd31fd626571db80f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1141 zcmZwHOHUI~6bJBgr_J)2zRYXL5Ay_(98U+#%5_W`z zDGL*HVOSbBZjCFK#uYKfL_dHbegId-rSU&ZYY1_YU+(nGz30A8I+Pnua^hbCd{gS? z8w~Ffmx$g-hKGnY7^5Hy`$-6fSUY}*%o{-5>o$_x_?S0@piF*7e!ov7B`8x91Rw~z zpcz6ChTYHtt-wKrJ)nUOZLk*%*a!Qe9U^c5I-nDx5CanqLL9o_5FCbXI08Lz6cUhx z6!bzG`rsHGhYa)wW$}cwlatO)vWT35({KhX$iV=dg*=>t0-T2na1k!SWhlZG7=)`Z z1lM2~uEPl2fKj*!x8OG1fx9pU_h1|*;69Xsa&{8u?L1|-AujS)jN3)t$`KhN+!`_T z6~F@`ctHXm#>@3_{>FIXF9e?1Dgn_gfmN2Smw=cHrA8Ll-+9sb52F-wrn7mz$Q5U{ zR1^h6Go|UuM1m`ngcMiez5k+VRMj`e3){0-$Lh&FxmqFL`8xcyHkD6zw1uCIHj^mR z4@_bi22lfPE1LXN&PzgNS+N9jaE@6f`|xqyDo`}DWbN$k6_;6rmg5wVoXT7wS{2E9 zF4P1@5jkOTK`-{9;_QX;BYSdUzC2Z#E}`_f7!x$1YR97Pt6VNUsXUyWTXF&cd=s6W z#)#AnrW<ni=Pc$6H3Dz@-I>y!Kbo2f4 zsNb2nCYx(MLz5di&7ZQNNn7`quD1!5 zRApz(N%ykJNvCeMV4tR}zEvn5&*KF11Chnb+ Result<(), DataFusionError> { - // We want to test the case where we have two libraries. - // Library A will have a foreign plan from Library B, called child_plan. - // Library A will add a plan called grandchild_plan under child_plan - // Library A will create a plan called parent_plan, that has child_plan - // under it. So we should have: - // parent_plan (local) -> child_plan (foreign) -> grandchild_plan (local) - // Then we want to turn parent_plan into a FFI plan. - // Verify that grandchild_plan also gets the same runtime as parent_plan. - - let module = get_module()?; - - fn generate_local_plan() -> Arc { - let schema = - Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); - - Arc::new(EmptyExec::new(schema)) - } - - let child_plan = - module - .create_empty_exec() - .ok_or(DataFusionError::NotImplemented( - "External module failed to implement create_empty_exec".to_string(), - ))?(); - let child_plan: Arc = (&child_plan) - .try_into() - .expect("should be able create plan"); - assert!(child_plan.as_any().is::()); - - let grandchild_plan = generate_local_plan(); - - let child_plan = child_plan.with_new_children(vec![grandchild_plan])?; - - unsafe { - // Originally the runtime is not set. We go through the unsafe casting - // of data here because the `inner()` function is private and this is - // only an integration test so we do not want to expose it. - let ffi_child = FFI_ExecutionPlan::new(Arc::clone(&child_plan), None); - let ffi_grandchild = - (ffi_child.children)(&ffi_child).into_iter().next().unwrap(); - - let grandchild_private_data = - ffi_grandchild.private_data as *const ExecutionPlanPrivateData; - assert!((*grandchild_private_data).runtime.is_none()); - } - - let parent_plan = generate_local_plan().with_new_children(vec![child_plan])?; - - // Adding the grandchild beneath this FFI plan should get the runtime passed down. - let runtime = tokio::runtime::Builder::new_current_thread() - .build() - .unwrap(); - let ffi_parent = - FFI_ExecutionPlan::new(parent_plan, Some(runtime.handle().clone())); - - unsafe { - let ffi_child = (ffi_parent.children)(&ffi_parent) - .into_iter() - .next() - .unwrap(); - let ffi_grandchild = - (ffi_child.children)(&ffi_child).into_iter().next().unwrap(); - assert_eq!( - (ffi_grandchild.library_marker_id)(), - (ffi_parent.library_marker_id)() - ); - - let grandchild_private_data = - ffi_grandchild.private_data as *const ExecutionPlanPrivateData; - assert!((*grandchild_private_data).runtime.is_some()); - } - - Ok(()) - } -} diff --git a/datafusion/functions-aggregate/benches/approx_distinct 2.rs b/datafusion/functions-aggregate/benches/approx_distinct 2.rs deleted file mode 100644 index 538103d991f1f..0000000000000 --- a/datafusion/functions-aggregate/benches/approx_distinct 2.rs +++ /dev/null @@ -1,128 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray}; -use arrow::datatypes::{DataType, Field, Schema}; -use criterion::{Criterion, criterion_group, criterion_main}; -use datafusion_expr::function::AccumulatorArgs; -use datafusion_expr::{Accumulator, AggregateUDFImpl}; -use datafusion_functions_aggregate::approx_distinct::ApproxDistinct; -use datafusion_physical_expr::expressions::col; -use rand::rngs::StdRng; -use rand::{Rng, SeedableRng}; - -const BATCH_SIZE: usize = 8192; -const STRING_LENGTH: usize = 20; - -fn prepare_accumulator(data_type: DataType) -> Box { - let schema = Arc::new(Schema::new(vec![Field::new("f", data_type, true)])); - let expr = col("f", &schema).unwrap(); - let accumulator_args = AccumulatorArgs { - return_field: Field::new("f", DataType::UInt64, true).into(), - schema: &schema, - expr_fields: &[expr.return_field(&schema).unwrap()], - ignore_nulls: false, - order_bys: &[], - is_reversed: false, - name: "approx_distinct(f)", - is_distinct: false, - exprs: &[expr], - }; - ApproxDistinct::new().accumulator(accumulator_args).unwrap() -} - -/// Creates an Int64Array where values are drawn from `0..n_distinct`. -fn create_i64_array(n_distinct: usize) -> Int64Array { - let mut rng = StdRng::seed_from_u64(42); - (0..BATCH_SIZE) - .map(|_| Some(rng.random_range(0..n_distinct as i64))) - .collect() -} - -/// Creates a pool of `n_distinct` random strings. -fn create_string_pool(n_distinct: usize) -> Vec { - let mut rng = StdRng::seed_from_u64(42); - (0..n_distinct) - .map(|_| { - (0..STRING_LENGTH) - .map(|_| rng.random_range(b'a'..=b'z') as char) - .collect() - }) - .collect() -} - -/// Creates a StringArray where values are drawn from the given pool. -fn create_string_array(pool: &[String]) -> StringArray { - let mut rng = StdRng::seed_from_u64(99); - (0..BATCH_SIZE) - .map(|_| Some(pool[rng.random_range(0..pool.len())].as_str())) - .collect() -} - -/// Creates a StringViewArray where values are drawn from the given pool. -fn create_string_view_array(pool: &[String]) -> StringViewArray { - let mut rng = StdRng::seed_from_u64(99); - (0..BATCH_SIZE) - .map(|_| Some(pool[rng.random_range(0..pool.len())].as_str())) - .collect() -} - -fn approx_distinct_benchmark(c: &mut Criterion) { - for pct in [80, 99] { - let n_distinct = BATCH_SIZE * pct / 100; - - // --- Int64 benchmarks --- - let values = Arc::new(create_i64_array(n_distinct)) as ArrayRef; - c.bench_function(&format!("approx_distinct i64 {pct}% distinct"), |b| { - b.iter(|| { - let mut accumulator = prepare_accumulator(DataType::Int64); - accumulator - .update_batch(std::slice::from_ref(&values)) - .unwrap() - }) - }); - - let string_pool = create_string_pool(n_distinct); - - // --- Utf8 benchmarks --- - let values = Arc::new(create_string_array(&string_pool)) as ArrayRef; - c.bench_function(&format!("approx_distinct utf8 {pct}% distinct"), |b| { - b.iter(|| { - let mut accumulator = prepare_accumulator(DataType::Utf8); - accumulator - .update_batch(std::slice::from_ref(&values)) - .unwrap() - }) - }); - - // --- Utf8View benchmarks --- - let values = Arc::new(create_string_view_array(&string_pool)) as ArrayRef; - c.bench_function(&format!("approx_distinct utf8view {pct}% distinct"), |b| { - b.iter(|| { - let mut accumulator = prepare_accumulator(DataType::Utf8View); - accumulator - .update_batch(std::slice::from_ref(&values)) - .unwrap() - }) - }); - } -} - -criterion_group!(benches, approx_distinct_benchmark); -criterion_main!(benches); diff --git a/datafusion/functions-nested/benches/array_concat 2.rs b/datafusion/functions-nested/benches/array_concat 2.rs deleted file mode 100644 index 75dcc88f14737..0000000000000 --- a/datafusion/functions-nested/benches/array_concat 2.rs +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::hint::black_box; -use std::sync::Arc; - -use arrow::array::{ArrayRef, Int32Array, ListArray}; -use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::datatypes::{DataType, Field}; -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use rand::rngs::StdRng; -use rand::{Rng, SeedableRng}; - -use datafusion_functions_nested::concat::array_concat_inner; - -const SEED: u64 = 42; - -/// Build a `ListArray` with `num_lists` rows, each containing -/// `elements_per_list` random i32 values. Every 10th row is null. -fn make_list_array( - rng: &mut StdRng, - num_lists: usize, - elements_per_list: usize, -) -> ArrayRef { - let total_values = num_lists * elements_per_list; - let values: Vec = (0..total_values).map(|_| rng.random()).collect(); - let values = Arc::new(Int32Array::from(values)); - - let offsets: Vec = (0..=num_lists) - .map(|i| (i * elements_per_list) as i32) - .collect(); - let offsets = OffsetBuffer::new(ScalarBuffer::from(offsets)); - - let nulls: Vec = (0..num_lists).map(|i| i % 10 != 0).collect(); - let nulls = Some(NullBuffer::from(nulls)); - - Arc::new(ListArray::new( - Arc::new(Field::new("item", DataType::Int32, false)), - offsets, - values, - nulls, - )) -} - -fn criterion_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("array_concat"); - - // Benchmark: varying number of rows, 20 elements per list - for num_rows in [100, 1000, 10000] { - let mut rng = StdRng::seed_from_u64(SEED); - let list_a = make_list_array(&mut rng, num_rows, 20); - let list_b = make_list_array(&mut rng, num_rows, 20); - let args: Vec = vec![list_a, list_b]; - - group.bench_with_input(BenchmarkId::new("rows", num_rows), &args, |b, args| { - b.iter(|| black_box(array_concat_inner(args).unwrap())); - }); - } - - // Benchmark: 1000 rows, varying element counts per list - for elements_per_list in [5, 50, 500] { - let mut rng = StdRng::seed_from_u64(SEED); - let list_a = make_list_array(&mut rng, 1000, elements_per_list); - let list_b = make_list_array(&mut rng, 1000, elements_per_list); - let args: Vec = vec![list_a, list_b]; - - group.bench_with_input( - BenchmarkId::new("elements_per_list", elements_per_list), - &args, - |b, args| { - b.iter(|| black_box(array_concat_inner(args).unwrap())); - }, - ); - } - - group.finish(); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/functions-nested/benches/array_to_string 2.rs b/datafusion/functions-nested/benches/array_to_string 2.rs deleted file mode 100644 index 286ed4eeb0003..0000000000000 --- a/datafusion/functions-nested/benches/array_to_string 2.rs +++ /dev/null @@ -1,188 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow::array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray}; -use arrow::buffer::OffsetBuffer; -use arrow::datatypes::{DataType, Field}; -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use datafusion_common::ScalarValue; -use datafusion_common::config::ConfigOptions; -use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; -use datafusion_functions_nested::string::ArrayToString; -use rand::rngs::StdRng; -use rand::{Rng, SeedableRng}; -use std::hint::black_box; -use std::sync::Arc; - -const NUM_ROWS: usize = 1000; -const ARRAY_SIZES: &[usize] = &[5, 20, 100]; -const NESTED_ARRAY_SIZE: usize = 3; -const SEED: u64 = 42; -const NULL_DENSITY: f64 = 0.1; - -fn criterion_benchmark(c: &mut Criterion) { - bench_array_to_string(c, "array_to_string_int64", create_int64_list_array); - bench_array_to_string(c, "array_to_string_float64", create_float64_list_array); - bench_array_to_string(c, "array_to_string_string", create_string_list_array); - bench_array_to_string( - c, - "array_to_string_nested_int64", - create_nested_int64_list_array, - ); -} - -fn bench_array_to_string( - c: &mut Criterion, - group_name: &str, - make_array: impl Fn(usize) -> ArrayRef, -) { - let mut group = c.benchmark_group(group_name); - - for &array_size in ARRAY_SIZES { - let list_array = make_array(array_size); - let args = vec![ - ColumnarValue::Array(list_array.clone()), - ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))), - ]; - let arg_fields = vec![ - Field::new("array", list_array.data_type().clone(), true).into(), - Field::new("delimiter", DataType::Utf8, false).into(), - ]; - - group.bench_with_input( - BenchmarkId::from_parameter(array_size), - &array_size, - |b, _| { - let udf = ArrayToString::new(); - b.iter(|| { - black_box( - udf.invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: NUM_ROWS, - return_field: Field::new("result", DataType::Utf8, true) - .into(), - config_options: Arc::new(ConfigOptions::default()), - }) - .unwrap(), - ) - }) - }, - ); - } - - group.finish(); -} - -fn create_int64_list_array(array_size: usize) -> ArrayRef { - let mut rng = StdRng::seed_from_u64(SEED); - let values = (0..NUM_ROWS * array_size) - .map(|_| { - if rng.random::() < NULL_DENSITY { - None - } else { - Some(rng.random_range(0..1000)) - } - }) - .collect::(); - let offsets = (0..=NUM_ROWS) - .map(|i| (i * array_size) as i32) - .collect::>(); - - Arc::new( - ListArray::try_new( - Arc::new(Field::new("item", DataType::Int64, true)), - OffsetBuffer::new(offsets.into()), - Arc::new(values), - None, - ) - .unwrap(), - ) -} - -fn create_nested_int64_list_array(array_size: usize) -> ArrayRef { - let inner = create_int64_list_array(array_size); - let inner_rows = NUM_ROWS; - let outer_rows = inner_rows / NESTED_ARRAY_SIZE; - let offsets = (0..=outer_rows) - .map(|i| (i * NESTED_ARRAY_SIZE) as i32) - .collect::>(); - Arc::new( - ListArray::try_new( - Arc::new(Field::new("item", inner.data_type().clone(), true)), - OffsetBuffer::new(offsets.into()), - inner, - None, - ) - .unwrap(), - ) -} - -fn create_float64_list_array(array_size: usize) -> ArrayRef { - let mut rng = StdRng::seed_from_u64(SEED); - let values = (0..NUM_ROWS * array_size) - .map(|_| { - if rng.random::() < NULL_DENSITY { - None - } else { - Some(rng.random_range(-1000.0..1000.0)) - } - }) - .collect::(); - let offsets = (0..=NUM_ROWS) - .map(|i| (i * array_size) as i32) - .collect::>(); - - Arc::new( - ListArray::try_new( - Arc::new(Field::new("item", DataType::Float64, true)), - OffsetBuffer::new(offsets.into()), - Arc::new(values), - None, - ) - .unwrap(), - ) -} - -fn create_string_list_array(array_size: usize) -> ArrayRef { - let mut rng = StdRng::seed_from_u64(SEED); - let values = (0..NUM_ROWS * array_size) - .map(|_| { - if rng.random::() < NULL_DENSITY { - None - } else { - Some(format!("value_{}", rng.random_range(0..100))) - } - }) - .collect::(); - let offsets = (0..=NUM_ROWS) - .map(|i| (i * array_size) as i32) - .collect::>(); - - Arc::new( - ListArray::try_new( - Arc::new(Field::new("item", DataType::Utf8, true)), - OffsetBuffer::new(offsets.into()), - Arc::new(values), - None, - ) - .unwrap(), - ) -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/optimizer/src/simplify_expressions/linear_aggregates 2.rs b/datafusion/optimizer/src/simplify_expressions/linear_aggregates 2.rs deleted file mode 100644 index 21389cf326c24..0000000000000 --- a/datafusion/optimizer/src/simplify_expressions/linear_aggregates 2.rs +++ /dev/null @@ -1,229 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Simplification to refactor multiple aggregate functions to use the same aggregate function - -use datafusion_common::HashMap; -use datafusion_expr::expr::AggregateFunctionParams; -use datafusion_expr::{BinaryExpr, Expr}; -use datafusion_expr_common::operator::Operator; - -/// Threshold of the number of aggregates that share similar arguments before -/// triggering rewrite. -/// -/// There is a threshold because the canonical SUM rewrite described in -/// [`AggregateUDFImpl::simplify_expr_op_literal`] actually results in more -/// aggregates (2) for each original aggregate. It is important that CSE then -/// eliminate them. -/// -/// [`AggregateUDFImpl::simplify_expr_op_literal`]: datafusion_expr::AggregateUDFImpl::simplify_expr_op_literal -const DUPLICATE_THRESHOLD: usize = 2; - -/// Rewrites multiple aggregate expressions that have a common linear component -/// into multiple aggregate expressions that share that common component. -/// -/// For example, rewrites patterns such as -/// * `SUM(x + 1), SUM(x + 2), ...` -/// -/// Into -/// * `SUM(x) + 1 * COUNT(x), SUM(x) + 2 * COUNT(x), ...` -/// -/// See the background [`AggregateUDFImpl::simplify_expr_op_literal`] for details. -/// -/// Returns `true` if any of the arguments are rewritten (modified), `false` -/// otherwise. -/// -/// ## Design goals: -/// 1. Keep the aggregate specific logic out of the optimizer (can't depend directly on SUM) -/// 2. Optimize for the case that this rewrite will not apply (it almost never does) -/// -/// [`AggregateUDFImpl::simplify_expr_op_literal`]: datafusion_expr::AggregateUDFImpl::simplify_expr_op_literal -pub(super) fn rewrite_multiple_linear_aggregates( - agg_expr: &mut [Expr], -) -> datafusion_common::Result { - // map : count of expressions that have a common argument - let mut common_args = HashMap::new(); - - // First pass -- figure out any aggregates that can be split and have common - // expressions. - for agg in agg_expr.iter() { - let Expr::AggregateFunction(agg_function) = agg else { - continue; - }; - - let Some(arg) = candidate_linear_param(&agg_function.params) else { - continue; - }; - - let Some(expr_literal) = ExprLiteral::try_new(arg) else { - continue; - }; - - let counter = common_args.entry(expr_literal.expr()).or_insert(0); - *counter += 1; - } - - // (agg_index, new_expr) - let mut new_aggs = vec![]; - - // Second pass, actually rewrite any aggregates that have a common - // expression and enough duplicates. - for (idx, agg) in agg_expr.iter().enumerate() { - let Expr::AggregateFunction(agg_function) = agg else { - continue; - }; - - let Some(arg) = candidate_linear_param(&agg_function.params) else { - continue; - }; - - let Some(expr_literal) = ExprLiteral::try_new(arg) else { - continue; - }; - - // Not enough common expressions to make it worth rewriting - if common_args.get(expr_literal.expr()).unwrap_or(&0) < &DUPLICATE_THRESHOLD { - continue; - } - - if let Some(new_agg_function) = agg_function.func.simplify_expr_op_literal( - agg_function, - expr_literal.expr(), - expr_literal.op(), - expr_literal.lit(), - expr_literal.arg_is_left(), - )? { - new_aggs.push((idx, new_agg_function)); - } - } - - if new_aggs.is_empty() { - return Ok(false); - } - - // Otherwise replace the aggregate expressions - drop(common_args); // release borrow - for (idx, new_agg) in new_aggs { - let orig_name = agg_expr[idx].name_for_alias()?; - agg_expr[idx] = new_agg.alias_if_changed(orig_name)? - } - - Ok(true) -} - -/// Returns Some(&Expr) with the single argument if this is a suitable candidate -/// for the linear rewrite -fn candidate_linear_param(params: &AggregateFunctionParams) -> Option<&Expr> { - // Explicitly destructure to ensure we check all relevant fields - let AggregateFunctionParams { - args, - distinct, - filter, - order_by, - null_treatment, - } = params; - - // Disqualify anything "non standard" - if *distinct - || filter.is_some() - || !order_by.is_empty() - || null_treatment.is_some() - || args.len() != 1 - { - return None; - } - let arg = args.first()?; - if arg.is_volatile() { - return None; - }; - Some(arg) -} - -/// A view into a [`Expr::BinaryExpr`] that is arbitrary expression and a -/// literal -/// -/// This is an enum to distinguish the direction of the operator arguments -#[derive(Debug, Clone)] -pub enum ExprLiteral<'a> { - /// if the expression is ` ` - ArgOpLit { - arg: &'a Expr, - op: Operator, - lit: &'a Expr, - }, - /// if the expression is ` ` - LitOpArg { - lit: &'a Expr, - op: Operator, - arg: &'a Expr, - }, -} - -impl<'a> ExprLiteral<'a> { - /// Try and split the Expr into its parts - fn try_new(expr: &'a Expr) -> Option { - match expr { - // - Expr::BinaryExpr(BinaryExpr { left, op, right }) - if matches!(left.as_ref(), Expr::Literal(..)) => - { - Some(Self::LitOpArg { - arg: right, - lit: left, - op: *op, - }) - } - - // + - Expr::BinaryExpr(BinaryExpr { left, op, right }) - if matches!(right.as_ref(), Expr::Literal(..)) => - { - Some(Self::ArgOpLit { - arg: left, - lit: right, - op: *op, - }) - } - _ => None, - } - } - - fn expr(&self) -> &'a Expr { - match self { - Self::ArgOpLit { arg, .. } => arg, - Self::LitOpArg { arg, .. } => arg, - } - } - - fn lit(&self) -> &'a Expr { - match self { - Self::ArgOpLit { lit, .. } => lit, - Self::LitOpArg { lit, .. } => lit, - } - } - - fn op(&self) -> Operator { - match self { - Self::ArgOpLit { op, .. } => *op, - Self::LitOpArg { op, .. } => *op, - } - } - - fn arg_is_left(&self) -> bool { - matches!(self, Self::ArgOpLit { .. }) - } -} diff --git a/datafusion/physical-expr-common/benches/compare_nested 2.rs b/datafusion/physical-expr-common/benches/compare_nested 2.rs deleted file mode 100644 index 56c122fef9420..0000000000000 --- a/datafusion/physical-expr-common/benches/compare_nested 2.rs +++ /dev/null @@ -1,74 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray}; -use arrow::datatypes::{DataType, Field, Fields}; -use criterion::{Criterion, criterion_group, criterion_main}; -use datafusion_expr_common::operator::Operator; -use datafusion_physical_expr_common::datum::compare_op_for_nested; -use rand::rngs::StdRng; -use rand::{Rng, SeedableRng}; -use std::hint::black_box; -use std::sync::Arc; - -/// Build a StructArray with fields {x: Int32, y: Utf8}. -fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef { - let ints: Int32Array = (0..num_rows).map(|_| Some(rng.random::())).collect(); - - let strings: StringArray = (0..num_rows) - .map(|_| { - let s: String = (0..12) - .map(|_| rng.random_range(b'a'..=b'z') as char) - .collect(); - Some(s) - }) - .collect(); - - let fields = Fields::from(vec![ - Field::new("x", DataType::Int32, false), - Field::new("y", DataType::Utf8, false), - ]); - - Arc::new( - StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], None) - .unwrap(), - ) -} - -fn criterion_benchmark(c: &mut Criterion) { - let num_rows = 8192; - let mut rng = StdRng::seed_from_u64(42); - - let lhs = make_struct_array(num_rows, &mut rng); - let rhs_array = make_struct_array(num_rows, &mut rng); - let rhs_scalar = Scalar::new(make_struct_array(1, &mut rng)); - - c.bench_function("compare_nested array_array", |b| { - b.iter(|| { - black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_array).unwrap()) - }) - }); - - c.bench_function("compare_nested array_scalar", |b| { - b.iter(|| { - black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_scalar).unwrap()) - }) - }); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/physical-optimizer/src/hash_join_buffering 2.rs b/datafusion/physical-optimizer/src/hash_join_buffering 2.rs deleted file mode 100644 index 3c29b46c0fa64..0000000000000 --- a/datafusion/physical-optimizer/src/hash_join_buffering 2.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::PhysicalOptimizerRule; -use datafusion_common::JoinSide; -use datafusion_common::config::ConfigOptions; -use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_physical_plan::ExecutionPlan; -use datafusion_physical_plan::buffer::BufferExec; -use datafusion_physical_plan::joins::HashJoinExec; -use std::sync::Arc; - -/// Looks for all the [HashJoinExec]s in the plan and places a [BufferExec] node with the -/// configured capacity in the probe side: -/// -/// ```text -/// ┌───────────────────┐ -/// │ HashJoinExec │ -/// └─────▲────────▲────┘ -/// ┌───────┘ └─────────┐ -/// │ │ -/// ┌────────────────┐ ┌─────────────────┐ -/// │ Build side │ + │ BufferExec │ -/// └────────────────┘ └────────▲────────┘ -/// │ -/// ┌────────┴────────┐ -/// │ Probe side │ -/// └─────────────────┘ -/// ``` -/// -/// Which allows eagerly pulling it even before the build side has completely finished. -#[derive(Debug, Default)] -pub struct HashJoinBuffering {} - -impl HashJoinBuffering { - pub fn new() -> Self { - Self::default() - } -} - -impl PhysicalOptimizerRule for HashJoinBuffering { - fn optimize( - &self, - plan: Arc, - config: &ConfigOptions, - ) -> datafusion_common::Result> { - let capacity = config.execution.hash_join_buffering_capacity; - if capacity == 0 { - return Ok(plan); - } - - plan.transform_down(|plan| { - let Some(node) = plan.as_any().downcast_ref::() else { - return Ok(Transformed::no(plan)); - }; - let plan = Arc::clone(&plan); - Ok(Transformed::yes( - if HashJoinExec::probe_side() == JoinSide::Left { - // Do not stack BufferExec nodes together. - if node.left.as_any().downcast_ref::().is_some() { - return Ok(Transformed::no(plan)); - } - plan.with_new_children(vec![ - Arc::new(BufferExec::new(Arc::clone(&node.left), capacity)), - Arc::clone(&node.right), - ])? - } else { - // Do not stack BufferExec nodes together. - if node.right.as_any().downcast_ref::().is_some() { - return Ok(Transformed::no(plan)); - } - plan.with_new_children(vec![ - Arc::clone(&node.left), - Arc::new(BufferExec::new(Arc::clone(&node.right), capacity)), - ])? - }, - )) - }) - .data() - } - - fn name(&self) -> &str { - "HashJoinBuffering" - } - - fn schema_check(&self) -> bool { - true - } -} diff --git a/datafusion/physical-optimizer/src/topk_repartition 2.rs b/datafusion/physical-optimizer/src/topk_repartition 2.rs deleted file mode 100644 index 668e0d273288b..0000000000000 --- a/datafusion/physical-optimizer/src/topk_repartition 2.rs +++ /dev/null @@ -1,368 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Push TopK (Sort with fetch) past Hash Repartition -//! -//! When a `SortExec` with a fetch limit (TopK) sits above a -//! `RepartitionExec(Hash)`, and the hash partition expressions are a prefix -//! of the sort expressions, this rule inserts a copy of the TopK below -//! the repartition to reduce the volume of data flowing through the shuffle. -//! -//! This is correct because the hash partition key being a prefix of the sort -//! key guarantees that all rows with the same partition key end up in the same -//! output partition. Therefore, rows that survive the final TopK after -//! repartitioning will always survive the pre-repartition TopK as well. -//! -//! ## Example -//! -//! Before: -//! ```text -//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC] -//! RepartitionExec: Hash([a], 4) -//! DataSourceExec -//! ``` -//! -//! After: -//! ```text -//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC] -//! RepartitionExec: Hash([a], 4) -//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC] -//! DataSourceExec -//! ``` - -use crate::PhysicalOptimizerRule; -use datafusion_common::Result; -use datafusion_common::config::ConfigOptions; -use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use std::sync::Arc; -// CoalesceBatchesExec is deprecated on main (replaced by arrow-rs BatchCoalescer), -// but older DataFusion versions may still insert it between SortExec and RepartitionExec. -#[expect(deprecated)] -use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec; -use datafusion_physical_plan::repartition::RepartitionExec; -use datafusion_physical_plan::sorts::sort::SortExec; -use datafusion_physical_plan::{ExecutionPlan, Partitioning}; - -/// A physical optimizer rule that pushes TopK (Sort with fetch) past -/// hash repartition when the partition key is a prefix of the sort key. -/// -/// See module-level documentation for details. -#[derive(Debug, Clone, Default)] -pub struct TopKRepartition; - -impl TopKRepartition { - pub fn new() -> Self { - Self {} - } -} - -impl PhysicalOptimizerRule for TopKRepartition { - #[expect(deprecated)] // CoalesceBatchesExec: kept for older DataFusion versions - fn optimize( - &self, - plan: Arc, - config: &ConfigOptions, - ) -> Result> { - if !config.optimizer.enable_topk_repartition { - return Ok(plan); - } - plan.transform_down(|node| { - // Match SortExec with fetch (TopK) - let Some(sort_exec) = node.as_any().downcast_ref::() else { - return Ok(Transformed::no(node)); - }; - let Some(fetch) = sort_exec.fetch() else { - return Ok(Transformed::no(node)); - }; - - // The child might be a CoalesceBatchesExec; look through it - let sort_input = sort_exec.input(); - let sort_any = sort_input.as_any(); - let (repart_parent, repart_exec) = if let Some(rp) = - sort_any.downcast_ref::() - { - // found a RepartitionExec, use it - (None, rp) - } else if let Some(cb_exec) = sort_any.downcast_ref::() { - // There's a CoalesceBatchesExec between TopK & RepartitionExec - // in this case we will need to reconstruct both nodes - let cb_input = cb_exec.input(); - let Some(rp) = cb_input.as_any().downcast_ref::() else { - return Ok(Transformed::no(node)); - }; - (Some(Arc::clone(sort_input)), rp) - } else { - return Ok(Transformed::no(node)); - }; - - // Only handle Hash partitioning - let Partitioning::Hash(hash_exprs, num_partitions) = - repart_exec.partitioning() - else { - return Ok(Transformed::no(node)); - }; - - let sort_exprs = sort_exec.expr(); - - // Check that hash expressions are a prefix of the sort expressions. - // Each hash expression must match the corresponding sort expression - // (ignoring sort options like ASC/DESC since hash doesn't care about order). - if hash_exprs.len() > sort_exprs.len() { - return Ok(Transformed::no(node)); - } - for (hash_expr, sort_expr) in hash_exprs.iter().zip(sort_exprs.iter()) { - if !hash_expr.eq(&sort_expr.expr) { - return Ok(Transformed::no(node)); - } - } - - // Don't push if the input to the repartition is already bounded - // (e.g., another TopK), as it would be redundant. - let repart_input = repart_exec.input(); - if repart_input.as_any().downcast_ref::().is_some() { - return Ok(Transformed::no(node)); - } - - // Insert a copy of the TopK below the repartition - let new_sort: Arc = Arc::new( - SortExec::new(sort_exprs.clone(), Arc::clone(repart_input)) - .with_fetch(Some(fetch)) - .with_preserve_partitioning(sort_exec.preserve_partitioning()), - ); - - let new_partitioning = - Partitioning::Hash(hash_exprs.clone(), *num_partitions); - let new_repartition: Arc = - Arc::new(RepartitionExec::try_new(new_sort, new_partitioning)?); - - // Rebuild the tree above the repartition - let new_sort_input = if let Some(parent) = repart_parent { - parent.with_new_children(vec![new_repartition])? - } else { - new_repartition - }; - - let new_top_sort: Arc = Arc::new( - SortExec::new(sort_exprs.clone(), new_sort_input) - .with_fetch(Some(fetch)) - .with_preserve_partitioning(sort_exec.preserve_partitioning()), - ); - - Ok(Transformed::yes(new_top_sort)) - }) - .data() - } - - fn name(&self) -> &str { - "TopKRepartition" - } - - fn schema_check(&self) -> bool { - true - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_physical_expr::expressions::col; - use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; - use datafusion_physical_plan::displayable; - use datafusion_physical_plan::test::scan_partitioned; - use insta::assert_snapshot; - use std::sync::Arc; - - fn schema() -> Arc { - Arc::new(Schema::new(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::Int64, false), - ])) - } - - fn sort_exprs(schema: &Schema) -> LexOrdering { - LexOrdering::new(vec![ - PhysicalSortExpr::new_default(col("a", schema).unwrap()).asc(), - PhysicalSortExpr::new_default(col("b", schema).unwrap()).asc(), - ]) - .unwrap() - } - - /// TopK above Hash(a) repartition should get pushed below it, - /// because `a` is a prefix of the sort key `(a, b)`. - #[test] - fn topk_pushed_below_hash_repartition() { - let s = schema(); - let input = scan_partitioned(1); - let ordering = sort_exprs(&s); - - let repartition = Arc::new( - RepartitionExec::try_new( - input, - Partitioning::Hash(vec![col("a", &s).unwrap()], 4), - ) - .unwrap(), - ); - - let sort = Arc::new( - SortExec::new(ordering, repartition) - .with_fetch(Some(3)) - .with_preserve_partitioning(true), - ); - - let config = ConfigOptions::new(); - let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); - - let display = displayable(optimized.as_ref()).indent(true).to_string(); - assert_snapshot!(display, @r" - SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true], sort_prefix=[a@0 ASC] - RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, maintains_sort_order=true - SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] - DataSourceExec: partitions=1, partition_sizes=[1] - "); - } - - /// TopK with no fetch (unbounded sort) should NOT be pushed. - #[test] - fn unbounded_sort_not_pushed() { - let s = schema(); - let input = scan_partitioned(1); - let ordering = sort_exprs(&s); - - let repartition = Arc::new( - RepartitionExec::try_new( - input, - Partitioning::Hash(vec![col("a", &s).unwrap()], 4), - ) - .unwrap(), - ); - - let sort: Arc = Arc::new( - SortExec::new(ordering, repartition).with_preserve_partitioning(true), - ); - - let config = ConfigOptions::new(); - let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); - - let display = displayable(optimized.as_ref()).indent(true).to_string(); - assert_snapshot!(display, @r" - SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] - RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 - DataSourceExec: partitions=1, partition_sizes=[1] - "); - } - - /// Hash key NOT a prefix of sort key should NOT be pushed. - #[test] - fn non_prefix_hash_key_not_pushed() { - let s = schema(); - let input = scan_partitioned(1); - let ordering = sort_exprs(&s); - - // Hash by `b`, but sort by `(a, b)` - b is not a prefix - let repartition = Arc::new( - RepartitionExec::try_new( - input, - Partitioning::Hash(vec![col("b", &s).unwrap()], 4), - ) - .unwrap(), - ); - - let sort: Arc = Arc::new( - SortExec::new(ordering, repartition) - .with_fetch(Some(3)) - .with_preserve_partitioning(true), - ); - - let config = ConfigOptions::new(); - let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); - - let display = displayable(optimized.as_ref()).indent(true).to_string(); - assert_snapshot!(display, @r" - SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] - RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=1 - DataSourceExec: partitions=1, partition_sizes=[1] - "); - } - - /// TopK above CoalesceBatchesExec above Hash(a) repartition should - /// push through both, inserting a new TopK below the repartition. - #[expect(deprecated)] - #[test] - fn topk_pushed_through_coalesce_batches() { - let s = schema(); - let input = scan_partitioned(1); - let ordering = sort_exprs(&s); - - let repartition = Arc::new( - RepartitionExec::try_new( - input, - Partitioning::Hash(vec![col("a", &s).unwrap()], 4), - ) - .unwrap(), - ); - - let coalesce: Arc = - Arc::new(CoalesceBatchesExec::new(repartition, 8192)); - - let sort = Arc::new( - SortExec::new(ordering, coalesce) - .with_fetch(Some(3)) - .with_preserve_partitioning(true), - ); - - let config = ConfigOptions::new(); - let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); - - let display = displayable(optimized.as_ref()).indent(true).to_string(); - assert_snapshot!(display, @r" - SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true], sort_prefix=[a@0 ASC] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, maintains_sort_order=true - SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] - DataSourceExec: partitions=1, partition_sizes=[1] - "); - } - - /// RoundRobin repartition should NOT be pushed. - #[test] - fn round_robin_not_pushed() { - let s = schema(); - let input = scan_partitioned(1); - let ordering = sort_exprs(&s); - - let repartition = Arc::new( - RepartitionExec::try_new(input, Partitioning::RoundRobinBatch(4)).unwrap(), - ); - - let sort: Arc = Arc::new( - SortExec::new(ordering, repartition) - .with_fetch(Some(3)) - .with_preserve_partitioning(true), - ); - - let config = ConfigOptions::new(); - let optimized = TopKRepartition::new().optimize(sort, &config).unwrap(); - - let display = displayable(optimized.as_ref()).indent(true).to_string(); - assert_snapshot!(display, @r" - SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] - RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 - DataSourceExec: partitions=1, partition_sizes=[1] - "); - } -} diff --git a/datafusion/spark/src/function/array/array_contains 2.rs b/datafusion/spark/src/function/array/array_contains 2.rs deleted file mode 100644 index 2bc5d64d8bff8..0000000000000 --- a/datafusion/spark/src/function/array/array_contains 2.rs +++ /dev/null @@ -1,168 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow::array::{ - Array, AsArray, BooleanArray, BooleanBufferBuilder, GenericListArray, OffsetSizeTrait, -}; -use arrow::buffer::{BooleanBuffer, NullBuffer}; -use arrow::datatypes::DataType; -use datafusion_common::{Result, exec_err}; -use datafusion_expr::{ - ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, -}; -use datafusion_functions_nested::array_has::array_has_udf; -use std::any::Any; -use std::sync::Arc; - -/// Spark-compatible `array_contains` function. -/// -/// Calls DataFusion's `array_has` and then applies Spark's null semantics: -/// - If the result from `array_has` is `true`, return `true`. -/// - If the result is `false` and the input array row contains any null elements, -/// return `null` (because the element might have been the null). -/// - If the result is `false` and the input array row has no null elements, -/// return `false`. -#[derive(Debug, PartialEq, Eq, Hash)] -pub struct SparkArrayContains { - signature: Signature, -} - -impl Default for SparkArrayContains { - fn default() -> Self { - Self::new() - } -} - -impl SparkArrayContains { - pub fn new() -> Self { - Self { - signature: Signature::array_and_element(Volatility::Immutable), - } - } -} - -impl ScalarUDFImpl for SparkArrayContains { - fn as_any(&self) -> &dyn Any { - self - } - - fn name(&self) -> &str { - "array_contains" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn return_type(&self, _: &[DataType]) -> Result { - Ok(DataType::Boolean) - } - - fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let haystack = args.args[0].clone(); - let array_has_result = array_has_udf().invoke_with_args(args)?; - - let result_array = array_has_result.to_array(1)?; - let patched = apply_spark_null_semantics(result_array.as_boolean(), &haystack)?; - Ok(ColumnarValue::Array(Arc::new(patched))) - } -} - -/// For each row where `array_has` returned `false`, set the output to null -/// if that row's input array contains any null elements. -fn apply_spark_null_semantics( - result: &BooleanArray, - haystack_arg: &ColumnarValue, -) -> Result { - // happy path - if result.false_count() == 0 || haystack_arg.data_type() == DataType::Null { - return Ok(result.clone()); - } - - let haystack = haystack_arg.to_array_of_size(result.len())?; - - let row_has_nulls = compute_row_has_nulls(&haystack)?; - - // A row keeps its validity when result is true OR the row has no nulls. - let keep_mask = result.values() | &!&row_has_nulls; - let new_validity = match result.nulls() { - Some(n) => n.inner() & &keep_mask, - None => keep_mask, - }; - - Ok(BooleanArray::new( - result.values().clone(), - Some(NullBuffer::new(new_validity)), - )) -} - -/// Returns a per-row bitmap where bit i is set if row i's list contains any null element. -fn compute_row_has_nulls(haystack: &dyn Array) -> Result { - match haystack.data_type() { - DataType::List(_) => generic_list_row_has_nulls(haystack.as_list::()), - DataType::LargeList(_) => generic_list_row_has_nulls(haystack.as_list::()), - DataType::FixedSizeList(_, _) => { - let list = haystack.as_fixed_size_list(); - let buf = match list.values().nulls() { - Some(nulls) => { - let validity = nulls.inner(); - let vl = list.value_length() as usize; - let mut builder = BooleanBufferBuilder::new(list.len()); - for i in 0..list.len() { - builder.append(validity.slice(i * vl, vl).count_set_bits() < vl); - } - builder.finish() - } - None => BooleanBuffer::new_unset(list.len()), - }; - Ok(mask_with_list_nulls(buf, list.nulls())) - } - dt => exec_err!("compute_row_has_nulls: unsupported data type {dt}"), - } -} - -/// Computes per-row null presence for `List` and `LargeList` arrays. -fn generic_list_row_has_nulls( - list: &GenericListArray, -) -> Result { - let buf = match list.values().nulls() { - Some(nulls) => { - let validity = nulls.inner(); - let offsets = list.offsets(); - let mut builder = BooleanBufferBuilder::new(list.len()); - for i in 0..list.len() { - let s = offsets[i].as_usize(); - let len = offsets[i + 1].as_usize() - s; - builder.append(validity.slice(s, len).count_set_bits() < len); - } - builder.finish() - } - None => BooleanBuffer::new_unset(list.len()), - }; - Ok(mask_with_list_nulls(buf, list.nulls())) -} - -/// Rows where the list itself is null should not be marked as "has nulls". -fn mask_with_list_nulls( - buf: BooleanBuffer, - list_nulls: Option<&NullBuffer>, -) -> BooleanBuffer { - match list_nulls { - Some(n) => &buf & n.inner(), - None => buf, - } -} diff --git a/datafusion/sqllogictest/src/test_file 2.rs b/datafusion/sqllogictest/src/test_file 2.rs deleted file mode 100644 index c44cae133639b..0000000000000 --- a/datafusion/sqllogictest/src/test_file 2.rs +++ /dev/null @@ -1,186 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::collections::HashMap; -use std::ffi::OsStr; -use std::path::{Path, PathBuf}; -use std::sync::LazyLock; - -/// Represents a parsed test file -/// -/// Note there is a custom Ord implementation that sorts test files by: -/// 1. Hard coded test priority (lower runs first), -/// 2. Relative path as deterministic tie-breaker. -#[derive(Debug, PartialEq, Eq)] -pub struct TestFile { - /// The absolute path to the file - pub path: PathBuf, - /// The relative path of the file (used for display) - pub relative_path: PathBuf, -} - -impl TestFile { - /// Create a new [`TestFile`] from the given path, stripping any of the - /// known test directory prefixes for the relative path. - pub fn new(path: PathBuf, prefixes: &[&str]) -> Self { - let p = path.to_string_lossy(); - for prefix in prefixes { - if p.starts_with(prefix) { - let relative_path = PathBuf::from(p.strip_prefix(prefix).unwrap()); - return Self { - path, - relative_path, - }; - } - } - let relative_path = PathBuf::from(""); - - Self { - path, - relative_path, - } - } - - /// Returns true if the file has a .slt extension, indicating it is a sqllogictest file. - pub fn is_slt_file(&self) -> bool { - self.path.extension() == Some(OsStr::new("slt")) - } - - /// Returns true if the relative path starts with the given prefix, which - /// can be used to filter tests by subdirectory or filename patterns. - pub fn relative_path_starts_with(&self, prefix: impl AsRef) -> bool { - self.relative_path.starts_with(prefix) - } -} - -impl PartialOrd for TestFile { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for TestFile { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - let self_path = &self.relative_path; - let other_path = &other.relative_path; - - let priority_self = TEST_PRIORITY.get(self_path).unwrap_or(&DEFAULT_PRIORITY); - let priority_other = TEST_PRIORITY.get(other_path).unwrap_or(&DEFAULT_PRIORITY); - - priority_self - .cmp(priority_other) - .then_with(|| self_path.cmp(other_path)) // Tie-breaker: lexicographic order of relative paths. - // Final tie-breaker keeps Ord consistent with Eq when relative paths collide. - .then_with(|| self.path.cmp(&other.path)) - } -} - -/// TEST PRIORITY -/// -/// Heuristically prioritize some test to run earlier. -/// -/// Prioritizes test to run earlier if they are known to be long running (as -/// each test file itself is run sequentially, but multiple test files are run -/// in parallel. -/// -/// Tests not listed here will run after the listed tests in deterministic -/// lexicographic order by relative path. -/// -/// You can find the top longest running tests by running `--timing-summary` -/// mode. For example -/// -/// ```shell -/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top -/// ... -/// Per-file elapsed summary (deterministic): -/// 1. 3.568s aggregate.slt -/// 2. 3.464s joins.slt -/// 3. 3.336s imdb.slt -/// 4. 3.085s push_down_filter_regression.slt -/// 5. 2.926s aggregate_skip_partial.slt -/// 6. 2.453s array.slt -/// 7. 2.399s window.slt -/// 8. 2.198s group_by.slt -/// 9. 1.281s clickbench.slt -/// 10. 1.058s datetime/timestamps.slt -/// ``` -const TEST_PRIORITY_ENTRIES: &[&str] = &[ - "aggregate.slt", // longest-running files go first - "joins.slt", - "imdb.slt", - "push_down_filter_regression.slt", - "aggregate_skip_partial.slt", - "array.slt", - "window.slt", - "group_by.slt", - "clickbench.slt", - "datetime/timestamps.slt", -]; - -/// Default priority for tests not in the priority map. Tests with lower -/// priority values run first. -const DEFAULT_PRIORITY: usize = 100; - -static TEST_PRIORITY: LazyLock> = LazyLock::new(|| { - TEST_PRIORITY_ENTRIES - .iter() - .enumerate() - .map(|(priority, path)| (PathBuf::from(path), priority)) - .collect() -}); - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn prioritized_files_are_first() { - let mut input = vec!["z_unlisted.slt", "a_unlisted.slt"]; - input.extend(TEST_PRIORITY_ENTRIES.iter()); - input.push("q_unlisted.slt"); - - let mut sorted = to_test_files(input); - sorted.sort_unstable(); - - println!("Sorted input: {sorted:?}"); - - // the prioritized files should be first, in the order specified by TEST_PRIORITY_ENTRIES - for file in sorted.iter().take(TEST_PRIORITY_ENTRIES.len()) { - assert!( - TEST_PRIORITY.contains_key(&file.relative_path), - "Expected prioritized file {file:?} not found in input {sorted:?}" - ); - } - // last three files should be the unlisted ones in deterministic order - let expected_files = - to_test_files(["a_unlisted.slt", "q_unlisted.slt", "z_unlisted.slt"]); - assert!( - sorted.ends_with(&expected_files), - "Expected unlisted files {expected_files:?} at the end in deterministic order of {sorted:?}" - ); - } - - fn to_test_files<'a>(files: impl IntoIterator) -> Vec { - files - .into_iter() - .map(|f| TestFile { - path: PathBuf::from(f), - relative_path: PathBuf::from(f), - }) - .collect() - } -} diff --git a/datafusion/sqllogictest/test_files/aggregates_simplify 2.slt b/datafusion/sqllogictest/test_files/aggregates_simplify 2.slt deleted file mode 100644 index 9aa3ecf7a29f8..0000000000000 --- a/datafusion/sqllogictest/test_files/aggregates_simplify 2.slt +++ /dev/null @@ -1,358 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -####### -# Tests for aggregate optimizations / simplifications -####### - -statement ok -CREATE TABLE sum_simplify_t AS VALUES (1, 100), (1, 200), (2, 100), (NULL, NULL); - -# Baseline SUM of an expression -query I -SELECT SUM(column1 + 1) FROM sum_simplify_t; ----- -7 - -query TT -EXPLAIN SELECT SUM(column1 + 1) FROM sum_simplify_t; ----- -logical_plan -01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]] -02)--TableScan: sum_simplify_t projection=[column1] -physical_plan -01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))] -02)--DataSourceExec: partitions=1, partition_sizes=[1] - - -# Mixed aggregate expressions with type validation -query TI -SELECT arrow_typeof(SUM(column1)), SUM(column1 + 1) FROM sum_simplify_t; ----- -Int64 7 - -query TT -EXPLAIN SELECT arrow_typeof(SUM(column1)), SUM(column1), SUM(column1 + 1) FROM sum_simplify_t; ----- -logical_plan -01)Projection: arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1)) -02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]] -03)----TableScan: sum_simplify_t projection=[column1] -physical_plan -01)ProjectionExec: expr=[arrow_typeof(sum(sum_simplify_t.column1)@0) as arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1)@0 as sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))@1 as sum(sum_simplify_t.column1 + Int64(1))] -02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - -# Duplicate aggregate expressions -query II -SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t; ----- -7 7 - -query TT -EXPLAIN SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t; ----- -logical_plan -01)Projection: sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_b -02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]] -03)----TableScan: sum_simplify_t projection=[column1] -physical_plan -01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_b] -02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - - -# constant aggregate expressions -query II -SELECT SUM(2+1), SUM(3) FROM sum_simplify_t; ----- -12 12 - -query TT -EXPLAIN SELECT SUM(2+1), SUM(3) FROM sum_simplify_t; ----- -logical_plan -01)Projection: __common_expr_1 AS sum(Int64(2) + Int64(1)), __common_expr_1 AS sum(Int64(3)) -02)--Aggregate: groupBy=[[]], aggr=[[sum(Int64(3)) AS __common_expr_1]] -03)----TableScan: sum_simplify_t projection=[] -physical_plan -01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(2) + Int64(1)), __common_expr_1@0 as sum(Int64(3))] -02)--AggregateExec: mode=Single, gby=[], aggr=[__common_expr_1] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - - -# Duplicated expression across multiple aggregate arguments. -query II -SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t; ----- -7 10 - - -query TT -EXPLAIN SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t; ----- -logical_plan -01)Projection: sum(sum_simplify_t.column1) + __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2)) -02)--Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum(sum_simplify_t.column1) -03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]] -04)------TableScan: sum_simplify_t projection=[column1] -physical_plan -01)ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1)@0 + 2 * count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(2))] -02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - -# Reordered expressions that still compute the same thing -query II -SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t; ----- -7 10 - -query TT -EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t; ----- -logical_plan -01)Projection: sum(sum_simplify_t.column1) + __common_expr_1 AS sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2)) -02)--Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum(sum_simplify_t.column1) -03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]] -04)------TableScan: sum_simplify_t projection=[column1] -physical_plan -01)ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1)@0 + 2 * count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(2))] -02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - -# DISTINCT aggregates with different arguments -query II -SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t; ----- -5 7 - -query TT -EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t; ----- -logical_plan -01)Aggregate: groupBy=[[]], aggr=[[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))]] -02)--TableScan: sum_simplify_t projection=[column1] -physical_plan -01)AggregateExec: mode=Single, gby=[], aggr=[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))] -02)--DataSourceExec: partitions=1, partition_sizes=[1] - -# DISTINCT and non-DISTINCT aggregates -query II -SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t; ----- -5 7 - -query TT -EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t; ----- -logical_plan -01)Projection: sum(alias1) AS sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2) AS sum(sum_simplify_t.column1 + Int64(1)) -02)--Aggregate: groupBy=[[]], aggr=[[sum(alias1), sum(alias2)]] -03)----Aggregate: groupBy=[[__common_expr_1 AS alias1]], aggr=[[sum(__common_expr_1) AS alias2]] -04)------Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1 -05)--------TableScan: sum_simplify_t projection=[column1] -physical_plan -01)ProjectionExec: expr=[sum(alias1)@0 as sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2)@1 as sum(sum_simplify_t.column1 + Int64(1))] -02)--AggregateExec: mode=Final, gby=[], aggr=[sum(alias1), sum(alias2)] -03)----CoalescePartitionsExec -04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(alias1), sum(alias2)] -05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[alias2] -06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1 -07)------------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as alias1], aggr=[alias2] -08)--------------ProjectionExec: expr=[column1@0 + 1 as __common_expr_1] -09)----------------DataSourceExec: partitions=1, partition_sizes=[1] - -# FILTER clauses with different aggregate arguments -query II -SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t; ----- -3 NULL - -query TT -EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t; ----- -logical_plan -01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))]] -02)--TableScan: sum_simplify_t projection=[column1] -physical_plan -01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))] -02)--DataSourceExec: partitions=1, partition_sizes=[1] - -# FILTER clauses with the same aggregate argument -query II -SELECT - SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a, - SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b -FROM sum_simplify_t; ----- -3 3 - -query TT -EXPLAIN SELECT - SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a, - SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b -FROM sum_simplify_t; ----- -logical_plan -01)Projection: sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_b -02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))]] -03)----TableScan: sum_simplify_t projection=[column1] -physical_plan -01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_b] -02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - -# Same aggregate argument with different FILTER predicates -query II -SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t; ----- -3 7 - -query TT -EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t; ----- -logical_plan -01)Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))]] -02)--Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1, sum_simplify_t.column1 -03)----TableScan: sum_simplify_t projection=[column1] -physical_plan -01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))] -02)--ProjectionExec: expr=[column1@0 + 1 as __common_expr_1, column1@0 as column1] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - -# volatile aggregate arguments -query B -SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t; ----- -true - -query TT -EXPLAIN SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t; ----- -logical_plan -01)Projection: sum(random() + Int64(2)) > sum(random() + Int64(1)) AS sum(random() + Int64(1)) < sum(random() + Int64(2)) -02)--Aggregate: groupBy=[[]], aggr=[[sum(random() + Float64(1)) AS sum(random() + Int64(1)), sum(random() + Float64(2)) AS sum(random() + Int64(2))]] -03)----TableScan: sum_simplify_t projection=[] -physical_plan -01)ProjectionExec: expr=[sum(random() + Int64(2))@1 > sum(random() + Int64(1))@0 as sum(random() + Int64(1)) < sum(random() + Int64(2))] -02)--AggregateExec: mode=Single, gby=[], aggr=[sum(random() + Int64(1)), sum(random() + Int64(2))] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - -# Checks grouped aggregates with explicit ORDER BY return deterministic row order. -query III -SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST; ----- -200 2 3 -100 5 7 -NULL NULL NULL - -query TT -EXPLAIN SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST; ----- -logical_plan -01)Sort: sum_simplify_t.column2 DESC NULLS LAST -02)--Projection: sum_simplify_t.column2, sum(sum_simplify_t.column1) + __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2)) -03)----Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum_simplify_t.column2, sum(sum_simplify_t.column1) -04)------Aggregate: groupBy=[[sum_simplify_t.column2]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]] -05)--------TableScan: sum_simplify_t projection=[column1, column2] -physical_plan -01)SortPreservingMergeExec: [column2@0 DESC NULLS LAST] -02)--SortExec: expr=[column2@0 DESC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[column2@0 as column2, sum(sum_simplify_t.column1)@1 + count(sum_simplify_t.column1)@2 as sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1)@1 + 2 * count(sum_simplify_t.column1)@2 as sum(sum_simplify_t.column1 + Int64(2))] -04)------AggregateExec: mode=FinalPartitioned, gby=[column2@0 as column2], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] -05)--------RepartitionExec: partitioning=Hash([column2@0], 4), input_partitions=1 -06)----------AggregateExec: mode=Partial, gby=[column2@1 as column2], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] -07)------------DataSourceExec: partitions=1, partition_sizes=[1] - -# Checks commutative forms of equivalent aggregate arguments are simplified consistently. -query II -SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t; ----- -7 7 - -query TT -EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t; ----- -logical_plan -01)Projection: __common_expr_1 AS sum(Int64(1) + sum_simplify_t.column1), __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)) -02)--Projection: sum(sum_simplify_t.column1) + CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1 -03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]] -04)------TableScan: sum_simplify_t projection=[column1] -physical_plan -01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(1) + sum_simplify_t.column1), __common_expr_1@0 as sum(sum_simplify_t.column1 + Int64(1))] -02)--ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as __common_expr_1] -03)----AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)] -04)------DataSourceExec: partitions=1, partition_sizes=[1] - -# Checks unsigned overflow edge case from PR discussion using transformed SUM arguments. -statement ok -CREATE TABLE IF NOT EXISTS tbl (val INTEGER UNSIGNED); - -statement ok -INSERT INTO tbl VALUES (4294967295); - -statement ok -INSERT INTO tbl VALUES (4294967295); - -# Checks transformed SUM results for unsigned max values are preserved. -query TII -SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl; ----- -Int64 8589934592 8589934594 - -query TT -EXPLAIN SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl; ----- -logical_plan -01)Projection: arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2)) -02)--Projection: sum(tbl.val) + __common_expr_1 AS sum(tbl.val + Int64(1)), sum(tbl.val) + Int64(2) * __common_expr_1 AS sum(tbl.val + Int64(2)) -03)----Projection: CAST(count(tbl.val) AS Int64) AS __common_expr_1, sum(tbl.val) -04)------Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_2 AS tbl.val), count(__common_expr_2 AS tbl.val)]] -05)--------Projection: CAST(tbl.val AS Int64) AS __common_expr_2 -06)----------TableScan: tbl projection=[val] -physical_plan -01)ProjectionExec: expr=[arrow_typeof(sum(tbl.val + Int64(1))@0) as arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1))@0 as sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))@1 as sum(tbl.val + Int64(2))] -02)--ProjectionExec: expr=[sum(tbl.val)@0 + count(tbl.val)@1 as sum(tbl.val + Int64(1)), sum(tbl.val)@0 + 2 * count(tbl.val)@1 as sum(tbl.val + Int64(2))] -03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)] -04)------ProjectionExec: expr=[CAST(val@0 AS Int64) as __common_expr_2] -05)--------DataSourceExec: partitions=1, partition_sizes=[2] - -# Checks equivalent rewritten form (SUM + COUNT terms) matches transformed SUM semantics. -query RR -SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl; ----- -8589934592 8589934594 - -query TT -EXPLAIN SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl; ----- -logical_plan -01)Projection: __common_expr_1 + CAST(count(tbl.val) AS Decimal128(20, 0)) AS sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1 AS sum(tbl.val) + CAST(Int64(2) * count(tbl.val) AS Decimal128(20, 0)) -02)--Projection: CAST(sum(tbl.val) AS Decimal128(20, 0)) AS __common_expr_1, count(tbl.val) -03)----Aggregate: groupBy=[[]], aggr=[[sum(CAST(tbl.val AS UInt64)), count(tbl.val)]] -04)------TableScan: tbl projection=[val] -physical_plan -01)ProjectionExec: expr=[__common_expr_1@0 + CAST(count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1@0 + CAST(2 * count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(2) * count(tbl.val)] -02)--ProjectionExec: expr=[CAST(sum(tbl.val)@0 AS Decimal128(20, 0)) as __common_expr_1, count(tbl.val)@1 as count(tbl.val)] -03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)] -04)------DataSourceExec: partitions=1, partition_sizes=[2] - -statement ok -DROP TABLE IF EXISTS tbl; - -statement ok -DROP TABLE sum_simplify_t; diff --git a/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch 2.slt b/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch 2.slt deleted file mode 100644 index ab23fff030489..0000000000000 --- a/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch 2.slt +++ /dev/null @@ -1,55 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Tests for filter pushdown behavior with Sort + LIMIT (fetch). - -statement ok -CREATE TABLE t(id INT, value INT) AS VALUES -(1, 100), -(2, 200), -(3, 300), -(4, 400), -(5, 500); - -# Take the 3 smallest values (100, 200, 300), then filter value > 200. -query II -SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; ----- -3 300 - -# Take the 3 largest values (500, 400, 300), then filter value < 400. -query II -SELECT * FROM (SELECT * FROM t ORDER BY value DESC LIMIT 3) sub WHERE sub.value < 400; ----- -3 300 - -# The filter stays above the sort+fetch in the plan. -query TT -EXPLAIN SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; ----- -logical_plan -01)SubqueryAlias: sub -02)--Filter: t.value > Int32(200) -03)----Sort: t.value ASC NULLS LAST, fetch=3 -04)------TableScan: t projection=[id, value] -physical_plan -01)FilterExec: value@1 > 200 -02)--SortExec: TopK(fetch=3), expr=[value@1 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - -statement ok -DROP TABLE t; diff --git a/datafusion/sqllogictest/test_files/spark/array/array_contains 2.slt b/datafusion/sqllogictest/test_files/spark/array/array_contains 2.slt deleted file mode 100644 index db9ac6b122e3f..0000000000000 --- a/datafusion/sqllogictest/test_files/spark/array/array_contains 2.slt +++ /dev/null @@ -1,140 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Tests for Spark-compatible array_contains function. -# Spark semantics: if element is found -> true; if not found and array has nulls -> null; if not found and no nulls -> false. - -### -### Scalar tests -### - -# Element found in array -query B -SELECT array_contains(array(1, 2, 3), 2); ----- -true - -# Element not found, no nulls in array -query B -SELECT array_contains(array(1, 2, 3), 4); ----- -false - -# Element not found, array has null elements -> null -query B -SELECT array_contains(array(1, NULL, 3), 2); ----- -NULL - -# Element found, array has null elements -> true (nulls don't matter) -query B -SELECT array_contains(array(1, NULL, 3), 1); ----- -true - -# Element found at the end, array has null elements -> true -query B -SELECT array_contains(array(1, NULL, 3), 3); ----- -true - -# Null array -> null -query B -SELECT array_contains(NULL, 1); ----- -NULL - -# Null element -> null -query B -SELECT array_contains(array(1, 2, 3), NULL); ----- -NULL - -# Empty array, element not found -> false -query B -SELECT array_contains(array(), 1); ----- -false - -# Array with only nulls, element not found -> null -query B -SELECT array_contains(array(NULL, NULL), 1); ----- -NULL - -# String array, element found -query B -SELECT array_contains(array('a', 'b', 'c'), 'b'); ----- -true - -# String array, element not found, no nulls -query B -SELECT array_contains(array('a', 'b', 'c'), 'd'); ----- -false - -# String array, element not found, has null -query B -SELECT array_contains(array('a', NULL, 'c'), 'd'); ----- -NULL - -### -### Columnar tests with a table -### - -statement ok -CREATE TABLE test_arrays AS VALUES - (1, make_array(1, 2, 3), 10), - (2, make_array(4, NULL, 6), 5), - (3, make_array(7, 8, 9), 10), - (4, NULL, 1), - (5, make_array(10, NULL, NULL), 10); - -# Column needle against column array -query IBB -SELECT column1, - array_contains(column2, column3), - array_contains(column2, 10) -FROM test_arrays -ORDER BY column1; ----- -1 false false -2 NULL NULL -3 false false -4 NULL NULL -5 true true - -statement ok -DROP TABLE test_arrays; - -### -### Nested array tests -### - -# Nested array element found -query B -SELECT array_contains(array(array(1, 2), array(3, 4)), array(3, 4)); ----- -true - -# Nested array element not found, no nulls -query B -SELECT array_contains(array(array(1, 2), array(3, 4)), array(5, 6)); ----- -false diff --git a/datafusion/sqllogictest/test_files/window_topk_pushdown 2.slt b/datafusion/sqllogictest/test_files/window_topk_pushdown 2.slt deleted file mode 100644 index 2c33566736745..0000000000000 --- a/datafusion/sqllogictest/test_files/window_topk_pushdown 2.slt +++ /dev/null @@ -1,141 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Tests for the TopKRepartition optimizer rule. -# -# When a partitioned window function has ORDER BY + LIMIT, the optimizer -# can push a TopK (Sort with fetch) below the hash repartition to reduce -# the volume of data flowing through the shuffle. -# -# The optimization is correct when the hash partition key is a prefix of -# the sort key, because all rows with the same partition key land in the -# same output partition. - -statement ok -CREATE EXTERNAL TABLE employees ( - depname VARCHAR NOT NULL, - c2 TINYINT NOT NULL, - c3 SMALLINT NOT NULL, - c4 SMALLINT, - c5 INT, - c6 BIGINT NOT NULL, - c7 SMALLINT NOT NULL, - empno INT NOT NULL, - salary BIGINT UNSIGNED NOT NULL, - c10 VARCHAR NOT NULL, - c11 FLOAT NOT NULL, - c12 DOUBLE NOT NULL, - c13 VARCHAR NOT NULL, - hire_date DATE NOT NULL, - c15 TIMESTAMP NOT NULL -) -STORED AS CSV -LOCATION '../../testing/data/csv/aggregate_test_100_with_dates.csv' -OPTIONS ('format.has_header' 'true'); - -# Use multiple partitions to trigger hash repartitioning for the window function -statement ok -SET datafusion.execution.target_partitions = 4; - -### -### Results correctness: both enabled and disabled must produce the same output -### - -# Disabled: baseline results without the optimization -statement ok -SET datafusion.optimizer.enable_topk_repartition = false; - -query TI -SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total -FROM employees -ORDER BY depname, empno -LIMIT 3; ----- -a 1 -a 2 -a 3 - -# Enabled: results must match baseline -statement ok -SET datafusion.optimizer.enable_topk_repartition = true; - -query TI -SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total -FROM employees -ORDER BY depname, empno -LIMIT 3; ----- -a 1 -a 2 -a 3 - -### -### Plan shape: disabled should have TopK only above repartition -### - -statement ok -SET datafusion.optimizer.enable_topk_repartition = false; - -query TT -EXPLAIN SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total -FROM employees -ORDER BY depname, empno -LIMIT 3; ----- -logical_plan -01)Projection: employees.depname, running_total -02)--Sort: employees.depname ASC NULLS LAST, employees.empno ASC NULLS LAST, fetch=3 -03)----Projection: employees.depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_total, employees.empno -04)------WindowAggr: windowExpr=[[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] -05)--------TableScan: employees projection=[depname, empno] -physical_plan -01)ProjectionExec: expr=[depname@0 as depname, running_total@1 as running_total] -02)--SortPreservingMergeExec: [depname@0 ASC NULLS LAST, empno@2 ASC NULLS LAST], fetch=3 -03)----ProjectionExec: expr=[depname@0 as depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as running_total, empno@1 as empno] -04)------BoundedWindowAggExec: wdw=[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] -05)--------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true] -06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1 -07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno], file_type=csv, has_header=true - -### -### Plan shape: enabled should have TopK on BOTH sides of the repartition -### - -statement ok -SET datafusion.optimizer.enable_topk_repartition = true; - -query TT -EXPLAIN SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total -FROM employees -ORDER BY depname, empno -LIMIT 3; ----- -logical_plan -01)Projection: employees.depname, running_total -02)--Sort: employees.depname ASC NULLS LAST, employees.empno ASC NULLS LAST, fetch=3 -03)----Projection: employees.depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_total, employees.empno -04)------WindowAggr: windowExpr=[[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] -05)--------TableScan: employees projection=[depname, empno] -physical_plan -01)ProjectionExec: expr=[depname@0 as depname, running_total@1 as running_total] -02)--SortPreservingMergeExec: [depname@0 ASC NULLS LAST, empno@2 ASC NULLS LAST], fetch=3 -03)----ProjectionExec: expr=[depname@0 as depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as running_total, empno@1 as empno] -04)------BoundedWindowAggExec: wdw=[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] -05)--------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST] -06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1, maintains_sort_order=true -07)------------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true] -08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno], file_type=csv, has_header=true diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/nested 2.rs b/datafusion/substrait/src/logical_plan/consumer/expr/nested 2.rs deleted file mode 100644 index f94a701342826..0000000000000 --- a/datafusion/substrait/src/logical_plan/consumer/expr/nested 2.rs +++ /dev/null @@ -1,151 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::logical_plan::consumer::SubstraitConsumer; -use datafusion::common::{DFSchema, not_impl_err, substrait_err}; -use datafusion::execution::FunctionRegistry; -use datafusion::logical_expr::Expr; -use substrait::proto::expression::Nested; -use substrait::proto::expression::nested::NestedType; - -/// Converts a Substrait [Nested] expression into a DataFusion [Expr]. -/// -/// Substrait Nested expressions represent complex type constructors (list, struct, map) -/// where elements are full expressions rather than just literals. This is used by -/// producers that emit `Nested { list: ... }` for array construction, as opposed to -/// `Literal { list: ... }` which only supports scalar values. -pub async fn from_nested( - consumer: &impl SubstraitConsumer, - nested: &Nested, - input_schema: &DFSchema, -) -> datafusion::common::Result { - let Some(nested_type) = &nested.nested_type else { - return substrait_err!("Nested expression requires a nested_type"); - }; - - match nested_type { - NestedType::List(list) => { - if list.values.is_empty() { - return substrait_err!( - "Empty Nested lists are not supported; use Literal.empty_list instead" - ); - } - - let mut args = Vec::with_capacity(list.values.len()); - for value in &list.values { - args.push(consumer.consume_expression(value, input_schema).await?); - } - - let make_array_udf = consumer.get_function_registry().udf("make_array")?; - Ok(Expr::ScalarFunction( - datafusion::logical_expr::expr::ScalarFunction::new_udf( - make_array_udf, - args, - ), - )) - } - NestedType::Struct(_) => { - not_impl_err!("Nested struct expressions are not yet supported") - } - NestedType::Map(_) => { - not_impl_err!("Nested map expressions are not yet supported") - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::logical_plan::consumer::utils::tests::test_consumer; - use substrait::proto::expression::Literal; - use substrait::proto::expression::nested::List; - use substrait::proto::{self, Expression}; - - fn make_i64_literal(value: i64) -> Expression { - Expression { - rex_type: Some(proto::expression::RexType::Literal(Literal { - nullable: false, - type_variation_reference: 0, - literal_type: Some(proto::expression::literal::LiteralType::I64(value)), - })), - } - } - - #[tokio::test] - async fn nested_list_with_literals() -> datafusion::common::Result<()> { - let consumer = test_consumer(); - let schema = DFSchema::empty(); - let nested = Nested { - nullable: false, - type_variation_reference: 0, - nested_type: Some(NestedType::List(List { - values: vec![ - make_i64_literal(1), - make_i64_literal(2), - make_i64_literal(3), - ], - })), - }; - - let expr = from_nested(&consumer, &nested, &schema).await?; - assert_eq!( - format!("{expr}"), - "make_array(Int64(1), Int64(2), Int64(3))" - ); - - Ok(()) - } - - #[tokio::test] - async fn nested_list_empty_rejected() -> datafusion::common::Result<()> { - let consumer = test_consumer(); - let schema = DFSchema::empty(); - let nested = Nested { - nullable: true, - type_variation_reference: 0, - nested_type: Some(NestedType::List(List { values: vec![] })), - }; - - let result = from_nested(&consumer, &nested, &schema).await; - assert!(result.is_err()); - assert!( - result - .unwrap_err() - .to_string() - .contains("Empty Nested lists are not supported") - ); - - Ok(()) - } - - #[tokio::test] - async fn nested_missing_type() -> datafusion::common::Result<()> { - let consumer = test_consumer(); - let schema = DFSchema::empty(); - let nested = Nested { - nullable: false, - type_variation_reference: 0, - nested_type: None, - }; - - let result = from_nested(&consumer, &nested, &schema).await; - assert!(result.is_err()); - assert!(result.unwrap_err().to_string().contains("nested_type")); - - Ok(()) - } -} diff --git a/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait 2.json b/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait 2.json deleted file mode 100644 index 85a69c41c5eb1..0000000000000 --- a/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait 2.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "relations": [ - { - "root": { - "input": { - "project": { - "common": { - "emit": { - "outputMapping": [2] - } - }, - "input": { - "read": { - "common": { - "direct": {} - }, - "baseSchema": { - "names": ["a", "b"], - "struct": { - "types": [ - { - "i32": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "i32": { - "nullability": "NULLABILITY_NULLABLE" - } - } - ], - "nullability": "NULLABILITY_REQUIRED" - } - }, - "namedTable": { - "names": ["DATA"] - } - } - }, - "expressions": [ - { - "nested": { - "nullable": false, - "list": { - "values": [ - { - "selection": { - "directReference": { - "structField": { - "field": 0 - } - }, - "rootReference": {} - } - }, - { - "selection": { - "directReference": { - "structField": { - "field": 1 - } - }, - "rootReference": {} - } - } - ] - } - } - } - ] - } - }, - "names": ["my_list"] - } - } - ] -} diff --git a/dev/changelog/52.2.0 2.md b/dev/changelog/52.2.0 2.md deleted file mode 100644 index 0801ec5e6a7ee..0000000000000 --- a/dev/changelog/52.2.0 2.md +++ /dev/null @@ -1,47 +0,0 @@ - - -# Apache DataFusion 52.2.0 Changelog - -This release consists of 5 commits from 3 contributors. See credits at the end of this changelog for more information. - -See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions. - -**Other:** - -- [branch-52] fix: filter pushdown when merge filter (#20110) [#20289](https://github.com/apache/datafusion/pull/20289) (haohuaijin) -- [branch-52] FilterExec should remap indices of parent dynamic filters (#20286) [#20347](https://github.com/apache/datafusion/pull/20347) (alamb) -- [branch-52] fix: validate inter-file ordering in eq_properties() (#20329) [#20509](https://github.com/apache/datafusion/pull/20509) (alamb) -- Fix name tracker (#19856) [#20539](https://github.com/apache/datafusion/pull/20539) (hareshkh) -- [branch-52] fix: HashJoin panic with dictionary-encoded columns in multi-key joins (#20441) [#20512](https://github.com/apache/datafusion/pull/20512) (alamb) -- [branch-52] Fix incorrect `SortExec` removal before `AggregateExec` (#20247) [#20507](https://github.com/apache/datafusion/pull/20507) (alamb) -- [branch-52] Update aws-smithy, bytes and time for security audits [#20546](https://github.com/apache/datafusion/pull/20546) (alamb) -- [branch-52] Clamp early aggregation emit to the sort boundary when using partial group ordering (#20446) [#20558](https://github.com/apache/datafusion/pull/20558) (alamb) - -## Credits - -Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. - -``` - 3 Andrew Lamb - 1 Haresh Khanna - 1 Huaijin -``` - -Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/dev/changelog/52.3.0 2.md b/dev/changelog/52.3.0 2.md deleted file mode 100644 index ed505b7fc2d0a..0000000000000 --- a/dev/changelog/52.3.0 2.md +++ /dev/null @@ -1,50 +0,0 @@ - - -# Apache DataFusion 52.3.0 Changelog - -This release consists of 7 commits from 4 contributors. See credits at the end of this changelog for more information. - -See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions. - -**Performance related:** - -- [branch-52] perf: sort replace free()->try_grow() pattern with try_resize() to reduce memory pool interactions [#20732](https://github.com/apache/datafusion/pull/20732) (mbutrovich) - -**Other:** - -- [branch-52] Backport fix: SortMergeJoin don't wait for all input before emitting #20482 [#20699](https://github.com/apache/datafusion/pull/20699) (mbutrovich) -- [branch-52] Fix Arrow Spill Underrun (#20159) [#20684](https://github.com/apache/datafusion/pull/20684) (hareshkh) -- [branch-52] Fix constant value from stats (#20042) [#20709](https://github.com/apache/datafusion/pull/20709) (alamb) -- [branch-52] fix: `HashJoin` panic with String dictionary keys (don't flatten keys) (#20505) [#20708](https://github.com/apache/datafusion/pull/20708) (alamb) -- [branch-52] FFI_TableOptions are using default values only [#20705](https://github.com/apache/datafusion/pull/20705) (timsaucer) -- [branch-52] Fix repartition from dropping data when spilling (#20672) [#20777](https://github.com/apache/datafusion/pull/20777) (alamb) - -## Credits - -Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. - -``` - 3 Andrew Lamb - 2 Matt Butrovich - 1 Haresh Khanna - 1 Tim Saucer -``` - -Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/docs/source/library-user-guide/upgrading/54.0.0 2.md b/docs/source/library-user-guide/upgrading/54.0.0 2.md deleted file mode 100644 index 77b4fb6f71a35..0000000000000 --- a/docs/source/library-user-guide/upgrading/54.0.0 2.md +++ /dev/null @@ -1,124 +0,0 @@ - - -# Upgrade Guides - -## DataFusion 54.0.0 - -**Note:** DataFusion `54.0.0` has not been released yet. The information provided -in this section pertains to features and changes that have already been merged -to the main branch and are awaiting release in this version. - -### `ExecutionPlan::apply_expressions` is now a required method - -`apply_expressions` has been added as a **required** method on the `ExecutionPlan` trait (no default implementation). The same applies to the `FileSource` and `DataSource` traits. Any custom implementation of these traits must now implement `apply_expressions`. - -**Who is affected:** - -- Users who implement custom `ExecutionPlan` nodes -- Users who implement custom `FileSource` or `DataSource` sources - -**Migration guide:** - -Add `apply_expressions` to your implementation. Call `f` on each top-level `PhysicalExpr` your node owns, using `visit_sibling` to correctly propagate `TreeNodeRecursion`: - -**Node with no expressions:** - -```rust,ignore -fn apply_expressions( - &self, - _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result, -) -> Result { - Ok(TreeNodeRecursion::Continue) -} -``` - -**Node with a single expression:** - -```rust,ignore -fn apply_expressions( - &self, - f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result, -) -> Result { - f(self.predicate.as_ref()) -} -``` - -**Node with multiple expressions:** - -```rust,ignore -fn apply_expressions( - &self, - f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result, -) -> Result { - let mut tnr = TreeNodeRecursion::Continue; - for expr in &self.expressions { - tnr = tnr.visit_sibling(|| f(expr.as_ref()))?; - } - Ok(tnr) -} -``` - -**Node whose only expressions are in `output_ordering()` (e.g. a synthetic test node with no owned expression fields):** - -````rust,ignore -fn apply_expressions( - &self, - f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result, -) -> Result { - let mut tnr = TreeNodeRecursion::Continue; - if let Some(ordering) = self.cache.output_ordering() { - for sort_expr in ordering { - tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?; - } - } - Ok(tnr) -} - -### `ExecutionPlan::partition_statistics` now returns `Arc` - -`ExecutionPlan::partition_statistics` now returns `Result>` instead of `Result`. This avoids cloning `Statistics` when it is shared across multiple consumers. - -**Before:** - -```rust,ignore -fn partition_statistics(&self, partition: Option) -> Result { - Ok(Statistics::new_unknown(&self.schema())) -} -```` - -**After:** - -```rust,ignore -fn partition_statistics(&self, partition: Option) -> Result> { - Ok(Arc::new(Statistics::new_unknown(&self.schema()))) -} -``` - -If you need an owned `Statistics` value (e.g. to mutate it), use `Arc::unwrap_or_clone`: - -```rust,ignore -// If you previously consumed the Statistics directly: -let stats = plan.partition_statistics(None)?; -stats.column_statistics[0].min_value = ...; - -// Now unwrap the Arc first: -let mut stats = Arc::unwrap_or_clone(plan.partition_statistics(None)?); -stats.column_statistics[0].min_value = ...; -``` From b87fd67e621b08d68931f1cde04af6c9b33a4da3 Mon Sep 17 00:00:00 2001 From: Shiv Bhatia Date: Sat, 21 Mar 2026 11:49:56 +0000 Subject: [PATCH 5/7] Use helper function for tests in sort.rs --- datafusion/physical-plan/src/sorts/sort.rs | 107 ++++++--------------- 1 file changed, 32 insertions(+), 75 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index e4a2024effece..4f47fb8baa2af 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -2876,110 +2876,67 @@ mod tests { Ok(()) } - #[test] - fn test_sort_with_fetch_blocks_filter_pushdown() -> Result<()> { + fn make_sort_exec_with_fetch(fetch: Option) -> SortExec { let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); - let input = Arc::new(EmptyExec::new(Arc::clone(&schema))); - let sort = SortExec::new( + let input = Arc::new(EmptyExec::new(schema)); + SortExec::new( [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(), input, ) - .with_fetch(Some(10)); - - let parent_filter: Arc = Arc::new(Column::new("a", 0)); - let config = ConfigOptions::new(); + .with_fetch(fetch) + } + #[test] + fn test_sort_with_fetch_blocks_filter_pushdown() -> Result<()> { + let sort = make_sort_exec_with_fetch(Some(10)); let desc = sort.gather_filters_for_pushdown( FilterPushdownPhase::Pre, - vec![parent_filter], - &config, + vec![Arc::new(Column::new("a", 0))], + &ConfigOptions::new(), )?; - - // Parent filter must be unsupported — it must not be pushed below - // a sort with fetch (TopK). - let parent_filters = desc.parent_filters(); - assert_eq!(parent_filters.len(), 1); - assert_eq!(parent_filters[0].len(), 1); - assert!( - matches!(parent_filters[0][0].discriminant, PushedDown::No), - "Parent filter should be unsupported when sort has fetch" - ); - + // Sort with fetch (TopK) must not allow filters to be pushed below it. + assert!(matches!( + desc.parent_filters()[0][0].discriminant, + PushedDown::No + )); Ok(()) } #[test] fn test_sort_without_fetch_allows_filter_pushdown() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); - let input = Arc::new(EmptyExec::new(Arc::clone(&schema))); - let sort = SortExec::new( - [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(), - input, - ); - - let parent_filter: Arc = Arc::new(Column::new("a", 0)); - let config = ConfigOptions::new(); - + let sort = make_sort_exec_with_fetch(None); let desc = sort.gather_filters_for_pushdown( FilterPushdownPhase::Pre, - vec![parent_filter], - &config, + vec![Arc::new(Column::new("a", 0))], + &ConfigOptions::new(), )?; - - // Parent filter should be supported — plain sort (no fetch) is - // filter-commutative. - let parent_filters = desc.parent_filters(); - assert_eq!(parent_filters.len(), 1); - assert_eq!(parent_filters[0].len(), 1); - assert!( - matches!(parent_filters[0][0].discriminant, PushedDown::Yes), - "Parent filter should be supported when sort has no fetch" - ); - + // Plain sort (no fetch) is filter-commutative. + assert!(matches!( + desc.parent_filters()[0][0].discriminant, + PushedDown::Yes + )); Ok(()) } #[test] fn test_sort_with_fetch_allows_topk_self_filter_in_post_phase() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); - let input = Arc::new(EmptyExec::new(Arc::clone(&schema))); - let sort = SortExec::new( - [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(), - input, - ) - .with_fetch(Some(10)); - - // with_fetch(Some(_)) creates the TopK dynamic filter automatically. + let sort = make_sort_exec_with_fetch(Some(10)); assert!(sort.filter.is_some(), "TopK filter should be created"); - let parent_filter: Arc = Arc::new(Column::new("a", 0)); let mut config = ConfigOptions::new(); config.optimizer.enable_topk_dynamic_filter_pushdown = true; - let desc = sort.gather_filters_for_pushdown( FilterPushdownPhase::Post, - vec![parent_filter], + vec![Arc::new(Column::new("a", 0))], &config, )?; - - // Parent filters should be blocked in Post phase when fetch is set. - let parent_filters = desc.parent_filters(); - assert_eq!(parent_filters.len(), 1); - assert_eq!(parent_filters[0].len(), 1); - assert!( - matches!(parent_filters[0][0].discriminant, PushedDown::No), - "Parent filter should be unsupported in Post phase when sort has fetch" - ); - - // The TopK self-filter should still be allowed through. - let self_filters = desc.self_filters(); - assert_eq!(self_filters.len(), 1); - assert_eq!( - self_filters[0].len(), - 1, - "TopK dynamic self-filter should be pushed down" - ); - + // Parent filters are still blocked in the Post phase. + assert!(matches!( + desc.parent_filters()[0][0].discriminant, + PushedDown::No + )); + // But the TopK self-filter should be pushed down. + assert_eq!(desc.self_filters()[0].len(), 1); Ok(()) } } From eee6542bb539e9fdf8094e21c944018853464e3a Mon Sep 17 00:00:00 2001 From: Shiv Bhatia Date: Sat, 21 Mar 2026 12:01:12 +0000 Subject: [PATCH 6/7] Move filter pushdown + sort/fetch tests into limit.slt --- datafusion/sqllogictest/test_files/limit.slt | 39 ++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index ff3c49485a286..f5ec26d304d41 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -869,6 +869,45 @@ limit 1000; statement ok DROP TABLE test_limit_with_partitions; +# Tests for filter pushdown behavior with Sort + LIMIT (fetch). + +statement ok +CREATE TABLE t(id INT, value INT) AS VALUES +(1, 100), +(2, 200), +(3, 300), +(4, 400), +(5, 500); + +# Take the 3 smallest values (100, 200, 300), then filter value > 200. +query II +SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; +---- +3 300 + +# Take the 3 largest values (500, 400, 300), then filter value < 400. +query II +SELECT * FROM (SELECT * FROM t ORDER BY value DESC LIMIT 3) sub WHERE sub.value < 400; +---- +3 300 + +# The filter stays above the sort+fetch in the plan. +query TT +EXPLAIN SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; +---- +logical_plan +01)SubqueryAlias: sub +02)--Filter: t.value > Int32(200) +03)----Sort: t.value ASC NULLS LAST, fetch=3 +04)------TableScan: t projection=[id, value] +physical_plan +01)FilterExec: value@1 > 200 +02)--SortExec: TopK(fetch=3), expr=[value@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: partitions=1, partition_sizes=[1] + +statement ok +DROP TABLE t; + # Tear down src_table table: statement ok DROP TABLE src_table; From 2a42cf011f1d6ad319e4786eecf6139c68136e93 Mon Sep 17 00:00:00 2001 From: Shiv Bhatia Date: Sat, 21 Mar 2026 12:01:28 +0000 Subject: [PATCH 7/7] Remove push_down_filter_sort_fetch.slt, tests moved to limit.slt --- .../push_down_filter_sort_fetch.slt | 55 ------------------- 1 file changed, 55 deletions(-) delete mode 100644 datafusion/sqllogictest/test_files/push_down_filter_sort_fetch.slt diff --git a/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch.slt b/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch.slt deleted file mode 100644 index ab23fff030489..0000000000000 --- a/datafusion/sqllogictest/test_files/push_down_filter_sort_fetch.slt +++ /dev/null @@ -1,55 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Tests for filter pushdown behavior with Sort + LIMIT (fetch). - -statement ok -CREATE TABLE t(id INT, value INT) AS VALUES -(1, 100), -(2, 200), -(3, 300), -(4, 400), -(5, 500); - -# Take the 3 smallest values (100, 200, 300), then filter value > 200. -query II -SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; ----- -3 300 - -# Take the 3 largest values (500, 400, 300), then filter value < 400. -query II -SELECT * FROM (SELECT * FROM t ORDER BY value DESC LIMIT 3) sub WHERE sub.value < 400; ----- -3 300 - -# The filter stays above the sort+fetch in the plan. -query TT -EXPLAIN SELECT * FROM (SELECT * FROM t ORDER BY value LIMIT 3) sub WHERE sub.value > 200; ----- -logical_plan -01)SubqueryAlias: sub -02)--Filter: t.value > Int32(200) -03)----Sort: t.value ASC NULLS LAST, fetch=3 -04)------TableScan: t projection=[id, value] -physical_plan -01)FilterExec: value@1 > 200 -02)--SortExec: TopK(fetch=3), expr=[value@1 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: partitions=1, partition_sizes=[1] - -statement ok -DROP TABLE t;