From cab96a64cb7663afce710ab8d4bbc08fe09da5ca Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 13 Mar 2026 18:17:00 -0400 Subject: [PATCH 1/2] fix stale doc --- benchmarks/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 7e9818aef24f1..3aa4f4bb8640c 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -879,13 +879,13 @@ The benchmark includes queries that: The sorted dataset is automatically generated from the ClickBench partitioned dataset. You can configure the memory used during the sorting process with the `DATAFUSION_MEMORY_GB` environment variable. The default memory limit is 12GB. ```bash -./bench.sh data data_sorted_clickbench +./bench.sh data clickbench_sorted ``` To create the sorted dataset, for example with 16GB of memory, run: ```bash -DATAFUSION_MEMORY_GB=16 ./bench.sh data data_sorted_clickbench +DATAFUSION_MEMORY_GB=16 ./bench.sh data clickbench_sorted ``` This command will: @@ -896,7 +896,7 @@ This command will: #### Running the Benchmark ```bash -./bench.sh run data_sorted_clickbench +./bench.sh run clickbench_sorted ``` This runs queries against the pre-sorted dataset with the `--sorted-by EventTime` flag, which informs DataFusion that the data is pre-sorted, allowing it to optimize away redundant sort operations. From 5995e8fb26eda8c5b82ad968216b6ba790e8ff0e Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sun, 15 Mar 2026 17:19:52 -0400 Subject: [PATCH 2/2] remove redundant heuristic --- .../datasource-parquet/src/row_filter.rs | 34 +++---------------- 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs index 2924208c5bd99..5ff88e6c5c237 100644 --- a/datafusion/datasource-parquet/src/row_filter.rs +++ b/datafusion/datasource-parquet/src/row_filter.rs @@ -50,21 +50,15 @@ //! 2. Determine whether each predicate can be evaluated as an `ArrowPredicate`. //! 3. Determine, for each predicate, the total compressed size of all //! columns required to evaluate the predicate. -//! 4. Determine, for each predicate, whether all columns required to -//! evaluate the expression are sorted. -//! 5. Re-order the predicate by total size (from step 3). -//! 6. Partition the predicates according to whether they are sorted (from step 4) -//! 7. "Compile" each predicate `Expr` to a `DatafusionArrowPredicate`. -//! 8. Build the `RowFilter` with the sorted predicates followed by -//! the unsorted predicates. Within each partition, predicates are -//! still be sorted by size. +//! 4. Re-order predicates by total size (from step 3). +//! 5. "Compile" each predicate `Expr` to a `DatafusionArrowPredicate`. +//! 6. Build the `RowFilter` from the ordered predicates. //! //! List-aware predicates (for example, `array_has`, `array_has_all`, and //! `array_has_any`) can be evaluated directly during Parquet decoding. Struct //! columns and other nested projections that are not explicitly supported will //! continue to be evaluated after the batches are materialized. -use std::cmp::Ordering; use std::collections::BTreeSet; use std::sync::Arc; @@ -186,8 +180,6 @@ pub(crate) struct FilterCandidate { /// the filter and to order the filters when `reorder_predicates` is true. /// This is generated by summing the compressed size of all columns that the filter references. required_bytes: usize, - /// Can this filter use an index (e.g. a page index) to prune rows? - can_use_index: bool, /// Column indices into the parquet file schema required to evaluate this filter. projection: LeafProjection, /// The Arrow schema containing only the columns required by this filter, @@ -251,12 +243,9 @@ impl FilterCandidateBuilder { let projected_schema = Arc::new(self.file_schema.project(&root_indices)?); let required_bytes = size_of_columns(&leaf_indices, metadata)?; - let can_use_index = columns_sorted(&leaf_indices, metadata)?; - Ok(Some(FilterCandidate { expr: self.expr, required_bytes, - can_use_index, projection: LeafProjection { leaf_indices }, filter_schema: projected_schema, })) @@ -547,16 +536,6 @@ fn size_of_columns(columns: &[usize], metadata: &ParquetMetaData) -> Result Result { - // TODO How do we know this? - Ok(false) -} - /// Build a [`RowFilter`] from the given predicate expression if possible. /// /// # Arguments @@ -611,12 +590,7 @@ pub fn build_row_filter( } if reorder_predicates { - candidates.sort_unstable_by(|c1, c2| { - match c1.can_use_index.cmp(&c2.can_use_index) { - Ordering::Equal => c1.required_bytes.cmp(&c2.required_bytes), - ord => ord, - } - }); + candidates.sort_unstable_by_key(|c| c.required_bytes); } // To avoid double-counting metrics when multiple predicates are used: