From c81c7fa5c69a50a9a9d7246253230c0b4a9beb88 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 21 Mar 2026 07:30:19 -0400 Subject: [PATCH 1/3] docs: Document the TableProvider evaluation order for filter, limit and projection --- datafusion/catalog/src/table.rs | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs index f31d4d52ce88b..3b72af624ac6f 100644 --- a/datafusion/catalog/src/table.rs +++ b/datafusion/catalog/src/table.rs @@ -87,7 +87,7 @@ pub trait TableProvider: Debug + Sync + Send { /// Create an [`ExecutionPlan`] for scanning the table with optionally /// specified `projection`, `filter` and `limit`, described below. /// - /// The `ExecutionPlan` is responsible scanning the datasource's + /// The returned `ExecutionPlan` is responsible scanning the datasource's /// partitions in a streaming, parallelized fashion. /// /// # Projection @@ -163,6 +163,29 @@ pub trait TableProvider: Debug + Sync + Send { /// because inexact filters do not guarantee that every filtered row is /// removed, so applying the limit could lead to too few rows being available /// to return as a final result. + /// + /// ## Evaluation Order + /// + /// The logical evaluation is first `filters` then `limit` and then `projection`. + /// + /// Note that `limit` applies to the filtered result, not to the unfiltered + /// input, and `projection` affects only which columns are returned, not + /// which rows qualify. + /// + /// For example, if a scan receives: + /// + /// - `projection = [a]` + /// - `filters = [b > 5]` + /// - `limit = Some(3)` + /// + /// It logically must produce results like: + /// + /// ```text + /// PROJECTION a (LIMIT 3 (SCAN WHERE b > 5)) + /// ``` + /// + /// As mentioned above, columns referenced only by pushed-down filters may + /// be absent from `projection`. async fn scan( &self, state: &dyn Session, From 61a8c2d6ea101f187bba2d19eb06b3fbebe052e7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 21 Mar 2026 07:38:47 -0400 Subject: [PATCH 2/3] tighten language --- datafusion/catalog/src/table.rs | 67 ++++++++++++++++----------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs index 3b72af624ac6f..423f5a0afb464 100644 --- a/datafusion/catalog/src/table.rs +++ b/datafusion/catalog/src/table.rs @@ -84,10 +84,10 @@ pub trait TableProvider: Debug + Sync + Send { None } - /// Create an [`ExecutionPlan`] for scanning the table with optionally - /// specified `projection`, `filter` and `limit`, described below. + /// Create an [`ExecutionPlan`] for scanning the table with optional + /// `projection`, `filter`, and `limit`, described below. /// - /// The returned `ExecutionPlan` is responsible scanning the datasource's + /// The returned `ExecutionPlan` is responsible for scanning the datasource's /// partitions in a streaming, parallelized fashion. /// /// # Projection @@ -96,33 +96,30 @@ pub trait TableProvider: Debug + Sync + Send { /// specified. The projection is a set of indexes of the fields in /// [`Self::schema`]. /// - /// DataFusion provides the projection to scan only the columns actually - /// used in the query to improve performance, an optimization called - /// "Projection Pushdown". Some datasources, such as Parquet, can use this - /// information to go significantly faster when only a subset of columns is - /// required. + /// DataFusion provides the projection so the scan reads only the columns + /// actually used in the query, an optimization called "Projection + /// Pushdown". Some datasources, such as Parquet, can use this information + /// to go significantly faster when only a subset of columns is required. /// /// # Filters /// /// A list of boolean filter [`Expr`]s to evaluate *during* the scan, in the /// manner specified by [`Self::supports_filters_pushdown`]. Only rows for - /// which *all* of the `Expr`s evaluate to `true` must be returned (aka the - /// expressions are `AND`ed together). + /// which *all* of the `Expr`s evaluate to `true` must be returned (that is, + /// the expressions are `AND`ed together). /// - /// To enable filter pushdown you must override - /// [`Self::supports_filters_pushdown`] as the default implementation does - /// not and `filters` will be empty. + /// To enable filter pushdown, override + /// [`Self::supports_filters_pushdown`]. The default implementation does not + /// push down filters, and `filters` will be empty. /// - /// DataFusion pushes filtering into the scans whenever possible - /// ("Filter Pushdown"), and depending on the format and the - /// implementation of the format, evaluating the predicate during the scan - /// can increase performance significantly. + /// DataFusion pushes filters into scans whenever possible ("Filter + /// Pushdown"). Depending on the data format and implementation, evaluating + /// predicates during the scan can significantly improve performance. /// /// ## Note: Some columns may appear *only* in Filters /// - /// In certain cases, a query may only use a certain column in a Filter that - /// has been completely pushed down to the scan. In this case, the - /// projection will not contain all the columns found in the filter + /// In some cases, a query may use a column only in a filter and the + /// projection will not contain all columns referenced by the filter /// expressions. /// /// For example, given the query `SELECT t.a FROM t WHERE t.b > 5`, @@ -154,19 +151,21 @@ pub trait TableProvider: Debug + Sync + Send { /// /// # Limit /// - /// If `limit` is specified, must only produce *at least* this many rows, - /// (though it may return more). Like Projection Pushdown and Filter - /// Pushdown, DataFusion pushes `LIMIT`s as far down in the plan as - /// possible, called "Limit Pushdown" as some sources can use this - /// information to improve their performance. Note that if there are any - /// Inexact filters pushed down, the LIMIT cannot be pushed down. This is - /// because inexact filters do not guarantee that every filtered row is - /// removed, so applying the limit could lead to too few rows being available - /// to return as a final result. + /// If `limit` is specified, the scan must produce *at least* this many + /// rows, though it may return more. Like Projection Pushdown and Filter + /// Pushdown, DataFusion pushes `LIMIT`s as far down in the plan as + /// possible. This is called "Limit Pushdown", and some sources can use the + /// information to improve performance. /// - /// ## Evaluation Order + /// Note: If any pushed-down filters are `Inexact`, the `LIMIT` cannot be + /// pushed down. Inexact filters do not guarantee that every filtered row is + /// removed, so applying the limit could leave too few rows to return in the + /// final result. /// - /// The logical evaluation is first `filters` then `limit` and then `projection`. + /// # Evaluation Order + /// + /// The logical evaluation order is `filters`, then `limit`, then + /// `projection`. /// /// Note that `limit` applies to the filtered result, not to the unfiltered /// input, and `projection` affects only which columns are returned, not @@ -178,14 +177,14 @@ pub trait TableProvider: Debug + Sync + Send { /// - `filters = [b > 5]` /// - `limit = Some(3)` /// - /// It logically must produce results like: + /// It must logically produce results like: /// /// ```text /// PROJECTION a (LIMIT 3 (SCAN WHERE b > 5)) /// ``` /// - /// As mentioned above, columns referenced only by pushed-down filters may - /// be absent from `projection`. + /// As noted above, columns referenced only by pushed-down filters may be + /// absent from `projection`. async fn scan( &self, state: &dyn Session, From 7662a4c981e64733a0f06109dbc9b511df50235e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 21 Mar 2026 08:02:46 -0400 Subject: [PATCH 3/3] wording tweak --- datafusion/catalog/src/table.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs index 423f5a0afb464..4249f8ebd5368 100644 --- a/datafusion/catalog/src/table.rs +++ b/datafusion/catalog/src/table.rs @@ -177,7 +177,7 @@ pub trait TableProvider: Debug + Sync + Send { /// - `filters = [b > 5]` /// - `limit = Some(3)` /// - /// It must logically produce results like: + /// It must logically produce results equivalent to: /// /// ```text /// PROJECTION a (LIMIT 3 (SCAN WHERE b > 5))