From 3068da7f1b9a47b9f9b61c904a29d93db0f76d39 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Mon, 9 Mar 2026 12:45:37 +0100 Subject: [PATCH 1/4] fix: inherit field names from left projection in set expressions when missing Otherwise they might end up having the same column names, which is prohibited. This is only the case for literals, other expression types should have default unique names. --- datafusion/sql/src/set_expr.rs | 101 ++++++++++++++++++- datafusion/sql/tests/sql_integration.rs | 54 ++++++++++ datafusion/sqllogictest/test_files/union.slt | 24 +++++ 3 files changed, 176 insertions(+), 3 deletions(-) diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index d4e771cb48585..6e9b2ea5d5340 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -17,10 +17,12 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::{ - DataFusionError, Diagnostic, Result, Span, not_impl_err, plan_err, + DFSchemaRef, DataFusionError, Diagnostic, Result, Span, not_impl_err, plan_err, }; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; -use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier, Spanned}; +use sqlparser::ast::{ + Expr as SQLExpr, Ident, SelectItem, SetExpr, SetOperator, SetQuantifier, Spanned, +}; impl SqlToRel<'_, S> { #[cfg_attr(feature = "recursive_protection", recursive::recursive)] @@ -42,7 +44,28 @@ impl SqlToRel<'_, S> { let left_span = Span::try_from_sqlparser_span(left.span()); let right_span = Span::try_from_sqlparser_span(right.span()); let left_plan = self.set_expr_to_plan(*left, planner_context); - let right_plan = self.set_expr_to_plan(*right, planner_context); + + // For non-*ByName operations, add missing aliases to right side using left schema's + // column names. This allows queries like + // `SELECT 1 c1, 0 c2, 0 c3 UNION ALL SELECT 2, 0, 0` + // where the right side has duplicate literal values. + // We only do this if the left side succeeded. + let right = if let Ok(plan) = &left_plan + && plan.schema().fields().len() > 1 + && matches!( + set_quantifier, + SetQuantifier::All + | SetQuantifier::Distinct + | SetQuantifier::None + ) { + alias_set_expr(*right, plan.schema()) + } else { + *right + }; + + let right_plan = self.set_expr_to_plan(right, planner_context); + + // Handle errors from both sides, collecting them if both failed let (left_plan, right_plan) = match (left_plan, right_plan) { (Ok(left_plan), Ok(right_plan)) => (left_plan, right_plan), (Err(left_err), Err(right_err)) => { @@ -160,3 +183,75 @@ impl SqlToRel<'_, S> { } } } + +// Adds aliases to SELECT items in a SetExpr using the provided schema. +// This ensures that unnamed expressions on the right side of a UNION/INTERSECT/EXCEPT +// get aliased with the column names from the left side, allowing queries like +// `SELECT 1 AS a, 0 AS b, 0 AS c UNION ALL SELECT 2, 0, 0` to work correctly. +fn alias_set_expr(set_expr: SetExpr, schema: &DFSchemaRef) -> SetExpr { + match set_expr { + SetExpr::Select(mut select) => { + alias_select_items(&mut select.projection, schema); + SetExpr::Select(select) + } + SetExpr::SetOperation { + op, + left, + right, + set_quantifier, + } => { + // For nested set operations, only alias the leftmost branch + // since that's what determines the output column names + SetExpr::SetOperation { + op, + left: Box::new(alias_set_expr(*left, schema)), + right, + set_quantifier, + } + } + SetExpr::Query(mut query) => { + // Handle parenthesized queries like (SELECT ... UNION ALL SELECT ...) + query.body = Box::new(alias_set_expr(*query.body, schema)); + SetExpr::Query(query) + } + // For other cases (Values, etc.), return as-is + other => other, + } +} + +// Adds aliases to literal value expressions where missing, based on the input schema, as these are +// the ones that can cause duplicate name issues (e.g. `SELECT 0, 0` has two columns named `Int64(0)`). +// Other expressions typically have unique names. +fn alias_select_items(items: &mut [SelectItem], schema: &DFSchemaRef) { + let mut col_idx = 0; + for item in items.iter_mut() { + match item { + SelectItem::UnnamedExpr(expr) if is_literal_value(expr) => { + if let Some(field) = schema.fields().get(col_idx) { + *item = SelectItem::ExprWithAlias { + expr: expr.clone(), + alias: Ident::new(field.name()), + }; + } + col_idx += 1; + } + SelectItem::UnnamedExpr(_) | SelectItem::ExprWithAlias { .. } => { + col_idx += 1; + } + SelectItem::Wildcard(_) | SelectItem::QualifiedWildcard(_, _) => { + // Wildcards expand to multiple columns - skip position tracking + } + } + } +} + +/// Returns true if the expression is a literal value that could cause duplicate names. +fn is_literal_value(expr: &SQLExpr) -> bool { + matches!( + expr, + SQLExpr::Value(_) + | SQLExpr::UnaryOp { .. } + | SQLExpr::TypedString { .. } + | SQLExpr::Interval(_) + ) +} diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 29c17be69ce5f..e864fae9a9144 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -2655,6 +2655,60 @@ fn union_all_by_name_same_column_names() { ); } +#[test] +fn union_all_with_duplicate_literals() { + let sql = "SELECT 0 a, 0 b UNION ALL SELECT 1, 1"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + Union + Projection: Int64(0) AS a, Int64(0) AS b + EmptyRelation: rows=1 + Projection: Int64(1) AS a, Int64(1) AS b + EmptyRelation: rows=1 + " + ); +} + +#[test] +fn intersect_with_duplicate_literals() { + let sql = "SELECT 1 as a, 0 as b, 0 as c INTERSECT SELECT 1, 0, 0"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + LeftSemi Join: left.a = right.a, left.b = right.b, left.c = right.c + Distinct: + SubqueryAlias: left + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + SubqueryAlias: right + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + " + ); +} + +#[test] +fn except_with_duplicate_literals() { + let sql = "SELECT 1 as a, 0 as b, 0 as c EXCEPT SELECT 2, 0, 0"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + LeftAnti Join: left.a = right.a, left.b = right.b, left.c = right.c + Distinct: + SubqueryAlias: left + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + SubqueryAlias: right + Projection: Int64(2) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + " + ); +} + #[test] fn empty_over() { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index d858d0ae3ea4e..d0ad5c8bb3c58 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -256,6 +256,30 @@ Bob_new John John_new +# Test UNION ALL with unaliased duplicate literal values on the right side. +# The second projection will inherit field names from the first one, and so +# pass the unique projection expression name check. +query TII rowsort +SELECT name, 1 as table, 1 as row FROM t1 WHERE id = 1 +UNION ALL +SELECT name, 2, 2 FROM t2 WHERE id = 2 +---- +Alex 1 1 +Bob 2 2 + +# Test nested UNION, EXCEPT, INTERSECT with duplicate unaliased literals. +# Only the first SELECT has column aliases, which should propagate to all projections. +query III rowsort +SELECT 1 as a, 0 as b, 0 as c +UNION ALL +((SELECT 2, 0, 0 UNION ALL SELECT 3, 0, 0) EXCEPT SELECT 3, 0, 0) +UNION ALL +(SELECT 4, 0, 0 INTERSECT SELECT 4, 0, 0) +---- +1 0 0 +2 0 0 +4 0 0 + # Plan is unnested query TT EXPLAIN SELECT name FROM t1 UNION ALL (SELECT name from t2 UNION ALL SELECT name || '_new' from t2) From f4464baac5107279a5aa833cceff5a9bc91a5d64 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Thu, 12 Mar 2026 09:25:01 +0100 Subject: [PATCH 2/4] ref: alter set expressions directly instead of creating new ones --- datafusion/sql/src/set_expr.rs | 50 ++++++++++------------------------ 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 6e9b2ea5d5340..bb9875182b264 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -38,7 +38,7 @@ impl SqlToRel<'_, S> { SetExpr::SetOperation { op, left, - right, + mut right, set_quantifier, } => { let left_span = Span::try_from_sqlparser_span(left.span()); @@ -47,23 +47,22 @@ impl SqlToRel<'_, S> { // For non-*ByName operations, add missing aliases to right side using left schema's // column names. This allows queries like - // `SELECT 1 c1, 0 c2, 0 c3 UNION ALL SELECT 2, 0, 0` + // `SELECT 1 a, 1 b UNION ALL SELECT 2, 2` // where the right side has duplicate literal values. // We only do this if the left side succeeded. - let right = if let Ok(plan) = &left_plan + if let Ok(plan) = &left_plan && plan.schema().fields().len() > 1 && matches!( set_quantifier, SetQuantifier::All | SetQuantifier::Distinct | SetQuantifier::None - ) { - alias_set_expr(*right, plan.schema()) - } else { - *right - }; + ) + { + alias_set_expr(&mut right, plan.schema()) + } - let right_plan = self.set_expr_to_plan(right, planner_context); + let right_plan = self.set_expr_to_plan(*right, planner_context); // Handle errors from both sides, collecting them if both failed let (left_plan, right_plan) = match (left_plan, right_plan) { @@ -188,34 +187,15 @@ impl SqlToRel<'_, S> { // This ensures that unnamed expressions on the right side of a UNION/INTERSECT/EXCEPT // get aliased with the column names from the left side, allowing queries like // `SELECT 1 AS a, 0 AS b, 0 AS c UNION ALL SELECT 2, 0, 0` to work correctly. -fn alias_set_expr(set_expr: SetExpr, schema: &DFSchemaRef) -> SetExpr { +fn alias_set_expr(set_expr: &mut SetExpr, schema: &DFSchemaRef) { match set_expr { - SetExpr::Select(mut select) => { - alias_select_items(&mut select.projection, schema); - SetExpr::Select(select) - } - SetExpr::SetOperation { - op, - left, - right, - set_quantifier, - } => { - // For nested set operations, only alias the leftmost branch - // since that's what determines the output column names - SetExpr::SetOperation { - op, - left: Box::new(alias_set_expr(*left, schema)), - right, - set_quantifier, - } - } - SetExpr::Query(mut query) => { - // Handle parenthesized queries like (SELECT ... UNION ALL SELECT ...) - query.body = Box::new(alias_set_expr(*query.body, schema)); - SetExpr::Query(query) - } + SetExpr::Select(select) => alias_select_items(&mut select.projection, schema), + // For nested set operations, only alias the leftmost branch + SetExpr::SetOperation { left, .. } => alias_set_expr(left, schema), + // Handle parenthesized queries like (SELECT ... UNION ALL SELECT ...) + SetExpr::Query(query) => alias_set_expr(&mut query.body, schema), // For other cases (Values, etc.), return as-is - other => other, + _other => (), } } From 9887464e04c2783005e0645b8f4c589ec4d0d1cb Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Mon, 16 Mar 2026 09:22:25 +0100 Subject: [PATCH 3/4] fix: handle case of non-literal expressions with same name --- datafusion/sql/src/set_expr.rs | 27 ++++------ datafusion/sql/tests/sql_integration.rs | 67 +++++++++++++++++-------- 2 files changed, 57 insertions(+), 37 deletions(-) diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index bb9875182b264..62e1c841b6805 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -199,15 +199,19 @@ fn alias_set_expr(set_expr: &mut SetExpr, schema: &DFSchemaRef) { } } -// Adds aliases to literal value expressions where missing, based on the input schema, as these are -// the ones that can cause duplicate name issues (e.g. `SELECT 0, 0` has two columns named `Int64(0)`). -// Other expressions typically have unique names. +// Aliases unnamed expressions in the provided select items using the provided schema. +// This helps with set expression queries where the right side has duplicate expressions, +// but the left side has unique column names, which control the output schema anyway. fn alias_select_items(items: &mut [SelectItem], schema: &DFSchemaRef) { let mut col_idx = 0; for item in items.iter_mut() { match item { - SelectItem::UnnamedExpr(expr) if is_literal_value(expr) => { - if let Some(field) = schema.fields().get(col_idx) { + SelectItem::UnnamedExpr(expr) => { + if !matches!( + expr, + SQLExpr::Identifier(_) | SQLExpr::CompoundIdentifier(_) + ) && let Some(field) = schema.fields().get(col_idx) + { *item = SelectItem::ExprWithAlias { expr: expr.clone(), alias: Ident::new(field.name()), @@ -215,7 +219,7 @@ fn alias_select_items(items: &mut [SelectItem], schema: &DFSchemaRef) { } col_idx += 1; } - SelectItem::UnnamedExpr(_) | SelectItem::ExprWithAlias { .. } => { + SelectItem::ExprWithAlias { .. } => { col_idx += 1; } SelectItem::Wildcard(_) | SelectItem::QualifiedWildcard(_, _) => { @@ -224,14 +228,3 @@ fn alias_select_items(items: &mut [SelectItem], schema: &DFSchemaRef) { } } } - -/// Returns true if the expression is a literal value that could cause duplicate names. -fn is_literal_value(expr: &SQLExpr) -> bool { - matches!( - expr, - SQLExpr::Value(_) - | SQLExpr::UnaryOp { .. } - | SQLExpr::TypedString { .. } - | SQLExpr::Interval(_) - ) -} diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index e864fae9a9144..422a415e7722b 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -2656,55 +2656,82 @@ fn union_all_by_name_same_column_names() { } #[test] -fn union_all_with_duplicate_literals() { - let sql = "SELECT 0 a, 0 b UNION ALL SELECT 1, 1"; +fn union_all_with_duplicate_expressions() { + let sql = "\ + SELECT 0 a, 0 b \ + UNION ALL SELECT 1, 1 \ + UNION ALL SELECT count(*), count(*) FROM orders"; let plan = logical_plan(sql).unwrap(); assert_snapshot!( plan, @r" Union - Projection: Int64(0) AS a, Int64(0) AS b - EmptyRelation: rows=1 - Projection: Int64(1) AS a, Int64(1) AS b - EmptyRelation: rows=1 + Union + Projection: Int64(0) AS a, Int64(0) AS b + EmptyRelation: rows=1 + Projection: Int64(1) AS a, Int64(1) AS b + EmptyRelation: rows=1 + Projection: count(*) AS a, count(*) AS b + Aggregate: groupBy=[[]], aggr=[[count(*)]] + TableScan: orders " ); } #[test] -fn intersect_with_duplicate_literals() { - let sql = "SELECT 1 as a, 0 as b, 0 as c INTERSECT SELECT 1, 0, 0"; +fn intersect_with_duplicate_expressions() { + let sql = "\ + SELECT 0 a, 0 b \ + INTERSECT SELECT 1, 1 \ + INTERSECT SELECT count(*), count(*) FROM orders"; let plan = logical_plan(sql).unwrap(); assert_snapshot!( plan, @r" - LeftSemi Join: left.a = right.a, left.b = right.b, left.c = right.c + LeftSemi Join: left.a = right.a, left.b = right.b Distinct: SubqueryAlias: left - Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c - EmptyRelation: rows=1 + LeftSemi Join: left.a = right.a, left.b = right.b + Distinct: + SubqueryAlias: left + Projection: Int64(0) AS a, Int64(0) AS b + EmptyRelation: rows=1 + SubqueryAlias: right + Projection: Int64(1) AS a, Int64(1) AS b + EmptyRelation: rows=1 SubqueryAlias: right - Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c - EmptyRelation: rows=1 + Projection: count(*) AS a, count(*) AS b + Aggregate: groupBy=[[]], aggr=[[count(*)]] + TableScan: orders " ); } #[test] -fn except_with_duplicate_literals() { - let sql = "SELECT 1 as a, 0 as b, 0 as c EXCEPT SELECT 2, 0, 0"; +fn except_with_duplicate_expressions() { + let sql = "\ + SELECT 0 a, 0 b \ + EXCEPT SELECT 1, 1 \ + EXCEPT SELECT count(*), count(*) FROM orders"; let plan = logical_plan(sql).unwrap(); assert_snapshot!( plan, @r" - LeftAnti Join: left.a = right.a, left.b = right.b, left.c = right.c + LeftAnti Join: left.a = right.a, left.b = right.b Distinct: SubqueryAlias: left - Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c - EmptyRelation: rows=1 + LeftAnti Join: left.a = right.a, left.b = right.b + Distinct: + SubqueryAlias: left + Projection: Int64(0) AS a, Int64(0) AS b + EmptyRelation: rows=1 + SubqueryAlias: right + Projection: Int64(1) AS a, Int64(1) AS b + EmptyRelation: rows=1 SubqueryAlias: right - Projection: Int64(2) AS a, Int64(0) AS b, Int64(0) AS c - EmptyRelation: rows=1 + Projection: count(*) AS a, count(*) AS b + Aggregate: groupBy=[[]], aggr=[[count(*)]] + TableScan: orders " ); } From 67e3df7c12b36d28164fd873c0120a2626490141 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Thu, 19 Mar 2026 11:29:22 +0100 Subject: [PATCH 4/4] handle single wildcard as well --- datafusion/sql/src/set_expr.rs | 22 ++++++++++++++++++++-- datafusion/sql/tests/sql_integration.rs | 19 +++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 62e1c841b6805..186181eb92544 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -203,6 +203,21 @@ fn alias_set_expr(set_expr: &mut SetExpr, schema: &DFSchemaRef) { // This helps with set expression queries where the right side has duplicate expressions, // but the left side has unique column names, which control the output schema anyway. fn alias_select_items(items: &mut [SelectItem], schema: &DFSchemaRef) { + // Figure out how many (qualified) wildcards we got. We only handle + // the case of a single unqualified wildcard; for multiple or qualified + // wildcards we can't reliably determine column counts, so bail out. + let (wildcard_count, qualified_wildcard_count) = + items.iter().fold((0, 0), |(wc, qwc), item| match item { + SelectItem::Wildcard(_) => (wc + 1, qwc), + SelectItem::QualifiedWildcard(_, _) => (wc, qwc + 1), + _ => (wc, qwc), + }); + if qualified_wildcard_count > 0 || wildcard_count > 1 { + return; + } + + let wildcard_expansion = schema.fields().len().saturating_sub(items.len() - 1); + let mut col_idx = 0; for item in items.iter_mut() { match item { @@ -222,8 +237,11 @@ fn alias_select_items(items: &mut [SelectItem], schema: &DFSchemaRef) { SelectItem::ExprWithAlias { .. } => { col_idx += 1; } - SelectItem::Wildcard(_) | SelectItem::QualifiedWildcard(_, _) => { - // Wildcards expand to multiple columns - skip position tracking + SelectItem::Wildcard(_) => { + col_idx += wildcard_expansion; + } + SelectItem::QualifiedWildcard(_, _) => { + unreachable!("qualified wildcards are handled above") } } } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 422a415e7722b..b1f4929eaa222 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -2678,6 +2678,25 @@ fn union_all_with_duplicate_expressions() { ); } +#[test] +fn union_with_qualified_and_duplicate_expressions() { + let sql = "\ + SELECT 0 a, id b, price c, 0 d FROM test_decimal \ + UNION SELECT 1, *, 1 FROM test_decimal"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @" + Distinct: + Union + Projection: Int64(0) AS a, test_decimal.id AS b, test_decimal.price AS c, Int64(0) AS d + TableScan: test_decimal + Projection: Int64(1) AS a, test_decimal.id, test_decimal.price, Int64(1) AS d + TableScan: test_decimal + " + ); +} + #[test] fn intersect_with_duplicate_expressions() { let sql = "\