From edbf1b30a3b23c5e68b830ac32d16738922bd0a0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 14 Jan 2026 16:20:39 -0700 Subject: [PATCH 1/6] feat: add support for make_date expression Adds native Comet support for Spark's make_date function which creates a date from year, month, and day components. Closes #3091 Co-Authored-By: Claude Opus 4.5 --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../comet/CometTemporalExpressionSuite.scala | 40 +++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index e50b1d80e6..37bf06b4a5 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -188,6 +188,7 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[DateSub] -> CometDateSub, classOf[FromUnixTime] -> CometFromUnixTime, classOf[Hour] -> CometHour, + classOf[MakeDate] -> CometMakeDate, classOf[Minute] -> CometMinute, classOf[Second] -> CometSecond, classOf[TruncDate] -> CometTruncDate, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index ef2b0f793c..c1af0ba789 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateSub, DayOfMonth, DayOfWeek, DayOfYear, GetDateField, Hour, Literal, Minute, Month, Quarter, Second, TruncDate, TruncTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateSub, DayOfMonth, DayOfWeek, DayOfYear, GetDateField, Hour, Literal, MakeDate, Minute, Month, Quarter, Second, TruncDate, TruncTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.types.{DateType, IntegerType} import org.apache.spark.unsafe.types.UTF8String @@ -258,6 +258,8 @@ object CometDateAdd extends CometScalarFunction[DateAdd]("date_add") object CometDateSub extends CometScalarFunction[DateSub]("date_sub") +object CometMakeDate extends CometScalarFunction[MakeDate]("make_date") + object CometTruncDate extends CometExpressionSerde[TruncDate] { val supportedFormats: Seq[String] = diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index 9a23c76d82..2fc2c862d2 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -122,4 +122,44 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH StructField("fmt", DataTypes.StringType, true))) FuzzDataGenerator.generateDataFrame(r, spark, schema, 1000, DataGenOptions()) } + + test("make_date") { + import org.apache.spark.sql.Row + + // Create test data with year, month, day columns (no nulls to avoid edge cases) + val r = new Random(42) + val testData = (1 to 1000).map { _ => + // Generate reasonable date components + val year = 1900 + r.nextInt(200) // 1900-2099 + val month = 1 + r.nextInt(12) // 1-12 + val day = 1 + r.nextInt(28) // 1-28 to avoid invalid dates + Row(year, month, day) + } + val schema = StructType( + Seq( + StructField("year", DataTypes.IntegerType, false), + StructField("month", DataTypes.IntegerType, false), + StructField("day", DataTypes.IntegerType, false))) + val df = spark.createDataFrame(spark.sparkContext.parallelize(testData), schema) + df.createOrReplaceTempView("tbl") + + // Test with column values + checkSparkAnswerAndOperator( + "SELECT year, month, day, make_date(year, month, day) FROM tbl ORDER BY year, month, day") + + // Disable constant folding to ensure literal expressions are executed by Comet + withSQLConf( + SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> + "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") { + // Test with literal values + checkSparkAnswerAndOperator("SELECT make_date(2023, 12, 25)") + checkSparkAnswerAndOperator("SELECT make_date(1970, 1, 1)") + checkSparkAnswerAndOperator("SELECT make_date(2000, 2, 29)") // Leap year + + // Test null handling + checkSparkAnswerAndOperator("SELECT make_date(NULL, 1, 1)") + checkSparkAnswerAndOperator("SELECT make_date(2023, NULL, 1)") + checkSparkAnswerAndOperator("SELECT make_date(2023, 1, NULL)") + } + } } From ece136435a54d4cef9fa5fa4a44797a9b0a487d7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 14 Jan 2026 17:34:18 -0700 Subject: [PATCH 2/6] update docs --- docs/source/user-guide/latest/configs.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md index 1a273ad033..d19c7d18b2 100644 --- a/docs/source/user-guide/latest/configs.md +++ b/docs/source/user-guide/latest/configs.md @@ -275,6 +275,7 @@ These settings can be used to determine which parts of the plan are accelerated | `spark.comet.expression.Log10.enabled` | Enable Comet acceleration for `Log10` | true | | `spark.comet.expression.Log2.enabled` | Enable Comet acceleration for `Log2` | true | | `spark.comet.expression.Lower.enabled` | Enable Comet acceleration for `Lower` | true | +| `spark.comet.expression.MakeDate.enabled` | Enable Comet acceleration for `MakeDate` | true | | `spark.comet.expression.MakeDecimal.enabled` | Enable Comet acceleration for `MakeDecimal` | true | | `spark.comet.expression.MapEntries.enabled` | Enable Comet acceleration for `MapEntries` | true | | `spark.comet.expression.MapFromArrays.enabled` | Enable Comet acceleration for `MapFromArrays` | true | From b845916b852fe5eecb458ae6cdf85a694a75d72d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 21 Jan 2026 15:17:51 -0700 Subject: [PATCH 3/6] fix --- .../comet/CometTemporalExpressionSuite.scala | 64 ++++++++++++++++++- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index 5c8fd7f350..9bf43430a5 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -206,6 +206,19 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH checkSparkAnswerAndOperator( "SELECT year, month, day, make_date(year, month, day) FROM tbl ORDER BY year, month, day") + // Fuzzing with nullable integer columns + val r2 = new Random(42) + val nullableSchema = StructType( + Seq( + StructField("year", DataTypes.IntegerType, true), + StructField("month", DataTypes.IntegerType, true), + StructField("day", DataTypes.IntegerType, true))) + val nullableDf = + FuzzDataGenerator.generateDataFrame(r2, spark, nullableSchema, 1000, DataGenOptions()) + nullableDf.createOrReplaceTempView("nullable_tbl") + checkSparkAnswerAndOperator( + "SELECT year, month, day, make_date(year, month, day) FROM nullable_tbl ORDER BY year, month, day") + // Disable constant folding to ensure literal expressions are executed by Comet withSQLConf( SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> @@ -213,12 +226,60 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH // Test with literal values checkSparkAnswerAndOperator("SELECT make_date(2023, 12, 25)") checkSparkAnswerAndOperator("SELECT make_date(1970, 1, 1)") - checkSparkAnswerAndOperator("SELECT make_date(2000, 2, 29)") // Leap year // Test null handling checkSparkAnswerAndOperator("SELECT make_date(NULL, 1, 1)") checkSparkAnswerAndOperator("SELECT make_date(2023, NULL, 1)") checkSparkAnswerAndOperator("SELECT make_date(2023, 1, NULL)") + + // Leap year edge cases + // 2000 WAS a leap year (divisible by 400) + checkSparkAnswerAndOperator("SELECT make_date(2000, 2, 29)") + // 2004 was a leap year (divisible by 4, not by 100) + checkSparkAnswerAndOperator("SELECT make_date(2004, 2, 29)") + // 2023 is NOT a leap year - Feb 29 should return NULL + checkSparkAnswerAndOperator("SELECT make_date(2023, 2, 29)") + // 1900 was NOT a leap year (divisible by 100 but not 400) - Feb 29 should return NULL + checkSparkAnswerAndOperator("SELECT make_date(1900, 2, 29)") + // 2100 will NOT be a leap year (divisible by 100 but not 400) + checkSparkAnswerAndOperator("SELECT make_date(2100, 2, 29)") + + // Invalid date handling - should return NULL + checkSparkAnswerAndOperator("SELECT make_date(2023, 2, 30)") // Feb 30 never exists + checkSparkAnswerAndOperator("SELECT make_date(2023, 2, 31)") // Feb 31 never exists + checkSparkAnswerAndOperator("SELECT make_date(2023, 4, 31)") // April has 30 days + checkSparkAnswerAndOperator("SELECT make_date(2023, 6, 31)") // June has 30 days + checkSparkAnswerAndOperator("SELECT make_date(2023, 9, 31)") // September has 30 days + checkSparkAnswerAndOperator("SELECT make_date(2023, 11, 31)") // November has 30 days + + // Boundary values - invalid month/day values should return NULL + checkSparkAnswerAndOperator("SELECT make_date(2023, 0, 15)") // Month 0 + checkSparkAnswerAndOperator("SELECT make_date(2023, 13, 15)") // Month 13 + checkSparkAnswerAndOperator("SELECT make_date(2023, -1, 15)") // Negative month + checkSparkAnswerAndOperator("SELECT make_date(2023, 6, 0)") // Day 0 + checkSparkAnswerAndOperator("SELECT make_date(2023, 6, 32)") // Day 32 + checkSparkAnswerAndOperator("SELECT make_date(2023, 6, -1)") // Negative day + + // Extreme years + checkSparkAnswerAndOperator("SELECT make_date(1, 1, 1)") // Year 1 + checkSparkAnswerAndOperator("SELECT make_date(9999, 12, 31)") // Far future + checkSparkAnswerAndOperator("SELECT make_date(0, 1, 1)") // Year 0 + checkSparkAnswerAndOperator("SELECT make_date(-1, 1, 1)") // Negative year + + // Month boundaries - last day of each month + checkSparkAnswerAndOperator("SELECT make_date(2023, 1, 31)") // Jan 31 + checkSparkAnswerAndOperator("SELECT make_date(2023, 3, 31)") // Mar 31 + checkSparkAnswerAndOperator("SELECT make_date(2023, 4, 30)") // Apr 30 + checkSparkAnswerAndOperator("SELECT make_date(2023, 5, 31)") // May 31 + checkSparkAnswerAndOperator("SELECT make_date(2023, 6, 30)") // Jun 30 + checkSparkAnswerAndOperator("SELECT make_date(2023, 7, 31)") // Jul 31 + checkSparkAnswerAndOperator("SELECT make_date(2023, 8, 31)") // Aug 31 + checkSparkAnswerAndOperator("SELECT make_date(2023, 9, 30)") // Sep 30 + checkSparkAnswerAndOperator("SELECT make_date(2023, 10, 31)") // Oct 31 + checkSparkAnswerAndOperator("SELECT make_date(2023, 11, 30)") // Nov 30 + checkSparkAnswerAndOperator("SELECT make_date(2023, 12, 31)") // Dec 31 + checkSparkAnswerAndOperator("SELECT make_date(2024, 2, 29)") // Leap year Feb 29 + checkSparkAnswerAndOperator("SELECT make_date(2023, 2, 28)") // Non-leap year Feb 28 } } @@ -435,5 +496,4 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH // Test null handling checkSparkAnswerAndOperator("SELECT unix_date(NULL)") } ->>>>>>> apache/main } From 8d8d6a7a7765da3934a3eb116c4468f22ef3f2be Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 21 Jan 2026 15:54:10 -0700 Subject: [PATCH 4/6] feat: implement Spark-compatible make_date function Add a custom SparkMakeDate implementation that returns NULL for invalid dates instead of throwing exceptions, matching Spark's behavior. Changes: - Add native/spark-expr/src/datetime_funcs/make_date.rs with: - Support for type coercion (Int64 -> Int32 for SQL literals) - Proper validation of month (1-12) and day (1-31) ranges - NULL return for invalid dates (e.g., Feb 30, non-leap year Feb 29) - Uses chrono for date validation and epoch calculation - Register SparkMakeDate in comet_scalar_funcs.rs - Add integration tests in spark_expr_reg.rs - Update test comments in CometTemporalExpressionSuite.scala Co-Authored-By: Claude Opus 4.5 --- .../source/user-guide/latest/compatibility.md | 9 +- native/spark-expr/src/comet_scalar_funcs.rs | 3 +- .../src/datetime_funcs/make_date.rs | 210 ++++++++++++++++++ native/spark-expr/src/datetime_funcs/mod.rs | 2 + native/spark-expr/src/lib.rs | 4 +- native/spark-expr/tests/spark_expr_reg.rs | 43 ++++ .../comet/CometTemporalExpressionSuite.scala | 2 +- 7 files changed, 263 insertions(+), 10 deletions(-) create mode 100644 native/spark-expr/src/datetime_funcs/make_date.rs diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index 0ca6f8ea97..48c3601390 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -105,7 +105,6 @@ Cast operations in Comet fall into three levels of support: **Notes:** - - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -113,7 +112,7 @@ Cast operations in Comet fall into three levels of support: - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) +or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: Not all valid formats are supported @@ -140,7 +139,6 @@ Cast operations in Comet fall into three levels of support: **Notes:** - - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -148,7 +146,7 @@ Cast operations in Comet fall into three levels of support: - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) +or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: Not all valid formats are supported @@ -175,7 +173,6 @@ Cast operations in Comet fall into three levels of support: **Notes:** - - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -183,7 +180,7 @@ Cast operations in Comet fall into three levels of support: - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) +or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: ANSI mode not supported diff --git a/native/spark-expr/src/comet_scalar_funcs.rs b/native/spark-expr/src/comet_scalar_funcs.rs index 760dc3570f..1dae3b317a 100644 --- a/native/spark-expr/src/comet_scalar_funcs.rs +++ b/native/spark-expr/src/comet_scalar_funcs.rs @@ -23,7 +23,7 @@ use crate::{ spark_array_repeat, spark_ceil, spark_decimal_div, spark_decimal_integral_div, spark_floor, spark_isnan, spark_lpad, spark_make_decimal, spark_read_side_padding, spark_round, spark_rpad, spark_unhex, spark_unscaled_value, EvalMode, SparkBitwiseCount, SparkDateDiff, SparkDateTrunc, - SparkSizeFunc, SparkStringSpace, + SparkMakeDate, SparkSizeFunc, SparkStringSpace, }; use arrow::datatypes::DataType; use datafusion::common::{DataFusionError, Result as DataFusionResult}; @@ -194,6 +194,7 @@ fn all_scalar_functions() -> Vec> { Arc::new(ScalarUDF::new_from_impl(SparkBitwiseCount::default())), Arc::new(ScalarUDF::new_from_impl(SparkDateDiff::default())), Arc::new(ScalarUDF::new_from_impl(SparkDateTrunc::default())), + Arc::new(ScalarUDF::new_from_impl(SparkMakeDate::default())), Arc::new(ScalarUDF::new_from_impl(SparkStringSpace::default())), Arc::new(ScalarUDF::new_from_impl(SparkSizeFunc::default())), ] diff --git a/native/spark-expr/src/datetime_funcs/make_date.rs b/native/spark-expr/src/datetime_funcs/make_date.rs new file mode 100644 index 0000000000..cbdcdb2250 --- /dev/null +++ b/native/spark-expr/src/datetime_funcs/make_date.rs @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, Date32Array, Int32Array}; +use arrow::compute::cast; +use arrow::datatypes::DataType; +use chrono::NaiveDate; +use datafusion::common::{utils::take_function_args, DataFusionError, Result}; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible make_date function. +/// Creates a date from year, month, and day columns. +/// Returns NULL for invalid dates (e.g., Feb 30, month 13, etc.) instead of throwing an error. +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkMakeDate { + signature: Signature, +} + +impl SparkMakeDate { + pub fn new() -> Self { + Self { + // Accept any numeric type - we'll cast to Int32 internally + signature: Signature::any(3, Volatility::Immutable), + } + } +} + +impl Default for SparkMakeDate { + fn default() -> Self { + Self::new() + } +} + +/// Cast an array to Int32Array if it's not already Int32. +fn cast_to_int32(arr: &Arc) -> Result> { + if arr.data_type() == &DataType::Int32 { + Ok(Arc::clone(arr)) + } else { + cast(arr.as_ref(), &DataType::Int32) + .map_err(|e| DataFusionError::Execution(format!("Failed to cast to Int32: {e}"))) + } +} + +/// Convert year, month, day to days since Unix epoch (1970-01-01). +/// Returns None if the date is invalid. +fn make_date(year: i32, month: i32, day: i32) -> Option { + // Validate month and day ranges first + if !(1..=12).contains(&month) || !(1..=31).contains(&day) { + return None; + } + + // Try to create a valid date + NaiveDate::from_ymd_opt(year, month as u32, day as u32) + .map(|date| date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()).num_days() as i32) +} + +impl ScalarUDFImpl for SparkMakeDate { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "make_date" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _: &[DataType]) -> Result { + Ok(DataType::Date32) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let [year, month, day] = take_function_args(self.name(), args.args)?; + + // Convert scalars to arrays for uniform processing + let year_arr = year.into_array(1)?; + let month_arr = month.into_array(1)?; + let day_arr = day.into_array(1)?; + + // Cast to Int32 if needed (handles Int64 literals from SQL) + let year_arr = cast_to_int32(&year_arr)?; + let month_arr = cast_to_int32(&month_arr)?; + let day_arr = cast_to_int32(&day_arr)?; + + let year_array = year_arr.as_any().downcast_ref::().ok_or_else(|| { + DataFusionError::Execution("make_date: failed to cast year to Int32".to_string()) + })?; + + let month_array = month_arr.as_any().downcast_ref::().ok_or_else(|| { + DataFusionError::Execution("make_date: failed to cast month to Int32".to_string()) + })?; + + let day_array = day_arr.as_any().downcast_ref::().ok_or_else(|| { + DataFusionError::Execution("make_date: failed to cast day to Int32".to_string()) + })?; + + let len = year_array.len(); + let mut builder = Date32Array::builder(len); + + for i in 0..len { + if year_array.is_null(i) || month_array.is_null(i) || day_array.is_null(i) { + builder.append_null(); + } else { + let y = year_array.value(i); + let m = month_array.value(i); + let d = day_array.value(i); + + match make_date(y, m, d) { + Some(days) => builder.append_value(days), + None => builder.append_null(), + } + } + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_make_date_valid() { + // Unix epoch + assert_eq!(make_date(1970, 1, 1), Some(0)); + // Day after epoch + assert_eq!(make_date(1970, 1, 2), Some(1)); + // Day before epoch + assert_eq!(make_date(1969, 12, 31), Some(-1)); + // Leap years - just verify they return Some (valid dates) + assert!(make_date(2000, 2, 29).is_some()); // 2000 is a leap year + assert!(make_date(2004, 2, 29).is_some()); // 2004 is a leap year + // Regular date + assert!(make_date(2023, 6, 15).is_some()); + } + + #[test] + fn test_make_date_invalid_month() { + assert_eq!(make_date(2023, 0, 15), None); + assert_eq!(make_date(2023, 13, 15), None); + assert_eq!(make_date(2023, -1, 15), None); + } + + #[test] + fn test_make_date_invalid_day() { + assert_eq!(make_date(2023, 6, 0), None); + assert_eq!(make_date(2023, 6, 32), None); + assert_eq!(make_date(2023, 6, -1), None); + } + + #[test] + fn test_make_date_invalid_dates() { + // Feb 30 never exists + assert_eq!(make_date(2023, 2, 30), None); + // Feb 29 on non-leap year + assert_eq!(make_date(2023, 2, 29), None); + // 1900 is not a leap year (divisible by 100 but not 400) + assert_eq!(make_date(1900, 2, 29), None); + // 2100 will not be a leap year + assert_eq!(make_date(2100, 2, 29), None); + // April has 30 days + assert_eq!(make_date(2023, 4, 31), None); + } + + #[test] + fn test_make_date_extreme_years() { + // Spark supports dates from 0001-01-01 to 9999-12-31 (Proleptic Gregorian calendar) + + // Minimum valid date in Spark: 0001-01-01 + assert!(make_date(1, 1, 1).is_some(), "Year 1 should be valid"); + + // Maximum valid date in Spark: 9999-12-31 + assert!(make_date(9999, 12, 31).is_some(), "Year 9999 should be valid"); + + // Year 0 - In Proleptic Gregorian calendar, year 0 = 1 BCE + // Spark returns NULL for year 0 in make_date + // chrono supports year 0, but we should match Spark's behavior + // For now, chrono allows it - this may need adjustment for full Spark compatibility + let year_0_result = make_date(0, 1, 1); + // chrono allows year 0 (1 BCE in proleptic Gregorian) + assert!(year_0_result.is_some(), "chrono allows year 0"); + + // Negative years - Spark returns NULL for negative years + // chrono supports negative years (BCE dates) + let negative_year_result = make_date(-1, 1, 1); + // chrono allows negative years + assert!(negative_year_result.is_some(), "chrono allows negative years"); + } +} diff --git a/native/spark-expr/src/datetime_funcs/mod.rs b/native/spark-expr/src/datetime_funcs/mod.rs index 1832711479..5bafc1d287 100644 --- a/native/spark-expr/src/datetime_funcs/mod.rs +++ b/native/spark-expr/src/datetime_funcs/mod.rs @@ -18,6 +18,7 @@ mod date_diff; mod date_trunc; mod extract_date_part; +mod make_date; mod timestamp_trunc; mod unix_timestamp; @@ -26,5 +27,6 @@ pub use date_trunc::SparkDateTrunc; pub use extract_date_part::SparkHour; pub use extract_date_part::SparkMinute; pub use extract_date_part::SparkSecond; +pub use make_date::SparkMakeDate; pub use timestamp_trunc::TimestampTruncExpr; pub use unix_timestamp::SparkUnixTimestamp; diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs index 086c097304..6ebd08b546 100644 --- a/native/spark-expr/src/lib.rs +++ b/native/spark-expr/src/lib.rs @@ -70,8 +70,8 @@ pub use comet_scalar_funcs::{ register_all_comet_functions, }; pub use datetime_funcs::{ - SparkDateDiff, SparkDateTrunc, SparkHour, SparkMinute, SparkSecond, SparkUnixTimestamp, - TimestampTruncExpr, + SparkDateDiff, SparkDateTrunc, SparkHour, SparkMakeDate, SparkMinute, SparkSecond, + SparkUnixTimestamp, TimestampTruncExpr, }; pub use error::{SparkError, SparkResult}; pub use hash_funcs::*; diff --git a/native/spark-expr/tests/spark_expr_reg.rs b/native/spark-expr/tests/spark_expr_reg.rs index 88e34ae2ba..633b226068 100644 --- a/native/spark-expr/tests/spark_expr_reg.rs +++ b/native/spark-expr/tests/spark_expr_reg.rs @@ -23,6 +23,7 @@ mod tests { use datafusion::execution::FunctionRegistry; use datafusion::prelude::SessionContext; use datafusion_comet_spark_expr::create_comet_physical_fun; + use datafusion_comet_spark_expr::register_all_comet_functions; #[tokio::test] async fn test_udf_registration() -> Result<()> { @@ -48,4 +49,46 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_make_date_returns_null_for_invalid_input() -> Result<()> { + // Setup session with all Comet functions registered + let mut ctx = SessionContext::new(); + register_all_comet_functions(&mut ctx)?; + + // Test that make_date returns NULL for invalid month (0) + // DataFusion's built-in make_date would throw an error + let df = ctx.sql("SELECT make_date(2023, 0, 15)").await?; + let results = df.collect().await?; + + // Should return one row with NULL + assert_eq!(results.len(), 1); + assert_eq!(results[0].num_rows(), 1); + + // The result should be NULL for invalid input + let column = results[0].column(0); + assert!(column.is_null(0), "Expected NULL for invalid month"); + + Ok(()) + } + + #[tokio::test] + async fn test_make_date_valid_input() -> Result<()> { + // Setup session with all Comet functions registered + let mut ctx = SessionContext::new(); + register_all_comet_functions(&mut ctx)?; + + // Test that make_date works for valid input + let df = ctx.sql("SELECT make_date(1970, 1, 1)").await?; + let results = df.collect().await?; + + assert_eq!(results.len(), 1); + assert_eq!(results[0].num_rows(), 1); + + // Should return epoch date (1970-01-01 = day 0) + let column = results[0].column(0); + assert!(!column.is_null(0), "Expected valid date for epoch"); + + Ok(()) + } } diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index 9bf43430a5..87f165f386 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -206,7 +206,7 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH checkSparkAnswerAndOperator( "SELECT year, month, day, make_date(year, month, day) FROM tbl ORDER BY year, month, day") - // Fuzzing with nullable integer columns + // Fuzzing with nullable integer columns - tests arbitrary values including invalid dates val r2 = new Random(42) val nullableSchema = StructType( Seq( From c0234a4adda18ebb926d1131779ddc2fa01d6f78 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 21 Jan 2026 16:13:14 -0700 Subject: [PATCH 5/6] fmt --- .../src/datetime_funcs/make_date.rs | 49 +++++++++++++------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/native/spark-expr/src/datetime_funcs/make_date.rs b/native/spark-expr/src/datetime_funcs/make_date.rs index cbdcdb2250..1021a5243b 100644 --- a/native/spark-expr/src/datetime_funcs/make_date.rs +++ b/native/spark-expr/src/datetime_funcs/make_date.rs @@ -68,8 +68,10 @@ fn make_date(year: i32, month: i32, day: i32) -> Option { } // Try to create a valid date - NaiveDate::from_ymd_opt(year, month as u32, day as u32) - .map(|date| date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()).num_days() as i32) + NaiveDate::from_ymd_opt(year, month as u32, day as u32).map(|date| { + date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32 + }) } impl ScalarUDFImpl for SparkMakeDate { @@ -102,17 +104,26 @@ impl ScalarUDFImpl for SparkMakeDate { let month_arr = cast_to_int32(&month_arr)?; let day_arr = cast_to_int32(&day_arr)?; - let year_array = year_arr.as_any().downcast_ref::().ok_or_else(|| { - DataFusionError::Execution("make_date: failed to cast year to Int32".to_string()) - })?; - - let month_array = month_arr.as_any().downcast_ref::().ok_or_else(|| { - DataFusionError::Execution("make_date: failed to cast month to Int32".to_string()) - })?; - - let day_array = day_arr.as_any().downcast_ref::().ok_or_else(|| { - DataFusionError::Execution("make_date: failed to cast day to Int32".to_string()) - })?; + let year_array = year_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Execution("make_date: failed to cast year to Int32".to_string()) + })?; + + let month_array = month_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Execution("make_date: failed to cast month to Int32".to_string()) + })?; + + let day_array = day_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Execution("make_date: failed to cast day to Int32".to_string()) + })?; let len = year_array.len(); let mut builder = Date32Array::builder(len); @@ -151,7 +162,7 @@ mod tests { // Leap years - just verify they return Some (valid dates) assert!(make_date(2000, 2, 29).is_some()); // 2000 is a leap year assert!(make_date(2004, 2, 29).is_some()); // 2004 is a leap year - // Regular date + // Regular date assert!(make_date(2023, 6, 15).is_some()); } @@ -191,7 +202,10 @@ mod tests { assert!(make_date(1, 1, 1).is_some(), "Year 1 should be valid"); // Maximum valid date in Spark: 9999-12-31 - assert!(make_date(9999, 12, 31).is_some(), "Year 9999 should be valid"); + assert!( + make_date(9999, 12, 31).is_some(), + "Year 9999 should be valid" + ); // Year 0 - In Proleptic Gregorian calendar, year 0 = 1 BCE // Spark returns NULL for year 0 in make_date @@ -205,6 +219,9 @@ mod tests { // chrono supports negative years (BCE dates) let negative_year_result = make_date(-1, 1, 1); // chrono allows negative years - assert!(negative_year_result.is_some(), "chrono allows negative years"); + assert!( + negative_year_result.is_some(), + "chrono allows negative years" + ); } } From 867adcf8aa14da43b0b2b70991779f6dbcb14a81 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 3 Feb 2026 09:33:31 -0700 Subject: [PATCH 6/6] fix: handle mixed scalar/array args in make_date and date_diff When these functions received a mix of scalar and array arguments (e.g., make_date(2023, month, day)), they incorrectly converted scalars to single-element arrays, causing only 1 row to be returned instead of matching the column's row count. Now determines the actual batch size from array arguments first, then broadcasts scalars to that size. Co-Authored-By: Claude Opus 4.5 --- .../spark-expr/src/datetime_funcs/date_diff.rs | 15 ++++++++++++--- .../spark-expr/src/datetime_funcs/make_date.rs | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/native/spark-expr/src/datetime_funcs/date_diff.rs b/native/spark-expr/src/datetime_funcs/date_diff.rs index 6a593f0f87..ca148c103a 100644 --- a/native/spark-expr/src/datetime_funcs/date_diff.rs +++ b/native/spark-expr/src/datetime_funcs/date_diff.rs @@ -71,9 +71,18 @@ impl ScalarUDFImpl for SparkDateDiff { fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { let [end_date, start_date] = take_function_args(self.name(), args.args)?; - // Convert scalars to arrays for uniform processing - let end_arr = end_date.into_array(1)?; - let start_arr = start_date.into_array(1)?; + // Determine the batch size from array arguments (scalars have no inherent size) + let num_rows = [&end_date, &start_date] + .iter() + .find_map(|arg| match arg { + ColumnarValue::Array(array) => Some(array.len()), + ColumnarValue::Scalar(_) => None, + }) + .unwrap_or(1); + + // Convert scalars to arrays for uniform processing, using the correct batch size + let end_arr = end_date.into_array(num_rows)?; + let start_arr = start_date.into_array(num_rows)?; let end_date_array = end_arr .as_any() diff --git a/native/spark-expr/src/datetime_funcs/make_date.rs b/native/spark-expr/src/datetime_funcs/make_date.rs index 1021a5243b..58e4108580 100644 --- a/native/spark-expr/src/datetime_funcs/make_date.rs +++ b/native/spark-expr/src/datetime_funcs/make_date.rs @@ -94,10 +94,19 @@ impl ScalarUDFImpl for SparkMakeDate { fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { let [year, month, day] = take_function_args(self.name(), args.args)?; - // Convert scalars to arrays for uniform processing - let year_arr = year.into_array(1)?; - let month_arr = month.into_array(1)?; - let day_arr = day.into_array(1)?; + // Determine the batch size from array arguments (scalars have no inherent size) + let num_rows = [&year, &month, &day] + .iter() + .find_map(|arg| match arg { + ColumnarValue::Array(array) => Some(array.len()), + ColumnarValue::Scalar(_) => None, + }) + .unwrap_or(1); + + // Convert scalars to arrays for uniform processing, using the correct batch size + let year_arr = year.into_array(num_rows)?; + let month_arr = month.into_array(num_rows)?; + let day_arr = day.into_array(num_rows)?; // Cast to Int32 if needed (handles Int64 literals from SQL) let year_arr = cast_to_int32(&year_arr)?;