From f0a6458f3769f1ffbea9fc418fbaf245257607ac Mon Sep 17 00:00:00 2001 From: comphead Date: Wed, 20 May 2026 11:45:01 -0700 Subject: [PATCH 1/4] chore: wire time functions + `slice`, `shift` --- .../spark_expressions_support.md | 12 ++-- docs/source/user-guide/latest/expressions.md | 26 +++++--- native/core/src/execution/jni_api.rs | 10 +++ .../apache/comet/serde/QueryPlanSerde.scala | 8 ++- .../sql-tests/expressions/array/slice.sql | 62 ++++++++++++++++++ .../bitwise/shiftrightunsigned.sql | 63 +++++++++++++++++++ .../expressions/datetime/unix_micros.sql | 41 ++++++++++++ .../expressions/datetime/unix_millis.sql | 41 ++++++++++++ .../expressions/datetime/unix_seconds.sql | 41 ++++++++++++ 9 files changed, 287 insertions(+), 17 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/array/slice.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/bitwise/shiftrightunsigned.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 9c19926a7c..81a132f357 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -151,7 +151,7 @@ - [x] get - [ ] sequence - [ ] shuffle -- [ ] slice +- [x] slice - [x] sort_array ### bitwise_funcs @@ -165,7 +165,7 @@ - [x] bit_get - [x] getbit - [x] shiftright -- [ ] shiftrightunsigned +- [x] shiftrightunsigned - [x] `|` - [x] `~` @@ -214,7 +214,7 @@ ### datetime_funcs -- [ ] add_months +- [x] add_months - [x] convert_timezone - [ ] curdate - [ ] current_date @@ -286,9 +286,9 @@ - [ ] try_to_time - [ ] try_to_timestamp - [x] unix_date -- [ ] unix_micros -- [ ] unix_millis -- [ ] unix_seconds +- [x] unix_micros +- [x] unix_millis +- [x] unix_seconds - [x] unix_timestamp - [x] weekday - [x] weekofyear diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 37440980ef..7fca291361 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -101,6 +101,7 @@ of expressions that be disabled. | Expression | SQL | | ---------------- | ---------------------------- | +| AddMonths | `add_months` | | ConvertTimezone | `convert_timezone` | | CurrentTimeZone | `current_timezone` | | DateAdd | `date_add` | @@ -124,6 +125,9 @@ of expressions that be disabled. | TruncDate | `trunc` | | TruncTimestamp | `date_trunc` | | UnixDate | `unix_date` | +| UnixMicros | `unix_micros` | +| UnixMillis | `unix_millis` | +| UnixSeconds | `unix_seconds` | | UnixTimestamp | `unix_timestamp` | | Year | `year` | | Month | `month` | @@ -206,16 +210,17 @@ of expressions that be disabled. ## Bitwise Expressions -| Expression | SQL | -| ------------ | ---- | -| BitwiseAnd | `&` | -| BitwiseCount | | -| BitwiseGet | | -| BitwiseOr | `\|` | -| BitwiseNot | `~` | -| BitwiseXor | `^` | -| ShiftLeft | `<<` | -| ShiftRight | `>>` | +| Expression | SQL | +| ------------------ | ----- | +| BitwiseAnd | `&` | +| BitwiseCount | | +| BitwiseGet | | +| BitwiseOr | `\|` | +| BitwiseNot | `~` | +| BitwiseXor | `^` | +| ShiftLeft | `<<` | +| ShiftRight | `>>` | +| ShiftRightUnsigned | `>>>` | ## Aggregate Expressions @@ -286,6 +291,7 @@ Comet supports using the following aggregate functions within window contexts wi | Flatten | | GetArrayItem | | Size | +| Slice | | SortArray | ## Map Expressions diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index a19d1ee368..ccebda43c8 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -43,15 +43,19 @@ use datafusion::{ }; use datafusion_comet_proto::spark_operator::Operator; use datafusion_spark::function::array::array_contains::SparkArrayContains; +use datafusion_spark::function::array::slice::SparkSlice; use datafusion_spark::function::bitwise::bit_count::SparkBitCount; use datafusion_spark::function::bitwise::bit_get::SparkBitGet; +use datafusion_spark::function::bitwise::bit_shift::SparkBitShift; use datafusion_spark::function::bitwise::bitwise_not::SparkBitwiseNot; +use datafusion_spark::function::datetime::add_months::SparkAddMonths; use datafusion_spark::function::datetime::date_add::SparkDateAdd; use datafusion_spark::function::datetime::date_sub::SparkDateSub; use datafusion_spark::function::datetime::from_utc_timestamp::SparkFromUtcTimestamp; use datafusion_spark::function::datetime::last_day::SparkLastDay; use datafusion_spark::function::datetime::next_day::SparkNextDay; use datafusion_spark::function::datetime::to_utc_timestamp::SparkToUtcTimestamp; +use datafusion_spark::function::datetime::unix::SparkUnixTimestamp; use datafusion_spark::function::hash::crc32::SparkCrc32; use datafusion_spark::function::hash::sha1::SparkSha1; use datafusion_spark::function::hash::sha2::SparkSha2; @@ -601,6 +605,12 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkCsc::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkFactorial::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSec::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkAddMonths::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSlice::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitShift::right_unsigned())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUnixTimestamp::microseconds())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUnixTimestamp::milliseconds())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUnixTimestamp::seconds())); } /// Prepares arrow arrays for output. diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 9c80f33d39..46e64284ac 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -63,6 +63,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[ArrayPosition] -> CometArrayPosition, classOf[ArrayRemove] -> CometArrayRemove, classOf[ArrayRepeat] -> CometArrayRepeat, + classOf[Slice] -> CometScalarFunction("slice"), classOf[SortArray] -> CometSortArray, classOf[ArraysOverlap] -> CometArraysOverlap, classOf[ArrayUnion] -> CometArrayUnion, @@ -214,10 +215,12 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[BitwiseNot] -> CometBitwiseNot, classOf[BitwiseXor] -> CometBitwiseXor, classOf[ShiftLeft] -> CometShiftLeft, - classOf[ShiftRight] -> CometShiftRight) + classOf[ShiftRight] -> CometShiftRight, + classOf[ShiftRightUnsigned] -> CometScalarFunction("shiftrightunsigned")) private[comet] val temporalExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + classOf[AddMonths] -> CometScalarFunction("add_months"), classOf[ConvertTimezone] -> CometConvertTimezone, classOf[DateAdd] -> CometDateAdd, classOf[DateDiff] -> CometDateDiff, @@ -227,6 +230,9 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[Hours] -> CometHours, classOf[DateSub] -> CometDateSub, classOf[UnixDate] -> CometUnixDate, + classOf[UnixMicros] -> CometScalarFunction("unix_micros"), + classOf[UnixMillis] -> CometScalarFunction("unix_millis"), + classOf[UnixSeconds] -> CometScalarFunction("unix_seconds"), classOf[FromUnixTime] -> CometFromUnixTime, classOf[FromUTCTimestamp] -> CometFromUTCTimestamp, classOf[ToUTCTimestamp] -> CometToUTCTimestamp, diff --git a/spark/src/test/resources/sql-tests/expressions/array/slice.sql b/spark/src/test/resources/sql-tests/expressions/array/slice.sql new file mode 100644 index 0000000000..a439d74820 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/array/slice.sql @@ -0,0 +1,62 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ConfigMatrix: parquet.enable.dictionary=false,true + +statement +CREATE TABLE test_slice(arr array, s int, l int) USING parquet + +statement +INSERT INTO test_slice VALUES + (array(1, 2, 3, 4, 5), 2, 3), + (array(1, 2, 3, 4, 5), 1, 5), + (array(1, 2, 3, 4, 5), 1, 10), + (array(1, 2, 3, 4, 5), 3, 0), + (array(1, 2, 3, 4, 5), -2, 2), + (array(1, 2, 3, 4, 5), -10, 2), + (array(1, 2, 3, 4, 5), 10, 2), + (array(1, NULL, 3, NULL, 5), 1, 5), + (array(), 1, 3), + (NULL, 1, 3), + (array(1, 2, 3), NULL, 2), + (array(1, 2, 3), 1, NULL) + +-- column array, column start, column length +query +SELECT slice(arr, s, l) FROM test_slice + +-- column array, literal start and length +query +SELECT slice(arr, 2, 2) FROM test_slice + +-- column array, negative literal start +query +SELECT slice(arr, -1, 1) FROM test_slice + +-- string element type +statement +CREATE TABLE test_slice_string(arr array) USING parquet + +statement +INSERT INTO test_slice_string VALUES + (array('a', 'b', 'c', 'd')), + (array('é', '日本', '', 'x')), + (array('a', NULL, 'c')), + (NULL) + +query +SELECT slice(arr, 1, 2) FROM test_slice_string diff --git a/spark/src/test/resources/sql-tests/expressions/bitwise/shiftrightunsigned.sql b/spark/src/test/resources/sql-tests/expressions/bitwise/shiftrightunsigned.sql new file mode 100644 index 0000000000..37f51c43e5 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/bitwise/shiftrightunsigned.sql @@ -0,0 +1,63 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ConfigMatrix: parquet.enable.dictionary=false,true + +-- Spark's ShiftRightUnsigned: first arg is Int or Long, second is Int. +-- Returns the same integer type as the first argument. Shift amount is +-- normalized to the bit width (Java semantics) for negative/large shifts. + +statement +CREATE TABLE test_shiftrightunsigned_int(v int, s int) USING parquet + +statement +INSERT INTO test_shiftrightunsigned_int VALUES + (1, 1), + (-1, 1), + (8, 2), + (2147483647, 1), + (-2147483648, 1), + (0, 0), + (1, 0), + (1, 31), + (1, 32), + (1, 33), + (1, -1), + (NULL, 1), + (1, NULL) + +query +SELECT shiftrightunsigned(v, s) FROM test_shiftrightunsigned_int + +statement +CREATE TABLE test_shiftrightunsigned_long(v bigint, s int) USING parquet + +statement +INSERT INTO test_shiftrightunsigned_long VALUES + (1, 1), + (-1, 1), + (9223372036854775807, 1), + (-9223372036854775808, 1), + (0, 0), + (1, 63), + (1, 64), + (1, -1), + (NULL, 1), + (1, NULL) + +query +SELECT shiftrightunsigned(v, s) FROM test_shiftrightunsigned_long diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql new file mode 100644 index 0000000000..8b278de839 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql @@ -0,0 +1,41 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- The result is microseconds since epoch in UTC, so it must not depend on the +-- session timezone. +-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles + +statement +CREATE TABLE test_unix_micros(ts timestamp) USING parquet + +statement +INSERT INTO test_unix_micros VALUES + (timestamp('1970-01-01 00:00:00')), + (timestamp('2024-01-15 12:34:56.123456')), + (timestamp('1969-12-31 23:59:59.999999')), + (timestamp('9999-12-31 23:59:59.999999')), + (timestamp('0001-01-01 00:00:00')), + (NULL) + +query +SELECT unix_micros(ts) FROM test_unix_micros + +-- literal arguments +query +SELECT unix_micros(timestamp('1970-01-01 00:00:00')), + unix_micros(timestamp('2024-01-15 12:34:56.123456')), + unix_micros(NULL) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql new file mode 100644 index 0000000000..a6344aa02c --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql @@ -0,0 +1,41 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- The result is milliseconds since epoch in UTC, so it must not depend on the +-- session timezone. +-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles + +statement +CREATE TABLE test_unix_millis(ts timestamp) USING parquet + +statement +INSERT INTO test_unix_millis VALUES + (timestamp('1970-01-01 00:00:00')), + (timestamp('2024-01-15 12:34:56.123456')), + (timestamp('1969-12-31 23:59:59.999999')), + (timestamp('9999-12-31 23:59:59.999999')), + (timestamp('0001-01-01 00:00:00')), + (NULL) + +query +SELECT unix_millis(ts) FROM test_unix_millis + +-- literal arguments +query +SELECT unix_millis(timestamp('1970-01-01 00:00:00')), + unix_millis(timestamp('2024-01-15 12:34:56.123456')), + unix_millis(NULL) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql new file mode 100644 index 0000000000..4d8771b0f0 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql @@ -0,0 +1,41 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- The result is whole seconds since epoch in UTC, so it must not depend on +-- the session timezone. +-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles + +statement +CREATE TABLE test_unix_seconds(ts timestamp) USING parquet + +statement +INSERT INTO test_unix_seconds VALUES + (timestamp('1970-01-01 00:00:00')), + (timestamp('2024-01-15 12:34:56.123456')), + (timestamp('1969-12-31 23:59:59.999999')), + (timestamp('9999-12-31 23:59:59.999999')), + (timestamp('0001-01-01 00:00:00')), + (NULL) + +query +SELECT unix_seconds(ts) FROM test_unix_seconds + +-- literal arguments +query +SELECT unix_seconds(timestamp('1970-01-01 00:00:00')), + unix_seconds(timestamp('2024-01-15 12:34:56.123456')), + unix_seconds(NULL) From 3e462a3c64cd7e3ded093ff68a1036bb4e7697f9 Mon Sep 17 00:00:00 2001 From: comphead Date: Wed, 20 May 2026 11:50:57 -0700 Subject: [PATCH 2/4] chore: wire time functions + `slice`, `shift` --- .../expressions/datetime/add_months.sql | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql b/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql new file mode 100644 index 0000000000..b4bf066797 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql @@ -0,0 +1,51 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ConfigMatrix: parquet.enable.dictionary=false,true + +-- Spark's AddMonths expects (DateType, IntegerType) and rolls back the day +-- when the destination month has fewer days (e.g. Jan 31 + 1 month = Feb 28). + +statement +CREATE TABLE test_add_months(d date, n int) USING parquet + +statement +INSERT INTO test_add_months VALUES + (date('2024-01-31'), 1), + (date('2024-01-31'), 12), + (date('2024-01-31'), -1), + (date('2024-02-29'), 12), + (date('2024-02-29'), -12), + (date('2020-02-29'), 48), + (date('1970-01-01'), 0), + (date('1970-01-01'), 2147483647), + (date('1970-01-01'), -2147483648), + (date('9999-12-01'), 1), + (date('0001-01-01'), -1), + (NULL, 1), + (date('2024-01-15'), NULL) + +query +SELECT add_months(d, n) FROM test_add_months + +-- literal arguments +query +SELECT add_months(date('2024-01-31'), 1), + add_months(date('2024-01-31'), -1), + add_months(date('2020-02-29'), 12), + add_months(NULL, 1), + add_months(date('2024-01-15'), NULL) From fbd86f3d6afe5148b8cd22ab2ff3206b16d533eb Mon Sep 17 00:00:00 2001 From: comphead Date: Wed, 20 May 2026 11:58:49 -0700 Subject: [PATCH 3/4] chore: wire time functions + `shift` --- .../spark_expressions_support.md | 2 +- docs/source/user-guide/latest/expressions.md | 1 - native/core/src/execution/jni_api.rs | 2 - .../apache/comet/serde/QueryPlanSerde.scala | 1 - .../sql-tests/expressions/array/slice.sql | 62 ------------------- 5 files changed, 1 insertion(+), 67 deletions(-) delete mode 100644 spark/src/test/resources/sql-tests/expressions/array/slice.sql diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 81a132f357..a4aa135364 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -151,7 +151,7 @@ - [x] get - [ ] sequence - [ ] shuffle -- [x] slice +- [ ] slice - [x] sort_array ### bitwise_funcs diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 7fca291361..10979c9dcc 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -291,7 +291,6 @@ Comet supports using the following aggregate functions within window contexts wi | Flatten | | GetArrayItem | | Size | -| Slice | | SortArray | ## Map Expressions diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index ccebda43c8..2381c42c36 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -43,7 +43,6 @@ use datafusion::{ }; use datafusion_comet_proto::spark_operator::Operator; use datafusion_spark::function::array::array_contains::SparkArrayContains; -use datafusion_spark::function::array::slice::SparkSlice; use datafusion_spark::function::bitwise::bit_count::SparkBitCount; use datafusion_spark::function::bitwise::bit_get::SparkBitGet; use datafusion_spark::function::bitwise::bit_shift::SparkBitShift; @@ -606,7 +605,6 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkFactorial::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSec::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkAddMonths::default())); - session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSlice::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitShift::right_unsigned())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUnixTimestamp::microseconds())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUnixTimestamp::milliseconds())); diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 46e64284ac..e61955c4f1 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -63,7 +63,6 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[ArrayPosition] -> CometArrayPosition, classOf[ArrayRemove] -> CometArrayRemove, classOf[ArrayRepeat] -> CometArrayRepeat, - classOf[Slice] -> CometScalarFunction("slice"), classOf[SortArray] -> CometSortArray, classOf[ArraysOverlap] -> CometArraysOverlap, classOf[ArrayUnion] -> CometArrayUnion, diff --git a/spark/src/test/resources/sql-tests/expressions/array/slice.sql b/spark/src/test/resources/sql-tests/expressions/array/slice.sql deleted file mode 100644 index a439d74820..0000000000 --- a/spark/src/test/resources/sql-tests/expressions/array/slice.sql +++ /dev/null @@ -1,62 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - --- ConfigMatrix: parquet.enable.dictionary=false,true - -statement -CREATE TABLE test_slice(arr array, s int, l int) USING parquet - -statement -INSERT INTO test_slice VALUES - (array(1, 2, 3, 4, 5), 2, 3), - (array(1, 2, 3, 4, 5), 1, 5), - (array(1, 2, 3, 4, 5), 1, 10), - (array(1, 2, 3, 4, 5), 3, 0), - (array(1, 2, 3, 4, 5), -2, 2), - (array(1, 2, 3, 4, 5), -10, 2), - (array(1, 2, 3, 4, 5), 10, 2), - (array(1, NULL, 3, NULL, 5), 1, 5), - (array(), 1, 3), - (NULL, 1, 3), - (array(1, 2, 3), NULL, 2), - (array(1, 2, 3), 1, NULL) - --- column array, column start, column length -query -SELECT slice(arr, s, l) FROM test_slice - --- column array, literal start and length -query -SELECT slice(arr, 2, 2) FROM test_slice - --- column array, negative literal start -query -SELECT slice(arr, -1, 1) FROM test_slice - --- string element type -statement -CREATE TABLE test_slice_string(arr array) USING parquet - -statement -INSERT INTO test_slice_string VALUES - (array('a', 'b', 'c', 'd')), - (array('é', '日本', '', 'x')), - (array('a', NULL, 'c')), - (NULL) - -query -SELECT slice(arr, 1, 2) FROM test_slice_string From 36db5431d4342ddd89fa3e2cc2bf2c3ee2a2c24f Mon Sep 17 00:00:00 2001 From: comphead Date: Wed, 20 May 2026 15:56:20 -0700 Subject: [PATCH 4/4] chore: wire function `shiftrightunsigned` --- .../spark_expressions_support.md | 8 +-- docs/source/user-guide/latest/expressions.md | 4 -- native/core/src/execution/jni_api.rs | 6 -- .../apache/comet/serde/QueryPlanSerde.scala | 4 -- .../sql-tests/expressions/bitwise/bitwise.sql | 47 ++++++++++++++ .../bitwise/shiftrightunsigned.sql | 63 ------------------- .../expressions/datetime/add_months.sql | 51 --------------- .../expressions/datetime/unix_micros.sql | 41 ------------ .../expressions/datetime/unix_millis.sql | 41 ------------ .../expressions/datetime/unix_seconds.sql | 41 ------------ 10 files changed, 51 insertions(+), 255 deletions(-) delete mode 100644 spark/src/test/resources/sql-tests/expressions/bitwise/shiftrightunsigned.sql delete mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql delete mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql delete mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql delete mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index a4aa135364..dc1cdde6fe 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -214,7 +214,7 @@ ### datetime_funcs -- [x] add_months +- [ ] add_months - [x] convert_timezone - [ ] curdate - [ ] current_date @@ -286,9 +286,9 @@ - [ ] try_to_time - [ ] try_to_timestamp - [x] unix_date -- [x] unix_micros -- [x] unix_millis -- [x] unix_seconds +- [ ] unix_micros +- [ ] unix_millis +- [ ] unix_seconds - [x] unix_timestamp - [x] weekday - [x] weekofyear diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 10979c9dcc..0bfce08f4d 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -101,7 +101,6 @@ of expressions that be disabled. | Expression | SQL | | ---------------- | ---------------------------- | -| AddMonths | `add_months` | | ConvertTimezone | `convert_timezone` | | CurrentTimeZone | `current_timezone` | | DateAdd | `date_add` | @@ -125,9 +124,6 @@ of expressions that be disabled. | TruncDate | `trunc` | | TruncTimestamp | `date_trunc` | | UnixDate | `unix_date` | -| UnixMicros | `unix_micros` | -| UnixMillis | `unix_millis` | -| UnixSeconds | `unix_seconds` | | UnixTimestamp | `unix_timestamp` | | Year | `year` | | Month | `month` | diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 2381c42c36..3454be18ff 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -47,14 +47,12 @@ use datafusion_spark::function::bitwise::bit_count::SparkBitCount; use datafusion_spark::function::bitwise::bit_get::SparkBitGet; use datafusion_spark::function::bitwise::bit_shift::SparkBitShift; use datafusion_spark::function::bitwise::bitwise_not::SparkBitwiseNot; -use datafusion_spark::function::datetime::add_months::SparkAddMonths; use datafusion_spark::function::datetime::date_add::SparkDateAdd; use datafusion_spark::function::datetime::date_sub::SparkDateSub; use datafusion_spark::function::datetime::from_utc_timestamp::SparkFromUtcTimestamp; use datafusion_spark::function::datetime::last_day::SparkLastDay; use datafusion_spark::function::datetime::next_day::SparkNextDay; use datafusion_spark::function::datetime::to_utc_timestamp::SparkToUtcTimestamp; -use datafusion_spark::function::datetime::unix::SparkUnixTimestamp; use datafusion_spark::function::hash::crc32::SparkCrc32; use datafusion_spark::function::hash::sha1::SparkSha1; use datafusion_spark::function::hash::sha2::SparkSha2; @@ -604,11 +602,7 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkCsc::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkFactorial::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSec::default())); - session_ctx.register_udf(ScalarUDF::new_from_impl(SparkAddMonths::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitShift::right_unsigned())); - session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUnixTimestamp::microseconds())); - session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUnixTimestamp::milliseconds())); - session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUnixTimestamp::seconds())); } /// Prepares arrow arrays for output. diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index e61955c4f1..3095682375 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -219,7 +219,6 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { private[comet] val temporalExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( - classOf[AddMonths] -> CometScalarFunction("add_months"), classOf[ConvertTimezone] -> CometConvertTimezone, classOf[DateAdd] -> CometDateAdd, classOf[DateDiff] -> CometDateDiff, @@ -229,9 +228,6 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[Hours] -> CometHours, classOf[DateSub] -> CometDateSub, classOf[UnixDate] -> CometUnixDate, - classOf[UnixMicros] -> CometScalarFunction("unix_micros"), - classOf[UnixMillis] -> CometScalarFunction("unix_millis"), - classOf[UnixSeconds] -> CometScalarFunction("unix_seconds"), classOf[FromUnixTime] -> CometFromUnixTime, classOf[FromUTCTimestamp] -> CometFromUTCTimestamp, classOf[ToUTCTimestamp] -> CometToUTCTimestamp, diff --git a/spark/src/test/resources/sql-tests/expressions/bitwise/bitwise.sql b/spark/src/test/resources/sql-tests/expressions/bitwise/bitwise.sql index 8b7ade25f0..f4a481fb2f 100644 --- a/spark/src/test/resources/sql-tests/expressions/bitwise/bitwise.sql +++ b/spark/src/test/resources/sql-tests/expressions/bitwise/bitwise.sql @@ -44,6 +44,50 @@ SELECT shiftright(col1, 2), shiftright(col1, col2) FROM test query SELECT shiftleft(col1, 2), shiftleft(col1, col2) FROM test +-- ShiftRightUnsigned: first arg is Int or Long, second is Int. Returns the +-- same integer type as the first argument. Shift amount is normalized to the +-- bit width (Java semantics) for negative and large shifts. +statement +CREATE TABLE test_shiftrightunsigned_int(v int, s int) USING parquet + +statement +INSERT INTO test_shiftrightunsigned_int VALUES + (1, 1), + (-1, 1), + (8, 2), + (2147483647, 1), + (-2147483648, 1), + (0, 0), + (1, 0), + (1, 31), + (1, 32), + (1, 33), + (1, -1), + (NULL, 1), + (1, NULL) + +query +SELECT shiftrightunsigned(v, s) FROM test_shiftrightunsigned_int + +statement +CREATE TABLE test_shiftrightunsigned_long(v bigint, s int) USING parquet + +statement +INSERT INTO test_shiftrightunsigned_long VALUES + (1, 1), + (-1, 1), + (9223372036854775807, 1), + (-9223372036854775808, 1), + (0, 0), + (1, 63), + (1, 64), + (1, -1), + (NULL, 1), + (1, NULL) + +query +SELECT shiftrightunsigned(v, s) FROM test_shiftrightunsigned_long + query SELECT ~(11), ~col1, ~col2 FROM test @@ -79,3 +123,6 @@ SELECT bit_get(11, 0), bit_get(11, 1), bit_get(11, 2), bit_get(11, 3) query SELECT shiftright(1111, 2), shiftleft(1111, 2) + +query +SELECT shiftrightunsigned(1, 1), shiftrightunsigned(-1, 1), shiftrightunsigned(2147483647, 1), shiftrightunsigned(cast(-1 as bigint), 1) diff --git a/spark/src/test/resources/sql-tests/expressions/bitwise/shiftrightunsigned.sql b/spark/src/test/resources/sql-tests/expressions/bitwise/shiftrightunsigned.sql deleted file mode 100644 index 37f51c43e5..0000000000 --- a/spark/src/test/resources/sql-tests/expressions/bitwise/shiftrightunsigned.sql +++ /dev/null @@ -1,63 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - --- ConfigMatrix: parquet.enable.dictionary=false,true - --- Spark's ShiftRightUnsigned: first arg is Int or Long, second is Int. --- Returns the same integer type as the first argument. Shift amount is --- normalized to the bit width (Java semantics) for negative/large shifts. - -statement -CREATE TABLE test_shiftrightunsigned_int(v int, s int) USING parquet - -statement -INSERT INTO test_shiftrightunsigned_int VALUES - (1, 1), - (-1, 1), - (8, 2), - (2147483647, 1), - (-2147483648, 1), - (0, 0), - (1, 0), - (1, 31), - (1, 32), - (1, 33), - (1, -1), - (NULL, 1), - (1, NULL) - -query -SELECT shiftrightunsigned(v, s) FROM test_shiftrightunsigned_int - -statement -CREATE TABLE test_shiftrightunsigned_long(v bigint, s int) USING parquet - -statement -INSERT INTO test_shiftrightunsigned_long VALUES - (1, 1), - (-1, 1), - (9223372036854775807, 1), - (-9223372036854775808, 1), - (0, 0), - (1, 63), - (1, 64), - (1, -1), - (NULL, 1), - (1, NULL) - -query -SELECT shiftrightunsigned(v, s) FROM test_shiftrightunsigned_long diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql b/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql deleted file mode 100644 index b4bf066797..0000000000 --- a/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql +++ /dev/null @@ -1,51 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - --- ConfigMatrix: parquet.enable.dictionary=false,true - --- Spark's AddMonths expects (DateType, IntegerType) and rolls back the day --- when the destination month has fewer days (e.g. Jan 31 + 1 month = Feb 28). - -statement -CREATE TABLE test_add_months(d date, n int) USING parquet - -statement -INSERT INTO test_add_months VALUES - (date('2024-01-31'), 1), - (date('2024-01-31'), 12), - (date('2024-01-31'), -1), - (date('2024-02-29'), 12), - (date('2024-02-29'), -12), - (date('2020-02-29'), 48), - (date('1970-01-01'), 0), - (date('1970-01-01'), 2147483647), - (date('1970-01-01'), -2147483648), - (date('9999-12-01'), 1), - (date('0001-01-01'), -1), - (NULL, 1), - (date('2024-01-15'), NULL) - -query -SELECT add_months(d, n) FROM test_add_months - --- literal arguments -query -SELECT add_months(date('2024-01-31'), 1), - add_months(date('2024-01-31'), -1), - add_months(date('2020-02-29'), 12), - add_months(NULL, 1), - add_months(date('2024-01-15'), NULL) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql deleted file mode 100644 index 8b278de839..0000000000 --- a/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql +++ /dev/null @@ -1,41 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - --- The result is microseconds since epoch in UTC, so it must not depend on the --- session timezone. --- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles - -statement -CREATE TABLE test_unix_micros(ts timestamp) USING parquet - -statement -INSERT INTO test_unix_micros VALUES - (timestamp('1970-01-01 00:00:00')), - (timestamp('2024-01-15 12:34:56.123456')), - (timestamp('1969-12-31 23:59:59.999999')), - (timestamp('9999-12-31 23:59:59.999999')), - (timestamp('0001-01-01 00:00:00')), - (NULL) - -query -SELECT unix_micros(ts) FROM test_unix_micros - --- literal arguments -query -SELECT unix_micros(timestamp('1970-01-01 00:00:00')), - unix_micros(timestamp('2024-01-15 12:34:56.123456')), - unix_micros(NULL) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql deleted file mode 100644 index a6344aa02c..0000000000 --- a/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql +++ /dev/null @@ -1,41 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - --- The result is milliseconds since epoch in UTC, so it must not depend on the --- session timezone. --- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles - -statement -CREATE TABLE test_unix_millis(ts timestamp) USING parquet - -statement -INSERT INTO test_unix_millis VALUES - (timestamp('1970-01-01 00:00:00')), - (timestamp('2024-01-15 12:34:56.123456')), - (timestamp('1969-12-31 23:59:59.999999')), - (timestamp('9999-12-31 23:59:59.999999')), - (timestamp('0001-01-01 00:00:00')), - (NULL) - -query -SELECT unix_millis(ts) FROM test_unix_millis - --- literal arguments -query -SELECT unix_millis(timestamp('1970-01-01 00:00:00')), - unix_millis(timestamp('2024-01-15 12:34:56.123456')), - unix_millis(NULL) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql deleted file mode 100644 index 4d8771b0f0..0000000000 --- a/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql +++ /dev/null @@ -1,41 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - --- The result is whole seconds since epoch in UTC, so it must not depend on --- the session timezone. --- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles - -statement -CREATE TABLE test_unix_seconds(ts timestamp) USING parquet - -statement -INSERT INTO test_unix_seconds VALUES - (timestamp('1970-01-01 00:00:00')), - (timestamp('2024-01-15 12:34:56.123456')), - (timestamp('1969-12-31 23:59:59.999999')), - (timestamp('9999-12-31 23:59:59.999999')), - (timestamp('0001-01-01 00:00:00')), - (NULL) - -query -SELECT unix_seconds(ts) FROM test_unix_seconds - --- literal arguments -query -SELECT unix_seconds(timestamp('1970-01-01 00:00:00')), - unix_seconds(timestamp('2024-01-15 12:34:56.123456')), - unix_seconds(NULL)