diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 7ef9178d8d8a..a4fe56af9d93 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -92,7 +92,35 @@ Status FunctionRegistry::Add(NativeFunction func) { pc_registry_.emplace_back(std::move(func)); const auto& last_func = pc_registry_.back(); for (const auto& func_signature : last_func.signatures()) { - pc_registry_map_.emplace(&func_signature, &last_func); + auto [it, inserted] = pc_registry_map_.emplace(&func_signature, &last_func); + if (!inserted) { + ARROW_LOG(ERROR) << "Duplicate function signature registered: " + << func_signature.ToString() + << " (existing pc_name=" << it->second->pc_name() + << ", new pc_name=" << last_func.pc_name() + << "); the new registration will be ignored on lookup."; + continue; + } + // Also flag the weaker collision where name+params match but the return type + // differs — these are technically distinct entries in the signature map, but + // they create ambiguity at the SQL surface (e.g. `day(timestamp)` resolving + // to either extractDay → int64 or truncateDay → timestamp). + auto shape_key = func_signature.CallShape(); + auto shape_it = call_shape_map_.find(shape_key); + if (shape_it != call_shape_map_.end()) { + auto pc_it = pc_registry_map_.find(shape_it->second); + std::string existing_pc_name = + pc_it != pc_registry_map_.end() ? pc_it->second->pc_name() : ""; + ARROW_LOG(ERROR) << "Function alias collision: " << func_signature.ToString() + << " has the same name and parameter types as " + << shape_it->second->ToString() + << " (existing pc_name=" << existing_pc_name + << ", new pc_name=" << last_func.pc_name() + << "); callers of this function will get different results " + "depending on the inferred return type."; + } else { + call_shape_map_.emplace(std::move(shape_key), &func_signature); + } } return arrow::Status::OK(); } diff --git a/cpp/src/gandiva/function_registry.h b/cpp/src/gandiva/function_registry.h index 24b64fac5f3f..463e774817fa 100644 --- a/cpp/src/gandiva/function_registry.h +++ b/cpp/src/gandiva/function_registry.h @@ -85,6 +85,10 @@ class GANDIVA_EXPORT FunctionRegistry { private: std::vector pc_registry_; SignatureMap pc_registry_map_; + // Tracks name+param-types (return type ignored) to detect call-shape collisions + // where the same `name(args)` could resolve to two functions with different + // return types. + std::unordered_map call_shape_map_; std::vector> bitcode_memory_buffers_; std::vector> c_functions_; FunctionHolderMakerRegistry holder_maker_registry_; diff --git a/cpp/src/gandiva/function_registry_datetime.cc b/cpp/src/gandiva/function_registry_datetime.cc index a2fe75c80f3f..7b860ca1b748 100644 --- a/cpp/src/gandiva/function_registry_datetime.cc +++ b/cpp/src/gandiva/function_registry_datetime.cc @@ -21,7 +21,7 @@ namespace gandiva { -#define DATE_EXTRACTION_TRUNCATION_FNS(INNER, name) \ +#define DATE_EXTRACTION_FNS(INNER, name) \ DATE_TYPES(INNER, name##Millennium, {}), DATE_TYPES(INNER, name##Century, {}), \ DATE_TYPES(INNER, name##Decade, {}), DATE_TYPES(INNER, name##Year, {"year"}), \ DATE_TYPES(INNER, name##Quarter, ({"quarter"})), \ @@ -32,6 +32,14 @@ namespace gandiva { DATE_TYPES(INNER, name##Minute, {"minute"}), \ DATE_TYPES(INNER, name##Second, {"second"}) +#define DATE_TRUNCATION_FNS(INNER, name) \ + DATE_TYPES(INNER, name##Millennium, {}), DATE_TYPES(INNER, name##Century, {}), \ + DATE_TYPES(INNER, name##Decade, {}), DATE_TYPES(INNER, name##Year, {}), \ + DATE_TYPES(INNER, name##Quarter, {}), DATE_TYPES(INNER, name##Month, {}), \ + DATE_TYPES(INNER, name##Week, {}), DATE_TYPES(INNER, name##Day, {}), \ + DATE_TYPES(INNER, name##Hour, {}), DATE_TYPES(INNER, name##Minute, {}), \ + DATE_TYPES(INNER, name##Second, {}) + #define TO_TIMESTAMP_SAFE_NULL_IF_NULL(NAME, ALIASES, TYPE) \ NativeFunction(#NAME, std::vector ALIASES, DataTypeVector{TYPE()}, \ timestamp(), kResultNullIfNull, ARROW_STRINGIFY(NAME##_##TYPE)) @@ -51,8 +59,8 @@ std::vector GetDateTimeFunctionRegistry() { static std::vector date_time_fn_registry_ = { UNARY_SAFE_NULL_NEVER_BOOL(isnull, {}, day_time_interval), UNARY_SAFE_NULL_NEVER_BOOL(isnull, {}, month_interval), - DATE_EXTRACTION_TRUNCATION_FNS(EXTRACT_SAFE_NULL_IF_NULL, extract), - DATE_EXTRACTION_TRUNCATION_FNS(TRUNCATE_SAFE_NULL_IF_NULL, date_trunc_), + DATE_EXTRACTION_FNS(EXTRACT_SAFE_NULL_IF_NULL, extract), + DATE_TRUNCATION_FNS(TRUNCATE_SAFE_NULL_IF_NULL, date_trunc_), DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDoy, {}), DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDow, {}), diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index be57ce4f4768..eee17b34ea9f 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -353,13 +353,12 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "byte_substr_binary_int32_int32", NativeFunction::kNeedsContext), - NativeFunction("convert_fromUTF8", {"convert_fromutf8"}, DataTypeVector{binary()}, - utf8(), kResultNullIfNull, "convert_fromUTF8_binary", + NativeFunction("convert_fromUTF8", {}, DataTypeVector{binary()}, utf8(), + kResultNullIfNull, "convert_fromUTF8_binary", NativeFunction::kNeedsContext), - NativeFunction("convert_replaceUTF8", {"convert_replaceutf8"}, - DataTypeVector{binary(), utf8()}, utf8(), kResultNullIfNull, - "convert_replace_invalid_fromUTF8_binary", + NativeFunction("convert_replaceUTF8", {}, DataTypeVector{binary(), utf8()}, utf8(), + kResultNullIfNull, "convert_replace_invalid_fromUTF8_binary", NativeFunction::kNeedsContext), NativeFunction("convert_toDOUBLE", {}, DataTypeVector{float64()}, binary(), diff --git a/cpp/src/gandiva/function_registry_test.cc b/cpp/src/gandiva/function_registry_test.cc index bbe72c0ee970..377c13594deb 100644 --- a/cpp/src/gandiva/function_registry_test.cc +++ b/cpp/src/gandiva/function_registry_test.cc @@ -20,7 +20,9 @@ #include #include #include +#include #include +#include #include #include "gandiva/tests/test_util.h" @@ -39,6 +41,90 @@ class TestFunctionRegistry : public ::testing::Test { } }; +TEST_F(TestFunctionRegistry, TestAddDuplicateSignatureKeepsFirstRegistration) { + auto registry = std::make_unique(); + auto buffer = arrow::Buffer::FromString(""); + + NativeFunction first("foo", {}, DataTypeVector{arrow::int32()}, arrow::int32(), + kResultNullIfNull, "foo_first"); + NativeFunction second("foo", {}, DataTypeVector{arrow::int32()}, arrow::int32(), + kResultNullIfNull, "foo_second"); + + ARROW_EXPECT_OK(registry->Register({first}, buffer)); + + testing::internal::CaptureStderr(); + ARROW_EXPECT_OK(registry->Register({second}, buffer)); + std::string log = testing::internal::GetCapturedStderr(); + + EXPECT_NE(log.find("Duplicate function signature registered"), std::string::npos) + << "stderr was: " << log; + EXPECT_NE(log.find("foo_first"), std::string::npos) << "stderr was: " << log; + EXPECT_NE(log.find("foo_second"), std::string::npos) << "stderr was: " << log; + + // The first registration wins; lookup returns the original pc_name. + FunctionSignature sig("foo", {arrow::int32()}, arrow::int32()); + const NativeFunction* fn = registry->LookupSignature(sig); + ASSERT_NE(fn, nullptr); + EXPECT_EQ(fn->pc_name(), "foo_first"); +} + +TEST_F(TestFunctionRegistry, TestAddCallShapeCollisionLogsAndKeepsBoth) { + auto registry = std::make_unique(); + auto buffer = arrow::Buffer::FromString(""); + + NativeFunction returns_int32("foo", {}, DataTypeVector{arrow::int32()}, arrow::int32(), + kResultNullIfNull, "foo_int32_to_int32"); + NativeFunction returns_int64("foo", {}, DataTypeVector{arrow::int32()}, arrow::int64(), + kResultNullIfNull, "foo_int32_to_int64"); + + ARROW_EXPECT_OK(registry->Register({returns_int32}, buffer)); + + testing::internal::CaptureStderr(); + ARROW_EXPECT_OK(registry->Register({returns_int64}, buffer)); + std::string log = testing::internal::GetCapturedStderr(); + + EXPECT_NE(log.find("Function alias collision"), std::string::npos) + << "stderr was: " << log; + EXPECT_NE(log.find("foo_int32_to_int32"), std::string::npos) << "stderr was: " << log; + EXPECT_NE(log.find("foo_int32_to_int64"), std::string::npos) << "stderr was: " << log; + + // Both signatures remain reachable because they differ in return type. + FunctionSignature as_int32("foo", {arrow::int32()}, arrow::int32()); + FunctionSignature as_int64("foo", {arrow::int32()}, arrow::int64()); + const NativeFunction* fn_int32 = registry->LookupSignature(as_int32); + const NativeFunction* fn_int64 = registry->LookupSignature(as_int64); + ASSERT_NE(fn_int32, nullptr); + ASSERT_NE(fn_int64, nullptr); + EXPECT_EQ(fn_int32->pc_name(), "foo_int32_to_int32"); + EXPECT_EQ(fn_int64->pc_name(), "foo_int32_to_int64"); +} + +TEST_F(TestFunctionRegistry, TestAddCallShapeCollisionAcrossDecimalPrecision) { + // FunctionSignature::operator== treats decimals as equal when byte_width matches, + // regardless of precision/scale. The alias-collision diagnostic must use the same + // identity rule, so two registrations differing only in decimal precision/scale + // should still be flagged. + auto registry = std::make_unique(); + auto buffer = arrow::Buffer::FromString(""); + + NativeFunction precise("foo", {}, DataTypeVector{arrow::decimal128(10, 2)}, + arrow::int32(), kResultNullIfNull, "foo_decimal128_precise"); + NativeFunction wide("foo", {}, DataTypeVector{arrow::decimal128(18, 4)}, arrow::int64(), + kResultNullIfNull, "foo_decimal128_wide"); + + ARROW_EXPECT_OK(registry->Register({precise}, buffer)); + + testing::internal::CaptureStderr(); + ARROW_EXPECT_OK(registry->Register({wide}, buffer)); + std::string log = testing::internal::GetCapturedStderr(); + + EXPECT_NE(log.find("Function alias collision"), std::string::npos) + << "stderr was: " << log; + EXPECT_NE(log.find("foo_decimal128_precise"), std::string::npos) + << "stderr was: " << log; + EXPECT_NE(log.find("foo_decimal128_wide"), std::string::npos) << "stderr was: " << log; +} + TEST_F(TestFunctionRegistry, TestFound) { FunctionSignature add_i32_i32("add", {arrow::int32(), arrow::int32()}, arrow::int32()); @@ -85,6 +171,11 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { std::unordered_set native_func_duplicates; std::unordered_set func_sigs; std::unordered_set func_sig_duplicates; + // (name, param-types) -> ret_type seen first; ignores return type to detect + // call-shape ambiguity, where the same user-facing call could resolve to two + // functions with different return types. + std::unordered_map call_shapes; + std::unordered_set call_shape_duplicates; for (const auto& native_func_it : *registry_) { auto& first_sig = native_func_it.signatures().front(); auto pc_func_sig = FunctionSignature(native_func_it.pc_name(), @@ -103,6 +194,16 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { } else { func_sig_duplicates.insert(sig_str); } + + auto call_shape = sig.CallShape(); + auto ret_str = sig.ret_type()->ToString(); + auto it = call_shapes.find(call_shape); + if (it == call_shapes.end()) { + call_shapes.emplace(call_shape, ret_str); + } else if (it->second != ret_str) { + call_shape_duplicates.insert(call_shape + " resolves to both " + it->second + + " and " + ret_str); + } } } std::ostringstream stream; @@ -114,6 +215,7 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { "following precompiled functions:\n" << result; + stream.str(""); stream.clear(); std::copy(func_sig_duplicates.begin(), func_sig_duplicates.end(), std::ostream_iterator(stream, "\n")); @@ -121,5 +223,14 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { << "The following signatures are defined more than once possibly pointing to " "different precompiled functions:\n" << stream.str(); + + std::ostringstream shape_stream; + std::copy(call_shape_duplicates.begin(), call_shape_duplicates.end(), + std::ostream_iterator(shape_stream, "\n")); + EXPECT_TRUE(call_shape_duplicates.empty()) + << "The following calls have the same name and parameter types but different " + "return types, so callers will get different results depending on the inferred " + "return type:\n" + << shape_stream.str(); } } // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc b/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc index 188bd60d9eac..f79c542aa6cf 100644 --- a/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc +++ b/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc @@ -76,9 +76,6 @@ std::vector GetDateTimeArithmeticFunctionRegistry() { DATE_ADD_FNS(date_add, {}), DATE_ADD_FNS(add, {}), - NativeFunction("add", {}, DataTypeVector{date64(), int64()}, timestamp(), - kResultNullIfNull, "add_date64_int64"), - DATE_DIFF_FNS(date_sub, {}), DATE_DIFF_FNS(subtract, {}), DATE_DIFF_FNS(date_diff, {})}; diff --git a/cpp/src/gandiva/function_signature.cc b/cpp/src/gandiva/function_signature.cc index 136afca2d944..b21c0424280d 100644 --- a/cpp/src/gandiva/function_signature.cc +++ b/cpp/src/gandiva/function_signature.cc @@ -114,4 +114,23 @@ std::string FunctionSignature::ToString() const { return s.str(); } +std::string FunctionSignature::CallShape() const { + std::stringstream s; + s << AsciiToLower(base_name_) << "("; + for (uint32_t i = 0; i < param_types_.size(); i++) { + if (i > 0) { + s << ", "; + } + const auto& p = param_types_[i]; + if (p->id() == arrow::Type::DECIMAL) { + // Precision/scale aren't part of signature identity (see operator==). + s << "decimal" << (p->byte_width() * 8); + } else { + s << p->ToString(); + } + } + s << ")"; + return s.str(); +} + } // namespace gandiva diff --git a/cpp/src/gandiva/function_signature.h b/cpp/src/gandiva/function_signature.h index c3e3639495a9..8a52ff3ee527 100644 --- a/cpp/src/gandiva/function_signature.h +++ b/cpp/src/gandiva/function_signature.h @@ -46,6 +46,11 @@ class GANDIVA_EXPORT FunctionSignature { std::string ToString() const; + /// String identifying the call shape: lowercased base name + parameter types + /// (return type omitted). Useful for detecting two functions that share the + /// same `name(args)` shape but differ in return type. + std::string CallShape() const; + private: std::string base_name_; DataTypeVector param_types_; diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index c93b694fc777..092fa25bf25e 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -150,11 +150,11 @@ gdv_float64 nvl_float64_float64(gdv_float64 in, gdv_boolean is_valid_in, gdv_boolean nvl_boolean_boolean(gdv_boolean in, gdv_boolean is_valid_in, gdv_boolean replace, gdv_boolean is_valid_value); -gdv_int64 date_add_int32_timestamp(gdv_int32, gdv_timestamp); -gdv_int64 add_int64_timestamp(gdv_int64, gdv_timestamp); -gdv_int64 add_int32_timestamp(gdv_int32, gdv_timestamp); -gdv_int64 date_add_int64_timestamp(gdv_int64, gdv_timestamp); -gdv_timestamp add_date64_int64(gdv_date64, gdv_int64); +gdv_timestamp date_add_int32_timestamp(gdv_int32, gdv_timestamp); +gdv_timestamp add_int64_timestamp(gdv_int64, gdv_timestamp); +gdv_timestamp add_int32_timestamp(gdv_int32, gdv_timestamp); +gdv_timestamp date_add_int64_timestamp(gdv_int64, gdv_timestamp); +gdv_date64 add_date64_int64(gdv_date64, gdv_int64); gdv_timestamp to_timestamp_int32(gdv_int32); gdv_timestamp to_timestamp_int64(gdv_int64);