From 1c29f5898e6579a45d1196ce6c07d8ac86b4268a Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Wed, 3 Jun 2026 07:30:57 +0000 Subject: [PATCH] feat: remove support for FLOAT/DOUBLE type partition field --- .../binary_row_partition_computer_test.cpp | 92 ++++++++----------- .../common/utils/data_converter_utils.h | 65 ------------- .../utils/data_converter_utils_test.cpp | 47 ++-------- src/paimon/core/io/field_mapping_reader.cpp | 10 -- .../core/io/field_mapping_reader_test.cpp | 71 ++++++-------- src/paimon/core/schema/schema_validation.cpp | 16 ++-- src/paimon/core/schema/schema_validation.h | 4 +- .../core/schema/schema_validation_test.cpp | 57 +++++++----- .../core/stats/simple_stats_collector.cpp | 34 ------- .../stats/simple_stats_collector_test.cpp | 39 ++++---- src/paimon/core/utils/field_mapping_test.cpp | 51 +++++----- src/paimon/testing/utils/data_generator.cpp | 8 -- test/inte/write_and_read_inte_test.cpp | 91 +++++++++--------- 13 files changed, 202 insertions(+), 383 deletions(-) diff --git a/src/paimon/common/utils/binary_row_partition_computer_test.cpp b/src/paimon/common/utils/binary_row_partition_computer_test.cpp index 17028aae8..8e8f84d25 100644 --- a/src/paimon/common/utils/binary_row_partition_computer_test.cpp +++ b/src/paimon/common/utils/binary_row_partition_computer_test.cpp @@ -43,16 +43,14 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { arrow::field("f6", arrow::int32()), arrow::field("f7", arrow::int64()), arrow::field("f8", arrow::int64()), - arrow::field("f9", arrow::float32()), - arrow::field("f10", arrow::float64()), - arrow::field("f11", arrow::utf8()), - arrow::field("f12", arrow::utf8()), - arrow::field("f13", arrow::date32()), + arrow::field("f9", arrow::utf8()), + arrow::field("f10", arrow::utf8()), + arrow::field("f11", arrow::date32()), arrow::field("non-partition-field", arrow::int32())}; auto schema = arrow::schema(fields); - std::vector partition_keys = {"f0", "f2", "f1", "f3", "f4", "f5", "f6", - "f7", "f8", "f9", "f10", "f11", "f12", "f13"}; + std::vector partition_keys = {"f0", "f2", "f1", "f3", "f4", "f5", + "f6", "f7", "f8", "f9", "f10", "f11"}; { // simple case with legacy_partition_name_enabled = true ASSERT_OK_AND_ASSIGN( @@ -69,14 +67,12 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { {"f6", "-448489"}, {"f7", "-9223372036854775808"}, {"f8", "182737474"}, - {"f9", "0.334"}, - {"f10", "467.66472"}, - {"f11", "abcde"}, - {"f12", "这是一个很长很长的中文"}, - {"f13", "5"}, + {"f9", "abcde"}, + {"f10", "这是一个很长很长的中文"}, + {"f11", "5"}, }; ASSERT_OK_AND_ASSIGN(BinaryRow row, computer->ToBinaryRow(partition_map)); - ASSERT_EQ(14, row.GetFieldCount()); + ASSERT_EQ(12, row.GetFieldCount()); ASSERT_EQ(true, row.GetBoolean(0)); ASSERT_EQ(-20, row.GetByte(1)); ASSERT_EQ(10, row.GetByte(2)); @@ -86,15 +82,13 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { ASSERT_EQ(-448489, row.GetInt(6)); ASSERT_EQ(std::numeric_limits::min(), row.GetLong(7)); ASSERT_EQ(182737474l, row.GetLong(8)); - ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); - ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); - ASSERT_EQ("abcde", row.GetString(11).ToString()); - ASSERT_EQ("这是一个很长很长的中文", row.GetString(12).ToString()); - ASSERT_EQ(5, row.GetDate(13)); + ASSERT_EQ("abcde", row.GetString(9).ToString()); + ASSERT_EQ("这是一个很长很长的中文", row.GetString(10).ToString()); + ASSERT_EQ(5, row.GetDate(11)); std::vector> part_values; ASSERT_OK_AND_ASSIGN(part_values, computer->GeneratePartitionVector(row)); - ASSERT_EQ(14, part_values.size()); + ASSERT_EQ(12, part_values.size()); std::map actual_part_values_map; for (const auto& [key, value] : part_values) { actual_part_values_map[key] = value; @@ -117,14 +111,12 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { {"f6", "-448489"}, {"f7", "-9223372036854775808"}, {"f8", "182737474"}, - {"f9", "0.334"}, - {"f10", "467.66472"}, - {"f11", "abcde"}, - {"f12", "这是一个很长很长的中文"}, - {"f13", "1970-01-06"}, + {"f9", "abcde"}, + {"f10", "这是一个很长很长的中文"}, + {"f11", "1970-01-06"}, }; ASSERT_OK_AND_ASSIGN(BinaryRow row, computer->ToBinaryRow(partition_map)); - ASSERT_EQ(14, row.GetFieldCount()); + ASSERT_EQ(12, row.GetFieldCount()); ASSERT_EQ(true, row.GetBoolean(0)); ASSERT_EQ(-20, row.GetByte(1)); ASSERT_EQ(10, row.GetByte(2)); @@ -134,15 +126,13 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { ASSERT_EQ(-448489, row.GetInt(6)); ASSERT_EQ(std::numeric_limits::min(), row.GetLong(7)); ASSERT_EQ(182737474l, row.GetLong(8)); - ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); - ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); - ASSERT_EQ("abcde", row.GetString(11).ToString()); - ASSERT_EQ("这是一个很长很长的中文", row.GetString(12).ToString()); - ASSERT_EQ(5, row.GetDate(13)); + ASSERT_EQ("abcde", row.GetString(9).ToString()); + ASSERT_EQ("这是一个很长很长的中文", row.GetString(10).ToString()); + ASSERT_EQ(5, row.GetDate(11)); std::vector> part_values; ASSERT_OK_AND_ASSIGN(part_values, computer->GeneratePartitionVector(row)); - ASSERT_EQ(14, part_values.size()); + ASSERT_EQ(12, part_values.size()); std::map actual_part_values_map; for (const auto& [key, value] : part_values) { actual_part_values_map[key] = value; @@ -165,14 +155,12 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { {"f6", "-448489"}, {"f7", "-9223372036854775808"}, {"f8", "182737474"}, - {"f9", "0.334"}, - {"f10", "467.66472"}, - {"f11", " "}, - {"f12", "__DEFAULT_PARTITION__"}, - {"f13", "5"}, + {"f9", " "}, + {"f10", "__DEFAULT_PARTITION__"}, + {"f11", "5"}, }; ASSERT_OK_AND_ASSIGN(BinaryRow row, computer->ToBinaryRow(partition_map)); - ASSERT_EQ(14, row.GetFieldCount()); + ASSERT_EQ(12, row.GetFieldCount()); ASSERT_EQ(true, row.GetBoolean(0)); ASSERT_EQ(-20, row.GetByte(1)); ASSERT_EQ(10, row.GetByte(2)); @@ -182,15 +170,13 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { ASSERT_EQ(-448489, row.GetInt(6)); ASSERT_EQ(std::numeric_limits::min(), row.GetLong(7)); ASSERT_EQ(182737474l, row.GetLong(8)); - ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); - ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); - ASSERT_EQ(" ", row.GetString(11).ToString()); - ASSERT_TRUE(row.IsNullAt(12)); - ASSERT_EQ(5, row.GetInt(13)); + ASSERT_EQ(" ", row.GetString(9).ToString()); + ASSERT_TRUE(row.IsNullAt(10)); + ASSERT_EQ(5, row.GetInt(11)); std::vector> part_values; ASSERT_OK_AND_ASSIGN(part_values, computer->GeneratePartitionVector(row)); - ASSERT_EQ(14, part_values.size()); + ASSERT_EQ(12, part_values.size()); std::map actual_part_values_map; for (const auto& [key, value] : part_values) { actual_part_values_map[key] = value; @@ -205,11 +191,9 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { {"f6", "-448489"}, {"f7", "-9223372036854775808"}, {"f8", "182737474"}, - {"f9", "0.334"}, - {"f10", "467.66472"}, - {"f11", "__DEFAULT_PARTITION__"}, - {"f12", "__DEFAULT_PARTITION__"}, - {"f13", "5"}, + {"f9", "__DEFAULT_PARTITION__"}, + {"f10", "__DEFAULT_PARTITION__"}, + {"f11", "5"}, }; ASSERT_EQ(actual_part_values_map, expected_map); } @@ -227,10 +211,8 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { {"f6", "-448489"}, {"f7", "-9223372036854775808"}, {"f8", "182737474"}, - {"f9", "0.334"}, - {"f10", "467.66472"}, - {"f11", "abcde"}, - {"f12", "这是一个很长很长的中文"}}; + {"f9", "abcde"}, + {"f10", "这是一个很长很长的中文"}}; ASSERT_NOK_WITH_MSG(computer->ToBinaryRow(partition_map), "can not find partition key 'f4' in input partition"); @@ -251,10 +233,8 @@ TEST(BinaryRowPartitionComputerTest, TestToAndFromBinaryRow) { {"f6", "abcd"}, {"f7", "-9223372036854775808"}, {"f8", "182737474"}, - {"f9", "0.334"}, - {"f10", "467.66472"}, - {"f11", "abcde"}, - {"f12", "这是一个很长很长的中文"}}; + {"f9", "abcde"}, + {"f10", "这是一个很长很长的中文"}}; ASSERT_NOK_WITH_MSG(computer->ToBinaryRow(partition_map), "cannot convert field idx 6, field value abcd to type INT32"); } diff --git a/src/paimon/common/utils/data_converter_utils.h b/src/paimon/common/utils/data_converter_utils.h index 8645335b1..19a295b1e 100644 --- a/src/paimon/common/utils/data_converter_utils.h +++ b/src/paimon/common/utils/data_converter_utils.h @@ -113,26 +113,6 @@ class DataConverterUtils { return Status::OK(); }; break; - case arrow::Type::FLOAT: - converter = [](const std::string& value_str, int32_t field_idx, - BinaryRowWriter* writer) { - auto value = StringUtils::StringToValue(value_str); - RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, - arrow::internal::ToString(arrow::Type::FLOAT)); - writer->WriteFloat(field_idx, value.value()); - return Status::OK(); - }; - break; - case arrow::Type::DOUBLE: - converter = [](const std::string& value_str, int32_t field_idx, - BinaryRowWriter* writer) { - auto value = StringUtils::StringToValue(value_str); - RETURN_INVALID_WITH_FIELD_INFO(value, field_idx, value_str, - arrow::internal::ToString(arrow::Type::DOUBLE)); - writer->WriteDouble(field_idx, value.value()); - return Status::OK(); - }; - break; case arrow::Type::STRING: converter = [pool](const std::string& value_str, int32_t field_idx, BinaryRowWriter* writer) { @@ -158,39 +138,6 @@ class DataConverterUtils { return converter; } - // support float and double - template - static std::string FloatValueToString(const T& value, int32_t precision) { - std::stringstream oss; - if (value >= 1e-3 && value <= 1e7) { - oss << std::fixed << std::setprecision(sizeof(T)) << value; - std::string result = oss.str(); - auto pos = result.find_last_not_of('0'); - result.erase(pos + (result[pos] == '.') + 1, std::string::npos); - return result; - } - oss << std::uppercase << std::scientific << std::setprecision(precision) << value; - std::string result = oss.str(); - auto e_pos = result.find('E'); - if (e_pos != std::string::npos) { - if (result[e_pos + 1] == '+') { - result.erase(e_pos + 1, 1 + (result[e_pos + 2] == '0')); - } else { - if (result[e_pos + 1] == '-' && result[e_pos + 2] == '0') { - result.erase(e_pos + 2, 1); - } - } - auto zero_pos = e_pos - 1; - while (zero_pos >= 1 && result[zero_pos] == '0' && result[zero_pos - 1] != '.') { - zero_pos--; - } - if (e_pos - zero_pos - 1 > 0) { - result.erase(zero_pos + 1, e_pos - zero_pos - 1); - } - } - return result; - } - static Result CreateBinaryRowFieldToStringConverter( arrow::Type::type type, bool legacy_partition_name_enabled) { BinaryRowFieldToStrConverter converter; @@ -226,18 +173,6 @@ class DataConverterUtils { return std::to_string(data); }; break; - case arrow::Type::FLOAT: - converter = [](const BinaryRow& row, int32_t field_idx) { - float data = row.GetFloat(field_idx); - return FloatValueToString(data, 6); - }; - break; - case arrow::Type::DOUBLE: - converter = [](const BinaryRow& row, int32_t field_idx) { - double data = row.GetDouble(field_idx); - return FloatValueToString(data, 15); - }; - break; case arrow::Type::STRING: converter = [](const BinaryRow& row, int32_t field_idx) { BinaryString data = row.GetString(field_idx); diff --git a/src/paimon/common/utils/data_converter_utils_test.cpp b/src/paimon/common/utils/data_converter_utils_test.cpp index 286bf205a..6e1acdb13 100644 --- a/src/paimon/common/utils/data_converter_utils_test.cpp +++ b/src/paimon/common/utils/data_converter_utils_test.cpp @@ -38,8 +38,6 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithLegacyPartitionName {"-448489", arrow::Type::INT32}, {"279039", arrow::Type::INT64}, {"1234567", arrow::Type::INT64}, - {"0.334", arrow::Type::FLOAT}, - {"467.66472", arrow::Type::DOUBLE}, {"abcde", arrow::Type::STRING}, {"这是一个很长很长的中文", arrow::Type::STRING}, {"10440", arrow::Type::DATE32}}; @@ -77,11 +75,9 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithLegacyPartitionName ASSERT_EQ(-448489, row.GetInt(6)); ASSERT_EQ(279039, row.GetLong(7)); ASSERT_EQ(1234567, row.GetLong(8)); - ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); - ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); - ASSERT_EQ("abcde", row.GetString(11).ToString()); - ASSERT_EQ("这是一个很长很长的中文", row.GetString(12).ToString()); - ASSERT_EQ(10440, row.GetDate(13)); + ASSERT_EQ("abcde", row.GetString(9).ToString()); + ASSERT_EQ("这是一个很长很长的中文", row.GetString(10).ToString()); + ASSERT_EQ(10440, row.GetDate(11)); for (size_t idx = 0; idx < data.size(); idx++) { ASSERT_OK_AND_ASSIGN(auto partition_field_str, reconverters[idx](row, idx)); @@ -101,8 +97,6 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithNoLegacyPartitionNa {"-448489", arrow::Type::INT32}, {"279039", arrow::Type::INT64}, {"1234567", arrow::Type::INT64}, - {"0.334", arrow::Type::FLOAT}, - {"467.66472", arrow::Type::DOUBLE}, {"abcde", arrow::Type::STRING}, {"这是一个很长很长的中文", arrow::Type::STRING}, {"1998-08-02", arrow::Type::DATE32}}; @@ -135,11 +129,9 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithNoLegacyPartitionNa ASSERT_EQ(-448489, row.GetInt(6)); ASSERT_EQ(279039, row.GetLong(7)); ASSERT_EQ(1234567, row.GetLong(8)); - ASSERT_NEAR(0.334, row.GetFloat(9), 0.0000001); - ASSERT_NEAR(467.66472, row.GetDouble(10), 0.0000001); - ASSERT_EQ("abcde", row.GetString(11).ToString()); - ASSERT_EQ("这是一个很长很长的中文", row.GetString(12).ToString()); - ASSERT_EQ(10440, row.GetDate(13)); + ASSERT_EQ("abcde", row.GetString(9).ToString()); + ASSERT_EQ("这是一个很长很长的中文", row.GetString(10).ToString()); + ASSERT_EQ(10440, row.GetDate(11)); for (size_t idx = 0; idx < data.size(); idx++) { ASSERT_OK_AND_ASSIGN(auto partition_field_str, reconverters[idx](row, idx)); @@ -147,31 +139,4 @@ TEST(DataConverterUtilsTest, TestDataToBinaryRowConverterWithNoLegacyPartitionNa } } -TEST(DataConverterUtilsTest, TestValueToStringSimple) { - ASSERT_EQ("233.0", DataConverterUtils::FloatValueToString(static_cast(233), 6)); - ASSERT_EQ("3.0E-4", - DataConverterUtils::FloatValueToString(static_cast(0.0003), 6)); - ASSERT_EQ("3.478589E10", - DataConverterUtils::FloatValueToString(static_cast(34785895352), 6)); - ASSERT_EQ("1.0E9", - DataConverterUtils::FloatValueToString(static_cast(1000000000), 6)); - ASSERT_EQ("1000000.0", - DataConverterUtils::FloatValueToString(static_cast(1000000), 6)); - ASSERT_EQ("467.6647", - DataConverterUtils::FloatValueToString(static_cast(467.6647), 6)); - - ASSERT_EQ("233.0", - DataConverterUtils::FloatValueToString(static_cast(233), 15)); - ASSERT_EQ("3.4785895352E10", - DataConverterUtils::FloatValueToString(static_cast(34785895352), 15)); - ASSERT_EQ("1.0E9", - DataConverterUtils::FloatValueToString(static_cast(1000000000), 15)); - ASSERT_EQ("1000000.0", - DataConverterUtils::FloatValueToString(static_cast(1000000), 15)); - ASSERT_EQ("467.66472", - DataConverterUtils::FloatValueToString(static_cast(467.66472), 6)); - ASSERT_EQ("123456.123456", DataConverterUtils::FloatValueToString( - static_cast(123456.123456), 6)); -} - } // namespace paimon::test diff --git a/src/paimon/core/io/field_mapping_reader.cpp b/src/paimon/core/io/field_mapping_reader.cpp index fdb88c653..767e0db6c 100644 --- a/src/paimon/core/io/field_mapping_reader.cpp +++ b/src/paimon/core/io/field_mapping_reader.cpp @@ -218,16 +218,6 @@ Result> FieldMappingReader::GenerateSinglePartitio scalar = std::make_shared(value); break; } - case arrow::Type::type::FLOAT: { - float value = partition_.GetFloat(partition_info_.value().idx_in_partition[idx]); - scalar = std::make_shared(value); - break; - } - case arrow::Type::type::DOUBLE: { - double value = partition_.GetDouble(partition_info_.value().idx_in_partition[idx]); - scalar = std::make_shared(value); - break; - } case arrow::Type::type::STRING: { BinaryString value = partition_.GetString(partition_info_.value().idx_in_partition[idx]); diff --git a/src/paimon/core/io/field_mapping_reader_test.cpp b/src/paimon/core/io/field_mapping_reader_test.cpp index 491f67246..706281b33 100644 --- a/src/paimon/core/io/field_mapping_reader_test.cpp +++ b/src/paimon/core/io/field_mapping_reader_test.cpp @@ -211,72 +211,53 @@ class FieldMappingReaderTest : public ::testing::Test { TEST_F(FieldMappingReaderTest, TestGenerateSinglePartitionArray) { PartitionInfo partition_info; - // read schema: p9-p0 - // partition key: p0-p9 - partition_info.partition_read_schema = {DataField(9, arrow::field("p9", arrow::date32())), - DataField(8, arrow::field("p8", arrow::binary())), - DataField(7, arrow::field("p7", arrow::utf8())), - DataField(6, arrow::field("p6", arrow::float64())), - DataField(5, arrow::field("p5", arrow::float32())), + // read schema: p7-p0 + // partition key: p0-p7 + partition_info.partition_read_schema = {DataField(7, arrow::field("p7", arrow::date32())), + DataField(6, arrow::field("p6", arrow::binary())), + DataField(5, arrow::field("p5", arrow::utf8())), DataField(4, arrow::field("p4", arrow::int64())), DataField(3, arrow::field("p3", arrow::int32())), DataField(2, arrow::field("p2", arrow::int16())), DataField(1, arrow::field("p1", arrow::int8())), DataField(0, arrow::field("p0", arrow::boolean()))}; - partition_info.idx_in_target_read_schema = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - partition_info.idx_in_partition = {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + partition_info.idx_in_target_read_schema = {0, 1, 2, 3, 4, 5, 6, 7}; + partition_info.idx_in_partition = {7, 6, 5, 4, 3, 2, 1, 0}; NonPartitionInfo non_part_info; auto field_mapping = std::make_unique(partition_info, non_part_info, /*non_exist_field_info=*/std::nullopt); auto partition = BinaryRowGenerator::GenerateRow( {false, static_cast(1), static_cast(2), static_cast(3), - static_cast(4), static_cast(5.1), 6.21, std::string("7"), - std::make_shared("8", pool_.get()), 100}, + static_cast(4), std::string("5"), std::make_shared("6", pool_.get()), 100}, pool_.get()); auto mapping_reader = std::make_unique( - /*field_count=*/10, /*reader=*/nullptr, partition, std::move(field_mapping), pool_); + /*field_count=*/8, /*reader=*/nullptr, partition, std::move(field_mapping), pool_); - { - ASSERT_OK_AND_ASSIGN(auto p9_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 0, /*batch_size=*/2)); - ASSERT_EQ(p9_array->length(), 2); - ASSERT_EQ(arrow::internal::checked_cast(p9_array.get())->Value(0), - 100); - } - { - ASSERT_OK_AND_ASSIGN(auto p8_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 1, /*batch_size=*/2)); - ASSERT_EQ(p8_array->length(), 2); - ASSERT_EQ(arrow::internal::checked_cast(p8_array.get())->Value(0), - "8"); - } { ASSERT_OK_AND_ASSIGN(auto p7_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 2, /*batch_size=*/1)); - ASSERT_EQ(p7_array->length(), 1); - ASSERT_EQ(arrow::internal::checked_cast(p7_array.get())->Value(0), - "7"); + /*idx_in_read_schema=*/0, /*batch_size=*/2)); + ASSERT_EQ(p7_array->length(), 2); + ASSERT_EQ(arrow::internal::checked_cast(p7_array.get())->Value(0), + 100); } { ASSERT_OK_AND_ASSIGN(auto p6_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 3, /*batch_size=*/1)); - ASSERT_EQ( - arrow::internal::checked_cast*>(p6_array.get()) - ->Value(0), - static_cast(6.21)); + /*idx_in_read_schema=*/1, /*batch_size=*/2)); + ASSERT_EQ(p6_array->length(), 2); + ASSERT_EQ(arrow::internal::checked_cast(p6_array.get())->Value(0), + "6"); } { ASSERT_OK_AND_ASSIGN(auto p5_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 4, /*batch_size=*/1)); - ASSERT_EQ( - arrow::internal::checked_cast*>(p5_array.get()) - ->Value(0), - static_cast(5.1)); + /*idx_in_read_schema=*/2, /*batch_size=*/1)); + ASSERT_EQ(p5_array->length(), 1); + ASSERT_EQ(arrow::internal::checked_cast(p5_array.get())->Value(0), + "5"); } { ASSERT_OK_AND_ASSIGN(auto p4_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 5, /*batch_size=*/1)); + /*idx_in_read_schema=*/3, /*batch_size=*/1)); ASSERT_EQ( arrow::internal::checked_cast*>(p4_array.get()) ->Value(0), @@ -284,7 +265,7 @@ TEST_F(FieldMappingReaderTest, TestGenerateSinglePartitionArray) { } { ASSERT_OK_AND_ASSIGN(auto p3_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 6, /*batch_size=*/1)); + /*idx_in_read_schema=*/4, /*batch_size=*/1)); ASSERT_EQ( arrow::internal::checked_cast*>(p3_array.get()) ->Value(0), @@ -292,7 +273,7 @@ TEST_F(FieldMappingReaderTest, TestGenerateSinglePartitionArray) { } { ASSERT_OK_AND_ASSIGN(auto p2_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 7, /*batch_size=*/1)); + /*idx_in_read_schema=*/5, /*batch_size=*/1)); ASSERT_EQ( arrow::internal::checked_cast*>(p2_array.get()) ->Value(0), @@ -300,7 +281,7 @@ TEST_F(FieldMappingReaderTest, TestGenerateSinglePartitionArray) { } { ASSERT_OK_AND_ASSIGN(auto p1_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 8, /*batch_size=*/1)); + /*idx_in_read_schema=*/6, /*batch_size=*/1)); ASSERT_EQ( arrow::internal::checked_cast*>(p1_array.get()) ->Value(0), @@ -308,7 +289,7 @@ TEST_F(FieldMappingReaderTest, TestGenerateSinglePartitionArray) { } { ASSERT_OK_AND_ASSIGN(auto p0_array, mapping_reader->GenerateSinglePartitionArray( - /*idx in read schema*/ 9, /*batch_size=*/1)); + /*idx_in_read_schema=*/7, /*batch_size=*/1)); ASSERT_EQ(arrow::internal::checked_cast(p0_array.get())->Value(0), false); } diff --git a/src/paimon/core/schema/schema_validation.cpp b/src/paimon/core/schema/schema_validation.cpp index 343435c98..4af714124 100644 --- a/src/paimon/core/schema/schema_validation.cpp +++ b/src/paimon/core/schema/schema_validation.cpp @@ -70,9 +70,9 @@ Status SchemaValidation::ValidateTableSchema(const TableSchema& schema) { ValidateOnlyContainPrimitiveType(schema.Fields(), schema.PrimaryKeys(), "primary key")); PAIMON_RETURN_NOT_OK( ValidateOnlyContainPrimitiveType(schema.Fields(), schema.PartitionKeys(), "partition")); - // TODO(lisizhuo.lsz): C++ Paimon do not support timestamp & decimal type in partition keys for - // now. - PAIMON_RETURN_NOT_OK(ValidateNotContainComplexType(schema.Fields(), schema.PartitionKeys())); + // TODO(lisizhuo.lsz): C++ Paimon do not support timestamp & decimal & float & double type in + // partition keys for now. + PAIMON_RETURN_NOT_OK(ValidateNotContainSpecificType(schema.Fields(), schema.PartitionKeys())); PAIMON_ASSIGN_OR_RAISE(CoreOptions options, CoreOptions::FromMap(schema.Options())); PAIMON_RETURN_NOT_OK(ValidateBucket(schema, options)); @@ -160,7 +160,7 @@ Status SchemaValidation::ValidateOnlyContainPrimitiveType( return Status::OK(); } -Status SchemaValidation::ValidateNotContainComplexType( +Status SchemaValidation::ValidateNotContainSpecificType( const std::vector& fields, const std::vector& field_names) { if (field_names.empty()) { return Status::OK(); @@ -175,8 +175,12 @@ Status SchemaValidation::ValidateNotContainComplexType( auto field = it->second; if (IsComplexType(field)) { return Status::Invalid( - fmt::format("The field {} in partition field {} is unsupported", - field->ToString(), it->first)); + fmt::format("partition field {} cannot be TIMESTAMP/DECIMAL/BLOB", field_name)); + } + if (field->type()->id() == arrow::Type::FLOAT || + field->type()->id() == arrow::Type::DOUBLE) { + return Status::Invalid( + fmt::format("partition field {} cannot be FLOAT/DOUBLE", field_name)); } } else { assert(false); diff --git a/src/paimon/core/schema/schema_validation.h b/src/paimon/core/schema/schema_validation.h index 52c63f91a..1c1875403 100644 --- a/src/paimon/core/schema/schema_validation.h +++ b/src/paimon/core/schema/schema_validation.h @@ -51,8 +51,8 @@ class SchemaValidation { static Status ValidateOnlyContainPrimitiveType(const std::vector& fields, const std::vector& field_names, const std::string& error_message_intro); - static Status ValidateNotContainComplexType(const std::vector& fields, - const std::vector& field_names); + static Status ValidateNotContainSpecificType(const std::vector& fields, + const std::vector& field_names); static Status ValidateBucket(const TableSchema& schema, const CoreOptions& options); static Status ValidateDefaultValues(const TableSchema& schema) { return Status::NotImplemented("validate default values not implemented"); diff --git a/src/paimon/core/schema/schema_validation_test.cpp b/src/paimon/core/schema/schema_validation_test.cpp index 5e2c797d7..8907c3c64 100644 --- a/src/paimon/core/schema/schema_validation_test.cpp +++ b/src/paimon/core/schema/schema_validation_test.cpp @@ -390,19 +390,34 @@ TEST(SchemaValidationTest, NonPrimitivePartitionKeyStruct) { "field f1 is unsupported"); } -TEST(SchemaValidationTest, TestComplexPartitionKey) { - auto f0 = arrow::field("f0", arrow::utf8()); - auto f1 = arrow::field("f1", arrow::decimal128(5, 2)); - auto f2 = arrow::field("f2", arrow::float64()); - arrow::FieldVector fields = {f0, f1, f2}; - auto schema = arrow::schema(fields); - std::vector primary_keys = {"f0", "f1"}; - std::vector partition_keys = {"f1"}; - ASSERT_OK_AND_ASSIGN( - std::shared_ptr table_schema, - TableSchema::Create(/*schema_id=*/0, schema, partition_keys, primary_keys, {})); - ASSERT_NOK_WITH_MSG(SchemaValidation::ValidateTableSchema(*table_schema), - "partition field f1 is unsupported"); +TEST(SchemaValidationTest, TestSpecificPartitionKey) { + { + auto f0 = arrow::field("f0", arrow::utf8()); + auto f1 = arrow::field("f1", arrow::decimal128(5, 2)); + auto f2 = arrow::field("f2", arrow::float64()); + arrow::FieldVector fields = {f0, f1, f2}; + auto schema = arrow::schema(fields); + std::vector primary_keys = {"f0", "f1"}; + std::vector partition_keys = {"f1"}; + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, schema, partition_keys, primary_keys, {})); + ASSERT_NOK_WITH_MSG(SchemaValidation::ValidateTableSchema(*table_schema), + "partition field f1 cannot be TIMESTAMP/DECIMAL/BLOB"); + } + { + auto f0 = arrow::field("f0", arrow::utf8()); + auto f1 = arrow::field("f1", arrow::float64()); + arrow::FieldVector fields = {f0, f1}; + auto schema = arrow::schema(fields); + std::vector primary_keys = {"f0", "f1"}; + std::vector partition_keys = {"f1"}; + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, schema, partition_keys, primary_keys, {})); + ASSERT_NOK_WITH_MSG(SchemaValidation::ValidateTableSchema(*table_schema), + "partition field f1 cannot be FLOAT/DOUBLE"); + } } TEST(SchemaValidationTest, TestComplexPartitionKeyWithBlob) { @@ -416,7 +431,7 @@ TEST(SchemaValidationTest, TestComplexPartitionKeyWithBlob) { std::shared_ptr table_schema, TableSchema::Create(/*schema_id=*/0, schema, partition_keys, /*primary_keys=*/{}, {})); ASSERT_NOK_WITH_MSG(SchemaValidation::ValidateTableSchema(*table_schema), - "partition field f1 is unsupported"); + "partition field f1 cannot be TIMESTAMP/DECIMAL/BLOB"); } TEST(SchemaValidationTest, TestDateTypePartitionKey) { @@ -508,8 +523,8 @@ TEST(SchemaValidationTest, ValidateBucket) { "The number of buckets needs to be greater than 0."); } { - std::vector primary_keys = {"f0", "f1"}; - std::vector partition_keys = {"f2"}; + std::vector primary_keys = {"f0", "f2"}; + std::vector partition_keys = {"f1"}; std::map options = {{Options::BUCKET, "2"}, {Options::BUCKET_KEY, "f0"}}; ASSERT_OK_AND_ASSIGN( @@ -521,7 +536,7 @@ TEST(SchemaValidationTest, ValidateBucket) { } { std::vector primary_keys = {}; - std::vector partition_keys = {"f2"}; + std::vector partition_keys = {"f1"}; std::map options = {{Options::BUCKET, "2"}}; ASSERT_OK_AND_ASSIGN( std::shared_ptr table_schema, @@ -530,7 +545,7 @@ TEST(SchemaValidationTest, ValidateBucket) { "You should define a 'bucket-key' for bucketed append mode"); } { - std::vector partition_keys = {"f2"}; + std::vector partition_keys = {"f1"}; std::map options = {{"full-compaction.delta-commits", "2"}}; ASSERT_OK_AND_ASSIGN(std::shared_ptr table_schema, TableSchema::Create(/*schema_id=*/0, schema, partition_keys, @@ -630,11 +645,11 @@ TEST(SchemaValidationTest, ValidateSequenceField) { std::map options = {{Options::BUCKET, "-1"}, {Options::SEQUENCE_FIELD, "f0,f1,f2"}}; ASSERT_OK_AND_ASSIGN(std::shared_ptr table_schema, - TableSchema::Create(/*schema_id=*/0, schema, /*partition_keys=*/{"f2"}, - /*primary_keys=*/{"f0", "f1"}, options)); + TableSchema::Create(/*schema_id=*/0, schema, /*partition_keys=*/{"f1"}, + /*primary_keys=*/{"f0", "f2"}, options)); ASSERT_NOK_WITH_MSG(SchemaValidation::ValidateTableSchema(*table_schema), "You cannot use sequence.field in cross partition update case (Primary " - "key constraint 'f0, f1' not including all partition fields 'f2')."); + "key constraint 'f0, f2' not including all partition fields 'f1')."); } } diff --git a/src/paimon/core/stats/simple_stats_collector.cpp b/src/paimon/core/stats/simple_stats_collector.cpp index dffd2ec2a..359eee361 100644 --- a/src/paimon/core/stats/simple_stats_collector.cpp +++ b/src/paimon/core/stats/simple_stats_collector.cpp @@ -130,40 +130,6 @@ Status SimpleStatsCollector::Collect(const BinaryRow& row) { } break; } - case arrow::Type::FLOAT: { - if (column_stats_[i] == nullptr) { - column_stats_[i] = ColumnStats::CreateFloatColumnStats( - std::nullopt, std::nullopt, std::nullopt); - } - auto typed_stats = dynamic_cast(column_stats_[i].get()); - if (typed_stats == nullptr) { - assert(false); - return Status::Invalid("cast typed stats failed"); - } - if (!row.IsNullAt(i)) { - typed_stats->Collect(row.GetFloat(i)); - } else { - typed_stats->Collect(std::nullopt); - } - break; - } - case arrow::Type::DOUBLE: { - if (column_stats_[i] == nullptr) { - column_stats_[i] = ColumnStats::CreateDoubleColumnStats( - std::nullopt, std::nullopt, std::nullopt); - } - auto typed_stats = dynamic_cast(column_stats_[i].get()); - if (typed_stats == nullptr) { - assert(false); - return Status::Invalid("cast typed stats failed"); - } - if (!row.IsNullAt(i)) { - typed_stats->Collect(row.GetDouble(i)); - } else { - typed_stats->Collect(std::nullopt); - } - break; - } case arrow::Type::STRING: case arrow::Type::BINARY: { if (column_stats_[i] == nullptr) { diff --git a/src/paimon/core/stats/simple_stats_collector_test.cpp b/src/paimon/core/stats/simple_stats_collector_test.cpp index eea60e2e3..55f3c29c1 100644 --- a/src/paimon/core/stats/simple_stats_collector_test.cpp +++ b/src/paimon/core/stats/simple_stats_collector_test.cpp @@ -38,9 +38,8 @@ TEST(SimpleStatsCollectorTest, TestSimple) { arrow::FieldVector fields = { arrow::field("f0", arrow::boolean()), arrow::field("f1", arrow::int8()), arrow::field("f2", arrow::int16()), arrow::field("f3", arrow::int32()), - arrow::field("f4", arrow::int64()), arrow::field("f5", arrow::float32()), - arrow::field("f6", arrow::float64()), arrow::field("f7", arrow::utf8()), - arrow::field("f8", arrow::date32()), + arrow::field("f4", arrow::int64()), arrow::field("f5", arrow::utf8()), + arrow::field("f6", arrow::date32()), }; auto schema = arrow::schema(fields); @@ -48,25 +47,21 @@ TEST(SimpleStatsCollectorTest, TestSimple) { auto pool = GetDefaultPool(); ASSERT_OK(collector.Collect(BinaryRowGenerator::GenerateRow( {true, static_cast(1), static_cast(1), static_cast(1), - static_cast(1), static_cast(3.0), static_cast(3.0), - std::string("abc"), 2025}, + static_cast(1), std::string("abc"), 2025}, pool.get()))); ASSERT_OK(collector.Collect(BinaryRowGenerator::GenerateRow( {false, static_cast(2), static_cast(2), static_cast(2), - static_cast(2), static_cast(6.0), static_cast(6.0), - std::string("bcd"), 2026}, + static_cast(2), std::string("bcd"), 2026}, pool.get()))); ASSERT_OK_AND_ASSIGN(auto col_stats, collector.GetResult()); ASSERT_OK_AND_ASSIGN(SimpleStats stats, SimpleStatsConverter::ToBinary(col_stats, pool.get())); auto expected_stats = BinaryRowGenerator::GenerateStats( {false, static_cast(1), static_cast(1), static_cast(1), - static_cast(1), static_cast(3.0), static_cast(3.0), - std::string("abc"), 2025}, + static_cast(1), std::string("abc"), 2025}, {true, static_cast(2), static_cast(2), static_cast(2), - static_cast(2), static_cast(6.0), static_cast(6.0), - std::string("bcd"), 2026}, - std::vector({0, 0, 0, 0, 0, 0, 0, 0, 0}), GetDefaultPool().get()); + static_cast(2), std::string("bcd"), 2026}, + std::vector({0, 0, 0, 0, 0, 0, 0}), GetDefaultPool().get()); ASSERT_EQ(stats, expected_stats); } @@ -75,29 +70,27 @@ TEST(SimpleStatsCollectorTest, TestNull) { arrow::FieldVector fields = { arrow::field("f0", arrow::boolean()), arrow::field("f1", arrow::int8()), arrow::field("f2", arrow::int16()), arrow::field("f3", arrow::int32()), - arrow::field("f4", arrow::int64()), arrow::field("f5", arrow::float32()), - arrow::field("f6", arrow::float64()), arrow::field("f7", arrow::utf8()), - arrow::field("f8", arrow::date32()), arrow::field("key", arrow::int32()), + arrow::field("f4", arrow::int64()), arrow::field("f5", arrow::utf8()), + arrow::field("f6", arrow::date32()), arrow::field("key", arrow::int32()), }; auto schema = arrow::schema(fields); SimpleStatsCollector collector(schema); auto pool = GetDefaultPool(); - ASSERT_OK(collector.Collect( - BinaryRowGenerator::GenerateRow({NullType(), NullType(), NullType(), NullType(), NullType(), - NullType(), NullType(), NullType(), NullType(), 100}, - pool.get()))); + ASSERT_OK(collector.Collect(BinaryRowGenerator::GenerateRow( + {NullType(), NullType(), NullType(), NullType(), NullType(), NullType(), NullType(), 100}, + pool.get()))); ASSERT_OK_AND_ASSIGN(auto col_stats, collector.GetResult()); - ASSERT_EQ(10, col_stats.size()); + ASSERT_EQ(8, col_stats.size()); ASSERT_OK_AND_ASSIGN(SimpleStats stats, SimpleStatsConverter::ToBinary(col_stats, pool.get())); ASSERT_EQ(stats.MinValues(), stats.MaxValues()); - for (size_t i = 0; i < 9; ++i) { + for (size_t i = 0; i < 7; ++i) { ASSERT_TRUE(stats.MinValues().IsNullAt(i)); } - ASSERT_EQ(stats.MinValues().GetInt(9), 100); + ASSERT_EQ(stats.MinValues().GetInt(7), 100); ASSERT_OK_AND_ASSIGN(std::vector expected, stats.NullCounts().ToLongArray()); - ASSERT_EQ(expected, std::vector({1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l, 0l})); + ASSERT_EQ(expected, std::vector({1l, 1l, 1l, 1l, 1l, 1l, 1l, 0l})); } TEST(SimpleStatsCollectorTest, TestInvalidPartition) { diff --git a/src/paimon/core/utils/field_mapping_test.cpp b/src/paimon/core/utils/field_mapping_test.cpp index c74083e26..4d9a87cd2 100644 --- a/src/paimon/core/utils/field_mapping_test.cpp +++ b/src/paimon/core/utils/field_mapping_test.cpp @@ -151,15 +151,19 @@ TEST_F(FieldMappingTest, TestPartitionKeysEqualSchema) { FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({equal, not_equal})); - std::vector partition_keys = {"f0", "f1", "f2", "f3"}; + std::vector fields = {DataField(0, arrow::field("f0", arrow::utf8())), + DataField(1, arrow::field("f1", arrow::int32())), + DataField(2, arrow::field("f2", arrow::int32()))}; + std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); + std::vector partition_keys = {"f0", "f1", "f2"}; ASSERT_OK_AND_ASSIGN(auto mapping_builder, - FieldMappingBuilder::Create(schema_, partition_keys, predicate)); - ASSERT_OK_AND_ASSIGN(auto mapping, mapping_builder->CreateFieldMapping(schema_)); + FieldMappingBuilder::Create(schema, partition_keys, predicate)); + ASSERT_OK_AND_ASSIGN(auto mapping, mapping_builder->CreateFieldMapping(schema)); PartitionInfo expected_part_info; - expected_part_info.partition_read_schema = fields_; - expected_part_info.idx_in_target_read_schema = {0, 1, 2, 3}; - expected_part_info.idx_in_partition = {0, 1, 2, 3}; + expected_part_info.partition_read_schema = fields; + expected_part_info.idx_in_target_read_schema = {0, 1, 2}; + expected_part_info.idx_in_partition = {0, 1, 2}; expected_part_info.partition_filter = std::dynamic_pointer_cast(predicate); CheckPartitionInfo(mapping->partition_info.value(), expected_part_info); @@ -353,7 +357,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolution) { // add field / delete field / rename / casting // without predicate std::vector data_fields = {DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64())), + DataField(1, arrow::field("key1", arrow::int64())), DataField(2, arrow::field("a", arrow::int32())), DataField(3, arrow::field("b", arrow::int32())), DataField(4, arrow::field("c", arrow::int32())), @@ -362,7 +366,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolution) { DataField::ConvertDataFieldsToArrowSchema(data_fields); std::vector read_fields = {DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64())), + DataField(1, arrow::field("key1", arrow::int64())), DataField(3, arrow::field("c", arrow::int64())), DataField(5, arrow::field("a", arrow::float32())), DataField(7, arrow::field("d", arrow::int32())), @@ -377,9 +381,8 @@ TEST_F(FieldMappingTest, TestSchemaEvolution) { ASSERT_OK_AND_ASSIGN(auto mapping, mapping_builder->CreateFieldMapping(data_fields)); PartitionInfo expected_part_info; - expected_part_info.partition_read_schema = { - DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64()))}; + expected_part_info.partition_read_schema = {DataField(0, arrow::field("key0", arrow::int32())), + DataField(1, arrow::field("key1", arrow::int64()))}; expected_part_info.idx_in_target_read_schema = {0, 1}; expected_part_info.idx_in_partition = {0, 1}; expected_part_info.partition_filter = nullptr; @@ -417,7 +420,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate) { // field_7 is added to the middle; // field_8 is added to the last field. std::vector data_fields = {DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64())), + DataField(1, arrow::field("key1", arrow::int64())), DataField(2, arrow::field("a", arrow::int32())), DataField(3, arrow::field("b", arrow::decimal128(5, 2))), DataField(4, arrow::field("c", arrow::int32())), @@ -425,7 +428,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate) { DataField(6, arrow::field("k", arrow::int32()))}; std::vector table_fields = {DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64())), + DataField(1, arrow::field("key1", arrow::int64())), DataField(6, arrow::field("k", arrow::int32())), DataField(3, arrow::field("c", arrow::decimal128(6, 3))), DataField(7, arrow::field("d", arrow::int32())), @@ -438,7 +441,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate) { auto greater_or_equal = PredicateBuilder::GreaterOrEqual( /*field_index=*/0, /*field_name=*/"key0", FieldType::INT, Literal(4)); auto equal = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"key1", - FieldType::DOUBLE, Literal(3.0)); + FieldType::BIGINT, Literal(3l)); auto less_or_equal = PredicateBuilder::LessOrEqual(/*field_index=*/2, /*field_name=*/"k", FieldType::INT, Literal(10)); // greater_than will not be pushed down, as with casting, only integer predicates can be pushed @@ -463,9 +466,8 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate) { ASSERT_OK_AND_ASSIGN(auto mapping, mapping_builder->CreateFieldMapping(data_fields)); PartitionInfo expected_part_info; - expected_part_info.partition_read_schema = { - DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64()))}; + expected_part_info.partition_read_schema = {DataField(0, arrow::field("key0", arrow::int32())), + DataField(1, arrow::field("key1", arrow::int64()))}; expected_part_info.idx_in_target_read_schema = {0, 1}; expected_part_info.idx_in_partition = {0, 1}; expected_part_info.partition_filter = @@ -511,7 +513,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate2) { // field_7 is added to the middle; // field_8 is added to the last field. std::vector data_fields = {DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64())), + DataField(1, arrow::field("key1", arrow::int64())), DataField(2, arrow::field("a", arrow::int32())), DataField(3, arrow::field("b", arrow::decimal128(5, 2))), DataField(4, arrow::field("c", arrow::int32())), @@ -519,7 +521,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate2) { DataField(6, arrow::field("k", arrow::int32()))}; std::vector table_fields = {DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64())), + DataField(1, arrow::field("key1", arrow::int64())), DataField(6, arrow::field("k", arrow::int32())), DataField(3, arrow::field("c", arrow::decimal128(6, 3))), DataField(7, arrow::field("d", arrow::int32())), @@ -528,7 +530,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate2) { // the field order of read schema and table schema is inconsistent std::vector read_fields = {DataField(7, arrow::field("d", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64())), + DataField(1, arrow::field("key1", arrow::int64())), DataField(5, arrow::field("a", arrow::int64())), DataField(8, arrow::field("e", arrow::int32())), DataField(6, arrow::field("k", arrow::int32())), @@ -540,7 +542,7 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate2) { auto greater_or_equal = PredicateBuilder::GreaterOrEqual( /*field_index=*/6, /*field_name=*/"key0", FieldType::INT, Literal(4)); auto equal = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"key1", - FieldType::DOUBLE, Literal(3.0)); + FieldType::BIGINT, Literal(3l)); auto less_or_equal = PredicateBuilder::LessOrEqual(/*field_index=*/4, /*field_name=*/"k", FieldType::INT, Literal(10)); // greater_than will not be pushed down, as with casting, only integer predicates can be pushed @@ -565,15 +567,14 @@ TEST_F(FieldMappingTest, TestSchemaEvolutionWithPredicate2) { ASSERT_OK_AND_ASSIGN(auto mapping, mapping_builder->CreateFieldMapping(data_fields)); PartitionInfo expected_part_info; - expected_part_info.partition_read_schema = { - DataField(0, arrow::field("key0", arrow::int32())), - DataField(1, arrow::field("key1", arrow::float64()))}; + expected_part_info.partition_read_schema = {DataField(0, arrow::field("key0", arrow::int32())), + DataField(1, arrow::field("key1", arrow::int64()))}; expected_part_info.idx_in_target_read_schema = {6, 1}; expected_part_info.idx_in_partition = {0, 1}; auto greater_or_equal_new = PredicateBuilder::GreaterOrEqual( /*field_index=*/0, /*field_name=*/"key0", FieldType::INT, Literal(4)); auto equal_new = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"key1", - FieldType::DOUBLE, Literal(3.0)); + FieldType::BIGINT, Literal(3l)); expected_part_info.partition_filter = PredicateBuilder::And({greater_or_equal_new, equal_new}).value_or(nullptr); CheckPartitionInfo(mapping->partition_info.value(), expected_part_info); diff --git a/src/paimon/testing/utils/data_generator.cpp b/src/paimon/testing/utils/data_generator.cpp index 1e9883c29..d160fa6b6 100644 --- a/src/paimon/testing/utils/data_generator.cpp +++ b/src/paimon/testing/utils/data_generator.cpp @@ -78,14 +78,6 @@ Status DataGenerator::WriteBinaryRow(const BinaryRow& src_row, int32_t src_field target_row_writer->WriteLong(target_field_id, src_row.GetLong(src_field_id)); break; } - case arrow::Type::type::FLOAT: { - target_row_writer->WriteFloat(target_field_id, src_row.GetFloat(src_field_id)); - break; - } - case arrow::Type::type::DOUBLE: { - target_row_writer->WriteDouble(target_field_id, src_row.GetDouble(src_field_id)); - break; - } case arrow::Type::type::STRING: { target_row_writer->WriteString(target_field_id, src_row.GetString(src_field_id)); break; diff --git a/test/inte/write_and_read_inte_test.cpp b/test/inte/write_and_read_inte_test.cpp index 1cc66e37b..fe015224f 100644 --- a/test/inte/write_and_read_inte_test.cpp +++ b/test/inte/write_and_read_inte_test.cpp @@ -675,11 +675,10 @@ TEST_P(WriteAndReadInteTest, TestPKWithSequenceFieldPartialInPKField) { TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForAppend) { arrow::FieldVector fields = { - arrow::field("f_bool", arrow::boolean()), arrow::field("f_int8", arrow::int8()), - arrow::field("f_int16", arrow::int16()), arrow::field("f_int32", arrow::int32()), - arrow::field("f_int64", arrow::int64()), arrow::field("f_float", arrow::float32()), - arrow::field("f_double", arrow::float64()), arrow::field("f_string", arrow::utf8()), - arrow::field("f_date", arrow::date32()), arrow::field("f_value", arrow::int32())}; + arrow::field("f_bool", arrow::boolean()), arrow::field("f_int8", arrow::int8()), + arrow::field("f_int16", arrow::int16()), arrow::field("f_int32", arrow::int32()), + arrow::field("f_int64", arrow::int64()), arrow::field("f_string", arrow::utf8()), + arrow::field("f_date", arrow::date32()), arrow::field("f_value", arrow::int32())}; auto schema = arrow::schema(fields); auto [file_format, file_system] = GetParam(); std::map options = { @@ -697,20 +696,20 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForAppe auto helper, TestHelper::Create(test_dir_, schema, /*partition_keys=*/ {"f_bool", "f_int8", "f_int16", "f_int32", "f_int64", - "f_float", "f_double", "f_string", "f_date"}, + "f_string", "f_date"}, /*primary_keys=*/{}, options, /*is_streaming_mode=*/true)); int64_t commit_identifier = 0; { std::map partition_map = { - {"f_bool", "true"}, {"f_int8", "1"}, {"f_int16", "100"}, - {"f_int32", "10000"}, {"f_int64", "100000"}, {"f_float", "1.5"}, - {"f_double", "2.5"}, {"f_string", "hello"}, {"f_date", "1970-01-02"}}; + {"f_bool", "true"}, {"f_int8", "1"}, {"f_int16", "100"}, + {"f_int32", "10000"}, {"f_int64", "100000"}, {"f_string", "hello"}, + {"f_date", "1970-01-02"}}; // First write to the same partition std::string data1 = R"([ - [true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 10], - [true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 20] + [true, 1, 100, 10000, 100000, "hello", 1, 10], + [true, 1, 100, 10000, 100000, "hello", 1, 20] ])"; ASSERT_OK_AND_ASSIGN( std::unique_ptr batch1, @@ -722,8 +721,8 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForAppe // Second write to the same partition std::string data2 = R"([ - [true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 30], - [true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 40] + [true, 1, 100, 10000, 100000, "hello", 1, 30], + [true, 1, 100, 10000, 100000, "hello", 1, 40] ])"; ASSERT_OK_AND_ASSIGN( std::unique_ptr batch2, @@ -736,13 +735,12 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForAppe { // write all null for partition fields std::map partition_map = { - {"f_bool", "null"}, {"f_int8", "null"}, {"f_int16", "null"}, - {"f_int32", "null"}, {"f_int64", "null"}, {"f_float", "null"}, - {"f_double", "null"}, {"f_string", "null"}, {"f_date", "null"}}; + {"f_bool", "null"}, {"f_int8", "null"}, {"f_int16", "null"}, {"f_int32", "null"}, + {"f_int64", "null"}, {"f_string", "null"}, {"f_date", "null"}}; // First write to the same partition std::string data1 = R"([ - [null, null, null, null, null, null, null, null, null, 50] + [null, null, null, null, null, null, null, 50] ])"; ASSERT_OK_AND_ASSIGN( std::unique_ptr batch1, @@ -754,7 +752,7 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForAppe // Second write to the same partition std::string data2 = R"([ - [null, null, null, null, null, null, null, null, null, 60] + [null, null, null, null, null, null, null, 60] ])"; ASSERT_OK_AND_ASSIGN( std::unique_ptr batch2, @@ -772,12 +770,12 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForAppe ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ - [0, true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 10], - [0, true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 20], - [0, true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 30], - [0, true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 40], - [0, null, null, null, null, null, null, null, null, null, 50], - [0, null, null, null, null, null, null, null, null, null, 60] + [0, true, 1, 100, 10000, 100000, "hello", 1, 10], + [0, true, 1, 100, 10000, 100000, "hello", 1, 20], + [0, true, 1, 100, 10000, 100000, "hello", 1, 30], + [0, true, 1, 100, 10000, 100000, "hello", 1, 40], + [0, null, null, null, null, null, null, null, 50], + [0, null, null, null, null, null, null, null, 60] ])"; ASSERT_OK_AND_ASSIGN(bool success, helper->ReadAndCheckResult(data_type, data_splits, expected_data)); @@ -786,11 +784,10 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForAppe TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForPk) { arrow::FieldVector fields = { - arrow::field("f_bool", arrow::boolean()), arrow::field("f_int8", arrow::int8()), - arrow::field("f_int16", arrow::int16()), arrow::field("f_int32", arrow::int32()), - arrow::field("f_int64", arrow::int64()), arrow::field("f_float", arrow::float32()), - arrow::field("f_double", arrow::float64()), arrow::field("f_string", arrow::utf8()), - arrow::field("f_date", arrow::date32()), arrow::field("f_value", arrow::int32()), + arrow::field("f_bool", arrow::boolean()), arrow::field("f_int8", arrow::int8()), + arrow::field("f_int16", arrow::int16()), arrow::field("f_int32", arrow::int32()), + arrow::field("f_int64", arrow::int64()), arrow::field("f_string", arrow::utf8()), + arrow::field("f_date", arrow::date32()), arrow::field("f_value", arrow::int32()), arrow::field("pk", arrow::utf8())}; auto schema = arrow::schema(fields); auto [file_format, file_system] = GetParam(); @@ -804,26 +801,26 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForPk) options = AddOptionsForJindo(options); } ASSERT_OK_AND_ASSIGN( - auto helper, TestHelper::Create(test_dir_, schema, - /*partition_keys=*/ - {"f_bool", "f_int8", "f_int16", "f_int32", "f_int64", - "f_float", "f_double", "f_string", "f_date"}, - /*primary_keys=*/ - {"pk", "f_bool", "f_int8", "f_int16", "f_int32", "f_int64", - "f_float", "f_double", "f_string", "f_date"}, - options, /*is_streaming_mode=*/true)); + auto helper, + TestHelper::Create( + test_dir_, schema, + /*partition_keys=*/ + {"f_bool", "f_int8", "f_int16", "f_int32", "f_int64", "f_string", "f_date"}, + /*primary_keys=*/ + {"pk", "f_bool", "f_int8", "f_int16", "f_int32", "f_int64", "f_string", "f_date"}, + options, /*is_streaming_mode=*/true)); int64_t commit_identifier = 0; { std::map partition_map = { - {"f_bool", "true"}, {"f_int8", "1"}, {"f_int16", "100"}, - {"f_int32", "10000"}, {"f_int64", "100000"}, {"f_float", "1.5"}, - {"f_double", "2.5"}, {"f_string", "hello"}, {"f_date", "1970-01-02"}}; + {"f_bool", "true"}, {"f_int8", "1"}, {"f_int16", "100"}, + {"f_int32", "10000"}, {"f_int64", "100000"}, {"f_string", "hello"}, + {"f_date", "1970-01-02"}}; // First write to the same partition std::string data1 = R"([ - [true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 10, "pk1"], - [true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 20, "pk2"] + [true, 1, 100, 10000, 100000,"hello", 1, 10, "pk1"], + [true, 1, 100, 10000, 100000,"hello", 1, 20, "pk2"] ])"; ASSERT_OK_AND_ASSIGN( std::unique_ptr batch1, @@ -835,8 +832,8 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForPk) // Second write to the same partition std::string data2 = R"([ - [true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 30, "pk1"], - [true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 40, "pk3"] + [true, 1, 100, 10000, 100000,"hello", 1, 30, "pk1"], + [true, 1, 100, 10000, 100000,"hello", 1, 40, "pk3"] ])"; ASSERT_OK_AND_ASSIGN( std::unique_ptr batch2, @@ -854,9 +851,9 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForPk) ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ - [0, true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 30, "pk1"], - [0, true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 20, "pk2"], - [0, true, 1, 100, 10000, 100000, 1.5, 2.5, "hello", 1, 40, "pk3"] + [0, true, 1, 100, 10000, 100000, "hello", 1, 30, "pk1"], + [0, true, 1, 100, 10000, 100000, "hello", 1, 20, "pk2"], + [0, true, 1, 100, 10000, 100000, "hello", 1, 40, "pk3"] ])"; ASSERT_OK_AND_ASSIGN(bool success, helper->ReadAndCheckResult(data_type, data_splits, expected_data));